line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Bio::Search::SearchUtils - Utility functions for Bio::Search:: objects |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# This module is just a collection of subroutines, not an object. |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 DESCRIPTION |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
The SearchUtils.pm module is a collection of subroutines used |
12
|
|
|
|
|
|
|
primarily by Bio::Search::Hit::HitI objects for some of the additional |
13
|
|
|
|
|
|
|
functionality, such as HSP tiling. Right now, the SearchUtils is just |
14
|
|
|
|
|
|
|
a collection of methods, not an object. |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 AUTHOR |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Steve Chervitz Esac@bioperl.orgE |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Sendu Bala, bix@sendu.me.uk |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=cut |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
package Bio::Search::SearchUtils; |
27
|
29
|
|
|
29
|
|
994
|
use Bio::Root::Version; |
|
29
|
|
|
|
|
44
|
|
|
29
|
|
|
|
|
200
|
|
28
|
|
|
|
|
|
|
|
29
|
29
|
|
|
29
|
|
952
|
use strict; |
|
29
|
|
|
|
|
55
|
|
|
29
|
|
|
|
|
81885
|
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head2 tile_hsps |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Usage : tile_hsps( $sbjct ); |
34
|
|
|
|
|
|
|
: This is called automatically by methods in Bio::Search::Hit::GenericHit |
35
|
|
|
|
|
|
|
: that rely on having tiled data. |
36
|
|
|
|
|
|
|
: |
37
|
|
|
|
|
|
|
: If you are interested in getting data about the constructed HSP contigs: |
38
|
|
|
|
|
|
|
: my ($qcontigs, $scontigs) = Bio::Search::SearchUtils::tile_hsps($hit); |
39
|
|
|
|
|
|
|
: if (ref $qcontigs) { |
40
|
|
|
|
|
|
|
: print STDERR "Query contigs:\n"; |
41
|
|
|
|
|
|
|
: foreach (@{$qcontigs}) { |
42
|
|
|
|
|
|
|
: print "contig start is $_->{'start'}\n"; |
43
|
|
|
|
|
|
|
: print "contig stop is $_->{'stop'}\n"; |
44
|
|
|
|
|
|
|
: } |
45
|
|
|
|
|
|
|
: } |
46
|
|
|
|
|
|
|
: See below for more information about the contig data structure. |
47
|
|
|
|
|
|
|
: |
48
|
|
|
|
|
|
|
Purpose : Collect statistics about the aligned sequences in a set of HSPs. |
49
|
|
|
|
|
|
|
: Calculates the following data across all HSPs: |
50
|
|
|
|
|
|
|
: -- total alignment length |
51
|
|
|
|
|
|
|
: -- total identical residues |
52
|
|
|
|
|
|
|
: -- total conserved residues |
53
|
|
|
|
|
|
|
Returns : If there was only a single HSP (so no tiling was necessary) |
54
|
|
|
|
|
|
|
tile_hsps() returns a list of two non-zero integers. |
55
|
|
|
|
|
|
|
If there were multiple HSP, |
56
|
|
|
|
|
|
|
tile_hsps() returns a list of two array references containin HSP contig data. |
57
|
|
|
|
|
|
|
The first array ref contains a list of HSP contigs on the query sequence. |
58
|
|
|
|
|
|
|
The second array ref contains a list of HSP contigs on the subject sequence. |
59
|
|
|
|
|
|
|
Each contig is a hash reference with the following data fields: |
60
|
|
|
|
|
|
|
'start' => start coordinate of the contig |
61
|
|
|
|
|
|
|
'stop' => start coordinate of the contig |
62
|
|
|
|
|
|
|
'iden' => number of identical residues in the contig |
63
|
|
|
|
|
|
|
'cons' => number of conserved residues in the contig |
64
|
|
|
|
|
|
|
'strand'=> strand of the contig |
65
|
|
|
|
|
|
|
'frame' => frame of the contig |
66
|
|
|
|
|
|
|
Argument : A Bio::Search::Hit::HitI object |
67
|
|
|
|
|
|
|
Throws : n/a |
68
|
|
|
|
|
|
|
Comments : |
69
|
|
|
|
|
|
|
: This method performs more careful summing of data across |
70
|
|
|
|
|
|
|
: all HSPs in the Sbjct object. Only HSPs that are in the same strand |
71
|
|
|
|
|
|
|
: and frame are tiled. Simply summing the data from all HSPs |
72
|
|
|
|
|
|
|
: in the same strand and frame will overestimate the actual |
73
|
|
|
|
|
|
|
: length of the alignment if there is overlap between different HSPs |
74
|
|
|
|
|
|
|
: (often the case). |
75
|
|
|
|
|
|
|
: |
76
|
|
|
|
|
|
|
: The strategy is to tile the HSPs and sum over the |
77
|
|
|
|
|
|
|
: contigs, collecting data separately from overlapping and |
78
|
|
|
|
|
|
|
: non-overlapping regions of each HSP. To facilitate this, the |
79
|
|
|
|
|
|
|
: HSP.pm object now permits extraction of data from sub-sections |
80
|
|
|
|
|
|
|
: of an HSP. |
81
|
|
|
|
|
|
|
: |
82
|
|
|
|
|
|
|
: Additional useful information is collected from the results |
83
|
|
|
|
|
|
|
: of the tiling. It is possible that sub-sequences in |
84
|
|
|
|
|
|
|
: different HSPs will overlap significantly. In this case, it |
85
|
|
|
|
|
|
|
: is impossible to create a single unambiguous alignment by |
86
|
|
|
|
|
|
|
: concatenating the HSPs. The ambiguity may indicate the |
87
|
|
|
|
|
|
|
: presence of multiple, similar domains in one or both of the |
88
|
|
|
|
|
|
|
: aligned sequences. This ambiguity is recorded using the |
89
|
|
|
|
|
|
|
: ambiguous_aln() method. |
90
|
|
|
|
|
|
|
: |
91
|
|
|
|
|
|
|
: This method does not attempt to discern biologically |
92
|
|
|
|
|
|
|
: significant vs. insignificant overlaps. The allowable amount of |
93
|
|
|
|
|
|
|
: overlap can be set with the overlap() method or with the -OVERLAP |
94
|
|
|
|
|
|
|
: parameter used when constructing the Hit object. |
95
|
|
|
|
|
|
|
: |
96
|
|
|
|
|
|
|
: For a given hit, both the query and the sbjct sequences are |
97
|
|
|
|
|
|
|
: tiled independently. |
98
|
|
|
|
|
|
|
: |
99
|
|
|
|
|
|
|
: -- If only query sequence HSPs overlap, |
100
|
|
|
|
|
|
|
: this may suggest multiple domains in the sbjct. |
101
|
|
|
|
|
|
|
: -- If only sbjct sequence HSPs overlap, |
102
|
|
|
|
|
|
|
: this may suggest multiple domains in the query. |
103
|
|
|
|
|
|
|
: -- If both query & sbjct sequence HSPs overlap, |
104
|
|
|
|
|
|
|
: this suggests multiple domains in both. |
105
|
|
|
|
|
|
|
: -- If neither query & sbjct sequence HSPs overlap, |
106
|
|
|
|
|
|
|
: this suggests either no multiple domains in either |
107
|
|
|
|
|
|
|
: sequence OR that both sequences have the same |
108
|
|
|
|
|
|
|
: distribution of multiple similar domains. |
109
|
|
|
|
|
|
|
: |
110
|
|
|
|
|
|
|
: This method can deal with the special case of when multiple |
111
|
|
|
|
|
|
|
: HSPs exactly overlap. |
112
|
|
|
|
|
|
|
: |
113
|
|
|
|
|
|
|
: Efficiency concerns: |
114
|
|
|
|
|
|
|
: Speed will be an issue for sequences with numerous HSPs. |
115
|
|
|
|
|
|
|
: |
116
|
|
|
|
|
|
|
Bugs : Currently, tile_hsps() does not properly account for |
117
|
|
|
|
|
|
|
: the number of non-tiled but overlapping HSPs, which becomes a problem |
118
|
|
|
|
|
|
|
: as overlap() grows. Large values overlap() may thus lead to |
119
|
|
|
|
|
|
|
: incorrect statistics for some hits. For best results, keep overlap() |
120
|
|
|
|
|
|
|
: below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and |
121
|
|
|
|
|
|
|
: Ambiguous Alignments" section in L. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
See Also : L<_adjust_contigs>(), L |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=cut |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
#-------------- |
128
|
|
|
|
|
|
|
sub tile_hsps { |
129
|
|
|
|
|
|
|
#-------------- |
130
|
48
|
|
|
48
|
1
|
69
|
my $sbjct = shift; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
#print STDERR "Calling tile_hsps(): $sbjct\n"; |
133
|
|
|
|
|
|
|
#$sbjct->verbose(1); # to activate debugging |
134
|
48
|
|
|
|
|
119
|
$sbjct->tiled_hsps(1); |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# changed to not rely on n() (which is unreliable here) --cjfields 4/6/10 |
137
|
48
|
50
|
|
|
|
131
|
if( $sbjct->num_hsps == 0) { |
|
|
100
|
|
|
|
|
|
138
|
|
|
|
|
|
|
#print STDERR "_tile_hsps(): no hsps, nothing to tile! (", $sbjct->num_hsps, ")\n"; |
139
|
0
|
|
|
|
|
0
|
_warn_about_no_hsps($sbjct); |
140
|
0
|
|
|
|
|
0
|
return (undef, undef); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
} elsif($sbjct->num_hsps == 1) { |
143
|
|
|
|
|
|
|
## Simple summation scheme. Valid if there is only one HSP. |
144
|
|
|
|
|
|
|
#print STDERR "_tile_hsps(): single HSP, easy stats.\n"; |
145
|
38
|
|
|
|
|
103
|
my $hsp = $sbjct->hsp; |
146
|
38
|
|
|
|
|
141
|
$sbjct->length_aln('query', $hsp->length('query')); |
147
|
38
|
|
|
|
|
96
|
$sbjct->length_aln('hit', $hsp->length('sbjct')); |
148
|
38
|
|
|
|
|
93
|
$sbjct->length_aln('total', $hsp->length('total')); |
149
|
38
|
|
|
|
|
138
|
$sbjct->matches( $hsp->matches() ); |
150
|
38
|
|
|
|
|
118
|
$sbjct->gaps('query', $hsp->gaps('query')); |
151
|
38
|
|
|
|
|
81
|
$sbjct->gaps('sbjct', $hsp->gaps('sbjct')); |
152
|
|
|
|
|
|
|
|
153
|
38
|
|
|
|
|
107
|
_adjust_length_aln($sbjct); |
154
|
38
|
|
|
|
|
72
|
return (1, 1); |
155
|
|
|
|
|
|
|
} else { |
156
|
|
|
|
|
|
|
#print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n"; |
157
|
10
|
|
|
|
|
41
|
$sbjct->length_aln('query', 0); |
158
|
10
|
|
|
|
|
29
|
$sbjct->length_aln('sbjct', 0); |
159
|
10
|
|
|
|
|
33
|
$sbjct->length_aln('total', 0); |
160
|
10
|
|
|
|
|
35
|
$sbjct->matches( 0, 0); |
161
|
10
|
|
|
|
|
50
|
$sbjct->gaps('query', 0); |
162
|
10
|
|
|
|
|
24
|
$sbjct->gaps('hit', 0); |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
## More than one HSP. Must tile HSPs. |
166
|
|
|
|
|
|
|
# print "\nTiling HSPs for $sbjct\n"; |
167
|
10
|
|
|
|
|
41
|
my($hsp, $qstart, $qstop, $sstart, $sstop); |
168
|
10
|
|
|
|
|
0
|
my($frame, $strand, $qstrand, $sstrand); |
169
|
10
|
|
|
|
|
0
|
my(@qcontigs, @scontigs); |
170
|
10
|
|
|
|
|
16
|
my $qoverlap = 0; |
171
|
10
|
|
|
|
|
16
|
my $soverlap = 0; |
172
|
10
|
|
|
|
|
46
|
my $max_overlap = $sbjct->overlap; |
173
|
10
|
|
|
|
|
14
|
my $hit_qgaps = 0; |
174
|
10
|
|
|
|
|
14
|
my $hit_sgaps = 0; |
175
|
10
|
|
|
|
|
16
|
my $hit_len_aln = 0; |
176
|
10
|
|
|
|
|
14
|
my %start_stop; |
177
|
10
|
|
|
|
|
28
|
my $v = $sbjct->verbose; |
178
|
10
|
|
|
|
|
35
|
foreach $hsp ( $sbjct->hsps() ) { |
179
|
|
|
|
|
|
|
#$sbjct->debug( sprintf(" HSP: %s %d..%d\n",$hsp->query->seq_id, $hsp->query->start, $hsp->hit->end)) if $v > 0; #$hsp->str('query'); |
180
|
|
|
|
|
|
|
# printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), |
181
|
|
|
|
|
|
|
# $hsp->length(-TYPE=>'cons'), |
182
|
|
|
|
|
|
|
# $hsp->length(-TYPE=>'cons', |
183
|
|
|
|
|
|
|
# -START=>0,-STOP=>10); |
184
|
|
|
|
|
|
|
|
185
|
53
|
|
|
|
|
151
|
($qstart, $qstop) = $hsp->range('query'); |
186
|
53
|
|
|
|
|
117
|
($sstart, $sstop) = $hsp->range('sbjct'); |
187
|
53
|
|
|
|
|
140
|
$frame = $hsp->frame('hit'); |
188
|
53
|
50
|
|
|
|
96
|
$frame = -1 unless defined $frame; |
189
|
|
|
|
|
|
|
|
190
|
53
|
|
|
|
|
107
|
($qstrand, $sstrand) = ($hsp->query->strand, |
191
|
|
|
|
|
|
|
$hsp->hit->strand); |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Note: No correction for overlap. |
194
|
|
|
|
|
|
|
|
195
|
53
|
|
|
|
|
134
|
my ($qgaps, $sgaps) = ($hsp->gaps('query'), $hsp->gaps('hit')); |
196
|
53
|
|
|
|
|
82
|
$hit_qgaps += $qgaps; |
197
|
53
|
|
|
|
|
58
|
$hit_sgaps += $sgaps; |
198
|
53
|
|
|
|
|
113
|
$hit_len_aln += $hsp->length; |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
## Collect contigs in the query sequence. |
201
|
53
|
|
|
|
|
143
|
$qoverlap += &_adjust_contigs('query', $hsp, $qstart, $qstop, |
202
|
|
|
|
|
|
|
\@qcontigs, $max_overlap, $frame, |
203
|
|
|
|
|
|
|
$qstrand); |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
## Collect contigs in the sbjct sequence |
206
|
|
|
|
|
|
|
# (needed for domain data and gapped Blast). |
207
|
53
|
|
|
|
|
119
|
$soverlap += &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, |
208
|
|
|
|
|
|
|
\@scontigs, $max_overlap, $frame, |
209
|
|
|
|
|
|
|
$sstrand); |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
## Collect overall start and stop data for query and |
212
|
|
|
|
|
|
|
# sbjct over all HSPs. |
213
|
53
|
100
|
|
|
|
109
|
unless ( defined $start_stop{'qstart'} ) { |
214
|
10
|
|
|
|
|
25
|
$start_stop{'qstart'} = $qstart; |
215
|
10
|
|
|
|
|
20
|
$start_stop{'qstop'} = $qstop; |
216
|
10
|
|
|
|
|
23
|
$start_stop{'sstart'} = $sstart; |
217
|
10
|
|
|
|
|
24
|
$start_stop{'sstop'} = $sstop; |
218
|
|
|
|
|
|
|
} else { |
219
|
|
|
|
|
|
|
$start_stop{'qstart'} = ($qstart < $start_stop{'qstart'} ? |
220
|
43
|
100
|
|
|
|
87
|
$qstart : $start_stop{'qstart'} ); |
221
|
|
|
|
|
|
|
$start_stop{'qstop'} = ($qstop > $start_stop{'qstop'} ? |
222
|
43
|
100
|
|
|
|
89
|
$qstop : $start_stop{'qstop'} ); |
223
|
|
|
|
|
|
|
$start_stop{'sstart'} = ($sstart < $start_stop{'sstart'} ? |
224
|
43
|
100
|
|
|
|
88
|
$sstart : $start_stop{'sstart'} ); |
225
|
|
|
|
|
|
|
$start_stop{'sstop'} = ($sstop > $start_stop{'sstop'} ? |
226
|
43
|
100
|
|
|
|
106
|
$sstop : $start_stop{'sstop'} ); |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# Store the collected data in the Hit object |
231
|
10
|
|
|
|
|
45
|
$sbjct->gaps('query', $hit_qgaps); |
232
|
10
|
|
|
|
|
30
|
$sbjct->gaps('hit', $hit_sgaps); |
233
|
10
|
|
|
|
|
37
|
$sbjct->length_aln('total', $hit_len_aln); |
234
|
|
|
|
|
|
|
|
235
|
10
|
|
|
|
|
54
|
$sbjct->start('query',$start_stop{'qstart'}); |
236
|
10
|
|
|
|
|
46
|
$sbjct->end('query', $start_stop{'qstop'}); |
237
|
10
|
|
|
|
|
28
|
$sbjct->start('hit', $start_stop{'sstart'}); |
238
|
10
|
|
|
|
|
27
|
$sbjct->end('hit', $start_stop{'sstop'}); |
239
|
|
|
|
|
|
|
## Collect data across the collected contigs. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
#$sbjct->debug( "\nQUERY CONTIGS:\n"." gaps = $sbjct->{'_gaps_query'}\n"); |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# Account for strand/frame. |
244
|
|
|
|
|
|
|
# Strategy: collect data on a per strand+frame basis and |
245
|
|
|
|
|
|
|
# save the most significant one. |
246
|
10
|
|
|
|
|
16
|
my (%qctg_dat); |
247
|
10
|
|
|
|
|
27
|
foreach (@qcontigs) { |
248
|
21
|
|
|
|
|
47
|
($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); |
249
|
|
|
|
|
|
|
|
250
|
21
|
50
|
|
|
|
37
|
if( $v > 0 ) { |
251
|
|
|
|
|
|
|
#$sbjct->debug(sprintf( "$frame/$strand len is getting %d for %d..%d\n", |
252
|
|
|
|
|
|
|
# ($_->{'stop'} - $_->{'start'} + 1), $_->{'start'}, $_->{'stop'})); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
21
|
|
|
|
|
68
|
$qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1; |
256
|
21
|
|
|
|
|
46
|
$qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'}; |
257
|
21
|
|
|
|
|
42
|
$qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'}; |
258
|
21
|
|
|
|
|
51
|
$qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Find longest contig. |
262
|
10
|
|
|
|
|
51
|
my @sortedkeys = sort { $qctg_dat{$b}->{'length_aln_query'} |
263
|
1
|
|
|
|
|
7
|
<=> $qctg_dat{$a}->{'length_aln_query'} } |
264
|
|
|
|
|
|
|
keys %qctg_dat; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Save the largest to the sbjct: |
267
|
10
|
|
|
|
|
33
|
my $longest = $sortedkeys[0]; |
268
|
|
|
|
|
|
|
#$sbjct->debug( "longest is ". $qctg_dat{ $longest }->{'length_aln_query'}. "\n"); |
269
|
10
|
|
|
|
|
36
|
$sbjct->length_aln('query', $qctg_dat{ $longest }->{'length_aln_query'}); |
270
|
|
|
|
|
|
|
$sbjct->matches($qctg_dat{ $longest }->{'totalIdentical'}, |
271
|
10
|
|
|
|
|
36
|
$qctg_dat{ $longest }->{'totalConserved'}); |
272
|
10
|
|
|
|
|
57
|
$sbjct->strand('query', $qctg_dat{ $longest }->{'qstrand'}); |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
## Collect data for sbjct contigs. Important for gapped Blast. |
275
|
|
|
|
|
|
|
## The totalIdentical and totalConserved numbers will be the same |
276
|
|
|
|
|
|
|
## as determined for the query contigs. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
#$sbjct->debug( "\nSBJCT CONTIGS:\n"." gaps = ". $sbjct->gaps('sbjct'). "\n"); |
279
|
10
|
|
|
|
|
12
|
my (%sctg_dat); |
280
|
10
|
|
|
|
|
21
|
foreach(@scontigs) { |
281
|
|
|
|
|
|
|
#$sbjct->debug(" sbjct contig: $_->{'start'} - $_->{'stop'}\n". |
282
|
|
|
|
|
|
|
# " iden = $_->{'iden'}; cons = $_->{'cons'}\n"); |
283
|
35
|
|
|
|
|
64
|
($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); |
284
|
35
|
|
|
|
|
76
|
$sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1; |
285
|
35
|
|
|
|
|
52
|
$sctg_dat{ "$frame$strand" }->{'frame'} = $frame; |
286
|
35
|
|
|
|
|
56
|
$sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
10
|
|
|
|
|
35
|
@sortedkeys = sort { $sctg_dat{ $b }->{'length_aln_sbjct'} |
290
|
2
|
|
|
|
|
9
|
<=> $sctg_dat{ $a }->{'length_aln_sbjct'} |
291
|
|
|
|
|
|
|
} keys %sctg_dat; |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Save the largest to the sbjct: |
294
|
10
|
|
|
|
|
20
|
$longest = $sortedkeys[0]; |
295
|
|
|
|
|
|
|
|
296
|
10
|
|
|
|
|
27
|
$sbjct->length_aln('sbjct', $sctg_dat{ $longest }->{'length_aln_sbjct'}); |
297
|
10
|
|
|
|
|
40
|
$sbjct->frame( $sctg_dat{ $longest }->{'frame'} ); |
298
|
10
|
|
|
|
|
26
|
$sbjct->strand('hit', $sctg_dat{ $longest }->{'sstrand'}); |
299
|
|
|
|
|
|
|
|
300
|
10
|
100
|
|
|
|
27
|
if($qoverlap) { |
|
|
100
|
|
|
|
|
|
301
|
7
|
100
|
|
|
|
15
|
if($soverlap) { $sbjct->ambiguous_aln('qs'); |
|
5
|
|
|
|
|
23
|
|
302
|
|
|
|
|
|
|
#$sbjct->debug("\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n"); |
303
|
|
|
|
|
|
|
} |
304
|
2
|
|
|
|
|
13
|
else { $sbjct->ambiguous_aln('q'); |
305
|
|
|
|
|
|
|
#$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Query\n\n"); |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
} elsif($soverlap) { |
308
|
2
|
|
|
|
|
8
|
$sbjct->ambiguous_aln('s'); |
309
|
|
|
|
|
|
|
#$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n"); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
|
312
|
10
|
|
|
|
|
26
|
_adjust_length_aln($sbjct); |
313
|
|
|
|
|
|
|
|
314
|
10
|
|
|
|
|
80
|
return ( [@qcontigs], [@scontigs] ); |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# Title : _adjust_length_aln |
320
|
|
|
|
|
|
|
# Usage : n/a; internal use only; called by tile_hsps. |
321
|
|
|
|
|
|
|
# Purpose : Adjust length of aligment based on BLAST flavor. |
322
|
|
|
|
|
|
|
# Comments : See comments in logica_length() |
323
|
|
|
|
|
|
|
sub _adjust_length_aln { |
324
|
48
|
|
|
48
|
|
67
|
my $sbjct = shift; |
325
|
48
|
|
|
|
|
106
|
my $algo = $sbjct->algorithm; |
326
|
48
|
|
|
|
|
100
|
my $hlen = $sbjct->length_aln('sbjct'); |
327
|
48
|
|
|
|
|
93
|
my $qlen = $sbjct->length_aln('query'); |
328
|
|
|
|
|
|
|
|
329
|
48
|
|
|
|
|
107
|
$sbjct->length_aln('sbjct', logical_length($algo, 'sbjct', $hlen)); |
330
|
48
|
|
|
|
|
179
|
$sbjct->length_aln('query', logical_length($algo, 'query', $qlen)); |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=head2 logical_length |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Usage : logical_length( $alg_name, $seq_type, $length ); |
336
|
|
|
|
|
|
|
Purpose : Determine the logical length of an aligned sequence based on |
337
|
|
|
|
|
|
|
: algorithm name and sequence type. |
338
|
|
|
|
|
|
|
Returns : integer representing the logical aligned length. |
339
|
|
|
|
|
|
|
Argument : $alg_name = name of algorigthm (e.g., blastx, tblastn) |
340
|
|
|
|
|
|
|
: $seq_type = type of sequence (e.g., query or hit) |
341
|
|
|
|
|
|
|
: $length = physical length of the sequence in the alignment. |
342
|
|
|
|
|
|
|
Throws : n/a |
343
|
|
|
|
|
|
|
Comments : This function is used to account for the fact that number of identities |
344
|
|
|
|
|
|
|
and conserved residues is reported in peptide space while the query |
345
|
|
|
|
|
|
|
length (in the case of BLASTX and TBLASTX) and/or the hit length |
346
|
|
|
|
|
|
|
(in the case of TBLASTN and TBLASTX) are in nucleotide space. |
347
|
|
|
|
|
|
|
The adjustment affects the values reported by the various frac_XXX |
348
|
|
|
|
|
|
|
methods in GenericHit and GenericHSP. |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub logical_length { |
353
|
128
|
|
|
128
|
1
|
233
|
my ($algo, $type, $len) = @_; |
354
|
128
|
|
|
|
|
148
|
my $logical = $len; |
355
|
128
|
50
|
|
|
|
575
|
if($algo =~ /^(?:PSI)?T(?:BLASTN|FAST(?:X|Y|XY))/oi ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
356
|
0
|
0
|
|
|
|
0
|
$logical = $len/3 if $type =~ /sbjct|hit|tot/i; |
357
|
|
|
|
|
|
|
} elsif($algo =~ /^(?:BLASTX|FAST(?:X|Y|XY))/oi ) { |
358
|
4
|
100
|
|
|
|
23
|
$logical = $len/3 if $type =~ /query|tot/i; |
359
|
|
|
|
|
|
|
} elsif($algo =~ /^TBLASTX/oi ) { |
360
|
40
|
|
|
|
|
52
|
$logical = $len/3; |
361
|
|
|
|
|
|
|
} |
362
|
128
|
|
|
|
|
288
|
return $logical; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
#=head2 _adjust_contigs |
367
|
|
|
|
|
|
|
# |
368
|
|
|
|
|
|
|
# Usage : n/a; internal function called by tile_hsps |
369
|
|
|
|
|
|
|
# Purpose : Builds HSP contigs for a given BLAST hit. |
370
|
|
|
|
|
|
|
# : Utility method called by _tile_hsps() |
371
|
|
|
|
|
|
|
# Returns : |
372
|
|
|
|
|
|
|
# Argument : |
373
|
|
|
|
|
|
|
# Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches() |
374
|
|
|
|
|
|
|
# : for invalid sub-sequence ranges. |
375
|
|
|
|
|
|
|
# Status : Experimental |
376
|
|
|
|
|
|
|
# Comments : This method supports gapped alignments through a patch by maj |
377
|
|
|
|
|
|
|
# : to B:S:HSP:HSPI::matches(). |
378
|
|
|
|
|
|
|
# : It does not keep track of the number of HSPs that |
379
|
|
|
|
|
|
|
# : overlap within the amount specified by overlap(). |
380
|
|
|
|
|
|
|
# : This will lead to significant tracking errors for large |
381
|
|
|
|
|
|
|
# : overlap values. |
382
|
|
|
|
|
|
|
# |
383
|
|
|
|
|
|
|
#See Also : L(), L |
384
|
|
|
|
|
|
|
# |
385
|
|
|
|
|
|
|
#=cut |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
sub _adjust_contigs { |
388
|
106
|
|
|
106
|
|
235
|
my ($seqType, $hsp, $start, $stop, $contigs_ref, |
389
|
|
|
|
|
|
|
$max_overlap, $frame, $strand) = @_; |
390
|
106
|
|
|
|
|
109
|
my $overlap = 0; |
391
|
106
|
|
|
|
|
120
|
my ($numID, $numCons); |
392
|
|
|
|
|
|
|
|
393
|
106
|
|
|
|
|
170
|
foreach (@$contigs_ref) { |
394
|
|
|
|
|
|
|
# Don't merge things unless they have matching strand/frame. |
395
|
211
|
100
|
100
|
|
|
875
|
next unless ($_->{'frame'} == $frame && $_->{'strand'} == $strand); |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# Test special case of a nested HSP. Skip it. |
398
|
189
|
100
|
100
|
|
|
428
|
if ($start >= $_->{'start'} && $stop <= $_->{'stop'}) { |
399
|
18
|
|
|
|
|
28
|
$overlap = 1; |
400
|
18
|
|
|
|
|
30
|
next; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# Test for overlap at beginning of contig, or precedes consecutively |
404
|
171
|
100
|
100
|
|
|
332
|
if ($start < $_->{'start'} && $stop >= ($_->{'start'} + $max_overlap - 1)) { |
405
|
12
|
|
|
|
|
21
|
eval { |
406
|
|
|
|
|
|
|
($numID, $numCons) = $hsp->matches(-SEQ =>$seqType, |
407
|
|
|
|
|
|
|
-START => $start, |
408
|
12
|
|
|
|
|
50
|
-STOP => $_->{'start'} - 1); |
409
|
12
|
50
|
|
|
|
36
|
if ($numID eq '') { |
410
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0"); |
411
|
0
|
|
|
|
|
0
|
$numID = 0; |
412
|
|
|
|
|
|
|
} |
413
|
12
|
50
|
|
|
|
29
|
if ($numCons eq '') { |
414
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0"); |
415
|
0
|
|
|
|
|
0
|
$numCons = 0; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
}; |
418
|
12
|
50
|
|
|
|
21
|
if($@) { warn "\a\n$@\n"; } |
|
0
|
|
|
|
|
0
|
|
419
|
|
|
|
|
|
|
else { |
420
|
12
|
|
|
|
|
22
|
$_->{'start'} = $start; # Assign a new start coordinate to the contig |
421
|
12
|
|
|
|
|
29
|
$_->{'iden'} += $numID; # and add new data to #identical, #conserved. |
422
|
12
|
|
|
|
|
19
|
$_->{'cons'} += $numCons; |
423
|
12
|
|
|
|
|
14
|
push(@{$_->{hsps}}, $hsp); |
|
12
|
|
|
|
|
32
|
|
424
|
12
|
|
|
|
|
18
|
$overlap = 1; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# Test for overlap at end of contig, or follows consecutively |
429
|
171
|
100
|
100
|
|
|
415
|
if ($stop > $_->{'stop'} and $start <= ($_->{'stop'} - $max_overlap + 1)) { |
430
|
17
|
|
|
|
|
30
|
eval { |
431
|
|
|
|
|
|
|
($numID,$numCons) = $hsp->matches(-SEQ =>$seqType, |
432
|
17
|
|
|
|
|
68
|
-START => $_->{'stop'} + 1, |
433
|
|
|
|
|
|
|
-STOP => $stop); |
434
|
17
|
50
|
|
|
|
47
|
if ($numID eq '') { |
435
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0"); |
436
|
0
|
|
|
|
|
0
|
$numID = 0; |
437
|
|
|
|
|
|
|
} |
438
|
17
|
50
|
|
|
|
631
|
if ($numCons eq '') { |
439
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0"); |
440
|
0
|
|
|
|
|
0
|
$numCons = 0; |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
}; |
443
|
17
|
50
|
|
|
|
41
|
if($@) { warn "\a\n$@\n"; } |
|
0
|
|
|
|
|
0
|
|
444
|
|
|
|
|
|
|
else { |
445
|
17
|
|
|
|
|
29
|
$_->{'stop'} = $stop; # Assign a new stop coordinate to the contig |
446
|
17
|
|
|
|
|
34
|
$_->{'iden'} += $numID; # and add new data to #identical, #conserved. |
447
|
17
|
|
|
|
|
30
|
$_->{'cons'} += $numCons; |
448
|
17
|
|
|
|
|
23
|
push(@{$_->{hsps}}, $hsp); |
|
17
|
|
|
|
|
46
|
|
449
|
17
|
|
|
|
|
27
|
$overlap = 1; |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
171
|
100
|
|
|
|
254
|
last if $overlap; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
106
|
100
|
100
|
|
|
352
|
if ($overlap && @$contigs_ref > 1) { |
|
|
100
|
|
|
|
|
|
457
|
|
|
|
|
|
|
## Merge any contigs that now overlap |
458
|
20
|
|
|
|
|
24
|
my $max = $#{$contigs_ref}; |
|
20
|
|
|
|
|
34
|
|
459
|
20
|
|
|
|
|
47
|
for my $i (0..$max) { |
460
|
50
|
100
|
|
|
|
53
|
${$contigs_ref}[$i] || next; |
|
50
|
|
|
|
|
88
|
|
461
|
44
|
|
|
|
|
47
|
my ($i_start, $i_stop) = (${$contigs_ref}[$i]->{start}, ${$contigs_ref}[$i]->{stop}); |
|
44
|
|
|
|
|
51
|
|
|
44
|
|
|
|
|
71
|
|
462
|
|
|
|
|
|
|
|
463
|
44
|
|
|
|
|
71
|
for my $u ($i+1..$max) { |
464
|
55
|
100
|
|
|
|
54
|
${$contigs_ref}[$u] || next; |
|
55
|
|
|
|
|
84
|
|
465
|
52
|
|
|
|
|
49
|
my ($u_start, $u_stop) = (${$contigs_ref}[$u]->{start}, ${$contigs_ref}[$u]->{stop}); |
|
52
|
|
|
|
|
80
|
|
|
52
|
|
|
|
|
73
|
|
466
|
|
|
|
|
|
|
|
467
|
52
|
100
|
100
|
|
|
259
|
if ($u_start < $i_start && $u_stop >= ($i_start + $max_overlap - 1)) { |
|
|
100
|
100
|
|
|
|
|
|
|
100
|
100
|
|
|
|
|
468
|
|
|
|
|
|
|
# find the hsps within the contig that have sequence |
469
|
|
|
|
|
|
|
# extending before $i_start |
470
|
2
|
|
|
|
|
5
|
my ($ids, $cons) = (0, 0); |
471
|
2
|
|
|
|
|
3
|
my $use_start = $i_start; |
472
|
2
|
|
|
|
|
5
|
foreach my $hsp (sort { $b->end($seqType) <=> $a->end($seqType) } @{${$contigs_ref}[$u]->{hsps}}) { |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
13
|
|
473
|
4
|
|
|
|
|
12
|
my $hsp_start = $hsp->start($seqType); |
474
|
4
|
50
|
|
|
|
10
|
$hsp_start < $use_start || next; |
475
|
|
|
|
|
|
|
|
476
|
4
|
|
|
|
|
11
|
my ($these_ids, $these_cons); |
477
|
4
|
|
|
|
|
4
|
eval { |
478
|
4
|
|
|
|
|
12
|
($these_ids, $these_cons) = $hsp->matches(-SEQ => $seqType, -START => $hsp_start, -STOP => $use_start - 1); |
479
|
4
|
50
|
|
|
|
11
|
if ($these_ids eq '') { |
480
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0"); |
481
|
0
|
|
|
|
|
0
|
$these_ids = 0; |
482
|
|
|
|
|
|
|
} |
483
|
4
|
50
|
|
|
|
126
|
if ($these_cons eq '') { |
484
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0"); |
485
|
0
|
|
|
|
|
0
|
$these_cons = 0; |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
}; |
488
|
4
|
50
|
|
|
|
8
|
if($@) { warn "\a\n$@\n"; } |
|
0
|
|
|
|
|
0
|
|
489
|
|
|
|
|
|
|
else { |
490
|
4
|
|
|
|
|
5
|
$ids += $these_ids; |
491
|
4
|
|
|
|
|
6
|
$cons += $these_cons; |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
|
494
|
4
|
100
|
|
|
|
10
|
last if $hsp_start == $u_start; |
495
|
2
|
|
|
|
|
3
|
$use_start = $hsp_start; |
496
|
|
|
|
|
|
|
} |
497
|
2
|
|
|
|
|
4
|
${$contigs_ref}[$i]->{start} = $u_start; |
|
2
|
|
|
|
|
4
|
|
498
|
2
|
|
|
|
|
24
|
${$contigs_ref}[$i]->{'iden'} += $ids; |
|
2
|
|
|
|
|
5
|
|
499
|
2
|
|
|
|
|
3
|
${$contigs_ref}[$i]->{'cons'} += $cons; |
|
2
|
|
|
|
|
3
|
|
500
|
2
|
|
|
|
|
4
|
push(@{${$contigs_ref}[$i]->{hsps}}, @{${$contigs_ref}[$u]->{hsps}}); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
6
|
|
501
|
|
|
|
|
|
|
|
502
|
2
|
|
|
|
|
2
|
${$contigs_ref}[$u] = undef; |
|
2
|
|
|
|
|
10
|
|
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
elsif ($u_stop > $i_stop && $u_start <= ($i_stop - $max_overlap + 1)) { |
505
|
|
|
|
|
|
|
# find the hsps within the contig that have sequence |
506
|
|
|
|
|
|
|
# extending beyond $i_stop |
507
|
3
|
|
|
|
|
7
|
my ($ids, $cons) = (0, 0); |
508
|
3
|
|
|
|
|
7
|
my $use_stop = $i_stop; |
509
|
3
|
|
|
|
|
5
|
foreach my $hsp (sort { $a->start($seqType) <=> $b->start($seqType) } @{${$contigs_ref}[$u]->{hsps}}) { |
|
1
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
16
|
|
510
|
4
|
|
|
|
|
18
|
my $hsp_end = $hsp->end($seqType); |
511
|
4
|
50
|
|
|
|
10
|
$hsp_end > $use_stop || next; |
512
|
|
|
|
|
|
|
|
513
|
4
|
|
|
|
|
7
|
my ($these_ids, $these_cons); |
514
|
4
|
|
|
|
|
6
|
eval { |
515
|
4
|
|
|
|
|
15
|
($these_ids, $these_cons) = $hsp->matches(-SEQ => $seqType, -START => $use_stop + 1, -STOP => $hsp_end); |
516
|
4
|
50
|
|
|
|
13
|
if ($these_ids eq '') { |
517
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0"); |
518
|
0
|
|
|
|
|
0
|
$these_ids = 0; |
519
|
|
|
|
|
|
|
} |
520
|
4
|
50
|
|
|
|
13
|
if ($these_cons eq '') { |
521
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0"); |
522
|
0
|
|
|
|
|
0
|
$these_cons = 0; |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
}; |
525
|
4
|
50
|
|
|
|
8
|
if($@) { warn "\a\n$@\n"; } |
|
0
|
|
|
|
|
0
|
|
526
|
|
|
|
|
|
|
else { |
527
|
4
|
|
|
|
|
6
|
$ids += $these_ids; |
528
|
4
|
|
|
|
|
7
|
$cons += $these_cons; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
|
531
|
4
|
100
|
|
|
|
10
|
last if $hsp_end == $u_stop; |
532
|
1
|
|
|
|
|
2
|
$use_stop = $hsp_end; |
533
|
|
|
|
|
|
|
} |
534
|
3
|
|
|
|
|
5
|
${$contigs_ref}[$i]->{'stop'} = $u_stop; |
|
3
|
|
|
|
|
6
|
|
535
|
3
|
|
|
|
|
5
|
${$contigs_ref}[$i]->{'iden'} += $ids; |
|
3
|
|
|
|
|
6
|
|
536
|
3
|
|
|
|
|
4
|
${$contigs_ref}[$i]->{'cons'} += $cons; |
|
3
|
|
|
|
|
6
|
|
537
|
3
|
|
|
|
|
3
|
push(@{${$contigs_ref}[$i]->{hsps}}, @{${$contigs_ref}[$u]->{hsps}}); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
6
|
|
538
|
|
|
|
|
|
|
|
539
|
3
|
|
|
|
|
5
|
${$contigs_ref}[$u] = undef; |
|
3
|
|
|
|
|
14
|
|
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
elsif ($u_start >= $i_start && $u_stop <= $i_stop) { |
542
|
|
|
|
|
|
|
# nested, drop this contig |
543
|
|
|
|
|
|
|
#*** ideally we might do some magic to keep the stats of the |
544
|
|
|
|
|
|
|
# better hsp... |
545
|
1
|
|
|
|
|
2
|
${$contigs_ref}[$u] = undef; |
|
1
|
|
|
|
|
4
|
|
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
20
|
|
|
|
|
23
|
my @merged; |
551
|
20
|
|
|
|
|
29
|
foreach (@$contigs_ref) { |
552
|
50
|
|
100
|
|
|
84
|
push(@merged, $_ || next); |
553
|
|
|
|
|
|
|
} |
554
|
20
|
|
|
|
|
22
|
@{$contigs_ref} = @merged; |
|
20
|
|
|
|
|
37
|
|
555
|
|
|
|
|
|
|
} |
556
|
|
|
|
|
|
|
elsif (! $overlap) { |
557
|
|
|
|
|
|
|
## If there is no overlap, add the complete HSP data. |
558
|
62
|
|
|
|
|
171
|
($numID,$numCons) = $hsp->matches(-SEQ=>$seqType); |
559
|
62
|
50
|
|
|
|
126
|
if ($numID eq '') { |
560
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0"); |
561
|
0
|
|
|
|
|
0
|
$numID = 0; |
562
|
|
|
|
|
|
|
} |
563
|
62
|
50
|
|
|
|
98
|
if ($numCons eq '') { |
564
|
0
|
|
|
|
|
0
|
$hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0"); |
565
|
0
|
|
|
|
|
0
|
$numCons = 0; |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
|
568
|
62
|
|
|
|
|
342
|
push @$contigs_ref, {'start' =>$start, 'stop' =>$stop, |
569
|
|
|
|
|
|
|
'iden' =>$numID, 'cons' =>$numCons, |
570
|
|
|
|
|
|
|
'strand'=>$strand,'frame'=>$frame,'hsps'=>[$hsp]}; |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
106
|
|
|
|
|
175
|
return $overlap; |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head2 get_exponent |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
Usage : &get_exponent( number ); |
579
|
|
|
|
|
|
|
Purpose : Determines the power of 10 exponent of an integer, float, |
580
|
|
|
|
|
|
|
: or scientific notation number. |
581
|
|
|
|
|
|
|
Example : &get_exponent("4.0e-206"); |
582
|
|
|
|
|
|
|
: &get_exponent("0.00032"); |
583
|
|
|
|
|
|
|
: &get_exponent("10."); |
584
|
|
|
|
|
|
|
: &get_exponent("1000.0"); |
585
|
|
|
|
|
|
|
: &get_exponent("e+83"); |
586
|
|
|
|
|
|
|
Argument : Float, Integer, or scientific notation number |
587
|
|
|
|
|
|
|
Returns : Integer representing the exponent part of the number (+ or -). |
588
|
|
|
|
|
|
|
: If argument == 0 (zero), return value is "-999". |
589
|
|
|
|
|
|
|
Comments : Exponents are rounded up (less negative) if the mantissa is >= 5. |
590
|
|
|
|
|
|
|
: Exponents are rounded down (more negative) if the mantissa is <= -5. |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=cut |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
sub get_exponent { |
595
|
0
|
|
|
0
|
1
|
0
|
my $data = shift; |
596
|
|
|
|
|
|
|
|
597
|
0
|
|
|
|
|
0
|
my($num, $exp) = split /[eE]/, $data; |
598
|
|
|
|
|
|
|
|
599
|
0
|
0
|
|
|
|
0
|
if( defined $exp) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
600
|
0
|
0
|
|
|
|
0
|
$num = 1 if not $num; |
601
|
0
|
0
|
|
|
|
0
|
$num >= 5 and $exp++; |
602
|
0
|
0
|
|
|
|
0
|
$num <= -5 and $exp--; |
603
|
|
|
|
|
|
|
} elsif( $num == 0) { |
604
|
0
|
|
|
|
|
0
|
$exp = -999; |
605
|
|
|
|
|
|
|
} elsif( not $num =~ /\./) { |
606
|
0
|
|
|
|
|
0
|
$exp = CORE::length($num) -1; |
607
|
|
|
|
|
|
|
} else { |
608
|
0
|
|
|
|
|
0
|
$exp = 0; |
609
|
0
|
0
|
|
|
|
0
|
$num .= '0' if $num =~ /\.$/; |
610
|
0
|
|
|
|
|
0
|
my ($c); |
611
|
0
|
|
|
|
|
0
|
my $rev = 0; |
612
|
0
|
0
|
|
|
|
0
|
if($num !~ /^0/) { |
613
|
0
|
|
|
|
|
0
|
$num = reverse($num); |
614
|
0
|
|
|
|
|
0
|
$rev = 1; |
615
|
|
|
|
|
|
|
} |
616
|
0
|
|
|
|
|
0
|
do { $c = chop($num); |
|
0
|
|
|
|
|
0
|
|
617
|
0
|
0
|
|
|
|
0
|
$c == 0 && $exp++; |
618
|
|
|
|
|
|
|
} while( $c ne '.'); |
619
|
|
|
|
|
|
|
|
620
|
0
|
0
|
0
|
|
|
0
|
$exp = -$exp if $num == 0 and not $rev; |
621
|
0
|
0
|
|
|
|
0
|
$exp -= 1 if $rev; |
622
|
|
|
|
|
|
|
} |
623
|
0
|
|
|
|
|
0
|
return $exp; |
624
|
|
|
|
|
|
|
} |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
=head2 collapse_nums |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
Usage : @cnums = collapse_nums( @numbers ); |
629
|
|
|
|
|
|
|
Purpose : Collapses a list of numbers into a set of ranges of consecutive terms: |
630
|
|
|
|
|
|
|
: Useful for condensing long lists of consecutive numbers. |
631
|
|
|
|
|
|
|
: EXPANDED: |
632
|
|
|
|
|
|
|
: 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32 |
633
|
|
|
|
|
|
|
: COLLAPSED: |
634
|
|
|
|
|
|
|
: 1-6 10 12-15 17 18 20-22 24 26 30-32 |
635
|
|
|
|
|
|
|
Argument : List of numbers sorted numerically. |
636
|
|
|
|
|
|
|
Returns : List of numbers mixed with ranges of numbers (see above). |
637
|
|
|
|
|
|
|
Throws : n/a |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
See Also : L |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=cut |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
sub collapse_nums { |
644
|
|
|
|
|
|
|
# This is probably not the slickest connectivity algorithm, but will do for now. |
645
|
50
|
|
|
50
|
1
|
518
|
my @a = @_; |
646
|
50
|
|
|
|
|
83
|
my ($from, $to, $i, @ca, $consec); |
647
|
|
|
|
|
|
|
|
648
|
50
|
|
|
|
|
69
|
$consec = 0; |
649
|
50
|
|
|
|
|
141
|
for($i=0; $i < @a; $i++) { |
650
|
6512
|
100
|
|
|
|
7191
|
not $from and do{ $from = $a[$i]; next; }; |
|
47
|
|
|
|
|
85
|
|
|
47
|
|
|
|
|
112
|
|
651
|
|
|
|
|
|
|
# pass repeated positions (gap inserts) |
652
|
6465
|
100
|
|
|
|
7917
|
next if $a[$i] == $a[$i-1]; |
653
|
6290
|
100
|
|
|
|
6980
|
if($a[$i] == $a[$i-1]+1) { |
654
|
4436
|
|
|
|
|
4100
|
$to = $a[$i]; |
655
|
4436
|
|
|
|
|
5775
|
$consec++; |
656
|
|
|
|
|
|
|
} else { |
657
|
1854
|
100
|
|
|
|
1933
|
if($consec == 1) { $from .= ",$to"; } |
|
290
|
|
|
|
|
364
|
|
658
|
1564
|
100
|
|
|
|
2168
|
else { $from .= $consec>1 ? "\-$to" : ""; } |
659
|
1854
|
|
|
|
|
3066
|
push @ca, split(',', $from); |
660
|
1854
|
|
|
|
|
2029
|
$from = $a[$i]; |
661
|
1854
|
|
|
|
|
1551
|
$consec = 0; |
662
|
1854
|
|
|
|
|
2602
|
$to = undef; |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
} |
665
|
50
|
100
|
|
|
|
99
|
if(defined $to) { |
666
|
23
|
100
|
|
|
|
50
|
if($consec == 1) { $from .= ",$to"; } |
|
3
|
|
|
|
|
9
|
|
667
|
20
|
50
|
|
|
|
60
|
else { $from .= $consec>1 ? "\-$to" : ""; } |
668
|
|
|
|
|
|
|
} |
669
|
50
|
100
|
|
|
|
130
|
push @ca, split(',', $from) if $from; |
670
|
|
|
|
|
|
|
|
671
|
50
|
|
|
|
|
1299
|
@ca; |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=head2 strip_blast_html |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
Usage : $boolean = &strip_blast_html( string_ref ); |
678
|
|
|
|
|
|
|
: This method is exported. |
679
|
|
|
|
|
|
|
Purpose : Removes HTML formatting from a supplied string. |
680
|
|
|
|
|
|
|
: Attempts to restore the Blast report to enable |
681
|
|
|
|
|
|
|
: parsing by Bio::SearchIO::blast.pm |
682
|
|
|
|
|
|
|
Returns : Boolean: true if string was stripped, false if not. |
683
|
|
|
|
|
|
|
Argument : string_ref = reference to a string containing the whole Blast |
684
|
|
|
|
|
|
|
: report containing HTML formatting. |
685
|
|
|
|
|
|
|
Throws : Croaks if the argument is not a scalar reference. |
686
|
|
|
|
|
|
|
Comments : Based on code originally written by Alex Dong Li |
687
|
|
|
|
|
|
|
: (ali@genet.sickkids.on.ca). |
688
|
|
|
|
|
|
|
: This method does some Blast-specific stripping |
689
|
|
|
|
|
|
|
: (adds back a '>' character in front of each HSP |
690
|
|
|
|
|
|
|
: alignment listing). |
691
|
|
|
|
|
|
|
: |
692
|
|
|
|
|
|
|
: THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES! |
693
|
|
|
|
|
|
|
: |
694
|
|
|
|
|
|
|
: Removal of the HTML tags and accurate reconstitution of the |
695
|
|
|
|
|
|
|
: non-HTML-formatted report is highly dependent on structure of |
696
|
|
|
|
|
|
|
: the HTML-formatted version. For example, it assumes that first |
697
|
|
|
|
|
|
|
: line of each alignment section (HSP listing) starts with a |
698
|
|
|
|
|
|
|
: anchor tag. This permits the reconstruction of the |
699
|
|
|
|
|
|
|
: original report in which these lines begin with a ">". |
700
|
|
|
|
|
|
|
: This is required for parsing. |
701
|
|
|
|
|
|
|
: |
702
|
|
|
|
|
|
|
: If the structure of the Blast report itself is not intended to |
703
|
|
|
|
|
|
|
: be a standard, the structure of the HTML-formatted version |
704
|
|
|
|
|
|
|
: is even less so. Therefore, the use of this method to |
705
|
|
|
|
|
|
|
: reconstitute parsable Blast reports from HTML-format versions |
706
|
|
|
|
|
|
|
: should be considered a temporary solution. |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=cut |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
sub strip_blast_html { |
711
|
|
|
|
|
|
|
# This may not best way to remove html tags. However, it is simple. |
712
|
|
|
|
|
|
|
# it won't work under following conditions: |
713
|
|
|
|
|
|
|
# 1) if quoted > appears in a tag (does this ever happen?) |
714
|
|
|
|
|
|
|
# 2) if a tag is split over multiple lines and this method is |
715
|
|
|
|
|
|
|
# used to process one line at a time. |
716
|
|
|
|
|
|
|
|
717
|
0
|
|
|
0
|
1
|
0
|
my ($string_ref) = shift; |
718
|
|
|
|
|
|
|
|
719
|
0
|
0
|
|
|
|
0
|
ref $string_ref eq 'SCALAR' or |
720
|
|
|
|
|
|
|
croak ("Can't strip HTML: ". |
721
|
0
|
|
|
|
|
0
|
"Argument is should be a SCALAR reference not a ${\ref $string_ref}\n"); |
722
|
|
|
|
|
|
|
|
723
|
0
|
|
|
|
|
0
|
my $str = $$string_ref; |
724
|
0
|
|
|
|
|
0
|
my $stripped = 0; |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# Removing "" and adding the '>' character for |
727
|
|
|
|
|
|
|
# HSP alignment listings. |
728
|
0
|
0
|
|
|
|
0
|
$str =~ s/(\A|\n)]+> ?/>/sgi and $stripped = 1; |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# Removing all "<>" tags. |
731
|
0
|
0
|
|
|
|
0
|
$str =~ s/<[^>]+>| //sgi and $stripped = 1; |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
# Re-uniting any lone '>' characters. |
734
|
0
|
0
|
|
|
|
0
|
$str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; |
735
|
|
|
|
|
|
|
|
736
|
0
|
|
|
|
|
0
|
$$string_ref = $str; |
737
|
0
|
|
|
|
|
0
|
$stripped; |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
=head2 result2hash |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
Title : result2hash |
743
|
|
|
|
|
|
|
Usage : my %data = &Bio::Search::SearchUtils($result) |
744
|
|
|
|
|
|
|
Function : converts ResultI data to simple hash |
745
|
|
|
|
|
|
|
Returns : hash |
746
|
|
|
|
|
|
|
Args : ResultI |
747
|
|
|
|
|
|
|
Note : used mainly as a utility for running SearchIO tests |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=cut |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
sub result2hash { |
752
|
4
|
|
|
4
|
1
|
1092
|
my ($result) = @_; |
753
|
4
|
|
|
|
|
8
|
my %hash; |
754
|
4
|
|
|
|
|
10
|
$hash{'query_name'} = $result->query_name; |
755
|
4
|
|
|
|
|
5
|
my $hitcount = 1; |
756
|
4
|
|
|
|
|
7
|
my $hspcount = 1; |
757
|
4
|
|
|
|
|
14
|
foreach my $hit ( $result->hits ) { |
758
|
8
|
|
|
|
|
18
|
$hash{"hit$hitcount\_name"} = $hit->name; |
759
|
|
|
|
|
|
|
# only going to test order of magnitude |
760
|
|
|
|
|
|
|
# too hard as these don't always match |
761
|
|
|
|
|
|
|
# $hash{"hit$hitcount\_signif"} = |
762
|
|
|
|
|
|
|
# ( sprintf("%.0e",$hit->significance) =~ /e\-?(\d+)/ ); |
763
|
8
|
|
|
|
|
20
|
$hash{"hit$hitcount\_bits"} = sprintf("%d",$hit->bits); |
764
|
|
|
|
|
|
|
|
765
|
8
|
|
|
|
|
19
|
foreach my $hsp ( $hit->hsps ) { |
766
|
22
|
|
|
|
|
47
|
$hash{"hsp$hspcount\_bits"} = sprintf("%d",$hsp->bits); |
767
|
|
|
|
|
|
|
# only going to test order of magnitude |
768
|
|
|
|
|
|
|
# too hard as these don't always match |
769
|
|
|
|
|
|
|
# $hash{"hsp$hspcount\_evalue"} = |
770
|
|
|
|
|
|
|
# ( sprintf("%.0e",$hsp->evalue) =~ /e\-?(\d+)/ ); |
771
|
22
|
|
|
|
|
52
|
$hash{"hsp$hspcount\_qs"} = $hsp->query->start; |
772
|
22
|
|
|
|
|
44
|
$hash{"hsp$hspcount\_qe"} = $hsp->query->end; |
773
|
22
|
|
|
|
|
42
|
$hash{"hsp$hspcount\_qstr"} = $hsp->query->strand; |
774
|
22
|
|
|
|
|
45
|
$hash{"hsp$hspcount\_hs"} = $hsp->hit->start; |
775
|
22
|
|
|
|
|
50
|
$hash{"hsp$hspcount\_he"} = $hsp->hit->end; |
776
|
22
|
|
|
|
|
42
|
$hash{"hsp$hspcount\_hstr"} = $hsp->hit->strand; |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
#$hash{"hsp$hspcount\_pid"} = sprintf("%d",$hsp->percent_identity); |
779
|
|
|
|
|
|
|
#$hash{"hsp$hspcount\_fid"} = sprintf("%.2f",$hsp->frac_identical); |
780
|
22
|
|
|
|
|
52
|
$hash{"hsp$hspcount\_gaps"} = $hsp->gaps('total'); |
781
|
22
|
|
|
|
|
39
|
$hspcount++; |
782
|
|
|
|
|
|
|
} |
783
|
8
|
|
|
|
|
11
|
$hitcount++; |
784
|
|
|
|
|
|
|
} |
785
|
4
|
|
|
|
|
133
|
return %hash; |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
sub _warn_about_no_hsps { |
789
|
0
|
|
|
0
|
|
|
my $hit = shift; |
790
|
0
|
|
|
|
|
|
my $prev_func=(caller(1))[3]; |
791
|
0
|
|
|
|
|
|
$hit->warn("There is no HSP data for hit '".$hit->name."'.\n". |
792
|
|
|
|
|
|
|
"You have called a method ($prev_func)\n". |
793
|
|
|
|
|
|
|
"that requires HSP data and there was no HSP data for this hit,\n". |
794
|
|
|
|
|
|
|
"most likely because it was absent from the BLAST report.\n". |
795
|
|
|
|
|
|
|
"Note that by default, BLAST lists alignments for the first 250 hits,\n". |
796
|
|
|
|
|
|
|
"but it lists descriptions for 500 hits. If this is the case,\n". |
797
|
|
|
|
|
|
|
"and you care about these hits, you should re-run BLAST using the\n". |
798
|
|
|
|
|
|
|
"-b option (or equivalent if not using blastall) to increase the number\n". |
799
|
|
|
|
|
|
|
"of alignments.\n" |
800
|
|
|
|
|
|
|
); |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
1; |