line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::Align::ProteinStatistics |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Jason Stajich |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Jason Stajich |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances) |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Bio::Align::ProteinStatistics; |
21
|
|
|
|
|
|
|
use Bio::AlignIO; |
22
|
|
|
|
|
|
|
my $in = Bio::AlignIO->new(-format => 'fasta', |
23
|
|
|
|
|
|
|
-file => 'pep-104.fasaln'); |
24
|
|
|
|
|
|
|
my $aln = $in->next_aln; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my $pepstats = Bio::Align::ProteinStatistics->new(); |
27
|
|
|
|
|
|
|
$kimura = $protstats->distance(-align => $aln, |
28
|
|
|
|
|
|
|
-method => 'Kimura'); |
29
|
|
|
|
|
|
|
print $kimura->print_matrix; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
This object is for generating various statistics from a protein |
35
|
|
|
|
|
|
|
alignment. Mostly it is where pairwise protein distances can be |
36
|
|
|
|
|
|
|
calculated. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 REFERENCES |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, |
41
|
|
|
|
|
|
|
Cambridge. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 FEEDBACK |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head2 Mailing Lists |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
48
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
49
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
52
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head2 Support |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
I |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
61
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
62
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
63
|
|
|
|
|
|
|
with code and data examples if at all possible. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head2 Reporting Bugs |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
68
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via the |
69
|
|
|
|
|
|
|
web: |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=head1 AUTHOR - Jason Stajich |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
Email jason-at-bioperl.org |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head1 APPENDIX |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
80
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=cut |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# Let the code begin... |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
package Bio::Align::ProteinStatistics; |
89
|
2
|
|
|
2
|
|
2020
|
use vars qw(%DistanceMethods $Precision $DefaultGapPenalty); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
109
|
|
90
|
2
|
|
|
2
|
|
8
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
39
|
|
91
|
|
|
|
|
|
|
|
92
|
2
|
|
|
2
|
|
6
|
use Bio::Align::PairwiseStatistics; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
28
|
|
93
|
2
|
|
|
2
|
|
7
|
use Bio::Matrix::PhylipDist; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
76
|
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
%DistanceMethods = ('kimura|k' => 'Kimura', |
96
|
|
|
|
|
|
|
); |
97
|
|
|
|
|
|
|
$Precision = 5; |
98
|
|
|
|
|
|
|
$DefaultGapPenalty = 0; |
99
|
|
|
|
|
|
|
|
100
|
2
|
|
|
2
|
|
7
|
use base qw(Bio::Root::Root Bio::Align::StatisticsI); |
|
2
|
|
|
|
|
1
|
|
|
2
|
|
|
|
|
1660
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head2 new |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Title : new |
105
|
|
|
|
|
|
|
Usage : my $obj = Bio::Align::ProteinStatistics->new(); |
106
|
|
|
|
|
|
|
Function: Builds a new Bio::Align::ProteinStatistics object |
107
|
|
|
|
|
|
|
Returns : an instance of Bio::Align::ProteinStatistics |
108
|
|
|
|
|
|
|
Args : |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub new { |
114
|
2
|
|
|
2
|
1
|
455
|
my($class,@args) = @_; |
115
|
|
|
|
|
|
|
|
116
|
2
|
|
|
|
|
16
|
my $self = $class->SUPER::new(@args); |
117
|
2
|
|
|
|
|
13
|
$self->pairwise_stats( Bio::Align::PairwiseStatistics->new()); |
118
|
|
|
|
|
|
|
|
119
|
2
|
|
|
|
|
4
|
return $self; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 distance |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Title : distance |
125
|
|
|
|
|
|
|
Usage : my $distance_mat = $stats->distance(-align => $aln, |
126
|
|
|
|
|
|
|
-method => $method); |
127
|
|
|
|
|
|
|
Function: Calculates a distance matrix for all pairwise distances of |
128
|
|
|
|
|
|
|
sequences in an alignment. |
129
|
|
|
|
|
|
|
Returns : L object |
130
|
|
|
|
|
|
|
Args : -align => Bio::Align::AlignI object |
131
|
|
|
|
|
|
|
-method => String specifying specific distance method |
132
|
|
|
|
|
|
|
(implementing class may assume a default) |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=cut |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub distance{ |
137
|
2
|
|
|
2
|
1
|
10
|
my ($self,@args) = @_; |
138
|
2
|
|
|
|
|
14
|
my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args); |
139
|
2
|
50
|
33
|
|
|
21
|
if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) { |
|
|
|
33
|
|
|
|
|
140
|
0
|
|
|
|
|
0
|
$self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance"); |
141
|
|
|
|
|
|
|
} |
142
|
2
|
|
50
|
|
|
5
|
$method ||= 'Kimura'; |
143
|
2
|
|
|
|
|
7
|
foreach my $m ( keys %DistanceMethods ) { |
144
|
2
|
50
|
33
|
|
|
69
|
if(defined $m && $method =~ /$m/i ) { |
145
|
2
|
|
|
|
|
6
|
my $mtd = "D_$DistanceMethods{$m}"; |
146
|
2
|
|
|
|
|
9
|
return $self->$mtd($aln); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
0
|
|
|
|
|
0
|
$self->warn("Unrecognized distance method $method must be one of [". |
150
|
|
|
|
|
|
|
join(',',$self->available_distance_methods())."]"); |
151
|
0
|
|
|
|
|
0
|
return; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=head2 available_distance_methods |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Title : available_distance_methods |
157
|
|
|
|
|
|
|
Usage : my @methods = $stats->available_distance_methods(); |
158
|
|
|
|
|
|
|
Function: Enumerates the possible distance methods |
159
|
|
|
|
|
|
|
Returns : Array of strings |
160
|
|
|
|
|
|
|
Args : none |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=cut |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub available_distance_methods{ |
166
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
167
|
0
|
|
|
|
|
0
|
return values %DistanceMethods; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head2 D - distance methods |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=cut |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head2 D_Kimura |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Title : D_Kimura |
179
|
|
|
|
|
|
|
Usage : my $matrix = $pepstats->D_Kimura($aln); |
180
|
|
|
|
|
|
|
Function: Calculate Kimura protein distance (Kimura 1983) which |
181
|
|
|
|
|
|
|
approximates PAM distance |
182
|
|
|
|
|
|
|
D = -ln ( 1 - p - 0.2 * p^2 ) |
183
|
|
|
|
|
|
|
Returns : L |
184
|
|
|
|
|
|
|
Args : L |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=cut |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
sub D_Kimura{ |
192
|
2
|
|
|
2
|
1
|
3
|
my ($self,$aln) = @_; |
193
|
2
|
50
|
|
|
|
6
|
return 0 unless $self->_check_arg($aln); |
194
|
|
|
|
|
|
|
# ambiguities ignored at this point |
195
|
2
|
|
|
|
|
2
|
my (@seqs,@names,@values,%dist); |
196
|
2
|
|
|
|
|
4
|
my $seqct = 0; |
197
|
2
|
|
|
|
|
13
|
foreach my $seq ( $aln->each_seq) { |
198
|
20
|
|
|
|
|
24
|
push @names, $seq->display_id; |
199
|
20
|
|
|
|
|
27
|
push @seqs, uc $seq->seq(); |
200
|
20
|
|
|
|
|
20
|
$seqct++; |
201
|
|
|
|
|
|
|
} |
202
|
2
|
|
|
|
|
9
|
my $len = $aln->length; |
203
|
2
|
|
|
|
|
7
|
my $precisionstr = "%.$Precision"."f"; |
204
|
|
|
|
|
|
|
|
205
|
2
|
|
|
|
|
9
|
for( my $i = 0; $i < $seqct-1; $i++ ) { |
206
|
|
|
|
|
|
|
# (diagonals) distance is 0 for same sequence |
207
|
18
|
|
|
|
|
61
|
$dist{$names[$i]}->{$names[$i]} = [$i,$i]; |
208
|
18
|
|
|
|
|
91
|
$values[$i][$i] = sprintf($precisionstr,0); |
209
|
18
|
|
|
|
|
45
|
for( my $j = $i+1; $j < $seqct; $j++ ) { |
210
|
106
|
|
|
|
|
98
|
my ($scored,$match) = (0,0); |
211
|
106
|
|
|
|
|
142
|
for( my $k=0; $k < $len; $k++ ) { |
212
|
42184
|
|
|
|
|
32839
|
my $m1 = substr($seqs[$i],$k,1); |
213
|
42184
|
|
|
|
|
26687
|
my $m2 = substr($seqs[$j],$k,1); |
214
|
42184
|
100
|
100
|
|
|
94505
|
if( $m1 ne '-' && $m2 ne '-' ) { |
215
|
|
|
|
|
|
|
# score is number of scored bases (alignable bases) |
216
|
|
|
|
|
|
|
# it could have also come from |
217
|
|
|
|
|
|
|
# my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); |
218
|
|
|
|
|
|
|
# match is number of matches weighting ambiguity bases |
219
|
|
|
|
|
|
|
# as well |
220
|
31488
|
|
|
|
|
26234
|
$match += _check_ambiguity_protein($m1,$m2); |
221
|
31488
|
|
|
|
|
41750
|
$scored++; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
# From Felsenstein's PHYLIP documentation: |
225
|
|
|
|
|
|
|
# This is very quick to do but has some obvious |
226
|
|
|
|
|
|
|
# limitations. It does not take into account which amino |
227
|
|
|
|
|
|
|
# acids differ or to what amino acids they change, so some |
228
|
|
|
|
|
|
|
# information is lost. The units of the distance measure |
229
|
|
|
|
|
|
|
# are fraction of amino acids differing, as also in the |
230
|
|
|
|
|
|
|
# case of the PAM distance. If the fraction of amino acids |
231
|
|
|
|
|
|
|
# differing gets larger than 0.8541 the distance becomes |
232
|
|
|
|
|
|
|
# infinite. |
233
|
|
|
|
|
|
|
|
234
|
106
|
|
|
|
|
134
|
my $D = 1 - ( $match / $scored ); |
235
|
106
|
100
|
|
|
|
124
|
if( $D < 0.8541 ) { |
236
|
97
|
|
|
|
|
250
|
$D = - log ( 1 - $D - (0.2 * ($D ** 2))); |
237
|
97
|
|
|
|
|
729
|
$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D); |
238
|
|
|
|
|
|
|
} else { |
239
|
9
|
|
|
|
|
24
|
$values[$j][$i] = $values[$i][$j] = ' NaN'; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
# fwd and rev lookup |
242
|
106
|
|
|
|
|
334
|
$dist{$names[$i]}->{$names[$j]} = [$i,$j]; |
243
|
106
|
|
|
|
|
240
|
$dist{$names[$j]}->{$names[$i]} = [$i,$j]; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# (diagonals) distance is 0 for same sequence |
246
|
106
|
|
|
|
|
155
|
$dist{$names[$j]}->{$names[$j]} = [$j,$j]; |
247
|
106
|
|
|
|
|
516
|
$values[$j][$j] = sprintf($precisionstr,0); |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
2
|
|
|
|
|
41
|
return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats', |
252
|
|
|
|
|
|
|
-matrix => \%dist, |
253
|
|
|
|
|
|
|
-names => \@names, |
254
|
|
|
|
|
|
|
-values => \@values); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
# some methods from EMBOSS distmat |
259
|
|
|
|
|
|
|
sub _check_ambiguity_protein |
260
|
|
|
|
|
|
|
{ |
261
|
31488
|
|
|
31488
|
|
20876
|
my ($t1,$t2) = @_; |
262
|
31488
|
|
|
|
|
17920
|
my $n = 0; |
263
|
|
|
|
|
|
|
|
264
|
31488
|
100
|
66
|
|
|
145324
|
if( $t1 ne 'X' && $t1 eq $t2 ) { |
|
|
50
|
33
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
265
|
18672
|
|
|
|
|
12126
|
$n = 1.0; |
266
|
|
|
|
|
|
|
} elsif( (($t1 eq 'B' && $t2 =~ /[DN]/ ) || |
267
|
|
|
|
|
|
|
($t2 eq 'B' && $t1 =~ /[DN]/ )) || |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
(($t1 eq 'Z' && $t2 =~ /[EQ]/) || |
270
|
|
|
|
|
|
|
($t2 eq 'Z' && $t1 =~ /[EQ]/ ))) { |
271
|
0
|
|
|
|
|
0
|
$n = 0.5; |
272
|
|
|
|
|
|
|
} elsif ( $t1 eq 'X' && $t2 eq 'X' ) { |
273
|
0
|
|
|
|
|
0
|
$n = 0.0025; |
274
|
|
|
|
|
|
|
} elsif( $t1 eq 'X' || $t2 eq 'X' ) { |
275
|
0
|
|
|
|
|
0
|
$n = 0.05; |
276
|
|
|
|
|
|
|
} |
277
|
31488
|
|
|
|
|
26541
|
return $n; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head2 Data Methods |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=cut |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=head2 pairwise_stats |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
Title : pairwise_stats |
287
|
|
|
|
|
|
|
Usage : $obj->pairwise_stats($newval) |
288
|
|
|
|
|
|
|
Function: |
289
|
|
|
|
|
|
|
Returns : value of pairwise_stats |
290
|
|
|
|
|
|
|
Args : newvalue (optional) |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=cut |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub pairwise_stats{ |
296
|
2
|
|
|
2
|
1
|
4
|
my ($self,$value) = @_; |
297
|
2
|
50
|
|
|
|
9
|
if( defined $value) { |
298
|
2
|
|
|
|
|
14
|
$self->{'_pairwise_stats'} = $value; |
299
|
|
|
|
|
|
|
} |
300
|
2
|
|
|
|
|
4
|
return $self->{'_pairwise_stats'}; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
sub _check_arg { |
305
|
2
|
|
|
2
|
|
2
|
my($self,$aln ) = @_; |
306
|
2
|
50
|
33
|
|
|
24
|
if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { |
|
|
50
|
|
|
|
|
|
307
|
0
|
|
|
|
|
0
|
$self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics"); |
308
|
0
|
|
|
|
|
0
|
return 0; |
309
|
|
|
|
|
|
|
} elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) { |
310
|
0
|
|
|
|
|
0
|
$self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet); |
311
|
0
|
|
|
|
|
0
|
return 0; |
312
|
|
|
|
|
|
|
} |
313
|
2
|
|
|
|
|
7
|
return 1; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
1; |