File Coverage

Bio/Align/ProteinStatistics.pm
Criterion Covered Total %
statement 70 82 85.3
branch 15 24 62.5
condition 19 47 40.4
subroutine 11 12 91.6
pod 5 5 100.0
total 120 170 70.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Align::ProteinStatistics
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances)
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Align::ProteinStatistics;
21             use Bio::AlignIO;
22             my $in = Bio::AlignIO->new(-format => 'fasta',
23             -file => 'pep-104.fasaln');
24             my $aln = $in->next_aln;
25              
26             my $pepstats = Bio::Align::ProteinStatistics->new();
27             $kimura = $protstats->distance(-align => $aln,
28             -method => 'Kimura');
29             print $kimura->print_matrix;
30              
31              
32             =head1 DESCRIPTION
33              
34             This object is for generating various statistics from a protein
35             alignment. Mostly it is where pairwise protein distances can be
36             calculated.
37              
38             =head1 REFERENCES
39              
40             D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP,
41             Cambridge.
42              
43             =head1 FEEDBACK
44              
45             =head2 Mailing Lists
46              
47             User feedback is an integral part of the evolution of this and other
48             Bioperl modules. Send your comments and suggestions preferably to
49             the Bioperl mailing list. Your participation is much appreciated.
50              
51             bioperl-l@bioperl.org - General discussion
52             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53              
54             =head2 Support
55              
56             Please direct usage questions or support issues to the mailing list:
57              
58             I
59              
60             rather than to the module maintainer directly. Many experienced and
61             reponsive experts will be able look at the problem and quickly
62             address it. Please include a thorough description of the problem
63             with code and data examples if at all possible.
64              
65             =head2 Reporting Bugs
66              
67             Report bugs to the Bioperl bug tracking system to help us keep track
68             of the bugs and their resolution. Bug reports can be submitted via the
69             web:
70              
71             https://github.com/bioperl/bioperl-live/issues
72              
73             =head1 AUTHOR - Jason Stajich
74              
75             Email jason-at-bioperl.org
76              
77             =head1 APPENDIX
78              
79             The rest of the documentation details each of the object methods.
80             Internal methods are usually preceded with a _
81              
82             =cut
83              
84              
85             # Let the code begin...
86              
87              
88             package Bio::Align::ProteinStatistics;
89 2     2   1545 use vars qw(%DistanceMethods $Precision $DefaultGapPenalty);
  2         2  
  2         90  
90 2     2   7 use strict;
  2         2  
  2         31  
91              
92 2     2   5 use Bio::Align::PairwiseStatistics;
  2         1  
  2         25  
93 2     2   5 use Bio::Matrix::PhylipDist;
  2         2  
  2         62  
94              
95             %DistanceMethods = ('kimura|k' => 'Kimura',
96             );
97             $Precision = 5;
98             $DefaultGapPenalty = 0;
99              
100 2     2   5 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
  2         3  
  2         1577  
101              
102             =head2 new
103              
104             Title : new
105             Usage : my $obj = Bio::Align::ProteinStatistics->new();
106             Function: Builds a new Bio::Align::ProteinStatistics object
107             Returns : an instance of Bio::Align::ProteinStatistics
108             Args :
109              
110              
111             =cut
112              
113             sub new {
114 2     2 1 599 my($class,@args) = @_;
115              
116 2         16 my $self = $class->SUPER::new(@args);
117 2         13 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
118              
119 2         3 return $self;
120             }
121              
122             =head2 distance
123              
124             Title : distance
125             Usage : my $distance_mat = $stats->distance(-align => $aln,
126             -method => $method);
127             Function: Calculates a distance matrix for all pairwise distances of
128             sequences in an alignment.
129             Returns : L object
130             Args : -align => Bio::Align::AlignI object
131             -method => String specifying specific distance method
132             (implementing class may assume a default)
133              
134             =cut
135              
136             sub distance{
137 2     2 1 8 my ($self,@args) = @_;
138 2         16 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
139 2 50 33     21 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
      33        
140 0         0 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
141             }
142 2   50     5 $method ||= 'Kimura';
143 2         7 foreach my $m ( keys %DistanceMethods ) {
144 2 50 33     70 if(defined $m && $method =~ /$m/i ) {
145 2         5 my $mtd = "D_$DistanceMethods{$m}";
146 2         8 return $self->$mtd($aln);
147             }
148             }
149 0         0 $self->warn("Unrecognized distance method $method must be one of [".
150             join(',',$self->available_distance_methods())."]");
151 0         0 return;
152             }
153              
154             =head2 available_distance_methods
155              
156             Title : available_distance_methods
157             Usage : my @methods = $stats->available_distance_methods();
158             Function: Enumerates the possible distance methods
159             Returns : Array of strings
160             Args : none
161              
162              
163             =cut
164              
165             sub available_distance_methods{
166 0     0 1 0 my ($self,@args) = @_;
167 0         0 return values %DistanceMethods;
168             }
169              
170             =head2 D - distance methods
171              
172              
173             =cut
174              
175              
176             =head2 D_Kimura
177              
178             Title : D_Kimura
179             Usage : my $matrix = $pepstats->D_Kimura($aln);
180             Function: Calculate Kimura protein distance (Kimura 1983) which
181             approximates PAM distance
182             D = -ln ( 1 - p - 0.2 * p^2 )
183             Returns : L
184             Args : L
185              
186              
187             =cut
188              
189             # Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
190              
191             sub D_Kimura{
192 2     2 1 3 my ($self,$aln) = @_;
193 2 50       6 return 0 unless $self->_check_arg($aln);
194             # ambiguities ignored at this point
195 2         4 my (@seqs,@names,@values,%dist);
196 2         2 my $seqct = 0;
197 2         4 foreach my $seq ( $aln->each_seq) {
198 20         25 push @names, $seq->display_id;
199 20         24 push @seqs, uc $seq->seq();
200 20         20 $seqct++;
201             }
202 2         8 my $len = $aln->length;
203 2         6 my $precisionstr = "%.$Precision"."f";
204              
205 2         7 for( my $i = 0; $i < $seqct-1; $i++ ) {
206             # (diagonals) distance is 0 for same sequence
207 18         35 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
208 18         56 $values[$i][$i] = sprintf($precisionstr,0);
209 18         38 for( my $j = $i+1; $j < $seqct; $j++ ) {
210 106         87 my ($scored,$match) = (0,0);
211 106         140 for( my $k=0; $k < $len; $k++ ) {
212 42184         32097 my $m1 = substr($seqs[$i],$k,1);
213 42184         25862 my $m2 = substr($seqs[$j],$k,1);
214 42184 100 100     95282 if( $m1 ne '-' && $m2 ne '-' ) {
215             # score is number of scored bases (alignable bases)
216             # it could have also come from
217             # my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
218             # match is number of matches weighting ambiguity bases
219             # as well
220 31488         26886 $match += _check_ambiguity_protein($m1,$m2);
221 31488         41138 $scored++;
222             }
223             }
224             # From Felsenstein's PHYLIP documentation:
225             # This is very quick to do but has some obvious
226             # limitations. It does not take into account which amino
227             # acids differ or to what amino acids they change, so some
228             # information is lost. The units of the distance measure
229             # are fraction of amino acids differing, as also in the
230             # case of the PAM distance. If the fraction of amino acids
231             # differing gets larger than 0.8541 the distance becomes
232             # infinite.
233              
234 106         130 my $D = 1 - ( $match / $scored );
235 106 100       105 if( $D < 0.8541 ) {
236 97         200 $D = - log ( 1 - $D - (0.2 * ($D ** 2)));
237 97         641 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
238             } else {
239 9         24 $values[$j][$i] = $values[$i][$j] = ' NaN';
240             }
241             # fwd and rev lookup
242 106         253 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
243 106         167 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
244              
245             # (diagonals) distance is 0 for same sequence
246 106         138 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
247 106         459 $values[$j][$j] = sprintf($precisionstr,0);
248              
249             }
250             }
251 2         32 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
252             -matrix => \%dist,
253             -names => \@names,
254             -values => \@values);
255            
256             }
257              
258             # some methods from EMBOSS distmat
259             sub _check_ambiguity_protein
260             {
261 31488     31488   20741 my ($t1,$t2) = @_;
262 31488         17118 my $n = 0;
263              
264 31488 100 66     146931 if( $t1 ne 'X' && $t1 eq $t2 ) {
    50 33        
    50 33        
    50 33        
      33        
      33        
      33        
      33        
      33        
      33        
265 18672         11675 $n = 1.0;
266             } elsif( (($t1 eq 'B' && $t2 =~ /[DN]/ ) ||
267             ($t2 eq 'B' && $t1 =~ /[DN]/ )) ||
268            
269             (($t1 eq 'Z' && $t2 =~ /[EQ]/) ||
270             ($t2 eq 'Z' && $t1 =~ /[EQ]/ ))) {
271 0         0 $n = 0.5;
272             } elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
273 0         0 $n = 0.0025;
274             } elsif( $t1 eq 'X' || $t2 eq 'X' ) {
275 0         0 $n = 0.05;
276             }
277 31488         25399 return $n;
278             }
279              
280             =head2 Data Methods
281              
282             =cut
283              
284             =head2 pairwise_stats
285              
286             Title : pairwise_stats
287             Usage : $obj->pairwise_stats($newval)
288             Function:
289             Returns : value of pairwise_stats
290             Args : newvalue (optional)
291              
292              
293             =cut
294              
295             sub pairwise_stats{
296 2     2 1 4 my ($self,$value) = @_;
297 2 50       5 if( defined $value) {
298 2         12 $self->{'_pairwise_stats'} = $value;
299             }
300 2         3 return $self->{'_pairwise_stats'};
301              
302             }
303              
304             sub _check_arg {
305 2     2   3 my($self,$aln ) = @_;
306 2 50 33     18 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
    50          
307 0         0 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
308 0         0 return 0;
309             } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) {
310 0         0 $self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
311 0         0 return 0;
312             }
313 2         5 return 1;
314             }
315              
316             1;