line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::PopGen::Statistics |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Jason Stajich |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Jason Stajich |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::PopGen::Statistics - Population Genetics statistical tests |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Bio::PopGen::Statistics; |
21
|
|
|
|
|
|
|
use Bio::AlignIO; |
22
|
|
|
|
|
|
|
use Bio::PopGen::IO; |
23
|
|
|
|
|
|
|
use Bio::PopGen::Simulation::Coalescent; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $tree = $sim->next_tree; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$sim->add_Mutations($tree,20); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my $stats = Bio::PopGen::Statistics->new(); |
32
|
|
|
|
|
|
|
my $individuals = [ $tree->get_leaf_nodes]; |
33
|
|
|
|
|
|
|
my $pi = $stats->pi($individuals); |
34
|
|
|
|
|
|
|
my $D = $stats->tajima_D($individuals); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# Alternatively to do this on input data from |
37
|
|
|
|
|
|
|
# See the tests in t/PopGen.t for more examples |
38
|
|
|
|
|
|
|
my $parser = Bio::PopGen::IO->new(-format => 'prettybase', |
39
|
|
|
|
|
|
|
-file => 't/data/popstats.prettybase'); |
40
|
|
|
|
|
|
|
my $pop = $parser->next_population; |
41
|
|
|
|
|
|
|
# Note that you can also call the stats as a class method if you like |
42
|
|
|
|
|
|
|
# the only reason to instantiate it (as above) is if you want |
43
|
|
|
|
|
|
|
# to set the verbosity for debugging |
44
|
|
|
|
|
|
|
$pi = Bio::PopGen::Statistics->pi($pop); |
45
|
|
|
|
|
|
|
$theta = Bio::PopGen::Statistics->theta($pop); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Pi and Theta also take additional arguments, |
48
|
|
|
|
|
|
|
# see the documentation for more information |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
use Bio::PopGen::Utilities; |
51
|
|
|
|
|
|
|
use Bio::AlignIO; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $in = Bio::AlignIO->new(-file => 't/data/t7.aln', |
54
|
|
|
|
|
|
|
-format => 'clustalw'); |
55
|
|
|
|
|
|
|
my $aln = $in->next_aln; |
56
|
|
|
|
|
|
|
# get a population, each sequence is an individual and |
57
|
|
|
|
|
|
|
# for the default case, every site which is not monomorphic |
58
|
|
|
|
|
|
|
# is a 'marker'. Each individual will have a 'genotype' for the |
59
|
|
|
|
|
|
|
# site which will be the specific base in the alignment at that |
60
|
|
|
|
|
|
|
# site |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 DESCRIPTION |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
This object is intended to provide implementations some standard |
68
|
|
|
|
|
|
|
population genetics statistics about alleles in populations. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
This module was previously named Bio::Tree::Statistics. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
This object is a place to accumulate routines for calculating various |
73
|
|
|
|
|
|
|
statistics from the coalescent simulation, marker/allele, or from |
74
|
|
|
|
|
|
|
aligned sequence data given that you can calculate alleles, number of |
75
|
|
|
|
|
|
|
segregating sites. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
Currently implemented: |
78
|
|
|
|
|
|
|
Fu and Li's D (fu_and_li_D) |
79
|
|
|
|
|
|
|
Fu and Li's D* (fu_and_li_D_star) |
80
|
|
|
|
|
|
|
Fu and Li's F (fu_and_li_F) |
81
|
|
|
|
|
|
|
Fu and Li's F* (fu_and_li_F_star) |
82
|
|
|
|
|
|
|
Tajima's D (tajima_D) |
83
|
|
|
|
|
|
|
Watterson's theta (theta) |
84
|
|
|
|
|
|
|
pi (pi) - number of pairwise differences |
85
|
|
|
|
|
|
|
composite_LD (composite_LD) |
86
|
|
|
|
|
|
|
McDonald-Kreitman (mcdonald_kreitman or MK) |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Count based methods also exist in case you have already calculated the |
89
|
|
|
|
|
|
|
key statistics (seg sites, num individuals, etc) and just want to |
90
|
|
|
|
|
|
|
compute the statistic. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
In all cases where a the method expects an arrayref of |
93
|
|
|
|
|
|
|
L objects and L |
94
|
|
|
|
|
|
|
object will also work. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head2 REFERENCES |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of |
99
|
|
|
|
|
|
|
Mutations." Genetics 133:693-709. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples |
102
|
|
|
|
|
|
|
from a Population." Genetics 143:557-570. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
McDonald J, Kreitman M. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Tajima F. (1989) "Statistical method for testing the neutral mutation |
107
|
|
|
|
|
|
|
hypothesis by DNA polymorphism." Genetics 123:585-595. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head2 CITING THIS WORK |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Please see this reference for use of this implementation. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
If you use these Bio::PopGen modules please cite the Bioperl |
117
|
|
|
|
|
|
|
publication (see FAQ) and the above reference. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head1 FEEDBACK |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 Mailing Lists |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
125
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
126
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
129
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 Support |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
I |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
138
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
139
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
140
|
|
|
|
|
|
|
with code and data examples if at all possible. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head2 Reporting Bugs |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
145
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via |
146
|
|
|
|
|
|
|
the web: |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head1 AUTHOR - Jason Stajich, Matthew Hahn |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Email jason-at-bioperl-dot-org |
153
|
|
|
|
|
|
|
Email matthew-dot-hahn-at-duke-dot-edu |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
McDonald-Kreitman implementation based on work by Alisha Holloway at |
156
|
|
|
|
|
|
|
UC Davis. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 APPENDIX |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
162
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Let the code begin... |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
package Bio::PopGen::Statistics; |
171
|
3
|
|
|
3
|
|
3034
|
use strict; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
100
|
|
172
|
|
|
|
|
|
|
use constant { |
173
|
3
|
|
|
|
|
245
|
in_label => 'ingroup', |
174
|
|
|
|
|
|
|
out_label => 'outgroup', |
175
|
|
|
|
|
|
|
non_syn => 'non_synonymous', |
176
|
|
|
|
|
|
|
syn => 'synonymous', |
177
|
|
|
|
|
|
|
default_codon_table => 1, # Standard Codon table |
178
|
3
|
|
|
3
|
|
10
|
}; |
|
3
|
|
|
|
|
3
|
|
179
|
|
|
|
|
|
|
|
180
|
3
|
|
|
3
|
|
20862
|
use Bio::MolEvol::CodonModel; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
92
|
|
181
|
3
|
|
|
3
|
|
12
|
use List::Util qw(sum); |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
216
|
|
182
|
|
|
|
|
|
|
|
183
|
3
|
|
|
3
|
|
11
|
use base qw(Bio::Root::Root); |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
333
|
|
184
|
|
|
|
|
|
|
our $codon_table => default_codon_table; |
185
|
|
|
|
|
|
|
our $has_twotailed => 0; |
186
|
|
|
|
|
|
|
BEGIN { |
187
|
3
|
|
|
3
|
|
3
|
eval { require Text::NSP::Measures::2D::Fisher2::twotailed }; |
|
3
|
|
|
|
|
448
|
|
188
|
3
|
50
|
|
|
|
14
|
if( $@ ) { $has_twotailed = 0; } |
|
3
|
|
|
|
|
13439
|
|
189
|
0
|
|
|
|
|
0
|
else { $has_twotailed = 1; } |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 new |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Title : new |
200
|
|
|
|
|
|
|
Usage : my $obj = Bio::PopGen::Statistics->new(); |
201
|
|
|
|
|
|
|
Function: Builds a new Bio::PopGen::Statistics object |
202
|
|
|
|
|
|
|
Returns : an instance of Bio::PopGen::Statistics |
203
|
|
|
|
|
|
|
Args : none |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head2 fu_and_li_D |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Title : fu_and_li_D |
212
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup); |
213
|
|
|
|
|
|
|
OR |
214
|
|
|
|
|
|
|
my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations); |
215
|
|
|
|
|
|
|
Function: Fu and Li D statistic for a list of individuals |
216
|
|
|
|
|
|
|
given an outgroup and the number of external mutations |
217
|
|
|
|
|
|
|
(either provided or calculated from list of outgroup individuals) |
218
|
|
|
|
|
|
|
Returns : decimal |
219
|
|
|
|
|
|
|
Args : $individuals - array reference which contains ingroup individuals |
220
|
|
|
|
|
|
|
(L or derived classes) |
221
|
|
|
|
|
|
|
$extmutations - number of external mutations OR |
222
|
|
|
|
|
|
|
arrayref of outgroup individuals |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=cut |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub fu_and_li_D { |
227
|
7
|
|
|
7
|
1
|
21
|
my ($self,$ingroup,$outgroup) = @_; |
228
|
|
|
|
|
|
|
|
229
|
7
|
|
|
|
|
10
|
my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0); |
230
|
7
|
100
|
33
|
|
|
43
|
if( ref($ingroup) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
231
|
4
|
|
|
|
|
5
|
$n = scalar @$ingroup; |
232
|
|
|
|
|
|
|
# pi - all pairwise differences |
233
|
4
|
|
|
|
|
9
|
$seg_sites = $self->segregating_sites_count($ingroup); |
234
|
|
|
|
|
|
|
} elsif( ref($ingroup) && |
235
|
|
|
|
|
|
|
$ingroup->isa('Bio::PopGen::PopulationI')) { |
236
|
3
|
|
|
|
|
7
|
$n = $ingroup->get_number_individuals; |
237
|
3
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($ingroup); |
238
|
|
|
|
|
|
|
} else { |
239
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D"); |
240
|
0
|
|
|
|
|
0
|
return 0; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
7
|
50
|
|
|
|
16
|
if( $seg_sites <= 0 ) { |
244
|
0
|
|
|
|
|
0
|
$self->warn("mutation total was not > 0, cannot calculate a Fu and Li D"); |
245
|
0
|
|
|
|
|
0
|
return 0; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
7
|
50
|
|
|
|
17
|
if( ! defined $outgroup ) { |
|
|
100
|
|
|
|
|
|
249
|
0
|
|
|
|
|
0
|
$self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations"); |
250
|
0
|
|
|
|
|
0
|
return 0; |
251
|
|
|
|
|
|
|
} elsif( ref($outgroup) ) { |
252
|
4
|
|
|
|
|
8
|
($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup); |
253
|
4
|
50
|
|
|
|
9
|
$ancestral = 0 unless defined $ancestral; |
254
|
|
|
|
|
|
|
} else { |
255
|
3
|
|
|
|
|
4
|
$ancestral = $outgroup; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
7
|
|
|
|
|
16
|
return $self->fu_and_li_D_counts($n,$seg_sites, |
259
|
|
|
|
|
|
|
$ancestral,$derived); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head2 fu_and_li_D_counts |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
Title : fu_li_D_counts |
265
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_and_li_D_counts($samps,$sites, |
266
|
|
|
|
|
|
|
$external); |
267
|
|
|
|
|
|
|
Function: Fu and Li D statistic for the raw counts of the number |
268
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
269
|
|
|
|
|
|
|
Returns : decimal number |
270
|
|
|
|
|
|
|
Args : number of samples (N) |
271
|
|
|
|
|
|
|
number of segregating sites (n) |
272
|
|
|
|
|
|
|
number of external mutations (n_e) |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=cut |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub fu_and_li_D_counts { |
278
|
8
|
|
|
8
|
1
|
131
|
my ($self,$n,$seg_sites, $external_mut) = @_; |
279
|
8
|
|
|
|
|
7
|
my $a_n = 0; |
280
|
8
|
50
|
|
|
|
16
|
if( $n <= 3 ) { |
281
|
0
|
|
|
|
|
0
|
$self->warn("n is $n, too small, must be > 3\n"); |
282
|
0
|
|
|
|
|
0
|
return; |
283
|
|
|
|
|
|
|
} |
284
|
8
|
|
|
|
|
24
|
for(my $k= 1; $k < $n; $k++ ) { |
285
|
51
|
|
|
|
|
79
|
$a_n += ( 1 / $k ); |
286
|
|
|
|
|
|
|
} |
287
|
8
|
|
|
|
|
9
|
my $b = 0; |
288
|
8
|
|
|
|
|
17
|
for(my $k= 1; $k < $n; $k++ ) { |
289
|
51
|
|
|
|
|
84
|
$b += ( 1 / $k**2 ); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
8
|
|
|
|
|
18
|
my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) / |
293
|
|
|
|
|
|
|
( ( $n - 1) * ( $n - 2 ) ) ); |
294
|
|
|
|
|
|
|
|
295
|
8
|
|
|
|
|
23
|
my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) * |
296
|
|
|
|
|
|
|
( $c - ( ( $n + 1) / |
297
|
|
|
|
|
|
|
( $n - 1) ) )); |
298
|
|
|
|
|
|
|
|
299
|
8
|
|
|
|
|
10
|
my $u = $a_n - 1 - $v; |
300
|
|
|
|
|
|
|
|
301
|
8
|
|
|
|
|
80
|
($seg_sites - $a_n * $external_mut) / |
302
|
|
|
|
|
|
|
sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites)); |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 fu_and_li_D_star |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
Title : fu_and_li_D_star |
310
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_an_li_D_star(\@individuals); |
311
|
|
|
|
|
|
|
Function: Fu and Li's D* statistic for a set of samples |
312
|
|
|
|
|
|
|
Without an outgroup |
313
|
|
|
|
|
|
|
Returns : decimal number |
314
|
|
|
|
|
|
|
Args : array ref of L objects |
315
|
|
|
|
|
|
|
OR |
316
|
|
|
|
|
|
|
L object |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
#' |
321
|
|
|
|
|
|
|
# fu_and_li_D* |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
sub fu_and_li_D_star { |
324
|
3
|
|
|
3
|
1
|
443
|
my ($self,$individuals) = @_; |
325
|
|
|
|
|
|
|
|
326
|
3
|
|
|
|
|
4
|
my ($seg_sites,$n,$singletons); |
327
|
3
|
100
|
33
|
|
|
22
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
328
|
2
|
|
|
|
|
3
|
$n = scalar @$individuals; |
329
|
2
|
|
|
|
|
5
|
$seg_sites = $self->segregating_sites_count($individuals); |
330
|
2
|
|
|
|
|
6
|
$singletons = $self->singleton_count($individuals); |
331
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
332
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI')) { |
333
|
1
|
|
|
|
|
1
|
my $pop = $individuals; |
334
|
1
|
|
|
|
|
3
|
$n = $pop->get_number_individuals; |
335
|
1
|
|
|
|
|
2
|
$seg_sites = $self->segregating_sites_count($pop); |
336
|
1
|
|
|
|
|
4
|
$singletons = $self->singleton_count($pop); |
337
|
|
|
|
|
|
|
} else { |
338
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star"); |
339
|
0
|
|
|
|
|
0
|
return 0; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
3
|
|
|
|
|
10
|
return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons); |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head2 fu_and_li_D_star_counts |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Title : fu_li_D_star_counts |
348
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites, |
349
|
|
|
|
|
|
|
$singletons); |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
Function: Fu and Li D statistic for the raw counts of the number |
352
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
353
|
|
|
|
|
|
|
Returns : decimal number |
354
|
|
|
|
|
|
|
Args : number of samples (N) |
355
|
|
|
|
|
|
|
number of segregating sites (n) |
356
|
|
|
|
|
|
|
singletons (n_s) |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=cut |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
sub fu_and_li_D_star_counts { |
362
|
4
|
|
|
4
|
1
|
5
|
my ($self,$n,$seg_sites, $singletons) = @_; |
363
|
4
|
|
|
|
|
4
|
my $a_n; |
364
|
4
|
|
|
|
|
14
|
for(my $k = 1; $k < $n; $k++ ) { |
365
|
35
|
|
|
|
|
50
|
$a_n += ( 1 / $k ); |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
|
368
|
4
|
|
|
|
|
6
|
my $a1 = $a_n + 1 / $n; |
369
|
|
|
|
|
|
|
|
370
|
4
|
|
|
|
|
4
|
my $b = 0; |
371
|
4
|
|
|
|
|
10
|
for(my $k= 1; $k < $n; $k++ ) { |
372
|
35
|
|
|
|
|
47
|
$b += ( 1 / $k**2 ); |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
4
|
|
|
|
|
12
|
my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) / |
376
|
|
|
|
|
|
|
( ( $n - 1) * ( $n - 2 ) ) ); |
377
|
|
|
|
|
|
|
|
378
|
4
|
|
|
|
|
13
|
my $d = $c + ($n -2) / ($n - 1)**2 + |
379
|
|
|
|
|
|
|
2 / ($n -1) * |
380
|
|
|
|
|
|
|
( 1.5 - ( (2*$a1 - 3) / ($n -2) ) - |
381
|
|
|
|
|
|
|
1 / $n ); |
382
|
|
|
|
|
|
|
|
383
|
4
|
|
|
|
|
17
|
my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) - |
384
|
|
|
|
|
|
|
(2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) ) / |
385
|
|
|
|
|
|
|
(($a_n**2) + $b); |
386
|
|
|
|
|
|
|
|
387
|
4
|
|
|
|
|
9
|
my $u_star = ( ($n/($n-1))* |
388
|
|
|
|
|
|
|
($a_n - ($n/ |
389
|
|
|
|
|
|
|
($n-1)))) - $v_star; |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
|
392
|
4
|
|
|
|
|
37
|
return (($n / ($n - 1)) * $seg_sites - |
393
|
|
|
|
|
|
|
$a_n * $singletons) / |
394
|
|
|
|
|
|
|
sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites)); |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head2 fu_and_li_F |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Title : fu_and_li_F |
401
|
|
|
|
|
|
|
Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts); |
402
|
|
|
|
|
|
|
Function: Calculate Fu and Li's F on an ingroup with either the set of |
403
|
|
|
|
|
|
|
outgroup individuals, or the number of external mutations |
404
|
|
|
|
|
|
|
Returns : decimal number |
405
|
|
|
|
|
|
|
Args : array ref of L objects for the ingroup |
406
|
|
|
|
|
|
|
OR a L object |
407
|
|
|
|
|
|
|
number of external mutations OR list of individuals for the outgroup |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=cut |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
#' |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
sub fu_and_li_F { |
414
|
5
|
|
|
5
|
1
|
472
|
my ($self,$ingroup,$outgroup) = @_; |
415
|
5
|
|
|
|
|
3
|
my ($seg_sites,$pi,$n,$external,$internal); |
416
|
5
|
100
|
33
|
|
|
38
|
if( ref($ingroup) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
417
|
2
|
|
|
|
|
4
|
$n = scalar @$ingroup; |
418
|
|
|
|
|
|
|
# pi - all pairwise differences |
419
|
2
|
|
|
|
|
5
|
$pi = $self->pi($ingroup); |
420
|
2
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($ingroup); |
421
|
|
|
|
|
|
|
} elsif( ref($ingroup) && |
422
|
|
|
|
|
|
|
$ingroup->isa('Bio::PopGen::PopulationI')) { |
423
|
3
|
|
|
|
|
6
|
$n = $ingroup->get_number_individuals; |
424
|
3
|
|
|
|
|
6
|
$pi = $self->pi($ingroup); |
425
|
3
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($ingroup); |
426
|
|
|
|
|
|
|
} else { |
427
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F"); |
428
|
0
|
|
|
|
|
0
|
return 0; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
5
|
50
|
|
|
|
16
|
if( ! defined $outgroup ) { |
|
|
100
|
|
|
|
|
|
432
|
0
|
|
|
|
|
0
|
$self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations"); |
433
|
0
|
|
|
|
|
0
|
return 0; |
434
|
|
|
|
|
|
|
} elsif( ref($outgroup) ) { |
435
|
2
|
|
|
|
|
7
|
($external,$internal) = $self->derived_mutations($ingroup,$outgroup); |
436
|
|
|
|
|
|
|
} else { |
437
|
3
|
|
|
|
|
3
|
$external = $outgroup; |
438
|
|
|
|
|
|
|
} |
439
|
5
|
|
|
|
|
13
|
$self->fu_and_li_F_counts($n,$pi,$seg_sites,$external); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head2 fu_and_li_F_counts |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Title : fu_li_F_counts |
445
|
|
|
|
|
|
|
Usage : my $F = $statistics->fu_and_li_F_counts($samps,$pi, |
446
|
|
|
|
|
|
|
$sites, |
447
|
|
|
|
|
|
|
$external); |
448
|
|
|
|
|
|
|
Function: Fu and Li F statistic for the raw counts of the number |
449
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
450
|
|
|
|
|
|
|
Returns : decimal number |
451
|
|
|
|
|
|
|
Args : number of samples (N) |
452
|
|
|
|
|
|
|
average pairwise differences (pi) |
453
|
|
|
|
|
|
|
number of segregating sites (n) |
454
|
|
|
|
|
|
|
external mutations (n_e) |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=cut |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub fu_and_li_F_counts { |
460
|
6
|
|
|
6
|
1
|
12
|
my ($self,$n,$pi,$seg_sites, $external) = @_; |
461
|
6
|
|
|
|
|
5
|
my $a_n = 0; |
462
|
6
|
|
|
|
|
14
|
for(my $k= 1; $k < $n; $k++ ) { |
463
|
43
|
|
|
|
|
60
|
$a_n += ( 1 / $k ); |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
|
466
|
6
|
|
|
|
|
13
|
my $a1 = $a_n + (1 / $n ); |
467
|
|
|
|
|
|
|
|
468
|
6
|
|
|
|
|
7
|
my $b = 0; |
469
|
6
|
|
|
|
|
13
|
for(my $k= 1; $k < $n; $k++ ) { |
470
|
43
|
|
|
|
|
56
|
$b += ( 1 / $k**2 ); |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
|
473
|
6
|
|
|
|
|
17
|
my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) / |
474
|
|
|
|
|
|
|
( ( $n - 1) * ( $n - 2 ) ) ); |
475
|
|
|
|
|
|
|
|
476
|
6
|
|
|
|
|
23
|
my $v_F = ( $c + ( (2*(($n**2)+$n+3)) / |
477
|
|
|
|
|
|
|
( (9*$n)*($n-1) ) ) - |
478
|
|
|
|
|
|
|
(2/($n-1)) ) / ( ($a_n**2)+$b ); |
479
|
|
|
|
|
|
|
|
480
|
6
|
|
|
|
|
18
|
my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )- |
481
|
|
|
|
|
|
|
( 4*( ($n+1)/(($n-1)**2) ))* |
482
|
|
|
|
|
|
|
($a1 - ((2*$n)/($n+1))) ) / |
483
|
|
|
|
|
|
|
$a_n - $v_F; |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# warn("$v_F vf $u_F uf n = $n\n"); |
486
|
6
|
|
|
|
|
13
|
my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) + |
487
|
|
|
|
|
|
|
($v_F*($seg_sites**2)) ) ); |
488
|
|
|
|
|
|
|
|
489
|
6
|
|
|
|
|
47
|
return $F; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 fu_and_li_F_star |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Title : fu_and_li_F_star |
495
|
|
|
|
|
|
|
Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup); |
496
|
|
|
|
|
|
|
Function: Calculate Fu and Li's F* on an ingroup without an outgroup |
497
|
|
|
|
|
|
|
It uses count of singleton alleles instead |
498
|
|
|
|
|
|
|
Returns : decimal number |
499
|
|
|
|
|
|
|
Args : array ref of L objects for the ingroup |
500
|
|
|
|
|
|
|
OR |
501
|
|
|
|
|
|
|
L object |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=cut |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
#' keep my emacs happy |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub fu_and_li_F_star { |
508
|
3
|
|
|
3
|
1
|
447
|
my ($self,$individuals) = @_; |
509
|
|
|
|
|
|
|
|
510
|
3
|
|
|
|
|
4
|
my ($seg_sites,$pi,$n,$singletons); |
511
|
3
|
100
|
33
|
|
|
22
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
512
|
2
|
|
|
|
|
3
|
$n = scalar @$individuals; |
513
|
|
|
|
|
|
|
# pi - all pairwise differences |
514
|
2
|
|
|
|
|
6
|
$pi = $self->pi($individuals); |
515
|
2
|
|
|
|
|
7
|
$seg_sites = $self->segregating_sites_count($individuals); |
516
|
2
|
|
|
|
|
7
|
$singletons = $self->singleton_count($individuals); |
517
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
518
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI')) { |
519
|
1
|
|
|
|
|
2
|
my $pop = $individuals; |
520
|
1
|
|
|
|
|
2
|
$n = $pop->get_number_individuals; |
521
|
1
|
|
|
|
|
3
|
$pi = $self->pi($pop); |
522
|
1
|
|
|
|
|
4
|
$seg_sites = $self->segregating_sites_count($pop); |
523
|
1
|
|
|
|
|
3
|
$singletons = $self->singleton_count($pop); |
524
|
|
|
|
|
|
|
} else { |
525
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star"); |
526
|
0
|
|
|
|
|
0
|
return 0; |
527
|
|
|
|
|
|
|
} |
528
|
3
|
|
|
|
|
8
|
return $self->fu_and_li_F_star_counts($n, |
529
|
|
|
|
|
|
|
$pi, |
530
|
|
|
|
|
|
|
$seg_sites, |
531
|
|
|
|
|
|
|
$singletons); |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=head2 fu_and_li_F_star_counts |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
Title : fu_li_F_star_counts |
537
|
|
|
|
|
|
|
Usage : my $F = $statistics->fu_and_li_F_star_counts($samps, |
538
|
|
|
|
|
|
|
$pi,$sites, |
539
|
|
|
|
|
|
|
$singletons); |
540
|
|
|
|
|
|
|
Function: Fu and Li F statistic for the raw counts of the number |
541
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
542
|
|
|
|
|
|
|
Returns : decimal number |
543
|
|
|
|
|
|
|
Args : number of samples (N) |
544
|
|
|
|
|
|
|
average pairwise differences (pi) |
545
|
|
|
|
|
|
|
number of segregating sites (n) |
546
|
|
|
|
|
|
|
singleton mutations (n_s) |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=cut |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
sub fu_and_li_F_star_counts { |
552
|
4
|
|
|
4
|
1
|
7
|
my ($self,$n,$pi,$seg_sites, $singletons) = @_; |
553
|
4
|
50
|
|
|
|
30
|
if( $n <= 1 ) { |
554
|
0
|
|
|
|
|
0
|
$self->warn("N must be > 1\n"); |
555
|
0
|
|
|
|
|
0
|
return; |
556
|
|
|
|
|
|
|
} |
557
|
4
|
50
|
|
|
|
7
|
if( $n == 2) { |
558
|
0
|
|
|
|
|
0
|
return 0; |
559
|
|
|
|
|
|
|
} |
560
|
|
|
|
|
|
|
|
561
|
4
|
|
|
|
|
5
|
my $a_n = 0; |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
|
564
|
4
|
|
|
|
|
4
|
my $b = 0; |
565
|
4
|
|
|
|
|
12
|
for(my $k= 1; $k < $n; $k++ ) { |
566
|
35
|
|
|
|
|
34
|
$b += (1 / ($k**2)); |
567
|
35
|
|
|
|
|
43
|
$a_n += ( 1 / $k ); # Eq (2) |
568
|
|
|
|
|
|
|
} |
569
|
4
|
|
|
|
|
6
|
my $a1 = $a_n + (1 / $n ); |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# warn("a_n is $a_n a1 is $a1 n is $n b is $b\n"); |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# From Simonsen et al (1995) instead of Fu and Li 1993 |
574
|
4
|
|
|
|
|
20
|
my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/ |
575
|
|
|
|
|
|
|
(9 * ($n ** 2) * ( $n - 1))) + |
576
|
|
|
|
|
|
|
((2 * ($n - 1) * $a_n ) / $n ** 2) - |
577
|
|
|
|
|
|
|
(8 * $b / $n) ) / |
578
|
|
|
|
|
|
|
( ($a_n ** 2) + $b ); |
579
|
|
|
|
|
|
|
|
580
|
4
|
|
|
|
|
11
|
my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) / |
581
|
|
|
|
|
|
|
(3 * $n * ( $n - 1))) / $a_n) - $v_F_star; |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
# warn("vf* = $v_F_star uf* = $u_F_star n = $n\n"); |
584
|
4
|
|
|
|
|
9
|
my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) / |
585
|
|
|
|
|
|
|
sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2); |
586
|
4
|
|
|
|
|
31
|
return $F_star; |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=head2 tajima_D |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
Title : tajima_D |
592
|
|
|
|
|
|
|
Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples); |
593
|
|
|
|
|
|
|
Function: Calculate Tajima's D on a set of samples |
594
|
|
|
|
|
|
|
Returns : decimal number |
595
|
|
|
|
|
|
|
Args : array ref of L objects |
596
|
|
|
|
|
|
|
OR |
597
|
|
|
|
|
|
|
L object |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=cut |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
#' |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
sub tajima_D { |
605
|
6
|
|
|
6
|
1
|
491
|
my ($self,$individuals) = @_; |
606
|
6
|
|
|
|
|
10
|
my ($seg_sites,$pi,$n); |
607
|
|
|
|
|
|
|
|
608
|
6
|
100
|
33
|
|
|
57
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
609
|
2
|
|
|
|
|
3
|
$n = scalar @$individuals; |
610
|
|
|
|
|
|
|
# pi - all pairwise differences |
611
|
2
|
|
|
|
|
5
|
$pi = $self->pi($individuals); |
612
|
2
|
|
|
|
|
7
|
$seg_sites = $self->segregating_sites_count($individuals); |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
615
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI')) { |
616
|
4
|
|
|
|
|
5
|
my $pop = $individuals; |
617
|
4
|
|
|
|
|
14
|
$n = $pop->get_number_individuals; |
618
|
4
|
|
|
|
|
16
|
$pi = $self->pi($pop); |
619
|
4
|
|
|
|
|
16
|
$seg_sites = $self->segregating_sites_count($pop); |
620
|
|
|
|
|
|
|
} else { |
621
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D"); |
622
|
0
|
|
|
|
|
0
|
return 0; |
623
|
|
|
|
|
|
|
} |
624
|
6
|
|
|
|
|
31
|
$self->tajima_D_counts($n,$seg_sites,$pi); |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=head2 tajima_D_counts |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
Title : tajima_D_counts |
630
|
|
|
|
|
|
|
Usage : my $D = $statistics->tajima_D_counts($samps,$sites,$pi); |
631
|
|
|
|
|
|
|
Function: Tajima's D statistic for the raw counts of the number |
632
|
|
|
|
|
|
|
of samples, sites, and avg pairwise distances (pi) |
633
|
|
|
|
|
|
|
Returns : decimal number |
634
|
|
|
|
|
|
|
Args : number of samples (N) |
635
|
|
|
|
|
|
|
number of segregating sites (n) |
636
|
|
|
|
|
|
|
average pairwise differences (pi) |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=cut |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
#' |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub tajima_D_counts { |
643
|
6
|
|
|
6
|
1
|
14
|
my ($self,$n,$seg_sites,$pi) = @_; |
644
|
6
|
|
|
|
|
5
|
my $a1 = 0; |
645
|
6
|
|
|
|
|
21
|
for(my $k= 1; $k < $n; $k++ ) { |
646
|
284
|
|
|
|
|
316
|
$a1 += ( 1 / $k ); |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
|
649
|
6
|
|
|
|
|
7
|
my $a2 = 0; |
650
|
6
|
|
|
|
|
30
|
for(my $k= 1; $k < $n; $k++ ) { |
651
|
284
|
|
|
|
|
335
|
$a2 += ( 1 / $k**2 ); |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
|
654
|
6
|
|
|
|
|
13
|
my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) ); |
655
|
6
|
|
|
|
|
14
|
my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) / |
656
|
|
|
|
|
|
|
( ( 9 * $n) * ( $n - 1) ); |
657
|
6
|
|
|
|
|
10
|
my $c1 = $b1 - ( 1 / $a1 ); |
658
|
6
|
|
|
|
|
43
|
my $c2 = $b2 - ( ( $n + 2 ) / |
659
|
|
|
|
|
|
|
( $a1 * $n))+( $a2 / $a1 ** 2); |
660
|
6
|
|
|
|
|
10
|
my $e1 = $c1 / $a1; |
661
|
6
|
|
|
|
|
8
|
my $e2 = $c2 / ( $a1**2 + $a2 ); |
662
|
|
|
|
|
|
|
|
663
|
6
|
|
|
|
|
13
|
my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1))); |
664
|
6
|
50
|
|
|
|
16
|
return if $denom == 0; |
665
|
6
|
|
|
|
|
7
|
my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom; |
666
|
6
|
|
|
|
|
92
|
return $D; |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=head2 pi |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
Title : pi |
673
|
|
|
|
|
|
|
Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds) |
674
|
|
|
|
|
|
|
Function: Calculate pi (average number of pairwise differences) given |
675
|
|
|
|
|
|
|
a list of individuals which have the same number of markers |
676
|
|
|
|
|
|
|
(also called sites) as available from the get_Genotypes() |
677
|
|
|
|
|
|
|
call in L |
678
|
|
|
|
|
|
|
Returns : decimal number |
679
|
|
|
|
|
|
|
Args : Arg1= array ref of L objects |
680
|
|
|
|
|
|
|
which have markers/mutations. We expect all individuals to |
681
|
|
|
|
|
|
|
have a marker - we will deal with missing data as a special case. |
682
|
|
|
|
|
|
|
OR |
683
|
|
|
|
|
|
|
Arg1= L object. In the event that |
684
|
|
|
|
|
|
|
only allele frequency data is available, storing it in |
685
|
|
|
|
|
|
|
Population object will make this available. |
686
|
|
|
|
|
|
|
num sites [optional], an optional second argument (integer) |
687
|
|
|
|
|
|
|
which is the number of sites, then pi returned is pi/site. |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=cut |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub pi { |
692
|
21
|
|
|
21
|
1
|
50
|
my ($self,$individuals,$numsites) = @_; |
693
|
21
|
|
|
|
|
21
|
my (%data,%marker_total,@marker_names,$n); |
694
|
|
|
|
|
|
|
|
695
|
21
|
100
|
33
|
|
|
123
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# one possible argument is an arrayref of Bio::PopGen::IndividualI objs |
697
|
9
|
|
|
|
|
40
|
@marker_names = $individuals->[0]->get_marker_names; |
698
|
9
|
|
|
|
|
21
|
$n = scalar @$individuals; |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# Here we are calculating the allele frequencies |
701
|
9
|
|
|
|
|
12
|
foreach my $ind ( @$individuals ) { |
702
|
45
|
50
|
|
|
|
120
|
if( ! $ind->isa('Bio::PopGen::IndividualI') ) { |
703
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n"); |
704
|
0
|
|
|
|
|
0
|
return 0; |
705
|
|
|
|
|
|
|
} |
706
|
45
|
|
|
|
|
45
|
foreach my $m ( @marker_names ) { |
707
|
2125
|
|
|
|
|
2593
|
foreach my $allele (map { $_->get_Alleles} |
|
2125
|
|
|
|
|
2462
|
|
708
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
709
|
2125
|
|
|
|
|
1932
|
$data{$m}->{$allele}++; |
710
|
2125
|
|
|
|
|
2383
|
$marker_total{$m}++; |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
} |
714
|
|
|
|
|
|
|
# while( my ($marker,$count) = each %marker_total ) { |
715
|
|
|
|
|
|
|
# foreach my $c ( values %{$data{$marker}} ) { |
716
|
|
|
|
|
|
|
# $c /= $count; |
717
|
|
|
|
|
|
|
# } |
718
|
|
|
|
|
|
|
# } |
719
|
|
|
|
|
|
|
# %data will contain allele frequencies for each marker, allele |
720
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
721
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI') ) { |
722
|
12
|
|
|
|
|
33
|
my $pop = $individuals; |
723
|
12
|
|
|
|
|
28
|
$n = $pop->get_number_individuals; |
724
|
12
|
|
|
|
|
80
|
foreach my $marker( $pop->get_Markers ) { |
725
|
143
|
|
|
|
|
374
|
push @marker_names, $marker->name; |
726
|
|
|
|
|
|
|
#$data{$marker->name} = {$marker->get_Allele_Frequencies}; |
727
|
143
|
|
|
|
|
466
|
my @genotypes = $pop->get_Genotypes(-marker => $marker->name); |
728
|
143
|
|
|
|
|
1137
|
for my $al ( map { $_->get_Alleles} @genotypes ) { |
|
12444
|
|
|
|
|
14821
|
|
729
|
24628
|
|
|
|
|
25399
|
$data{$marker->name}->{$al}++; |
730
|
24628
|
|
|
|
|
26737
|
$marker_total{$marker->name}++; |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
|
} else { |
734
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi"); |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
# based on Kevin Thornton's code: |
737
|
|
|
|
|
|
|
# http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152 |
738
|
|
|
|
|
|
|
# For now we assume that all individuals have the same markers |
739
|
21
|
|
|
|
|
84
|
my ($diffcount,$totalcompare) = (0,0); |
740
|
21
|
|
|
|
|
21
|
my $pi = 0; |
741
|
21
|
|
|
|
|
62
|
while ( my ($marker,$markerdat) = each %data ) { |
742
|
568
|
|
|
|
|
399
|
my $sampsize = $marker_total{$marker}; |
743
|
568
|
|
|
|
|
344
|
my $ssh = 0; |
744
|
568
|
|
|
|
|
795
|
my @alleles = keys %$markerdat; |
745
|
568
|
50
|
|
|
|
670
|
if ( $sampsize > 1 ) { |
746
|
568
|
|
|
|
|
388
|
my $denom = $sampsize * ($sampsize - 1.0); |
747
|
568
|
|
|
|
|
433
|
foreach my $al ( @alleles ) { |
748
|
1118
|
|
|
|
|
1145
|
$ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom; |
749
|
|
|
|
|
|
|
} |
750
|
568
|
|
|
|
|
1077
|
$pi += 1.0 - $ssh; |
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
} |
753
|
21
|
|
|
|
|
239
|
$self->debug( "pi=$pi\n"); |
754
|
21
|
100
|
|
|
|
46
|
if( $numsites ) { |
755
|
2
|
|
|
|
|
14
|
return $pi / $numsites; |
756
|
|
|
|
|
|
|
} else { |
757
|
19
|
|
|
|
|
256
|
return $pi; |
758
|
|
|
|
|
|
|
} |
759
|
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=head2 theta |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
Title : theta |
765
|
|
|
|
|
|
|
Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites); |
766
|
|
|
|
|
|
|
Function: Calculates Watterson's theta from the sample size |
767
|
|
|
|
|
|
|
and the number of segregating sites. |
768
|
|
|
|
|
|
|
Providing the third parameter, total number of sites will |
769
|
|
|
|
|
|
|
return theta per site. |
770
|
|
|
|
|
|
|
This is also known as K-hat = K / a_n |
771
|
|
|
|
|
|
|
Returns : decimal number |
772
|
|
|
|
|
|
|
Args : sample size (integer), |
773
|
|
|
|
|
|
|
num segregating sites (integer) |
774
|
|
|
|
|
|
|
total sites (integer) [optional] (to calculate theta per site) |
775
|
|
|
|
|
|
|
OR |
776
|
|
|
|
|
|
|
provide an arrayref of the L objects |
777
|
|
|
|
|
|
|
total sites (integer) [optional] (to calculate theta per site) |
778
|
|
|
|
|
|
|
OR |
779
|
|
|
|
|
|
|
provide an L object |
780
|
|
|
|
|
|
|
total sites (integer)[optional] |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=cut |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
#' |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
sub theta { |
787
|
7
|
|
|
7
|
1
|
687
|
my $self = shift; |
788
|
7
|
|
|
|
|
10
|
my ( $n, $seg_sites,$totalsites) = @_; |
789
|
7
|
100
|
66
|
|
|
68
|
if( ref($n) =~ /ARRAY/i ) { |
|
|
100
|
|
|
|
|
|
790
|
2
|
|
|
|
|
2
|
my $samps = $n; |
791
|
2
|
|
|
|
|
1
|
$totalsites = $seg_sites; # only 2 arguments if one is an array |
792
|
2
|
|
|
|
|
2
|
my %data; |
793
|
2
|
|
|
|
|
5
|
my @marker_names = $samps->[0]->get_marker_names; |
794
|
|
|
|
|
|
|
# we need to calculate number of polymorphic sites |
795
|
2
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($samps); |
796
|
2
|
|
|
|
|
3
|
$n = scalar @$samps; |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
} elsif(ref($n) && |
799
|
|
|
|
|
|
|
$n->isa('Bio::PopGen::PopulationI') ) { |
800
|
|
|
|
|
|
|
# This will handle the case when we pass in a PopulationI object |
801
|
4
|
|
|
|
|
4
|
my $pop = $n; |
802
|
4
|
|
|
|
|
6
|
$totalsites = $seg_sites; # shift the arguments over by one |
803
|
4
|
|
|
|
|
14
|
$n = $pop->haploid_population->get_number_individuals; |
804
|
4
|
|
|
|
|
10
|
$seg_sites = $self->segregating_sites_count($pop); |
805
|
|
|
|
|
|
|
} |
806
|
7
|
|
|
|
|
8
|
my $a1 = 0; |
807
|
7
|
|
|
|
|
18
|
for(my $k= 1; $k < $n; $k++ ) { |
808
|
202
|
|
|
|
|
242
|
$a1 += ( 1 / $k ); |
809
|
|
|
|
|
|
|
} |
810
|
7
|
100
|
|
|
|
12
|
if( $totalsites ) { # 0 and undef are the same can't divide by them |
811
|
2
|
|
|
|
|
3
|
$seg_sites /= $totalsites; |
812
|
|
|
|
|
|
|
} |
813
|
7
|
50
|
|
|
|
14
|
if( $a1 == 0 ) { |
814
|
0
|
|
|
|
|
0
|
return 0; |
815
|
|
|
|
|
|
|
} |
816
|
7
|
|
|
|
|
60
|
return $seg_sites / $a1; |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 singleton_count |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Title : singleton_count |
822
|
|
|
|
|
|
|
Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds) |
823
|
|
|
|
|
|
|
Function: Calculate the number of mutations/alleles which only occur once in |
824
|
|
|
|
|
|
|
a list of individuals for all sites/markers |
825
|
|
|
|
|
|
|
Returns : (integer) number of alleles which only occur once (integer) |
826
|
|
|
|
|
|
|
Args : arrayref of L objects |
827
|
|
|
|
|
|
|
OR |
828
|
|
|
|
|
|
|
L object |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=cut |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
sub singleton_count { |
833
|
7
|
|
|
7
|
1
|
454
|
my ($self,$individuals) = @_; |
834
|
|
|
|
|
|
|
|
835
|
7
|
|
|
|
|
5
|
my @inds; |
836
|
7
|
100
|
33
|
|
|
36
|
if( ref($individuals) =~ /ARRAY/ ) { |
|
|
50
|
|
|
|
|
|
837
|
5
|
|
|
|
|
9
|
@inds = @$individuals; |
838
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
839
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI') ) { |
840
|
2
|
|
|
|
|
2
|
my $pop = $individuals; |
841
|
2
|
|
|
|
|
4
|
@inds = $pop->get_Individuals(); |
842
|
2
|
50
|
|
|
|
3
|
unless( @inds ) { |
843
|
0
|
|
|
|
|
0
|
$self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies"); |
844
|
0
|
|
|
|
|
0
|
return 0; |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
} else { |
847
|
0
|
|
|
|
|
0
|
$self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects"); |
848
|
0
|
|
|
|
|
0
|
return 0; |
849
|
|
|
|
|
|
|
} |
850
|
|
|
|
|
|
|
# find number of sites where a particular allele is only seen once |
851
|
|
|
|
|
|
|
|
852
|
7
|
|
|
|
|
10
|
my ($singleton_allele_ct,%sites) = (0); |
853
|
|
|
|
|
|
|
# first collect all the alleles into a hash structure |
854
|
|
|
|
|
|
|
|
855
|
7
|
|
|
|
|
10
|
foreach my $n ( @inds ) { |
856
|
35
|
50
|
|
|
|
84
|
if( ! $n->isa('Bio::PopGen::IndividualI') ) { |
857
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n"); |
858
|
0
|
|
|
|
|
0
|
return 0; |
859
|
|
|
|
|
|
|
} |
860
|
35
|
|
|
|
|
44
|
foreach my $g ( $n->get_Genotypes ) { |
861
|
1600
|
|
|
|
|
1702
|
my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles); |
862
|
1600
|
|
|
|
|
1508
|
foreach my $allele (@alleles ) { |
863
|
1600
|
|
|
|
|
2106
|
$sites{$nm}->{$allele}++; |
864
|
|
|
|
|
|
|
} |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
} |
867
|
7
|
|
|
|
|
22
|
foreach my $site ( values %sites ) { # don't really care what the name is |
868
|
320
|
|
|
|
|
278
|
foreach my $allelect ( values %$site ) { # |
869
|
|
|
|
|
|
|
# find the sites which have an allele with only 1 copy |
870
|
636
|
100
|
|
|
|
724
|
$singleton_allele_ct++ if( $allelect == 1 ); |
871
|
|
|
|
|
|
|
} |
872
|
|
|
|
|
|
|
} |
873
|
7
|
|
|
|
|
67
|
return $singleton_allele_ct; |
874
|
|
|
|
|
|
|
} |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# Yes I know that singleton_count and segregating_sites_count are |
877
|
|
|
|
|
|
|
# basically processing the same data so calling them both is |
878
|
|
|
|
|
|
|
# redundant, something I want to fix later but want to make things |
879
|
|
|
|
|
|
|
# correct and simple first |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
=head2 segregating_sites_count |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
Title : segregating_sites_count |
884
|
|
|
|
|
|
|
Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count |
885
|
|
|
|
|
|
|
Function: Gets the number of segregating sites (number of polymorphic sites) |
886
|
|
|
|
|
|
|
Returns : (integer) number of segregating sites |
887
|
|
|
|
|
|
|
Args : arrayref of L objects |
888
|
|
|
|
|
|
|
OR |
889
|
|
|
|
|
|
|
L object |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
=cut |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
# perhaps we'll change this in the future |
894
|
|
|
|
|
|
|
# to return the actual segregating sites |
895
|
|
|
|
|
|
|
# so one can use this to pull in the names of those sites. |
896
|
|
|
|
|
|
|
# Would be trivial if it is useful. |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
sub segregating_sites_count { |
899
|
31
|
|
|
31
|
1
|
490
|
my ($self,$individuals) = @_; |
900
|
31
|
|
|
|
|
40
|
my $type = ref($individuals); |
901
|
31
|
|
|
|
|
29
|
my $seg_sites = 0; |
902
|
31
|
100
|
33
|
|
|
175
|
if( $type =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
903
|
15
|
|
|
|
|
14
|
my %sites; |
904
|
15
|
|
|
|
|
20
|
foreach my $n ( @$individuals ) { |
905
|
75
|
50
|
|
|
|
173
|
if( ! $n->isa('Bio::PopGen::IndividualI') ) { |
906
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n"); |
907
|
0
|
|
|
|
|
0
|
return 0; |
908
|
|
|
|
|
|
|
} |
909
|
75
|
|
|
|
|
105
|
foreach my $g ( $n->get_Genotypes ) { |
910
|
3225
|
|
|
|
|
3509
|
my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles); |
911
|
3225
|
|
|
|
|
3159
|
foreach my $allele (@alleles ) { |
912
|
3225
|
|
|
|
|
4114
|
$sites{$nm}->{$allele}++; |
913
|
|
|
|
|
|
|
} |
914
|
|
|
|
|
|
|
} |
915
|
|
|
|
|
|
|
} |
916
|
15
|
|
|
|
|
47
|
foreach my $site ( values %sites ) { # use values b/c we don't |
917
|
|
|
|
|
|
|
# really care what the name is |
918
|
|
|
|
|
|
|
# find the sites which >1 allele |
919
|
645
|
100
|
|
|
|
902
|
$seg_sites++ if( keys %$site > 1 ); |
920
|
|
|
|
|
|
|
} |
921
|
|
|
|
|
|
|
} elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) { |
922
|
16
|
|
|
|
|
43
|
foreach my $marker ( $individuals->haploid_population->get_Markers ) { |
923
|
163
|
|
|
|
|
217
|
my @alleles = $marker->get_Alleles; |
924
|
163
|
100
|
|
|
|
281
|
$seg_sites++ if ( scalar @alleles > 1 ); |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
} else { |
927
|
0
|
|
|
|
|
0
|
$self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects"); |
928
|
0
|
|
|
|
|
0
|
return 0; |
929
|
|
|
|
|
|
|
} |
930
|
31
|
|
|
|
|
108
|
return $seg_sites; |
931
|
|
|
|
|
|
|
} |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=head2 heterozygosity |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
Title : heterozygosity |
937
|
|
|
|
|
|
|
Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1); |
938
|
|
|
|
|
|
|
Function: Calculate the heterozgosity for a sample set for a set of alleles |
939
|
|
|
|
|
|
|
Returns : decimal number |
940
|
|
|
|
|
|
|
Args : sample size (integer) |
941
|
|
|
|
|
|
|
frequency of one allele (fraction - must be less than 1) |
942
|
|
|
|
|
|
|
[optional] frequency of another allele - this is only needed |
943
|
|
|
|
|
|
|
in a non-binary allele system |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Note : p^2 + 2pq + q^2 |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=cut |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
sub heterozygosity { |
951
|
0
|
|
|
0
|
1
|
0
|
my ($self,$samp_size, $freq1,$freq2) = @_; |
952
|
0
|
0
|
|
|
|
0
|
if( ! $freq2 ) { $freq2 = 1 - $freq1 } |
|
0
|
|
|
|
|
0
|
|
953
|
0
|
0
|
0
|
|
|
0
|
if( $freq1 > 1 || $freq2 > 1 ) { |
954
|
0
|
|
|
|
|
0
|
$self->warn("heterozygosity expects frequencies to be less than 1"); |
955
|
|
|
|
|
|
|
} |
956
|
0
|
|
|
|
|
0
|
my $sum = ($freq1**2) + (($freq2)**2); |
957
|
0
|
|
|
|
|
0
|
my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ; |
958
|
0
|
|
|
|
|
0
|
return $h; |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
=head2 derived_mutations |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
Title : derived_mutations |
965
|
|
|
|
|
|
|
Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup); |
966
|
|
|
|
|
|
|
Function: Calculate the number of alleles or (mutations) which are ancestral |
967
|
|
|
|
|
|
|
and the number which are derived (occurred only on the tips) |
968
|
|
|
|
|
|
|
Returns : array of 2 items - number of external and internal derived |
969
|
|
|
|
|
|
|
mutation |
970
|
|
|
|
|
|
|
Args : ingroup - Ls arrayref OR |
971
|
|
|
|
|
|
|
L |
972
|
|
|
|
|
|
|
outgroup- Ls arrayref OR |
973
|
|
|
|
|
|
|
L OR |
974
|
|
|
|
|
|
|
a single L |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
=cut |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
sub derived_mutations { |
979
|
10
|
|
|
10
|
1
|
13
|
my ($self,$ingroup,$outgroup) = @_; |
980
|
10
|
|
|
|
|
10
|
my (%indata,%outdata,@marker_names); |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
# basically we have to do some type checking |
983
|
|
|
|
|
|
|
# if that perl were typed... |
984
|
10
|
|
|
|
|
16
|
my ($itype,$otype) = (ref($ingroup),ref($outgroup)); |
985
|
|
|
|
|
|
|
|
986
|
10
|
50
|
|
|
|
15
|
return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums |
987
|
|
|
|
|
|
|
# are already the value we |
988
|
|
|
|
|
|
|
# are searching for |
989
|
|
|
|
|
|
|
# pick apart the ingroup |
990
|
|
|
|
|
|
|
# get the data |
991
|
10
|
100
|
33
|
|
|
63
|
if( ref($ingroup) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
992
|
4
|
50
|
33
|
|
|
19
|
if( ! ref($ingroup->[0]) || |
993
|
|
|
|
|
|
|
! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) { |
994
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations"); |
995
|
0
|
|
|
|
|
0
|
return 0; |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
# we assume that all individuals have the same markers |
998
|
|
|
|
|
|
|
# i.e. that they are aligned |
999
|
4
|
|
|
|
|
8
|
@marker_names = $ingroup->[0]->get_marker_names; |
1000
|
4
|
|
|
|
|
6
|
for my $ind ( @$ingroup ) { |
1001
|
20
|
|
|
|
|
18
|
for my $m ( @marker_names ) { |
1002
|
100
|
|
|
|
|
112
|
for my $allele ( map { $_->get_Alleles } |
|
100
|
|
|
|
|
122
|
|
1003
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
1004
|
100
|
|
|
|
|
138
|
$indata{$m}->{$allele}++; |
1005
|
|
|
|
|
|
|
} |
1006
|
|
|
|
|
|
|
} |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
} elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) { |
1009
|
6
|
|
|
|
|
14
|
@marker_names = $ingroup->get_marker_names; |
1010
|
6
|
|
|
|
|
15
|
for my $ind ( $ingroup->haploid_population->get_Individuals() ) { |
1011
|
30
|
|
|
|
|
24
|
for my $m ( @marker_names ) { |
1012
|
150
|
|
|
|
|
176
|
for my $allele ( map { $_->get_Alleles} |
|
150
|
|
|
|
|
183
|
|
1013
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
1014
|
150
|
|
|
|
|
221
|
$indata{$m}->{$allele}++; |
1015
|
|
|
|
|
|
|
} |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
} |
1018
|
|
|
|
|
|
|
} else { |
1019
|
0
|
|
|
|
|
0
|
$self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations"); |
1020
|
0
|
|
|
|
|
0
|
return 0; |
1021
|
|
|
|
|
|
|
} |
1022
|
|
|
|
|
|
|
|
1023
|
10
|
100
|
|
|
|
29
|
if( $otype =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
1024
|
5
|
50
|
33
|
|
|
26
|
if( ! ref($outgroup->[0]) || |
1025
|
|
|
|
|
|
|
! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) { |
1026
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations"); |
1027
|
0
|
|
|
|
|
0
|
return 0; |
1028
|
|
|
|
|
|
|
} |
1029
|
5
|
|
|
|
|
6
|
for my $ind ( @$outgroup ) { |
1030
|
5
|
|
|
|
|
24
|
for my $m ( @marker_names ) { |
1031
|
25
|
|
|
|
|
32
|
for my $allele ( map { $_->get_Alleles } |
|
25
|
|
|
|
|
35
|
|
1032
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
1033
|
25
|
|
|
|
|
47
|
$outdata{$m}->{$allele}++; |
1034
|
|
|
|
|
|
|
} |
1035
|
|
|
|
|
|
|
} |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
} elsif( $otype->isa('Bio::PopGen::PopulationI') ) { |
1039
|
5
|
|
|
|
|
10
|
for my $ind ( $outgroup->haploid_population->get_Individuals() ) { |
1040
|
5
|
|
|
|
|
7
|
for my $m ( @marker_names ) { |
1041
|
25
|
|
|
|
|
36
|
for my $allele ( map { $_->get_Alleles} |
|
25
|
|
|
|
|
36
|
|
1042
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
1043
|
25
|
|
|
|
|
43
|
$outdata{$m}->{$allele}++; |
1044
|
|
|
|
|
|
|
} |
1045
|
|
|
|
|
|
|
} |
1046
|
|
|
|
|
|
|
} |
1047
|
|
|
|
|
|
|
} else { |
1048
|
0
|
|
|
|
|
0
|
$self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations"); |
1049
|
0
|
|
|
|
|
0
|
return 0; |
1050
|
|
|
|
|
|
|
} |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# derived mutations are defined as |
1053
|
|
|
|
|
|
|
# |
1054
|
|
|
|
|
|
|
# ingroup (G A T) |
1055
|
|
|
|
|
|
|
# outgroup (A) |
1056
|
|
|
|
|
|
|
# derived mutations are G and T, A is the external mutation |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
# ingroup (A T) |
1059
|
|
|
|
|
|
|
# outgroup (C) |
1060
|
|
|
|
|
|
|
# derived mutations A,T no external/ancestral mutations |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
# ingroup (G A T) |
1063
|
|
|
|
|
|
|
# outgroup (A T) |
1064
|
|
|
|
|
|
|
# cannot determine |
1065
|
|
|
|
|
|
|
|
1066
|
10
|
|
|
|
|
14
|
my ($internal,$external); |
1067
|
10
|
|
|
|
|
13
|
foreach my $marker ( @marker_names ) { |
1068
|
50
|
|
|
|
|
33
|
my @outalleles = keys %{$outdata{$marker}}; |
|
50
|
|
|
|
|
85
|
|
1069
|
50
|
|
|
|
|
42
|
my @in_alleles = keys %{$indata{$marker}}; |
|
50
|
|
|
|
|
68
|
|
1070
|
50
|
100
|
66
|
|
|
145
|
next if( @outalleles > 1 || @in_alleles == 1); |
1071
|
40
|
|
|
|
|
33
|
for my $allele ( @in_alleles ) { |
1072
|
80
|
100
|
|
|
|
114
|
if( ! exists $outdata{$marker}->{$allele} ) { |
1073
|
40
|
100
|
|
|
|
45
|
if( $indata{$marker}->{$allele} == 1 ) { |
1074
|
10
|
|
|
|
|
10
|
$external++; |
1075
|
|
|
|
|
|
|
} else { |
1076
|
30
|
|
|
|
|
28
|
$internal++; |
1077
|
|
|
|
|
|
|
} |
1078
|
|
|
|
|
|
|
} |
1079
|
|
|
|
|
|
|
} |
1080
|
|
|
|
|
|
|
} |
1081
|
10
|
|
|
|
|
52
|
return ($external, $internal); |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
=head2 composite_LD |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
Title : composite_LD |
1088
|
|
|
|
|
|
|
Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population); |
1089
|
|
|
|
|
|
|
Function: Calculate the Linkage Disequilibrium |
1090
|
|
|
|
|
|
|
This is for calculating LD for unphased data. |
1091
|
|
|
|
|
|
|
Other methods will be appropriate for phased haplotype data. |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
Returns : Hash of Hashes - first key is site 1,second key is site 2 |
1094
|
|
|
|
|
|
|
and value is LD for those two sites. |
1095
|
|
|
|
|
|
|
my $LDarrayref = $matrix{$site1}->{$site2}; |
1096
|
|
|
|
|
|
|
my ($ldval, $chisquared) = @$LDarrayref; |
1097
|
|
|
|
|
|
|
Args : L or arrayref of |
1098
|
|
|
|
|
|
|
Ls |
1099
|
|
|
|
|
|
|
Reference: Weir B.S. (1996) "Genetic Data Analysis II", |
1100
|
|
|
|
|
|
|
Sinauer, Sunderlanm MA. |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=cut |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
sub composite_LD { |
1105
|
2
|
|
|
2
|
1
|
7
|
my ($self,$pop) = @_; |
1106
|
2
|
50
|
33
|
|
|
17
|
if( ref($pop) =~ /ARRAY/i ) { |
|
|
50
|
|
|
|
|
|
1107
|
0
|
0
|
0
|
|
|
0
|
if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) { |
1108
|
0
|
|
|
|
|
0
|
$pop = Bio::PopGen::Population->new(-individuals => @$pop); |
1109
|
|
|
|
|
|
|
} else { |
1110
|
0
|
|
|
|
|
0
|
$self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects"); |
1111
|
0
|
|
|
|
|
0
|
return (); |
1112
|
|
|
|
|
|
|
} |
1113
|
|
|
|
|
|
|
} elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) { |
1114
|
0
|
|
|
|
|
0
|
$self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects"); |
1115
|
0
|
|
|
|
|
0
|
return (); |
1116
|
|
|
|
|
|
|
} |
1117
|
|
|
|
|
|
|
|
1118
|
2
|
|
|
|
|
247
|
my @marker_names = $pop->get_marker_names; |
1119
|
2
|
|
|
|
|
5
|
my @inds = $pop->get_Individuals; |
1120
|
2
|
|
|
|
|
3
|
my $num_inds = scalar @inds; |
1121
|
2
|
|
|
|
|
1
|
my (%lookup); |
1122
|
|
|
|
|
|
|
# calculate allele frequencies for each marker from the population |
1123
|
|
|
|
|
|
|
# use the built-in get_Marker to get the allele freqs |
1124
|
|
|
|
|
|
|
# we still need to calculate the genotype frequencies |
1125
|
2
|
|
|
|
|
3
|
foreach my $marker_name ( @marker_names ) { |
1126
|
5
|
|
|
|
|
5
|
my(%allelef); |
1127
|
|
|
|
|
|
|
|
1128
|
5
|
|
|
|
|
5
|
foreach my $ind ( @inds ) { |
1129
|
76
|
|
|
|
|
124
|
my ($genotype) = $ind->get_Genotypes(-marker => $marker_name); |
1130
|
76
|
50
|
|
|
|
120
|
if( ! defined $genotype ) { |
1131
|
0
|
|
|
|
|
0
|
$self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n"); |
1132
|
0
|
|
|
|
|
0
|
next; |
1133
|
|
|
|
|
|
|
} |
1134
|
76
|
|
|
|
|
96
|
my @alleles = sort $genotype->get_Alleles; |
1135
|
76
|
100
|
|
|
|
117
|
next if( scalar @alleles != 2); |
1136
|
73
|
|
|
|
|
76
|
my $genostr = join(',', @alleles); |
1137
|
73
|
|
|
|
|
55
|
$allelef{$alleles[0]}++; |
1138
|
73
|
|
|
|
|
86
|
$allelef{$alleles[1]}++; |
1139
|
|
|
|
|
|
|
} |
1140
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
# we should check for cases where there > 2 alleles or |
1142
|
|
|
|
|
|
|
# only 1 allele and throw out those markers. |
1143
|
5
|
|
|
|
|
11
|
my @alleles = sort keys %allelef; |
1144
|
5
|
|
|
|
|
4
|
my $allele_count = scalar @alleles; |
1145
|
|
|
|
|
|
|
# test if site is polymorphic |
1146
|
5
|
50
|
|
|
|
9
|
if( $allele_count != 2) { |
1147
|
|
|
|
|
|
|
# only really warn if we're seeing multi-allele |
1148
|
0
|
0
|
|
|
|
0
|
$self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2; |
1149
|
0
|
|
|
|
|
0
|
next; # skip this marker |
1150
|
|
|
|
|
|
|
} |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
# Need to do something here to detect alleles which aren't |
1153
|
|
|
|
|
|
|
# a single character |
1154
|
5
|
50
|
33
|
|
|
19
|
if( length($alleles[0]) != 1 || |
1155
|
|
|
|
|
|
|
length($alleles[1]) != 1 ) { |
1156
|
0
|
|
|
|
|
0
|
$self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character"); |
1157
|
0
|
|
|
|
|
0
|
next; |
1158
|
|
|
|
|
|
|
} |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
# fix the call for allele 1 (A or B) and |
1161
|
|
|
|
|
|
|
# allele 2 (a or b) in terms of how we'll do the |
1162
|
|
|
|
|
|
|
# N square from Weir p.126 |
1163
|
5
|
|
|
|
|
16
|
$self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n"); |
1164
|
5
|
|
|
|
|
10
|
$lookup{$marker_name}->{'1'} = $alleles[0]; |
1165
|
5
|
|
|
|
|
11
|
$lookup{$marker_name}->{'2'} = $alleles[1]; |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
|
1168
|
2
|
|
|
|
|
5
|
@marker_names = sort keys %lookup; |
1169
|
2
|
|
|
|
|
2
|
my $site_count = scalar @marker_names; |
1170
|
|
|
|
|
|
|
# where the final data will be stored |
1171
|
2
|
|
|
|
|
3
|
my %stats_for_sites; |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
# standard way of generating pairwise combos |
1174
|
|
|
|
|
|
|
# LD is done by comparing all the pairwise site (marker) |
1175
|
|
|
|
|
|
|
# combinations and keeping track of the genotype and |
1176
|
|
|
|
|
|
|
# pairwise genotype (ie genotypes of the 2 sites) frequencies |
1177
|
2
|
|
|
|
|
4
|
for( my $i = 0; $i < $site_count - 1; $i++ ) { |
1178
|
3
|
|
|
|
|
4
|
my $site1 = $marker_names[$i]; |
1179
|
|
|
|
|
|
|
|
1180
|
3
|
|
|
|
|
6
|
for( my $j = $i+1; $j < $site_count ; $j++) { |
1181
|
4
|
|
|
|
|
3
|
my (%genotypes, %total_genotype_count,$total_pairwisegeno_count, |
1182
|
|
|
|
|
|
|
%pairwise_genotypes); |
1183
|
|
|
|
|
|
|
|
1184
|
4
|
|
|
|
|
4
|
my $site2 = $marker_names[$j]; |
1185
|
4
|
|
|
|
|
8
|
my (%allele_count,%allele_freqs) = (0,0); |
1186
|
4
|
|
|
|
|
5
|
foreach my $ind ( @inds ) { |
1187
|
|
|
|
|
|
|
# build string of genotype at site 1 |
1188
|
53
|
|
|
|
|
82
|
my ($genotype1) = $ind->get_Genotypes(-marker => $site1); |
1189
|
53
|
|
|
|
|
75
|
my @alleles1 = sort $genotype1->get_Alleles; |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
# if an individual has only one available allele |
1192
|
|
|
|
|
|
|
# (has a blank or N for one of the chromosomes) |
1193
|
|
|
|
|
|
|
# we don't want to use it in our calculation |
1194
|
|
|
|
|
|
|
|
1195
|
53
|
100
|
|
|
|
78
|
next unless( scalar @alleles1 == 2); |
1196
|
52
|
|
|
|
|
58
|
my $genostr1 = join(',', @alleles1); |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
# build string of genotype at site 2 |
1199
|
52
|
|
|
|
|
70
|
my ($genotype2) = $ind->get_Genotypes(-marker => $site2); |
1200
|
52
|
|
|
|
|
72
|
my @alleles2 = sort $genotype2->get_Alleles; |
1201
|
52
|
|
|
|
|
62
|
my $genostr2 = join(',', @alleles2); |
1202
|
|
|
|
|
|
|
|
1203
|
52
|
100
|
|
|
|
61
|
next unless( scalar @alleles2 == 2); |
1204
|
50
|
|
|
|
|
48
|
for (@alleles1) { |
1205
|
100
|
|
|
|
|
74
|
$allele_count{$site1}++; |
1206
|
100
|
|
|
|
|
84
|
$allele_freqs{$site1}->{$_}++; |
1207
|
|
|
|
|
|
|
} |
1208
|
50
|
|
|
|
|
44
|
$genotypes{$site1}->{$genostr1}++; |
1209
|
50
|
|
|
|
|
25
|
$total_genotype_count{$site1}++; |
1210
|
|
|
|
|
|
|
|
1211
|
50
|
|
|
|
|
42
|
for (@alleles2) { |
1212
|
100
|
|
|
|
|
67
|
$allele_count{$site2}++; |
1213
|
100
|
|
|
|
|
76
|
$allele_freqs{$site2}->{$_}++; |
1214
|
|
|
|
|
|
|
} |
1215
|
50
|
|
|
|
|
37
|
$genotypes{$site2}->{$genostr2}++; |
1216
|
50
|
|
|
|
|
33
|
$total_genotype_count{$site2}++; |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
# We are using the $site1,$site2 to signify |
1219
|
|
|
|
|
|
|
# a unique key |
1220
|
50
|
|
|
|
|
63
|
$pairwise_genotypes{"$genostr1,$genostr2"}++; |
1221
|
|
|
|
|
|
|
# some individuals |
1222
|
50
|
|
|
|
|
57
|
$total_pairwisegeno_count++; |
1223
|
|
|
|
|
|
|
} |
1224
|
4
|
|
|
|
|
8
|
for my $site ( %allele_freqs ) { |
1225
|
16
|
|
|
|
|
9
|
for my $al ( keys %{ $allele_freqs{$site} } ) { |
|
16
|
|
|
|
|
30
|
|
1226
|
16
|
|
|
|
|
20
|
$allele_freqs{$site}->{$al} /= $allele_count{$site}; |
1227
|
|
|
|
|
|
|
} |
1228
|
|
|
|
|
|
|
} |
1229
|
4
|
|
|
|
|
3
|
my $n = $total_pairwisegeno_count; # number of pairs of comparisons |
1230
|
|
|
|
|
|
|
# 'A' and 'B' are two loci or in our case site1 and site2 |
1231
|
4
|
|
|
|
|
6
|
my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele |
1232
|
4
|
|
|
|
|
8
|
my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele |
1233
|
4
|
|
|
|
|
3
|
my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele |
1234
|
4
|
|
|
|
|
4
|
my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele |
1235
|
|
|
|
|
|
|
# AABB |
1236
|
4
|
|
|
|
|
5
|
my $N1genostr = join(",",( $allele1_site1, $allele1_site1, |
1237
|
|
|
|
|
|
|
$allele1_site2, $allele1_site2)); |
1238
|
4
|
|
|
|
|
12
|
$self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n"); |
1239
|
|
|
|
|
|
|
# AABb |
1240
|
4
|
|
|
|
|
6
|
my $N2genostr = join(",",( $allele1_site1, $allele1_site1, |
1241
|
|
|
|
|
|
|
$allele1_site2, $allele2_site2)); |
1242
|
4
|
|
|
|
|
9
|
$self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n"); |
1243
|
|
|
|
|
|
|
# AaBB |
1244
|
4
|
|
|
|
|
6
|
my $N4genostr = join(",",( $allele1_site1, $allele2_site1, |
1245
|
|
|
|
|
|
|
$allele1_site2, $allele1_site2)); |
1246
|
4
|
|
|
|
|
9
|
$self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n"); |
1247
|
|
|
|
|
|
|
# AaBb |
1248
|
4
|
|
|
|
|
5
|
my $N5genostr = join(",",( $allele1_site1, $allele2_site1, |
1249
|
|
|
|
|
|
|
$allele1_site2, $allele2_site2)); |
1250
|
4
|
|
|
|
|
10
|
$self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n"); |
1251
|
|
|
|
|
|
|
# count of AABB in |
1252
|
4
|
|
50
|
|
|
7
|
my $n1 = $pairwise_genotypes{$N1genostr} || 0; |
1253
|
|
|
|
|
|
|
# count of AABb in |
1254
|
4
|
|
100
|
|
|
10
|
my $n2 = $pairwise_genotypes{$N2genostr} || 0; |
1255
|
|
|
|
|
|
|
# count of AaBB in |
1256
|
4
|
|
100
|
|
|
9
|
my $n4 = $pairwise_genotypes{$N4genostr} || 0; |
1257
|
|
|
|
|
|
|
# count of AaBb in |
1258
|
4
|
|
100
|
|
|
16
|
my $n5 = $pairwise_genotypes{$N5genostr} || 0; |
1259
|
|
|
|
|
|
|
|
1260
|
4
|
|
|
|
|
5
|
my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1)); |
1261
|
4
|
|
|
|
|
3
|
my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2)); |
1262
|
4
|
|
50
|
|
|
9
|
my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n; |
1263
|
4
|
|
50
|
|
|
10
|
my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n; |
1264
|
4
|
|
50
|
|
|
6
|
my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq |
1265
|
4
|
|
|
|
|
4
|
my $p_a = 1 - $p_A; |
1266
|
|
|
|
|
|
|
|
1267
|
4
|
|
50
|
|
|
8
|
my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq |
1268
|
4
|
|
|
|
|
2
|
my $p_b = 1 - $p_B; |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# variance of allele frequencies |
1271
|
4
|
|
|
|
|
6
|
my $pi_A = $p_A * $p_a; |
1272
|
4
|
|
|
|
|
4
|
my $pi_B = $p_B * $p_b; |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
# hardy weinberg |
1275
|
4
|
|
|
|
|
5
|
my $D_A = $p_AA - $p_A**2; |
1276
|
4
|
|
|
|
|
5
|
my $D_B = $p_BB - $p_B**2; |
1277
|
4
|
|
|
|
|
6
|
my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5; |
1278
|
4
|
|
|
|
|
18
|
$self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n"); |
1279
|
|
|
|
|
|
|
|
1280
|
4
|
|
|
|
|
7
|
my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B ); |
1281
|
4
|
|
|
|
|
36
|
$self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n"); |
1282
|
4
|
|
|
|
|
36
|
$self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n", |
1283
|
|
|
|
|
|
|
$n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B)); |
1284
|
|
|
|
|
|
|
|
1285
|
4
|
|
|
|
|
5
|
my $chisquared; |
1286
|
4
|
|
|
|
|
4
|
eval { $chisquared = ( $n * ($delta_AB**2) ) / |
|
4
|
|
|
|
|
7
|
|
1287
|
|
|
|
|
|
|
( ( $pi_A + $D_A) * ( $pi_B + $D_B) ); |
1288
|
|
|
|
|
|
|
}; |
1289
|
4
|
50
|
|
|
|
6
|
if( $@ ) { |
1290
|
0
|
|
|
|
|
0
|
$self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n"); |
1291
|
0
|
|
|
|
|
0
|
next; |
1292
|
|
|
|
|
|
|
} |
1293
|
|
|
|
|
|
|
# this will be an upper triangular matrix |
1294
|
4
|
|
|
|
|
37
|
$stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared]; |
1295
|
|
|
|
|
|
|
} |
1296
|
|
|
|
|
|
|
} |
1297
|
2
|
|
|
|
|
12
|
return %stats_for_sites; |
1298
|
|
|
|
|
|
|
} |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
=head2 mcdonald_kreitman |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
Title : mcdonald_kreitman |
1303
|
|
|
|
|
|
|
Usage : $Fstat = mcdonald_kreitman($ingroup, $outgroup); |
1304
|
|
|
|
|
|
|
Function: Calculates McDonald-Kreitman statistic based on a set of ingroup |
1305
|
|
|
|
|
|
|
individuals and an outgroup by computing the number of |
1306
|
|
|
|
|
|
|
differences at synonymous and non-synonymous sites |
1307
|
|
|
|
|
|
|
for intraspecific comparisons and with the outgroup |
1308
|
|
|
|
|
|
|
Returns : 2x2 table, followed by a hash reference indicating any |
1309
|
|
|
|
|
|
|
warning messages about the status of the alleles or codons |
1310
|
|
|
|
|
|
|
Args : -ingroup => L object or |
1311
|
|
|
|
|
|
|
arrayref of Ls |
1312
|
|
|
|
|
|
|
-outgroup => L object or |
1313
|
|
|
|
|
|
|
arrayef of Ls |
1314
|
|
|
|
|
|
|
-polarized => Boolean, to indicate if this should be |
1315
|
|
|
|
|
|
|
a polarized test. Must provide two individuals |
1316
|
|
|
|
|
|
|
as outgroups. |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
=cut |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
sub mcdonald_kreitman { |
1321
|
6
|
|
|
6
|
1
|
4983
|
my ($self,@args) = @_; |
1322
|
6
|
|
|
|
|
35
|
my ($ingroup, $outgroup,$polarized) = |
1323
|
|
|
|
|
|
|
$self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args); |
1324
|
6
|
|
|
|
|
30
|
my $verbose = $self->verbose; |
1325
|
6
|
|
|
|
|
9
|
my $outgroup_count; |
1326
|
6
|
|
|
|
|
8
|
my $gapchar = '\-'; |
1327
|
6
|
50
|
|
|
|
30
|
if( ref($outgroup) =~ /ARRAY/i ) { |
|
|
0
|
|
|
|
|
|
1328
|
6
|
|
|
|
|
10
|
$outgroup_count = scalar @$outgroup; |
1329
|
|
|
|
|
|
|
} elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) { |
1330
|
0
|
|
|
|
|
0
|
$outgroup_count = $outgroup->get_number_individuals; |
1331
|
|
|
|
|
|
|
} else { |
1332
|
0
|
|
|
|
|
0
|
$self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI"); |
1333
|
|
|
|
|
|
|
} |
1334
|
|
|
|
|
|
|
|
1335
|
6
|
100
|
|
|
|
24
|
if( $polarized ) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1336
|
2
|
50
|
|
|
|
8
|
if( $outgroup_count < 2 ) { |
1337
|
0
|
|
|
|
|
0
|
$self->throw("Need 2 outgroups with polarized option\n"); |
1338
|
|
|
|
|
|
|
} |
1339
|
|
|
|
|
|
|
} elsif( $outgroup_count > 1 ) { |
1340
|
0
|
|
|
|
|
0
|
$self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count )); |
1341
|
|
|
|
|
|
|
} elsif( $outgroup_count == 0 ) { |
1342
|
0
|
|
|
|
|
0
|
$self->throw("No outgroup sequence provided"); |
1343
|
|
|
|
|
|
|
} |
1344
|
|
|
|
|
|
|
|
1345
|
6
|
|
|
|
|
38
|
my $codon_path = Bio::MolEvol::CodonModel->codon_path; |
1346
|
|
|
|
|
|
|
|
1347
|
6
|
|
|
|
|
524
|
my (%marker_names,%unique,@inds); |
1348
|
6
|
|
|
|
|
12
|
for my $p ( $ingroup, $outgroup) { |
1349
|
12
|
50
|
|
|
|
42
|
if( ref($p) =~ /ARRAY/i ) { |
1350
|
12
|
|
|
|
|
27
|
push @inds, @$p; |
1351
|
|
|
|
|
|
|
} else { |
1352
|
0
|
|
|
|
|
0
|
push @inds, $p->get_Individuals; |
1353
|
|
|
|
|
|
|
} |
1354
|
|
|
|
|
|
|
} |
1355
|
6
|
|
|
|
|
12
|
for my $i ( @inds ) { |
1356
|
35
|
50
|
|
|
|
89
|
if( $unique{$i->unique_id}++ ) { |
1357
|
0
|
|
|
|
|
0
|
$self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n"); |
1358
|
|
|
|
|
|
|
} |
1359
|
35
|
|
|
|
|
61
|
for my $n ( $i->get_marker_names ) { |
1360
|
14126
|
|
|
|
|
9605
|
$marker_names{$n}++; |
1361
|
|
|
|
|
|
|
} |
1362
|
|
|
|
|
|
|
} |
1363
|
|
|
|
|
|
|
|
1364
|
6
|
|
|
|
|
335
|
my @marker_names = keys %marker_names; |
1365
|
6
|
50
|
|
|
|
81
|
if( $marker_names[0] =~ /^(Site|Codon)/ ) { |
1366
|
|
|
|
|
|
|
# sort by site or codon number and do it in |
1367
|
|
|
|
|
|
|
# a schwartzian transformation baby! |
1368
|
0
|
|
|
|
|
0
|
@marker_names = map { $_->[1] } |
1369
|
0
|
|
|
|
|
0
|
sort { $a->[0] <=> $b->[0] } |
1370
|
0
|
|
|
|
|
0
|
map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names; |
|
0
|
|
|
|
|
0
|
|
1371
|
|
|
|
|
|
|
} |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
|
1374
|
6
|
|
|
|
|
12
|
my $num_inds = scalar @inds; |
1375
|
6
|
|
|
|
|
21
|
my %vals = ( 'ingroup' => $ingroup, |
1376
|
|
|
|
|
|
|
'outgroup' => $outgroup, |
1377
|
|
|
|
|
|
|
); |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
# Make the Codon Table type a parameter! |
1380
|
6
|
|
|
|
|
45
|
my $table = Bio::Tools::CodonTable->new(-id => $codon_table); |
1381
|
6
|
|
|
|
|
15
|
my @vt = qw(outgroup ingroup); |
1382
|
6
|
|
|
|
|
5
|
my %changes; |
1383
|
|
|
|
|
|
|
my %status; |
1384
|
6
|
|
|
|
|
19
|
my %two_by_two = ( 'fixed_N' => 0, |
1385
|
|
|
|
|
|
|
'fixed_S' => 0, |
1386
|
|
|
|
|
|
|
'poly_N' => 0, |
1387
|
|
|
|
|
|
|
'poly_S' => 0); |
1388
|
|
|
|
|
|
|
|
1389
|
6
|
|
|
|
|
12
|
for my $codon ( @marker_names ) { |
1390
|
2436
|
|
|
|
|
1700
|
my (%codonvals); |
1391
|
|
|
|
|
|
|
my %all_alleles; |
1392
|
2436
|
|
|
|
|
2119
|
for my $t ( @vt ) { |
1393
|
4872
|
|
|
|
|
3461
|
my $outcount = 1; |
1394
|
4872
|
|
|
|
|
3395
|
for my $ind ( @{$vals{$t}} ) { |
|
4872
|
|
|
|
|
5995
|
|
1395
|
14126
|
|
|
|
|
21196
|
my @alleles = $ind->get_Genotypes($codon)->get_Alleles; |
1396
|
14126
|
50
|
|
|
|
18456
|
if( @alleles > 2 ) { |
1397
|
0
|
|
|
|
|
0
|
warn("Codon $codon saw ", scalar @alleles, " alleles for ind ", $ind->unique_id, "\n"); |
1398
|
0
|
|
|
|
|
0
|
die; |
1399
|
|
|
|
|
|
|
} else { |
1400
|
14126
|
|
|
|
|
10579
|
my ($allele) = shift @alleles; |
1401
|
14126
|
|
|
|
|
20793
|
$all_alleles{$ind->unique_id} = $allele; |
1402
|
14126
|
|
|
|
|
20428
|
my $AA = $table->translate($allele); |
1403
|
14126
|
50
|
66
|
|
|
53198
|
next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i); |
|
|
|
66
|
|
|
|
|
1404
|
|
|
|
|
|
|
|
1405
|
8971
|
|
|
|
|
6552
|
my $label = $t; |
1406
|
8971
|
100
|
|
|
|
10452
|
if( $t eq 'outgroup' ) { |
1407
|
3244
|
|
|
|
|
3478
|
$label = $t.$outcount++; |
1408
|
|
|
|
|
|
|
} |
1409
|
8971
|
|
|
|
|
10793
|
$codonvals{$label}->{$allele}++; |
1410
|
8971
|
|
|
|
|
14880
|
$codonvals{all}->{$allele}++; |
1411
|
|
|
|
|
|
|
} |
1412
|
|
|
|
|
|
|
} |
1413
|
|
|
|
|
|
|
} |
1414
|
2436
|
|
|
|
|
1874
|
my $total = sum ( values %{$codonvals{'ingroup'}} ); |
|
2436
|
|
|
|
|
5497
|
|
1415
|
2436
|
100
|
100
|
|
|
8190
|
next if( $total && $total < 2 ); # skip sites with < alleles |
1416
|
|
|
|
|
|
|
# process all the seen alleles (codons) |
1417
|
|
|
|
|
|
|
# this is a vertical slide through the alignment |
1418
|
1626
|
100
|
|
|
|
1094
|
if( keys %{$codonvals{all}} <= 1 ) { |
|
1626
|
|
|
|
|
4748
|
|
1419
|
|
|
|
|
|
|
# no changes or no VALID codons - monomorphic |
1420
|
|
|
|
|
|
|
} else { |
1421
|
|
|
|
|
|
|
# grab only the first outgroup codon (what to do with rest?) |
1422
|
278
|
|
|
|
|
173
|
my ($outcodon) = keys %{$codonvals{'outgroup1'}}; |
|
278
|
|
|
|
|
376
|
|
1423
|
278
|
50
|
|
|
|
397
|
if( ! $outcodon ) { |
1424
|
0
|
|
|
|
|
0
|
$status{"no outgroup codon $codon"}++; |
1425
|
0
|
|
|
|
|
0
|
next; |
1426
|
|
|
|
|
|
|
} |
1427
|
278
|
|
|
|
|
385
|
my $out_AA = $table->translate($outcodon); |
1428
|
278
|
|
|
|
|
260
|
my ($outcodon2) = keys %{$codonvals{'outgroup2'}}; |
|
278
|
|
|
|
|
441
|
|
1429
|
278
|
50
|
100
|
|
|
1263
|
if( ($polarized && ($outcodon ne $outcodon2)) || |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1430
|
|
|
|
|
|
|
$out_AA eq 'X' || $out_AA eq '*' ) { |
1431
|
|
|
|
|
|
|
# skip if outgroup codons are different |
1432
|
|
|
|
|
|
|
# (when polarized option is on) |
1433
|
|
|
|
|
|
|
# or skip if the outcodon is STOP or 'NNN' |
1434
|
90
|
50
|
|
|
|
125
|
if( $verbose > 0 ) { |
1435
|
0
|
|
|
|
|
0
|
$self->debug("skipping $out_AA and $outcodon $outcodon2\n"); |
1436
|
|
|
|
|
|
|
} |
1437
|
90
|
|
|
|
|
73
|
$status{'outgroup codons different'}++; |
1438
|
90
|
|
|
|
|
279
|
next; |
1439
|
|
|
|
|
|
|
} |
1440
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
# check if ingroup is actually different from outgroup - |
1442
|
|
|
|
|
|
|
# if there are the same number of alleles when considering |
1443
|
|
|
|
|
|
|
# ALL or just the ingroup, then there is nothing new seen |
1444
|
|
|
|
|
|
|
# in the outgroup so it must be a shared allele (codon) |
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
# so we just count how many total alleles were seen |
1447
|
|
|
|
|
|
|
# if this is the same as the number of alleles seen for just |
1448
|
|
|
|
|
|
|
# the ingroup then the outgroup presents no new information |
1449
|
|
|
|
|
|
|
|
1450
|
188
|
|
|
|
|
127
|
my @ingroup_codons = keys %{$codonvals{'ingroup'}}; |
|
188
|
|
|
|
|
303
|
|
1451
|
188
|
|
|
|
|
234
|
my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon}; |
1452
|
|
|
|
|
|
|
|
1453
|
188
|
50
|
|
|
|
232
|
if( $verbose > 0 ) { |
1454
|
|
|
|
|
|
|
$self->debug("alleles are in: ", join(",", @ingroup_codons), |
1455
|
0
|
|
|
|
|
0
|
" out: ", join(",", keys %{$codonvals{outgroup1}}), |
|
0
|
|
|
|
|
0
|
|
1456
|
|
|
|
|
|
|
" diff_from_out=$diff_from_out\n"); |
1457
|
|
|
|
|
|
|
|
1458
|
0
|
|
|
|
|
0
|
for my $ind ( sort keys %all_alleles ) { |
1459
|
0
|
|
|
|
|
0
|
$self->debug( "$ind\t$all_alleles{$ind}\n"); |
1460
|
|
|
|
|
|
|
} |
1461
|
|
|
|
|
|
|
} |
1462
|
|
|
|
|
|
|
# are all the ingroup alleles the same and diferent from outgroup? |
1463
|
|
|
|
|
|
|
# fixed differences between species |
1464
|
188
|
100
|
|
|
|
216
|
if( $diff_from_out ) { |
1465
|
99
|
100
|
|
|
|
129
|
if( scalar @ingroup_codons == 1 ) { |
1466
|
|
|
|
|
|
|
# fixed differences |
1467
|
92
|
50
|
|
|
|
385
|
if( $outcodon =~ /^$gapchar/ ) { |
|
|
50
|
|
|
|
|
|
1468
|
0
|
|
|
|
|
0
|
$status{'outgroup codons with gaps'}++; |
1469
|
0
|
|
|
|
|
0
|
next; |
1470
|
|
|
|
|
|
|
} elsif( $ingroup_codons[0] =~ /$gapchar/) { |
1471
|
0
|
|
|
|
|
0
|
$status{'ingroup codons with gaps'}++; |
1472
|
0
|
|
|
|
|
0
|
next; |
1473
|
|
|
|
|
|
|
} |
1474
|
92
|
|
|
|
|
166
|
my $path = $codon_path->{uc $ingroup_codons[0].$outcodon}; |
1475
|
92
|
|
|
|
|
102
|
$two_by_two{fixed_N} += $path->[0]; |
1476
|
92
|
|
|
|
|
86
|
$two_by_two{fixed_S} += $path->[1]; |
1477
|
92
|
50
|
|
|
|
376
|
if( $verbose > 0 ) { |
1478
|
0
|
|
|
|
|
0
|
$self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n"); |
1479
|
0
|
|
|
|
|
0
|
$self->debug("path is ",join(",",@$path),"\n"); |
1480
|
|
|
|
|
|
|
$self->debug |
1481
|
|
|
|
|
|
|
(sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA, |
1482
|
0
|
|
|
|
|
0
|
@$path, map { $two_by_two{$_} } |
|
0
|
|
|
|
|
0
|
|
1483
|
|
|
|
|
|
|
qw(fixed_N fixed_S poly_N poly_S))); |
1484
|
|
|
|
|
|
|
} |
1485
|
|
|
|
|
|
|
} else { |
1486
|
|
|
|
|
|
|
# polymorphic and all are different from outgroup |
1487
|
|
|
|
|
|
|
# Here we find the minimum number of NS subst |
1488
|
7
|
|
|
|
|
10
|
my ($Ndiff,$Sdiff) = (3,0); # most different path |
1489
|
7
|
|
|
|
|
8
|
for my $c ( @ingroup_codons ) { |
1490
|
14
|
50
|
33
|
|
|
79
|
next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/); |
1491
|
14
|
|
|
|
|
24
|
my $path = $codon_path->{uc $c.$outcodon}; |
1492
|
14
|
|
|
|
|
16
|
my ($tNdiff,$tSdiff) = @$path; |
1493
|
14
|
100
|
100
|
|
|
46
|
if( $path->[0] < $Ndiff || |
|
|
|
66
|
|
|
|
|
1494
|
|
|
|
|
|
|
($tNdiff == $Ndiff && |
1495
|
|
|
|
|
|
|
$tSdiff <= $Sdiff)) { |
1496
|
11
|
|
|
|
|
15
|
($Ndiff,$Sdiff) = ($tNdiff,$tSdiff); |
1497
|
|
|
|
|
|
|
} |
1498
|
|
|
|
|
|
|
} |
1499
|
7
|
|
|
|
|
10
|
$two_by_two{fixed_N} += $Ndiff; |
1500
|
7
|
|
|
|
|
7
|
$two_by_two{fixed_S} += $Sdiff; |
1501
|
7
|
50
|
|
|
|
11
|
if( @ingroup_codons > 2 ) { |
1502
|
0
|
|
|
|
|
0
|
$status{"more than 2 ingroup codons $codon"}++; |
1503
|
0
|
|
|
|
|
0
|
warn("more than 2 ingroup codons (@ingroup_codons)\n"); |
1504
|
|
|
|
|
|
|
} else { |
1505
|
7
|
|
|
|
|
16
|
my $path = $codon_path->{uc join('',@ingroup_codons)}; |
1506
|
|
|
|
|
|
|
|
1507
|
7
|
|
|
|
|
10
|
$two_by_two{poly_N} += $path->[0]; |
1508
|
7
|
|
|
|
|
6
|
$two_by_two{poly_S} += $path->[1]; |
1509
|
7
|
50
|
|
|
|
33
|
if( $verbose > 0 ) { |
1510
|
0
|
|
|
|
|
0
|
$self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S))); |
|
0
|
|
|
|
|
0
|
|
1511
|
|
|
|
|
|
|
} |
1512
|
|
|
|
|
|
|
} |
1513
|
|
|
|
|
|
|
} |
1514
|
|
|
|
|
|
|
} else { |
1515
|
89
|
|
|
|
|
98
|
my %unq = map { $_ => 1 } @ingroup_codons; |
|
181
|
|
|
|
|
273
|
|
1516
|
89
|
|
|
|
|
97
|
delete $unq{$outcodon}; |
1517
|
89
|
|
|
|
|
113
|
my @unique_codons = keys %unq; |
1518
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
# calc path for diff add to poly |
1520
|
|
|
|
|
|
|
# Here we find the minimum number of subst bw |
1521
|
|
|
|
|
|
|
# codons |
1522
|
89
|
|
|
|
|
84
|
my ($Ndiff,$Sdiff) = (3,0); # most different path |
1523
|
89
|
|
|
|
|
84
|
for my $c ( @unique_codons ) { |
1524
|
92
|
|
|
|
|
159
|
my $path = $codon_path->{uc $c.$outcodon }; |
1525
|
92
|
50
|
|
|
|
138
|
if( ! defined $path ) { |
1526
|
0
|
|
|
|
|
0
|
die " cannot get path for ", $c.$outcodon, "\n"; |
1527
|
|
|
|
|
|
|
} |
1528
|
92
|
|
|
|
|
87
|
my ($tNdiff,$tSdiff) = @$path; |
1529
|
92
|
50
|
33
|
|
|
163
|
if( $path->[0] < $Ndiff || |
|
|
|
66
|
|
|
|
|
1530
|
|
|
|
|
|
|
($tNdiff == $Ndiff && |
1531
|
|
|
|
|
|
|
$tSdiff <= $Sdiff)) { |
1532
|
92
|
|
|
|
|
127
|
($Ndiff,$Sdiff) = ($tNdiff,$tSdiff); |
1533
|
|
|
|
|
|
|
} |
1534
|
|
|
|
|
|
|
} |
1535
|
|
|
|
|
|
|
|
1536
|
89
|
100
|
|
|
|
139
|
if( @unique_codons == 2 ) { |
1537
|
3
|
|
|
|
|
8
|
my $path = $codon_path->{uc join('',@unique_codons)}; |
1538
|
3
|
50
|
|
|
|
8
|
if( ! defined $path ) { |
1539
|
0
|
|
|
|
|
0
|
$self->throw("no path for @unique_codons\n"); |
1540
|
|
|
|
|
|
|
} |
1541
|
3
|
|
|
|
|
5
|
$Ndiff += $path->[0]; |
1542
|
3
|
|
|
|
|
3
|
$Sdiff += $path->[1]; |
1543
|
|
|
|
|
|
|
} |
1544
|
89
|
|
|
|
|
91
|
$two_by_two{poly_N} += $Ndiff; |
1545
|
89
|
|
|
|
|
77
|
$two_by_two{poly_S} += $Sdiff; |
1546
|
89
|
50
|
|
|
|
374
|
if( $verbose > 0 ) { |
1547
|
|
|
|
|
|
|
$self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA, |
1548
|
0
|
|
|
|
|
0
|
$Ndiff, $Sdiff, map { $two_by_two{$_} } |
|
0
|
|
|
|
|
0
|
|
1549
|
|
|
|
|
|
|
qw(fixed_N fixed_S poly_N poly_S))); |
1550
|
|
|
|
|
|
|
} |
1551
|
|
|
|
|
|
|
} |
1552
|
|
|
|
|
|
|
} |
1553
|
|
|
|
|
|
|
} |
1554
|
|
|
|
|
|
|
return ( $two_by_two{'poly_N'}, |
1555
|
|
|
|
|
|
|
$two_by_two{'fixed_N'}, |
1556
|
|
|
|
|
|
|
$two_by_two{'poly_S'}, |
1557
|
6
|
|
|
|
|
83
|
$two_by_two{'fixed_S'}, |
1558
|
|
|
|
|
|
|
{%status}); |
1559
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
} |
1561
|
|
|
|
|
|
|
|
1562
|
|
|
|
|
|
|
*MK = \&mcdonald_kreitman; |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
=head2 mcdonald_kreitman_counts |
1566
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
Title : mcdonald_kreitman_counts |
1568
|
|
|
|
|
|
|
Usage : my $MK = $statistics->mcdonald_kreitman_counts( |
1569
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
N_poly -> integer of count of non-syn polymorphism |
1571
|
|
|
|
|
|
|
N_fix -> integer of count of non-syn fixed substitutions |
1572
|
|
|
|
|
|
|
S_poly -> integer of count of syn polymorphism |
1573
|
|
|
|
|
|
|
S_fix -> integer of count of syn fixed substitutions |
1574
|
|
|
|
|
|
|
); |
1575
|
|
|
|
|
|
|
Function: |
1576
|
|
|
|
|
|
|
Returns : decimal number |
1577
|
|
|
|
|
|
|
Args : |
1578
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
=cut |
1580
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
sub mcdonald_kreitman_counts { |
1583
|
0
|
|
|
0
|
1
|
|
my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_; |
1584
|
0
|
0
|
|
|
|
|
if( $has_twotailed ) { |
1585
|
0
|
|
|
|
|
|
return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic |
1586
|
|
|
|
|
|
|
(n11=>$Npoly, |
1587
|
|
|
|
|
|
|
n1p=>$Npoly+$Spoly, |
1588
|
|
|
|
|
|
|
np1=>$Npoly+$Nfix, |
1589
|
|
|
|
|
|
|
npp=>$Npoly+$Nfix+$Spoly+$Sfix); |
1590
|
|
|
|
|
|
|
} else { |
1591
|
0
|
|
|
|
|
|
$self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed"); |
1592
|
0
|
|
|
|
|
|
return 0; |
1593
|
|
|
|
|
|
|
} |
1594
|
|
|
|
|
|
|
} |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
1; |