| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# BioPerl module for Bio::PopGen::Statistics |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Cared for by Jason Stajich |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Copyright Jason Stajich |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::PopGen::Statistics - Population Genetics statistical tests |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Bio::PopGen::Statistics; |
|
21
|
|
|
|
|
|
|
use Bio::AlignIO; |
|
22
|
|
|
|
|
|
|
use Bio::PopGen::IO; |
|
23
|
|
|
|
|
|
|
use Bio::PopGen::Simulation::Coalescent; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $tree = $sim->next_tree; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$sim->add_Mutations($tree,20); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my $stats = Bio::PopGen::Statistics->new(); |
|
32
|
|
|
|
|
|
|
my $individuals = [ $tree->get_leaf_nodes]; |
|
33
|
|
|
|
|
|
|
my $pi = $stats->pi($individuals); |
|
34
|
|
|
|
|
|
|
my $D = $stats->tajima_D($individuals); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# Alternatively to do this on input data from |
|
37
|
|
|
|
|
|
|
# See the tests in t/PopGen.t for more examples |
|
38
|
|
|
|
|
|
|
my $parser = Bio::PopGen::IO->new(-format => 'prettybase', |
|
39
|
|
|
|
|
|
|
-file => 't/data/popstats.prettybase'); |
|
40
|
|
|
|
|
|
|
my $pop = $parser->next_population; |
|
41
|
|
|
|
|
|
|
# Note that you can also call the stats as a class method if you like |
|
42
|
|
|
|
|
|
|
# the only reason to instantiate it (as above) is if you want |
|
43
|
|
|
|
|
|
|
# to set the verbosity for debugging |
|
44
|
|
|
|
|
|
|
$pi = Bio::PopGen::Statistics->pi($pop); |
|
45
|
|
|
|
|
|
|
$theta = Bio::PopGen::Statistics->theta($pop); |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Pi and Theta also take additional arguments, |
|
48
|
|
|
|
|
|
|
# see the documentation for more information |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
use Bio::PopGen::Utilities; |
|
51
|
|
|
|
|
|
|
use Bio::AlignIO; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $in = Bio::AlignIO->new(-file => 't/data/t7.aln', |
|
54
|
|
|
|
|
|
|
-format => 'clustalw'); |
|
55
|
|
|
|
|
|
|
my $aln = $in->next_aln; |
|
56
|
|
|
|
|
|
|
# get a population, each sequence is an individual and |
|
57
|
|
|
|
|
|
|
# for the default case, every site which is not monomorphic |
|
58
|
|
|
|
|
|
|
# is a 'marker'. Each individual will have a 'genotype' for the |
|
59
|
|
|
|
|
|
|
# site which will be the specific base in the alignment at that |
|
60
|
|
|
|
|
|
|
# site |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln); |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
This object is intended to provide implementations some standard |
|
68
|
|
|
|
|
|
|
population genetics statistics about alleles in populations. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
This module was previously named Bio::Tree::Statistics. |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
This object is a place to accumulate routines for calculating various |
|
73
|
|
|
|
|
|
|
statistics from the coalescent simulation, marker/allele, or from |
|
74
|
|
|
|
|
|
|
aligned sequence data given that you can calculate alleles, number of |
|
75
|
|
|
|
|
|
|
segregating sites. |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
Currently implemented: |
|
78
|
|
|
|
|
|
|
Fu and Li's D (fu_and_li_D) |
|
79
|
|
|
|
|
|
|
Fu and Li's D* (fu_and_li_D_star) |
|
80
|
|
|
|
|
|
|
Fu and Li's F (fu_and_li_F) |
|
81
|
|
|
|
|
|
|
Fu and Li's F* (fu_and_li_F_star) |
|
82
|
|
|
|
|
|
|
Tajima's D (tajima_D) |
|
83
|
|
|
|
|
|
|
Watterson's theta (theta) |
|
84
|
|
|
|
|
|
|
pi (pi) - number of pairwise differences |
|
85
|
|
|
|
|
|
|
composite_LD (composite_LD) |
|
86
|
|
|
|
|
|
|
McDonald-Kreitman (mcdonald_kreitman or MK) |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Count based methods also exist in case you have already calculated the |
|
89
|
|
|
|
|
|
|
key statistics (seg sites, num individuals, etc) and just want to |
|
90
|
|
|
|
|
|
|
compute the statistic. |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
In all cases where a the method expects an arrayref of |
|
93
|
|
|
|
|
|
|
L objects and L |
|
94
|
|
|
|
|
|
|
object will also work. |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head2 REFERENCES |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of |
|
99
|
|
|
|
|
|
|
Mutations." Genetics 133:693-709. |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples |
|
102
|
|
|
|
|
|
|
from a Population." Genetics 143:557-570. |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
McDonald J, Kreitman M. |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Tajima F. (1989) "Statistical method for testing the neutral mutation |
|
107
|
|
|
|
|
|
|
hypothesis by DNA polymorphism." Genetics 123:585-595. |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head2 CITING THIS WORK |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Please see this reference for use of this implementation. |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73. |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
If you use these Bio::PopGen modules please cite the Bioperl |
|
117
|
|
|
|
|
|
|
publication (see FAQ) and the above reference. |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
125
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
|
126
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
129
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 Support |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
I |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
138
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
139
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
140
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
145
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via |
|
146
|
|
|
|
|
|
|
the web: |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head1 AUTHOR - Jason Stajich, Matthew Hahn |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Email jason-at-bioperl-dot-org |
|
153
|
|
|
|
|
|
|
Email matthew-dot-hahn-at-duke-dot-edu |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
McDonald-Kreitman implementation based on work by Alisha Holloway at |
|
156
|
|
|
|
|
|
|
UC Davis. |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 APPENDIX |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
|
162
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Let the code begin... |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
package Bio::PopGen::Statistics; |
|
171
|
3
|
|
|
3
|
|
3034
|
use strict; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
100
|
|
|
172
|
|
|
|
|
|
|
use constant { |
|
173
|
3
|
|
|
|
|
245
|
in_label => 'ingroup', |
|
174
|
|
|
|
|
|
|
out_label => 'outgroup', |
|
175
|
|
|
|
|
|
|
non_syn => 'non_synonymous', |
|
176
|
|
|
|
|
|
|
syn => 'synonymous', |
|
177
|
|
|
|
|
|
|
default_codon_table => 1, # Standard Codon table |
|
178
|
3
|
|
|
3
|
|
10
|
}; |
|
|
3
|
|
|
|
|
3
|
|
|
179
|
|
|
|
|
|
|
|
|
180
|
3
|
|
|
3
|
|
20862
|
use Bio::MolEvol::CodonModel; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
92
|
|
|
181
|
3
|
|
|
3
|
|
12
|
use List::Util qw(sum); |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
216
|
|
|
182
|
|
|
|
|
|
|
|
|
183
|
3
|
|
|
3
|
|
11
|
use base qw(Bio::Root::Root); |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
333
|
|
|
184
|
|
|
|
|
|
|
our $codon_table => default_codon_table; |
|
185
|
|
|
|
|
|
|
our $has_twotailed => 0; |
|
186
|
|
|
|
|
|
|
BEGIN { |
|
187
|
3
|
|
|
3
|
|
3
|
eval { require Text::NSP::Measures::2D::Fisher2::twotailed }; |
|
|
3
|
|
|
|
|
448
|
|
|
188
|
3
|
50
|
|
|
|
14
|
if( $@ ) { $has_twotailed = 0; } |
|
|
3
|
|
|
|
|
13439
|
|
|
189
|
0
|
|
|
|
|
0
|
else { $has_twotailed = 1; } |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 new |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Title : new |
|
200
|
|
|
|
|
|
|
Usage : my $obj = Bio::PopGen::Statistics->new(); |
|
201
|
|
|
|
|
|
|
Function: Builds a new Bio::PopGen::Statistics object |
|
202
|
|
|
|
|
|
|
Returns : an instance of Bio::PopGen::Statistics |
|
203
|
|
|
|
|
|
|
Args : none |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head2 fu_and_li_D |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Title : fu_and_li_D |
|
212
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup); |
|
213
|
|
|
|
|
|
|
OR |
|
214
|
|
|
|
|
|
|
my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations); |
|
215
|
|
|
|
|
|
|
Function: Fu and Li D statistic for a list of individuals |
|
216
|
|
|
|
|
|
|
given an outgroup and the number of external mutations |
|
217
|
|
|
|
|
|
|
(either provided or calculated from list of outgroup individuals) |
|
218
|
|
|
|
|
|
|
Returns : decimal |
|
219
|
|
|
|
|
|
|
Args : $individuals - array reference which contains ingroup individuals |
|
220
|
|
|
|
|
|
|
(L or derived classes) |
|
221
|
|
|
|
|
|
|
$extmutations - number of external mutations OR |
|
222
|
|
|
|
|
|
|
arrayref of outgroup individuals |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=cut |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub fu_and_li_D { |
|
227
|
7
|
|
|
7
|
1
|
21
|
my ($self,$ingroup,$outgroup) = @_; |
|
228
|
|
|
|
|
|
|
|
|
229
|
7
|
|
|
|
|
10
|
my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0); |
|
230
|
7
|
100
|
33
|
|
|
43
|
if( ref($ingroup) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
231
|
4
|
|
|
|
|
5
|
$n = scalar @$ingroup; |
|
232
|
|
|
|
|
|
|
# pi - all pairwise differences |
|
233
|
4
|
|
|
|
|
9
|
$seg_sites = $self->segregating_sites_count($ingroup); |
|
234
|
|
|
|
|
|
|
} elsif( ref($ingroup) && |
|
235
|
|
|
|
|
|
|
$ingroup->isa('Bio::PopGen::PopulationI')) { |
|
236
|
3
|
|
|
|
|
7
|
$n = $ingroup->get_number_individuals; |
|
237
|
3
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($ingroup); |
|
238
|
|
|
|
|
|
|
} else { |
|
239
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D"); |
|
240
|
0
|
|
|
|
|
0
|
return 0; |
|
241
|
|
|
|
|
|
|
} |
|
242
|
|
|
|
|
|
|
|
|
243
|
7
|
50
|
|
|
|
16
|
if( $seg_sites <= 0 ) { |
|
244
|
0
|
|
|
|
|
0
|
$self->warn("mutation total was not > 0, cannot calculate a Fu and Li D"); |
|
245
|
0
|
|
|
|
|
0
|
return 0; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
7
|
50
|
|
|
|
17
|
if( ! defined $outgroup ) { |
|
|
|
100
|
|
|
|
|
|
|
249
|
0
|
|
|
|
|
0
|
$self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations"); |
|
250
|
0
|
|
|
|
|
0
|
return 0; |
|
251
|
|
|
|
|
|
|
} elsif( ref($outgroup) ) { |
|
252
|
4
|
|
|
|
|
8
|
($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup); |
|
253
|
4
|
50
|
|
|
|
9
|
$ancestral = 0 unless defined $ancestral; |
|
254
|
|
|
|
|
|
|
} else { |
|
255
|
3
|
|
|
|
|
4
|
$ancestral = $outgroup; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
|
|
258
|
7
|
|
|
|
|
16
|
return $self->fu_and_li_D_counts($n,$seg_sites, |
|
259
|
|
|
|
|
|
|
$ancestral,$derived); |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head2 fu_and_li_D_counts |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
Title : fu_li_D_counts |
|
265
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_and_li_D_counts($samps,$sites, |
|
266
|
|
|
|
|
|
|
$external); |
|
267
|
|
|
|
|
|
|
Function: Fu and Li D statistic for the raw counts of the number |
|
268
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
|
269
|
|
|
|
|
|
|
Returns : decimal number |
|
270
|
|
|
|
|
|
|
Args : number of samples (N) |
|
271
|
|
|
|
|
|
|
number of segregating sites (n) |
|
272
|
|
|
|
|
|
|
number of external mutations (n_e) |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=cut |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub fu_and_li_D_counts { |
|
278
|
8
|
|
|
8
|
1
|
131
|
my ($self,$n,$seg_sites, $external_mut) = @_; |
|
279
|
8
|
|
|
|
|
7
|
my $a_n = 0; |
|
280
|
8
|
50
|
|
|
|
16
|
if( $n <= 3 ) { |
|
281
|
0
|
|
|
|
|
0
|
$self->warn("n is $n, too small, must be > 3\n"); |
|
282
|
0
|
|
|
|
|
0
|
return; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
8
|
|
|
|
|
24
|
for(my $k= 1; $k < $n; $k++ ) { |
|
285
|
51
|
|
|
|
|
79
|
$a_n += ( 1 / $k ); |
|
286
|
|
|
|
|
|
|
} |
|
287
|
8
|
|
|
|
|
9
|
my $b = 0; |
|
288
|
8
|
|
|
|
|
17
|
for(my $k= 1; $k < $n; $k++ ) { |
|
289
|
51
|
|
|
|
|
84
|
$b += ( 1 / $k**2 ); |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
8
|
|
|
|
|
18
|
my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) / |
|
293
|
|
|
|
|
|
|
( ( $n - 1) * ( $n - 2 ) ) ); |
|
294
|
|
|
|
|
|
|
|
|
295
|
8
|
|
|
|
|
23
|
my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) * |
|
296
|
|
|
|
|
|
|
( $c - ( ( $n + 1) / |
|
297
|
|
|
|
|
|
|
( $n - 1) ) )); |
|
298
|
|
|
|
|
|
|
|
|
299
|
8
|
|
|
|
|
10
|
my $u = $a_n - 1 - $v; |
|
300
|
|
|
|
|
|
|
|
|
301
|
8
|
|
|
|
|
80
|
($seg_sites - $a_n * $external_mut) / |
|
302
|
|
|
|
|
|
|
sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites)); |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
} |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 fu_and_li_D_star |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
Title : fu_and_li_D_star |
|
310
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_an_li_D_star(\@individuals); |
|
311
|
|
|
|
|
|
|
Function: Fu and Li's D* statistic for a set of samples |
|
312
|
|
|
|
|
|
|
Without an outgroup |
|
313
|
|
|
|
|
|
|
Returns : decimal number |
|
314
|
|
|
|
|
|
|
Args : array ref of L objects |
|
315
|
|
|
|
|
|
|
OR |
|
316
|
|
|
|
|
|
|
L object |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
#' |
|
321
|
|
|
|
|
|
|
# fu_and_li_D* |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
sub fu_and_li_D_star { |
|
324
|
3
|
|
|
3
|
1
|
443
|
my ($self,$individuals) = @_; |
|
325
|
|
|
|
|
|
|
|
|
326
|
3
|
|
|
|
|
4
|
my ($seg_sites,$n,$singletons); |
|
327
|
3
|
100
|
33
|
|
|
22
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
328
|
2
|
|
|
|
|
3
|
$n = scalar @$individuals; |
|
329
|
2
|
|
|
|
|
5
|
$seg_sites = $self->segregating_sites_count($individuals); |
|
330
|
2
|
|
|
|
|
6
|
$singletons = $self->singleton_count($individuals); |
|
331
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
|
332
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI')) { |
|
333
|
1
|
|
|
|
|
1
|
my $pop = $individuals; |
|
334
|
1
|
|
|
|
|
3
|
$n = $pop->get_number_individuals; |
|
335
|
1
|
|
|
|
|
2
|
$seg_sites = $self->segregating_sites_count($pop); |
|
336
|
1
|
|
|
|
|
4
|
$singletons = $self->singleton_count($pop); |
|
337
|
|
|
|
|
|
|
} else { |
|
338
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star"); |
|
339
|
0
|
|
|
|
|
0
|
return 0; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
3
|
|
|
|
|
10
|
return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons); |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head2 fu_and_li_D_star_counts |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Title : fu_li_D_star_counts |
|
348
|
|
|
|
|
|
|
Usage : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites, |
|
349
|
|
|
|
|
|
|
$singletons); |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
Function: Fu and Li D statistic for the raw counts of the number |
|
352
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
|
353
|
|
|
|
|
|
|
Returns : decimal number |
|
354
|
|
|
|
|
|
|
Args : number of samples (N) |
|
355
|
|
|
|
|
|
|
number of segregating sites (n) |
|
356
|
|
|
|
|
|
|
singletons (n_s) |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=cut |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
sub fu_and_li_D_star_counts { |
|
362
|
4
|
|
|
4
|
1
|
5
|
my ($self,$n,$seg_sites, $singletons) = @_; |
|
363
|
4
|
|
|
|
|
4
|
my $a_n; |
|
364
|
4
|
|
|
|
|
14
|
for(my $k = 1; $k < $n; $k++ ) { |
|
365
|
35
|
|
|
|
|
50
|
$a_n += ( 1 / $k ); |
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
|
|
368
|
4
|
|
|
|
|
6
|
my $a1 = $a_n + 1 / $n; |
|
369
|
|
|
|
|
|
|
|
|
370
|
4
|
|
|
|
|
4
|
my $b = 0; |
|
371
|
4
|
|
|
|
|
10
|
for(my $k= 1; $k < $n; $k++ ) { |
|
372
|
35
|
|
|
|
|
47
|
$b += ( 1 / $k**2 ); |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
4
|
|
|
|
|
12
|
my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) / |
|
376
|
|
|
|
|
|
|
( ( $n - 1) * ( $n - 2 ) ) ); |
|
377
|
|
|
|
|
|
|
|
|
378
|
4
|
|
|
|
|
13
|
my $d = $c + ($n -2) / ($n - 1)**2 + |
|
379
|
|
|
|
|
|
|
2 / ($n -1) * |
|
380
|
|
|
|
|
|
|
( 1.5 - ( (2*$a1 - 3) / ($n -2) ) - |
|
381
|
|
|
|
|
|
|
1 / $n ); |
|
382
|
|
|
|
|
|
|
|
|
383
|
4
|
|
|
|
|
17
|
my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) - |
|
384
|
|
|
|
|
|
|
(2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) ) / |
|
385
|
|
|
|
|
|
|
(($a_n**2) + $b); |
|
386
|
|
|
|
|
|
|
|
|
387
|
4
|
|
|
|
|
9
|
my $u_star = ( ($n/($n-1))* |
|
388
|
|
|
|
|
|
|
($a_n - ($n/ |
|
389
|
|
|
|
|
|
|
($n-1)))) - $v_star; |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
|
|
392
|
4
|
|
|
|
|
37
|
return (($n / ($n - 1)) * $seg_sites - |
|
393
|
|
|
|
|
|
|
$a_n * $singletons) / |
|
394
|
|
|
|
|
|
|
sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites)); |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head2 fu_and_li_F |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Title : fu_and_li_F |
|
401
|
|
|
|
|
|
|
Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts); |
|
402
|
|
|
|
|
|
|
Function: Calculate Fu and Li's F on an ingroup with either the set of |
|
403
|
|
|
|
|
|
|
outgroup individuals, or the number of external mutations |
|
404
|
|
|
|
|
|
|
Returns : decimal number |
|
405
|
|
|
|
|
|
|
Args : array ref of L objects for the ingroup |
|
406
|
|
|
|
|
|
|
OR a L object |
|
407
|
|
|
|
|
|
|
number of external mutations OR list of individuals for the outgroup |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=cut |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
#' |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
sub fu_and_li_F { |
|
414
|
5
|
|
|
5
|
1
|
472
|
my ($self,$ingroup,$outgroup) = @_; |
|
415
|
5
|
|
|
|
|
3
|
my ($seg_sites,$pi,$n,$external,$internal); |
|
416
|
5
|
100
|
33
|
|
|
38
|
if( ref($ingroup) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
417
|
2
|
|
|
|
|
4
|
$n = scalar @$ingroup; |
|
418
|
|
|
|
|
|
|
# pi - all pairwise differences |
|
419
|
2
|
|
|
|
|
5
|
$pi = $self->pi($ingroup); |
|
420
|
2
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($ingroup); |
|
421
|
|
|
|
|
|
|
} elsif( ref($ingroup) && |
|
422
|
|
|
|
|
|
|
$ingroup->isa('Bio::PopGen::PopulationI')) { |
|
423
|
3
|
|
|
|
|
6
|
$n = $ingroup->get_number_individuals; |
|
424
|
3
|
|
|
|
|
6
|
$pi = $self->pi($ingroup); |
|
425
|
3
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($ingroup); |
|
426
|
|
|
|
|
|
|
} else { |
|
427
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F"); |
|
428
|
0
|
|
|
|
|
0
|
return 0; |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
|
|
431
|
5
|
50
|
|
|
|
16
|
if( ! defined $outgroup ) { |
|
|
|
100
|
|
|
|
|
|
|
432
|
0
|
|
|
|
|
0
|
$self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations"); |
|
433
|
0
|
|
|
|
|
0
|
return 0; |
|
434
|
|
|
|
|
|
|
} elsif( ref($outgroup) ) { |
|
435
|
2
|
|
|
|
|
7
|
($external,$internal) = $self->derived_mutations($ingroup,$outgroup); |
|
436
|
|
|
|
|
|
|
} else { |
|
437
|
3
|
|
|
|
|
3
|
$external = $outgroup; |
|
438
|
|
|
|
|
|
|
} |
|
439
|
5
|
|
|
|
|
13
|
$self->fu_and_li_F_counts($n,$pi,$seg_sites,$external); |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head2 fu_and_li_F_counts |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Title : fu_li_F_counts |
|
445
|
|
|
|
|
|
|
Usage : my $F = $statistics->fu_and_li_F_counts($samps,$pi, |
|
446
|
|
|
|
|
|
|
$sites, |
|
447
|
|
|
|
|
|
|
$external); |
|
448
|
|
|
|
|
|
|
Function: Fu and Li F statistic for the raw counts of the number |
|
449
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
|
450
|
|
|
|
|
|
|
Returns : decimal number |
|
451
|
|
|
|
|
|
|
Args : number of samples (N) |
|
452
|
|
|
|
|
|
|
average pairwise differences (pi) |
|
453
|
|
|
|
|
|
|
number of segregating sites (n) |
|
454
|
|
|
|
|
|
|
external mutations (n_e) |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=cut |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub fu_and_li_F_counts { |
|
460
|
6
|
|
|
6
|
1
|
12
|
my ($self,$n,$pi,$seg_sites, $external) = @_; |
|
461
|
6
|
|
|
|
|
5
|
my $a_n = 0; |
|
462
|
6
|
|
|
|
|
14
|
for(my $k= 1; $k < $n; $k++ ) { |
|
463
|
43
|
|
|
|
|
60
|
$a_n += ( 1 / $k ); |
|
464
|
|
|
|
|
|
|
} |
|
465
|
|
|
|
|
|
|
|
|
466
|
6
|
|
|
|
|
13
|
my $a1 = $a_n + (1 / $n ); |
|
467
|
|
|
|
|
|
|
|
|
468
|
6
|
|
|
|
|
7
|
my $b = 0; |
|
469
|
6
|
|
|
|
|
13
|
for(my $k= 1; $k < $n; $k++ ) { |
|
470
|
43
|
|
|
|
|
56
|
$b += ( 1 / $k**2 ); |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
|
|
473
|
6
|
|
|
|
|
17
|
my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) / |
|
474
|
|
|
|
|
|
|
( ( $n - 1) * ( $n - 2 ) ) ); |
|
475
|
|
|
|
|
|
|
|
|
476
|
6
|
|
|
|
|
23
|
my $v_F = ( $c + ( (2*(($n**2)+$n+3)) / |
|
477
|
|
|
|
|
|
|
( (9*$n)*($n-1) ) ) - |
|
478
|
|
|
|
|
|
|
(2/($n-1)) ) / ( ($a_n**2)+$b ); |
|
479
|
|
|
|
|
|
|
|
|
480
|
6
|
|
|
|
|
18
|
my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )- |
|
481
|
|
|
|
|
|
|
( 4*( ($n+1)/(($n-1)**2) ))* |
|
482
|
|
|
|
|
|
|
($a1 - ((2*$n)/($n+1))) ) / |
|
483
|
|
|
|
|
|
|
$a_n - $v_F; |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# warn("$v_F vf $u_F uf n = $n\n"); |
|
486
|
6
|
|
|
|
|
13
|
my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) + |
|
487
|
|
|
|
|
|
|
($v_F*($seg_sites**2)) ) ); |
|
488
|
|
|
|
|
|
|
|
|
489
|
6
|
|
|
|
|
47
|
return $F; |
|
490
|
|
|
|
|
|
|
} |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 fu_and_li_F_star |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Title : fu_and_li_F_star |
|
495
|
|
|
|
|
|
|
Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup); |
|
496
|
|
|
|
|
|
|
Function: Calculate Fu and Li's F* on an ingroup without an outgroup |
|
497
|
|
|
|
|
|
|
It uses count of singleton alleles instead |
|
498
|
|
|
|
|
|
|
Returns : decimal number |
|
499
|
|
|
|
|
|
|
Args : array ref of L objects for the ingroup |
|
500
|
|
|
|
|
|
|
OR |
|
501
|
|
|
|
|
|
|
L object |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=cut |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
#' keep my emacs happy |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub fu_and_li_F_star { |
|
508
|
3
|
|
|
3
|
1
|
447
|
my ($self,$individuals) = @_; |
|
509
|
|
|
|
|
|
|
|
|
510
|
3
|
|
|
|
|
4
|
my ($seg_sites,$pi,$n,$singletons); |
|
511
|
3
|
100
|
33
|
|
|
22
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
512
|
2
|
|
|
|
|
3
|
$n = scalar @$individuals; |
|
513
|
|
|
|
|
|
|
# pi - all pairwise differences |
|
514
|
2
|
|
|
|
|
6
|
$pi = $self->pi($individuals); |
|
515
|
2
|
|
|
|
|
7
|
$seg_sites = $self->segregating_sites_count($individuals); |
|
516
|
2
|
|
|
|
|
7
|
$singletons = $self->singleton_count($individuals); |
|
517
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
|
518
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI')) { |
|
519
|
1
|
|
|
|
|
2
|
my $pop = $individuals; |
|
520
|
1
|
|
|
|
|
2
|
$n = $pop->get_number_individuals; |
|
521
|
1
|
|
|
|
|
3
|
$pi = $self->pi($pop); |
|
522
|
1
|
|
|
|
|
4
|
$seg_sites = $self->segregating_sites_count($pop); |
|
523
|
1
|
|
|
|
|
3
|
$singletons = $self->singleton_count($pop); |
|
524
|
|
|
|
|
|
|
} else { |
|
525
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star"); |
|
526
|
0
|
|
|
|
|
0
|
return 0; |
|
527
|
|
|
|
|
|
|
} |
|
528
|
3
|
|
|
|
|
8
|
return $self->fu_and_li_F_star_counts($n, |
|
529
|
|
|
|
|
|
|
$pi, |
|
530
|
|
|
|
|
|
|
$seg_sites, |
|
531
|
|
|
|
|
|
|
$singletons); |
|
532
|
|
|
|
|
|
|
} |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=head2 fu_and_li_F_star_counts |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
Title : fu_li_F_star_counts |
|
537
|
|
|
|
|
|
|
Usage : my $F = $statistics->fu_and_li_F_star_counts($samps, |
|
538
|
|
|
|
|
|
|
$pi,$sites, |
|
539
|
|
|
|
|
|
|
$singletons); |
|
540
|
|
|
|
|
|
|
Function: Fu and Li F statistic for the raw counts of the number |
|
541
|
|
|
|
|
|
|
of samples, sites, external and internal mutations |
|
542
|
|
|
|
|
|
|
Returns : decimal number |
|
543
|
|
|
|
|
|
|
Args : number of samples (N) |
|
544
|
|
|
|
|
|
|
average pairwise differences (pi) |
|
545
|
|
|
|
|
|
|
number of segregating sites (n) |
|
546
|
|
|
|
|
|
|
singleton mutations (n_s) |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=cut |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
sub fu_and_li_F_star_counts { |
|
552
|
4
|
|
|
4
|
1
|
7
|
my ($self,$n,$pi,$seg_sites, $singletons) = @_; |
|
553
|
4
|
50
|
|
|
|
30
|
if( $n <= 1 ) { |
|
554
|
0
|
|
|
|
|
0
|
$self->warn("N must be > 1\n"); |
|
555
|
0
|
|
|
|
|
0
|
return; |
|
556
|
|
|
|
|
|
|
} |
|
557
|
4
|
50
|
|
|
|
7
|
if( $n == 2) { |
|
558
|
0
|
|
|
|
|
0
|
return 0; |
|
559
|
|
|
|
|
|
|
} |
|
560
|
|
|
|
|
|
|
|
|
561
|
4
|
|
|
|
|
5
|
my $a_n = 0; |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
|
|
564
|
4
|
|
|
|
|
4
|
my $b = 0; |
|
565
|
4
|
|
|
|
|
12
|
for(my $k= 1; $k < $n; $k++ ) { |
|
566
|
35
|
|
|
|
|
34
|
$b += (1 / ($k**2)); |
|
567
|
35
|
|
|
|
|
43
|
$a_n += ( 1 / $k ); # Eq (2) |
|
568
|
|
|
|
|
|
|
} |
|
569
|
4
|
|
|
|
|
6
|
my $a1 = $a_n + (1 / $n ); |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# warn("a_n is $a_n a1 is $a1 n is $n b is $b\n"); |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# From Simonsen et al (1995) instead of Fu and Li 1993 |
|
574
|
4
|
|
|
|
|
20
|
my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/ |
|
575
|
|
|
|
|
|
|
(9 * ($n ** 2) * ( $n - 1))) + |
|
576
|
|
|
|
|
|
|
((2 * ($n - 1) * $a_n ) / $n ** 2) - |
|
577
|
|
|
|
|
|
|
(8 * $b / $n) ) / |
|
578
|
|
|
|
|
|
|
( ($a_n ** 2) + $b ); |
|
579
|
|
|
|
|
|
|
|
|
580
|
4
|
|
|
|
|
11
|
my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) / |
|
581
|
|
|
|
|
|
|
(3 * $n * ( $n - 1))) / $a_n) - $v_F_star; |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
# warn("vf* = $v_F_star uf* = $u_F_star n = $n\n"); |
|
584
|
4
|
|
|
|
|
9
|
my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) / |
|
585
|
|
|
|
|
|
|
sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2); |
|
586
|
4
|
|
|
|
|
31
|
return $F_star; |
|
587
|
|
|
|
|
|
|
} |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=head2 tajima_D |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
Title : tajima_D |
|
592
|
|
|
|
|
|
|
Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples); |
|
593
|
|
|
|
|
|
|
Function: Calculate Tajima's D on a set of samples |
|
594
|
|
|
|
|
|
|
Returns : decimal number |
|
595
|
|
|
|
|
|
|
Args : array ref of L objects |
|
596
|
|
|
|
|
|
|
OR |
|
597
|
|
|
|
|
|
|
L object |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=cut |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
#' |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
sub tajima_D { |
|
605
|
6
|
|
|
6
|
1
|
491
|
my ($self,$individuals) = @_; |
|
606
|
6
|
|
|
|
|
10
|
my ($seg_sites,$pi,$n); |
|
607
|
|
|
|
|
|
|
|
|
608
|
6
|
100
|
33
|
|
|
57
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
609
|
2
|
|
|
|
|
3
|
$n = scalar @$individuals; |
|
610
|
|
|
|
|
|
|
# pi - all pairwise differences |
|
611
|
2
|
|
|
|
|
5
|
$pi = $self->pi($individuals); |
|
612
|
2
|
|
|
|
|
7
|
$seg_sites = $self->segregating_sites_count($individuals); |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
|
615
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI')) { |
|
616
|
4
|
|
|
|
|
5
|
my $pop = $individuals; |
|
617
|
4
|
|
|
|
|
14
|
$n = $pop->get_number_individuals; |
|
618
|
4
|
|
|
|
|
16
|
$pi = $self->pi($pop); |
|
619
|
4
|
|
|
|
|
16
|
$seg_sites = $self->segregating_sites_count($pop); |
|
620
|
|
|
|
|
|
|
} else { |
|
621
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D"); |
|
622
|
0
|
|
|
|
|
0
|
return 0; |
|
623
|
|
|
|
|
|
|
} |
|
624
|
6
|
|
|
|
|
31
|
$self->tajima_D_counts($n,$seg_sites,$pi); |
|
625
|
|
|
|
|
|
|
} |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=head2 tajima_D_counts |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
Title : tajima_D_counts |
|
630
|
|
|
|
|
|
|
Usage : my $D = $statistics->tajima_D_counts($samps,$sites,$pi); |
|
631
|
|
|
|
|
|
|
Function: Tajima's D statistic for the raw counts of the number |
|
632
|
|
|
|
|
|
|
of samples, sites, and avg pairwise distances (pi) |
|
633
|
|
|
|
|
|
|
Returns : decimal number |
|
634
|
|
|
|
|
|
|
Args : number of samples (N) |
|
635
|
|
|
|
|
|
|
number of segregating sites (n) |
|
636
|
|
|
|
|
|
|
average pairwise differences (pi) |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=cut |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
#' |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub tajima_D_counts { |
|
643
|
6
|
|
|
6
|
1
|
14
|
my ($self,$n,$seg_sites,$pi) = @_; |
|
644
|
6
|
|
|
|
|
5
|
my $a1 = 0; |
|
645
|
6
|
|
|
|
|
21
|
for(my $k= 1; $k < $n; $k++ ) { |
|
646
|
284
|
|
|
|
|
316
|
$a1 += ( 1 / $k ); |
|
647
|
|
|
|
|
|
|
} |
|
648
|
|
|
|
|
|
|
|
|
649
|
6
|
|
|
|
|
7
|
my $a2 = 0; |
|
650
|
6
|
|
|
|
|
30
|
for(my $k= 1; $k < $n; $k++ ) { |
|
651
|
284
|
|
|
|
|
335
|
$a2 += ( 1 / $k**2 ); |
|
652
|
|
|
|
|
|
|
} |
|
653
|
|
|
|
|
|
|
|
|
654
|
6
|
|
|
|
|
13
|
my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) ); |
|
655
|
6
|
|
|
|
|
14
|
my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) / |
|
656
|
|
|
|
|
|
|
( ( 9 * $n) * ( $n - 1) ); |
|
657
|
6
|
|
|
|
|
10
|
my $c1 = $b1 - ( 1 / $a1 ); |
|
658
|
6
|
|
|
|
|
43
|
my $c2 = $b2 - ( ( $n + 2 ) / |
|
659
|
|
|
|
|
|
|
( $a1 * $n))+( $a2 / $a1 ** 2); |
|
660
|
6
|
|
|
|
|
10
|
my $e1 = $c1 / $a1; |
|
661
|
6
|
|
|
|
|
8
|
my $e2 = $c2 / ( $a1**2 + $a2 ); |
|
662
|
|
|
|
|
|
|
|
|
663
|
6
|
|
|
|
|
13
|
my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1))); |
|
664
|
6
|
50
|
|
|
|
16
|
return if $denom == 0; |
|
665
|
6
|
|
|
|
|
7
|
my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom; |
|
666
|
6
|
|
|
|
|
92
|
return $D; |
|
667
|
|
|
|
|
|
|
} |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=head2 pi |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
Title : pi |
|
673
|
|
|
|
|
|
|
Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds) |
|
674
|
|
|
|
|
|
|
Function: Calculate pi (average number of pairwise differences) given |
|
675
|
|
|
|
|
|
|
a list of individuals which have the same number of markers |
|
676
|
|
|
|
|
|
|
(also called sites) as available from the get_Genotypes() |
|
677
|
|
|
|
|
|
|
call in L |
|
678
|
|
|
|
|
|
|
Returns : decimal number |
|
679
|
|
|
|
|
|
|
Args : Arg1= array ref of L objects |
|
680
|
|
|
|
|
|
|
which have markers/mutations. We expect all individuals to |
|
681
|
|
|
|
|
|
|
have a marker - we will deal with missing data as a special case. |
|
682
|
|
|
|
|
|
|
OR |
|
683
|
|
|
|
|
|
|
Arg1= L object. In the event that |
|
684
|
|
|
|
|
|
|
only allele frequency data is available, storing it in |
|
685
|
|
|
|
|
|
|
Population object will make this available. |
|
686
|
|
|
|
|
|
|
num sites [optional], an optional second argument (integer) |
|
687
|
|
|
|
|
|
|
which is the number of sites, then pi returned is pi/site. |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=cut |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub pi { |
|
692
|
21
|
|
|
21
|
1
|
50
|
my ($self,$individuals,$numsites) = @_; |
|
693
|
21
|
|
|
|
|
21
|
my (%data,%marker_total,@marker_names,$n); |
|
694
|
|
|
|
|
|
|
|
|
695
|
21
|
100
|
33
|
|
|
123
|
if( ref($individuals) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# one possible argument is an arrayref of Bio::PopGen::IndividualI objs |
|
697
|
9
|
|
|
|
|
40
|
@marker_names = $individuals->[0]->get_marker_names; |
|
698
|
9
|
|
|
|
|
21
|
$n = scalar @$individuals; |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# Here we are calculating the allele frequencies |
|
701
|
9
|
|
|
|
|
12
|
foreach my $ind ( @$individuals ) { |
|
702
|
45
|
50
|
|
|
|
120
|
if( ! $ind->isa('Bio::PopGen::IndividualI') ) { |
|
703
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n"); |
|
704
|
0
|
|
|
|
|
0
|
return 0; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
45
|
|
|
|
|
45
|
foreach my $m ( @marker_names ) { |
|
707
|
2125
|
|
|
|
|
2593
|
foreach my $allele (map { $_->get_Alleles} |
|
|
2125
|
|
|
|
|
2462
|
|
|
708
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
|
709
|
2125
|
|
|
|
|
1932
|
$data{$m}->{$allele}++; |
|
710
|
2125
|
|
|
|
|
2383
|
$marker_total{$m}++; |
|
711
|
|
|
|
|
|
|
} |
|
712
|
|
|
|
|
|
|
} |
|
713
|
|
|
|
|
|
|
} |
|
714
|
|
|
|
|
|
|
# while( my ($marker,$count) = each %marker_total ) { |
|
715
|
|
|
|
|
|
|
# foreach my $c ( values %{$data{$marker}} ) { |
|
716
|
|
|
|
|
|
|
# $c /= $count; |
|
717
|
|
|
|
|
|
|
# } |
|
718
|
|
|
|
|
|
|
# } |
|
719
|
|
|
|
|
|
|
# %data will contain allele frequencies for each marker, allele |
|
720
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
|
721
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI') ) { |
|
722
|
12
|
|
|
|
|
33
|
my $pop = $individuals; |
|
723
|
12
|
|
|
|
|
28
|
$n = $pop->get_number_individuals; |
|
724
|
12
|
|
|
|
|
80
|
foreach my $marker( $pop->get_Markers ) { |
|
725
|
143
|
|
|
|
|
374
|
push @marker_names, $marker->name; |
|
726
|
|
|
|
|
|
|
#$data{$marker->name} = {$marker->get_Allele_Frequencies}; |
|
727
|
143
|
|
|
|
|
466
|
my @genotypes = $pop->get_Genotypes(-marker => $marker->name); |
|
728
|
143
|
|
|
|
|
1137
|
for my $al ( map { $_->get_Alleles} @genotypes ) { |
|
|
12444
|
|
|
|
|
14821
|
|
|
729
|
24628
|
|
|
|
|
25399
|
$data{$marker->name}->{$al}++; |
|
730
|
24628
|
|
|
|
|
26737
|
$marker_total{$marker->name}++; |
|
731
|
|
|
|
|
|
|
} |
|
732
|
|
|
|
|
|
|
} |
|
733
|
|
|
|
|
|
|
} else { |
|
734
|
0
|
|
|
|
|
0
|
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi"); |
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
# based on Kevin Thornton's code: |
|
737
|
|
|
|
|
|
|
# http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152 |
|
738
|
|
|
|
|
|
|
# For now we assume that all individuals have the same markers |
|
739
|
21
|
|
|
|
|
84
|
my ($diffcount,$totalcompare) = (0,0); |
|
740
|
21
|
|
|
|
|
21
|
my $pi = 0; |
|
741
|
21
|
|
|
|
|
62
|
while ( my ($marker,$markerdat) = each %data ) { |
|
742
|
568
|
|
|
|
|
399
|
my $sampsize = $marker_total{$marker}; |
|
743
|
568
|
|
|
|
|
344
|
my $ssh = 0; |
|
744
|
568
|
|
|
|
|
795
|
my @alleles = keys %$markerdat; |
|
745
|
568
|
50
|
|
|
|
670
|
if ( $sampsize > 1 ) { |
|
746
|
568
|
|
|
|
|
388
|
my $denom = $sampsize * ($sampsize - 1.0); |
|
747
|
568
|
|
|
|
|
433
|
foreach my $al ( @alleles ) { |
|
748
|
1118
|
|
|
|
|
1145
|
$ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom; |
|
749
|
|
|
|
|
|
|
} |
|
750
|
568
|
|
|
|
|
1077
|
$pi += 1.0 - $ssh; |
|
751
|
|
|
|
|
|
|
} |
|
752
|
|
|
|
|
|
|
} |
|
753
|
21
|
|
|
|
|
239
|
$self->debug( "pi=$pi\n"); |
|
754
|
21
|
100
|
|
|
|
46
|
if( $numsites ) { |
|
755
|
2
|
|
|
|
|
14
|
return $pi / $numsites; |
|
756
|
|
|
|
|
|
|
} else { |
|
757
|
19
|
|
|
|
|
256
|
return $pi; |
|
758
|
|
|
|
|
|
|
} |
|
759
|
|
|
|
|
|
|
} |
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=head2 theta |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
Title : theta |
|
765
|
|
|
|
|
|
|
Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites); |
|
766
|
|
|
|
|
|
|
Function: Calculates Watterson's theta from the sample size |
|
767
|
|
|
|
|
|
|
and the number of segregating sites. |
|
768
|
|
|
|
|
|
|
Providing the third parameter, total number of sites will |
|
769
|
|
|
|
|
|
|
return theta per site. |
|
770
|
|
|
|
|
|
|
This is also known as K-hat = K / a_n |
|
771
|
|
|
|
|
|
|
Returns : decimal number |
|
772
|
|
|
|
|
|
|
Args : sample size (integer), |
|
773
|
|
|
|
|
|
|
num segregating sites (integer) |
|
774
|
|
|
|
|
|
|
total sites (integer) [optional] (to calculate theta per site) |
|
775
|
|
|
|
|
|
|
OR |
|
776
|
|
|
|
|
|
|
provide an arrayref of the L objects |
|
777
|
|
|
|
|
|
|
total sites (integer) [optional] (to calculate theta per site) |
|
778
|
|
|
|
|
|
|
OR |
|
779
|
|
|
|
|
|
|
provide an L object |
|
780
|
|
|
|
|
|
|
total sites (integer)[optional] |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=cut |
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
#' |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
sub theta { |
|
787
|
7
|
|
|
7
|
1
|
687
|
my $self = shift; |
|
788
|
7
|
|
|
|
|
10
|
my ( $n, $seg_sites,$totalsites) = @_; |
|
789
|
7
|
100
|
66
|
|
|
68
|
if( ref($n) =~ /ARRAY/i ) { |
|
|
|
100
|
|
|
|
|
|
|
790
|
2
|
|
|
|
|
2
|
my $samps = $n; |
|
791
|
2
|
|
|
|
|
1
|
$totalsites = $seg_sites; # only 2 arguments if one is an array |
|
792
|
2
|
|
|
|
|
2
|
my %data; |
|
793
|
2
|
|
|
|
|
5
|
my @marker_names = $samps->[0]->get_marker_names; |
|
794
|
|
|
|
|
|
|
# we need to calculate number of polymorphic sites |
|
795
|
2
|
|
|
|
|
6
|
$seg_sites = $self->segregating_sites_count($samps); |
|
796
|
2
|
|
|
|
|
3
|
$n = scalar @$samps; |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
} elsif(ref($n) && |
|
799
|
|
|
|
|
|
|
$n->isa('Bio::PopGen::PopulationI') ) { |
|
800
|
|
|
|
|
|
|
# This will handle the case when we pass in a PopulationI object |
|
801
|
4
|
|
|
|
|
4
|
my $pop = $n; |
|
802
|
4
|
|
|
|
|
6
|
$totalsites = $seg_sites; # shift the arguments over by one |
|
803
|
4
|
|
|
|
|
14
|
$n = $pop->haploid_population->get_number_individuals; |
|
804
|
4
|
|
|
|
|
10
|
$seg_sites = $self->segregating_sites_count($pop); |
|
805
|
|
|
|
|
|
|
} |
|
806
|
7
|
|
|
|
|
8
|
my $a1 = 0; |
|
807
|
7
|
|
|
|
|
18
|
for(my $k= 1; $k < $n; $k++ ) { |
|
808
|
202
|
|
|
|
|
242
|
$a1 += ( 1 / $k ); |
|
809
|
|
|
|
|
|
|
} |
|
810
|
7
|
100
|
|
|
|
12
|
if( $totalsites ) { # 0 and undef are the same can't divide by them |
|
811
|
2
|
|
|
|
|
3
|
$seg_sites /= $totalsites; |
|
812
|
|
|
|
|
|
|
} |
|
813
|
7
|
50
|
|
|
|
14
|
if( $a1 == 0 ) { |
|
814
|
0
|
|
|
|
|
0
|
return 0; |
|
815
|
|
|
|
|
|
|
} |
|
816
|
7
|
|
|
|
|
60
|
return $seg_sites / $a1; |
|
817
|
|
|
|
|
|
|
} |
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 singleton_count |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Title : singleton_count |
|
822
|
|
|
|
|
|
|
Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds) |
|
823
|
|
|
|
|
|
|
Function: Calculate the number of mutations/alleles which only occur once in |
|
824
|
|
|
|
|
|
|
a list of individuals for all sites/markers |
|
825
|
|
|
|
|
|
|
Returns : (integer) number of alleles which only occur once (integer) |
|
826
|
|
|
|
|
|
|
Args : arrayref of L objects |
|
827
|
|
|
|
|
|
|
OR |
|
828
|
|
|
|
|
|
|
L object |
|
829
|
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=cut |
|
831
|
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
sub singleton_count { |
|
833
|
7
|
|
|
7
|
1
|
454
|
my ($self,$individuals) = @_; |
|
834
|
|
|
|
|
|
|
|
|
835
|
7
|
|
|
|
|
5
|
my @inds; |
|
836
|
7
|
100
|
33
|
|
|
36
|
if( ref($individuals) =~ /ARRAY/ ) { |
|
|
|
50
|
|
|
|
|
|
|
837
|
5
|
|
|
|
|
9
|
@inds = @$individuals; |
|
838
|
|
|
|
|
|
|
} elsif( ref($individuals) && |
|
839
|
|
|
|
|
|
|
$individuals->isa('Bio::PopGen::PopulationI') ) { |
|
840
|
2
|
|
|
|
|
2
|
my $pop = $individuals; |
|
841
|
2
|
|
|
|
|
4
|
@inds = $pop->get_Individuals(); |
|
842
|
2
|
50
|
|
|
|
3
|
unless( @inds ) { |
|
843
|
0
|
|
|
|
|
0
|
$self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies"); |
|
844
|
0
|
|
|
|
|
0
|
return 0; |
|
845
|
|
|
|
|
|
|
} |
|
846
|
|
|
|
|
|
|
} else { |
|
847
|
0
|
|
|
|
|
0
|
$self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects"); |
|
848
|
0
|
|
|
|
|
0
|
return 0; |
|
849
|
|
|
|
|
|
|
} |
|
850
|
|
|
|
|
|
|
# find number of sites where a particular allele is only seen once |
|
851
|
|
|
|
|
|
|
|
|
852
|
7
|
|
|
|
|
10
|
my ($singleton_allele_ct,%sites) = (0); |
|
853
|
|
|
|
|
|
|
# first collect all the alleles into a hash structure |
|
854
|
|
|
|
|
|
|
|
|
855
|
7
|
|
|
|
|
10
|
foreach my $n ( @inds ) { |
|
856
|
35
|
50
|
|
|
|
84
|
if( ! $n->isa('Bio::PopGen::IndividualI') ) { |
|
857
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n"); |
|
858
|
0
|
|
|
|
|
0
|
return 0; |
|
859
|
|
|
|
|
|
|
} |
|
860
|
35
|
|
|
|
|
44
|
foreach my $g ( $n->get_Genotypes ) { |
|
861
|
1600
|
|
|
|
|
1702
|
my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles); |
|
862
|
1600
|
|
|
|
|
1508
|
foreach my $allele (@alleles ) { |
|
863
|
1600
|
|
|
|
|
2106
|
$sites{$nm}->{$allele}++; |
|
864
|
|
|
|
|
|
|
} |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
} |
|
867
|
7
|
|
|
|
|
22
|
foreach my $site ( values %sites ) { # don't really care what the name is |
|
868
|
320
|
|
|
|
|
278
|
foreach my $allelect ( values %$site ) { # |
|
869
|
|
|
|
|
|
|
# find the sites which have an allele with only 1 copy |
|
870
|
636
|
100
|
|
|
|
724
|
$singleton_allele_ct++ if( $allelect == 1 ); |
|
871
|
|
|
|
|
|
|
} |
|
872
|
|
|
|
|
|
|
} |
|
873
|
7
|
|
|
|
|
67
|
return $singleton_allele_ct; |
|
874
|
|
|
|
|
|
|
} |
|
875
|
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# Yes I know that singleton_count and segregating_sites_count are |
|
877
|
|
|
|
|
|
|
# basically processing the same data so calling them both is |
|
878
|
|
|
|
|
|
|
# redundant, something I want to fix later but want to make things |
|
879
|
|
|
|
|
|
|
# correct and simple first |
|
880
|
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
=head2 segregating_sites_count |
|
882
|
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
Title : segregating_sites_count |
|
884
|
|
|
|
|
|
|
Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count |
|
885
|
|
|
|
|
|
|
Function: Gets the number of segregating sites (number of polymorphic sites) |
|
886
|
|
|
|
|
|
|
Returns : (integer) number of segregating sites |
|
887
|
|
|
|
|
|
|
Args : arrayref of L objects |
|
888
|
|
|
|
|
|
|
OR |
|
889
|
|
|
|
|
|
|
L object |
|
890
|
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
=cut |
|
892
|
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
# perhaps we'll change this in the future |
|
894
|
|
|
|
|
|
|
# to return the actual segregating sites |
|
895
|
|
|
|
|
|
|
# so one can use this to pull in the names of those sites. |
|
896
|
|
|
|
|
|
|
# Would be trivial if it is useful. |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
sub segregating_sites_count { |
|
899
|
31
|
|
|
31
|
1
|
490
|
my ($self,$individuals) = @_; |
|
900
|
31
|
|
|
|
|
40
|
my $type = ref($individuals); |
|
901
|
31
|
|
|
|
|
29
|
my $seg_sites = 0; |
|
902
|
31
|
100
|
33
|
|
|
175
|
if( $type =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
903
|
15
|
|
|
|
|
14
|
my %sites; |
|
904
|
15
|
|
|
|
|
20
|
foreach my $n ( @$individuals ) { |
|
905
|
75
|
50
|
|
|
|
173
|
if( ! $n->isa('Bio::PopGen::IndividualI') ) { |
|
906
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n"); |
|
907
|
0
|
|
|
|
|
0
|
return 0; |
|
908
|
|
|
|
|
|
|
} |
|
909
|
75
|
|
|
|
|
105
|
foreach my $g ( $n->get_Genotypes ) { |
|
910
|
3225
|
|
|
|
|
3509
|
my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles); |
|
911
|
3225
|
|
|
|
|
3159
|
foreach my $allele (@alleles ) { |
|
912
|
3225
|
|
|
|
|
4114
|
$sites{$nm}->{$allele}++; |
|
913
|
|
|
|
|
|
|
} |
|
914
|
|
|
|
|
|
|
} |
|
915
|
|
|
|
|
|
|
} |
|
916
|
15
|
|
|
|
|
47
|
foreach my $site ( values %sites ) { # use values b/c we don't |
|
917
|
|
|
|
|
|
|
# really care what the name is |
|
918
|
|
|
|
|
|
|
# find the sites which >1 allele |
|
919
|
645
|
100
|
|
|
|
902
|
$seg_sites++ if( keys %$site > 1 ); |
|
920
|
|
|
|
|
|
|
} |
|
921
|
|
|
|
|
|
|
} elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) { |
|
922
|
16
|
|
|
|
|
43
|
foreach my $marker ( $individuals->haploid_population->get_Markers ) { |
|
923
|
163
|
|
|
|
|
217
|
my @alleles = $marker->get_Alleles; |
|
924
|
163
|
100
|
|
|
|
281
|
$seg_sites++ if ( scalar @alleles > 1 ); |
|
925
|
|
|
|
|
|
|
} |
|
926
|
|
|
|
|
|
|
} else { |
|
927
|
0
|
|
|
|
|
0
|
$self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects"); |
|
928
|
0
|
|
|
|
|
0
|
return 0; |
|
929
|
|
|
|
|
|
|
} |
|
930
|
31
|
|
|
|
|
108
|
return $seg_sites; |
|
931
|
|
|
|
|
|
|
} |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=head2 heterozygosity |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
Title : heterozygosity |
|
937
|
|
|
|
|
|
|
Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1); |
|
938
|
|
|
|
|
|
|
Function: Calculate the heterozgosity for a sample set for a set of alleles |
|
939
|
|
|
|
|
|
|
Returns : decimal number |
|
940
|
|
|
|
|
|
|
Args : sample size (integer) |
|
941
|
|
|
|
|
|
|
frequency of one allele (fraction - must be less than 1) |
|
942
|
|
|
|
|
|
|
[optional] frequency of another allele - this is only needed |
|
943
|
|
|
|
|
|
|
in a non-binary allele system |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Note : p^2 + 2pq + q^2 |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=cut |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
sub heterozygosity { |
|
951
|
0
|
|
|
0
|
1
|
0
|
my ($self,$samp_size, $freq1,$freq2) = @_; |
|
952
|
0
|
0
|
|
|
|
0
|
if( ! $freq2 ) { $freq2 = 1 - $freq1 } |
|
|
0
|
|
|
|
|
0
|
|
|
953
|
0
|
0
|
0
|
|
|
0
|
if( $freq1 > 1 || $freq2 > 1 ) { |
|
954
|
0
|
|
|
|
|
0
|
$self->warn("heterozygosity expects frequencies to be less than 1"); |
|
955
|
|
|
|
|
|
|
} |
|
956
|
0
|
|
|
|
|
0
|
my $sum = ($freq1**2) + (($freq2)**2); |
|
957
|
0
|
|
|
|
|
0
|
my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ; |
|
958
|
0
|
|
|
|
|
0
|
return $h; |
|
959
|
|
|
|
|
|
|
} |
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
=head2 derived_mutations |
|
963
|
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
Title : derived_mutations |
|
965
|
|
|
|
|
|
|
Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup); |
|
966
|
|
|
|
|
|
|
Function: Calculate the number of alleles or (mutations) which are ancestral |
|
967
|
|
|
|
|
|
|
and the number which are derived (occurred only on the tips) |
|
968
|
|
|
|
|
|
|
Returns : array of 2 items - number of external and internal derived |
|
969
|
|
|
|
|
|
|
mutation |
|
970
|
|
|
|
|
|
|
Args : ingroup - Ls arrayref OR |
|
971
|
|
|
|
|
|
|
L |
|
972
|
|
|
|
|
|
|
outgroup- Ls arrayref OR |
|
973
|
|
|
|
|
|
|
L OR |
|
974
|
|
|
|
|
|
|
a single L |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
=cut |
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
sub derived_mutations { |
|
979
|
10
|
|
|
10
|
1
|
13
|
my ($self,$ingroup,$outgroup) = @_; |
|
980
|
10
|
|
|
|
|
10
|
my (%indata,%outdata,@marker_names); |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
# basically we have to do some type checking |
|
983
|
|
|
|
|
|
|
# if that perl were typed... |
|
984
|
10
|
|
|
|
|
16
|
my ($itype,$otype) = (ref($ingroup),ref($outgroup)); |
|
985
|
|
|
|
|
|
|
|
|
986
|
10
|
50
|
|
|
|
15
|
return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums |
|
987
|
|
|
|
|
|
|
# are already the value we |
|
988
|
|
|
|
|
|
|
# are searching for |
|
989
|
|
|
|
|
|
|
# pick apart the ingroup |
|
990
|
|
|
|
|
|
|
# get the data |
|
991
|
10
|
100
|
33
|
|
|
63
|
if( ref($ingroup) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
992
|
4
|
50
|
33
|
|
|
19
|
if( ! ref($ingroup->[0]) || |
|
993
|
|
|
|
|
|
|
! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) { |
|
994
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations"); |
|
995
|
0
|
|
|
|
|
0
|
return 0; |
|
996
|
|
|
|
|
|
|
} |
|
997
|
|
|
|
|
|
|
# we assume that all individuals have the same markers |
|
998
|
|
|
|
|
|
|
# i.e. that they are aligned |
|
999
|
4
|
|
|
|
|
8
|
@marker_names = $ingroup->[0]->get_marker_names; |
|
1000
|
4
|
|
|
|
|
6
|
for my $ind ( @$ingroup ) { |
|
1001
|
20
|
|
|
|
|
18
|
for my $m ( @marker_names ) { |
|
1002
|
100
|
|
|
|
|
112
|
for my $allele ( map { $_->get_Alleles } |
|
|
100
|
|
|
|
|
122
|
|
|
1003
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
|
1004
|
100
|
|
|
|
|
138
|
$indata{$m}->{$allele}++; |
|
1005
|
|
|
|
|
|
|
} |
|
1006
|
|
|
|
|
|
|
} |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
} elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) { |
|
1009
|
6
|
|
|
|
|
14
|
@marker_names = $ingroup->get_marker_names; |
|
1010
|
6
|
|
|
|
|
15
|
for my $ind ( $ingroup->haploid_population->get_Individuals() ) { |
|
1011
|
30
|
|
|
|
|
24
|
for my $m ( @marker_names ) { |
|
1012
|
150
|
|
|
|
|
176
|
for my $allele ( map { $_->get_Alleles} |
|
|
150
|
|
|
|
|
183
|
|
|
1013
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
|
1014
|
150
|
|
|
|
|
221
|
$indata{$m}->{$allele}++; |
|
1015
|
|
|
|
|
|
|
} |
|
1016
|
|
|
|
|
|
|
} |
|
1017
|
|
|
|
|
|
|
} |
|
1018
|
|
|
|
|
|
|
} else { |
|
1019
|
0
|
|
|
|
|
0
|
$self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations"); |
|
1020
|
0
|
|
|
|
|
0
|
return 0; |
|
1021
|
|
|
|
|
|
|
} |
|
1022
|
|
|
|
|
|
|
|
|
1023
|
10
|
100
|
|
|
|
29
|
if( $otype =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
1024
|
5
|
50
|
33
|
|
|
26
|
if( ! ref($outgroup->[0]) || |
|
1025
|
|
|
|
|
|
|
! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) { |
|
1026
|
0
|
|
|
|
|
0
|
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations"); |
|
1027
|
0
|
|
|
|
|
0
|
return 0; |
|
1028
|
|
|
|
|
|
|
} |
|
1029
|
5
|
|
|
|
|
6
|
for my $ind ( @$outgroup ) { |
|
1030
|
5
|
|
|
|
|
24
|
for my $m ( @marker_names ) { |
|
1031
|
25
|
|
|
|
|
32
|
for my $allele ( map { $_->get_Alleles } |
|
|
25
|
|
|
|
|
35
|
|
|
1032
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
|
1033
|
25
|
|
|
|
|
47
|
$outdata{$m}->{$allele}++; |
|
1034
|
|
|
|
|
|
|
} |
|
1035
|
|
|
|
|
|
|
} |
|
1036
|
|
|
|
|
|
|
} |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
} elsif( $otype->isa('Bio::PopGen::PopulationI') ) { |
|
1039
|
5
|
|
|
|
|
10
|
for my $ind ( $outgroup->haploid_population->get_Individuals() ) { |
|
1040
|
5
|
|
|
|
|
7
|
for my $m ( @marker_names ) { |
|
1041
|
25
|
|
|
|
|
36
|
for my $allele ( map { $_->get_Alleles} |
|
|
25
|
|
|
|
|
36
|
|
|
1042
|
|
|
|
|
|
|
$ind->get_Genotypes($m) ) { |
|
1043
|
25
|
|
|
|
|
43
|
$outdata{$m}->{$allele}++; |
|
1044
|
|
|
|
|
|
|
} |
|
1045
|
|
|
|
|
|
|
} |
|
1046
|
|
|
|
|
|
|
} |
|
1047
|
|
|
|
|
|
|
} else { |
|
1048
|
0
|
|
|
|
|
0
|
$self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations"); |
|
1049
|
0
|
|
|
|
|
0
|
return 0; |
|
1050
|
|
|
|
|
|
|
} |
|
1051
|
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# derived mutations are defined as |
|
1053
|
|
|
|
|
|
|
# |
|
1054
|
|
|
|
|
|
|
# ingroup (G A T) |
|
1055
|
|
|
|
|
|
|
# outgroup (A) |
|
1056
|
|
|
|
|
|
|
# derived mutations are G and T, A is the external mutation |
|
1057
|
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
# ingroup (A T) |
|
1059
|
|
|
|
|
|
|
# outgroup (C) |
|
1060
|
|
|
|
|
|
|
# derived mutations A,T no external/ancestral mutations |
|
1061
|
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
# ingroup (G A T) |
|
1063
|
|
|
|
|
|
|
# outgroup (A T) |
|
1064
|
|
|
|
|
|
|
# cannot determine |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
10
|
|
|
|
|
14
|
my ($internal,$external); |
|
1067
|
10
|
|
|
|
|
13
|
foreach my $marker ( @marker_names ) { |
|
1068
|
50
|
|
|
|
|
33
|
my @outalleles = keys %{$outdata{$marker}}; |
|
|
50
|
|
|
|
|
85
|
|
|
1069
|
50
|
|
|
|
|
42
|
my @in_alleles = keys %{$indata{$marker}}; |
|
|
50
|
|
|
|
|
68
|
|
|
1070
|
50
|
100
|
66
|
|
|
145
|
next if( @outalleles > 1 || @in_alleles == 1); |
|
1071
|
40
|
|
|
|
|
33
|
for my $allele ( @in_alleles ) { |
|
1072
|
80
|
100
|
|
|
|
114
|
if( ! exists $outdata{$marker}->{$allele} ) { |
|
1073
|
40
|
100
|
|
|
|
45
|
if( $indata{$marker}->{$allele} == 1 ) { |
|
1074
|
10
|
|
|
|
|
10
|
$external++; |
|
1075
|
|
|
|
|
|
|
} else { |
|
1076
|
30
|
|
|
|
|
28
|
$internal++; |
|
1077
|
|
|
|
|
|
|
} |
|
1078
|
|
|
|
|
|
|
} |
|
1079
|
|
|
|
|
|
|
} |
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
10
|
|
|
|
|
52
|
return ($external, $internal); |
|
1082
|
|
|
|
|
|
|
} |
|
1083
|
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
=head2 composite_LD |
|
1086
|
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
Title : composite_LD |
|
1088
|
|
|
|
|
|
|
Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population); |
|
1089
|
|
|
|
|
|
|
Function: Calculate the Linkage Disequilibrium |
|
1090
|
|
|
|
|
|
|
This is for calculating LD for unphased data. |
|
1091
|
|
|
|
|
|
|
Other methods will be appropriate for phased haplotype data. |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
Returns : Hash of Hashes - first key is site 1,second key is site 2 |
|
1094
|
|
|
|
|
|
|
and value is LD for those two sites. |
|
1095
|
|
|
|
|
|
|
my $LDarrayref = $matrix{$site1}->{$site2}; |
|
1096
|
|
|
|
|
|
|
my ($ldval, $chisquared) = @$LDarrayref; |
|
1097
|
|
|
|
|
|
|
Args : L or arrayref of |
|
1098
|
|
|
|
|
|
|
Ls |
|
1099
|
|
|
|
|
|
|
Reference: Weir B.S. (1996) "Genetic Data Analysis II", |
|
1100
|
|
|
|
|
|
|
Sinauer, Sunderlanm MA. |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=cut |
|
1103
|
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
sub composite_LD { |
|
1105
|
2
|
|
|
2
|
1
|
7
|
my ($self,$pop) = @_; |
|
1106
|
2
|
50
|
33
|
|
|
17
|
if( ref($pop) =~ /ARRAY/i ) { |
|
|
|
50
|
|
|
|
|
|
|
1107
|
0
|
0
|
0
|
|
|
0
|
if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) { |
|
1108
|
0
|
|
|
|
|
0
|
$pop = Bio::PopGen::Population->new(-individuals => @$pop); |
|
1109
|
|
|
|
|
|
|
} else { |
|
1110
|
0
|
|
|
|
|
0
|
$self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects"); |
|
1111
|
0
|
|
|
|
|
0
|
return (); |
|
1112
|
|
|
|
|
|
|
} |
|
1113
|
|
|
|
|
|
|
} elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) { |
|
1114
|
0
|
|
|
|
|
0
|
$self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects"); |
|
1115
|
0
|
|
|
|
|
0
|
return (); |
|
1116
|
|
|
|
|
|
|
} |
|
1117
|
|
|
|
|
|
|
|
|
1118
|
2
|
|
|
|
|
247
|
my @marker_names = $pop->get_marker_names; |
|
1119
|
2
|
|
|
|
|
5
|
my @inds = $pop->get_Individuals; |
|
1120
|
2
|
|
|
|
|
3
|
my $num_inds = scalar @inds; |
|
1121
|
2
|
|
|
|
|
1
|
my (%lookup); |
|
1122
|
|
|
|
|
|
|
# calculate allele frequencies for each marker from the population |
|
1123
|
|
|
|
|
|
|
# use the built-in get_Marker to get the allele freqs |
|
1124
|
|
|
|
|
|
|
# we still need to calculate the genotype frequencies |
|
1125
|
2
|
|
|
|
|
3
|
foreach my $marker_name ( @marker_names ) { |
|
1126
|
5
|
|
|
|
|
5
|
my(%allelef); |
|
1127
|
|
|
|
|
|
|
|
|
1128
|
5
|
|
|
|
|
5
|
foreach my $ind ( @inds ) { |
|
1129
|
76
|
|
|
|
|
124
|
my ($genotype) = $ind->get_Genotypes(-marker => $marker_name); |
|
1130
|
76
|
50
|
|
|
|
120
|
if( ! defined $genotype ) { |
|
1131
|
0
|
|
|
|
|
0
|
$self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n"); |
|
1132
|
0
|
|
|
|
|
0
|
next; |
|
1133
|
|
|
|
|
|
|
} |
|
1134
|
76
|
|
|
|
|
96
|
my @alleles = sort $genotype->get_Alleles; |
|
1135
|
76
|
100
|
|
|
|
117
|
next if( scalar @alleles != 2); |
|
1136
|
73
|
|
|
|
|
76
|
my $genostr = join(',', @alleles); |
|
1137
|
73
|
|
|
|
|
55
|
$allelef{$alleles[0]}++; |
|
1138
|
73
|
|
|
|
|
86
|
$allelef{$alleles[1]}++; |
|
1139
|
|
|
|
|
|
|
} |
|
1140
|
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
# we should check for cases where there > 2 alleles or |
|
1142
|
|
|
|
|
|
|
# only 1 allele and throw out those markers. |
|
1143
|
5
|
|
|
|
|
11
|
my @alleles = sort keys %allelef; |
|
1144
|
5
|
|
|
|
|
4
|
my $allele_count = scalar @alleles; |
|
1145
|
|
|
|
|
|
|
# test if site is polymorphic |
|
1146
|
5
|
50
|
|
|
|
9
|
if( $allele_count != 2) { |
|
1147
|
|
|
|
|
|
|
# only really warn if we're seeing multi-allele |
|
1148
|
0
|
0
|
|
|
|
0
|
$self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2; |
|
1149
|
0
|
|
|
|
|
0
|
next; # skip this marker |
|
1150
|
|
|
|
|
|
|
} |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
# Need to do something here to detect alleles which aren't |
|
1153
|
|
|
|
|
|
|
# a single character |
|
1154
|
5
|
50
|
33
|
|
|
19
|
if( length($alleles[0]) != 1 || |
|
1155
|
|
|
|
|
|
|
length($alleles[1]) != 1 ) { |
|
1156
|
0
|
|
|
|
|
0
|
$self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character"); |
|
1157
|
0
|
|
|
|
|
0
|
next; |
|
1158
|
|
|
|
|
|
|
} |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
# fix the call for allele 1 (A or B) and |
|
1161
|
|
|
|
|
|
|
# allele 2 (a or b) in terms of how we'll do the |
|
1162
|
|
|
|
|
|
|
# N square from Weir p.126 |
|
1163
|
5
|
|
|
|
|
16
|
$self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n"); |
|
1164
|
5
|
|
|
|
|
10
|
$lookup{$marker_name}->{'1'} = $alleles[0]; |
|
1165
|
5
|
|
|
|
|
11
|
$lookup{$marker_name}->{'2'} = $alleles[1]; |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
2
|
|
|
|
|
5
|
@marker_names = sort keys %lookup; |
|
1169
|
2
|
|
|
|
|
2
|
my $site_count = scalar @marker_names; |
|
1170
|
|
|
|
|
|
|
# where the final data will be stored |
|
1171
|
2
|
|
|
|
|
3
|
my %stats_for_sites; |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
# standard way of generating pairwise combos |
|
1174
|
|
|
|
|
|
|
# LD is done by comparing all the pairwise site (marker) |
|
1175
|
|
|
|
|
|
|
# combinations and keeping track of the genotype and |
|
1176
|
|
|
|
|
|
|
# pairwise genotype (ie genotypes of the 2 sites) frequencies |
|
1177
|
2
|
|
|
|
|
4
|
for( my $i = 0; $i < $site_count - 1; $i++ ) { |
|
1178
|
3
|
|
|
|
|
4
|
my $site1 = $marker_names[$i]; |
|
1179
|
|
|
|
|
|
|
|
|
1180
|
3
|
|
|
|
|
6
|
for( my $j = $i+1; $j < $site_count ; $j++) { |
|
1181
|
4
|
|
|
|
|
3
|
my (%genotypes, %total_genotype_count,$total_pairwisegeno_count, |
|
1182
|
|
|
|
|
|
|
%pairwise_genotypes); |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
4
|
|
|
|
|
4
|
my $site2 = $marker_names[$j]; |
|
1185
|
4
|
|
|
|
|
8
|
my (%allele_count,%allele_freqs) = (0,0); |
|
1186
|
4
|
|
|
|
|
5
|
foreach my $ind ( @inds ) { |
|
1187
|
|
|
|
|
|
|
# build string of genotype at site 1 |
|
1188
|
53
|
|
|
|
|
82
|
my ($genotype1) = $ind->get_Genotypes(-marker => $site1); |
|
1189
|
53
|
|
|
|
|
75
|
my @alleles1 = sort $genotype1->get_Alleles; |
|
1190
|
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
# if an individual has only one available allele |
|
1192
|
|
|
|
|
|
|
# (has a blank or N for one of the chromosomes) |
|
1193
|
|
|
|
|
|
|
# we don't want to use it in our calculation |
|
1194
|
|
|
|
|
|
|
|
|
1195
|
53
|
100
|
|
|
|
78
|
next unless( scalar @alleles1 == 2); |
|
1196
|
52
|
|
|
|
|
58
|
my $genostr1 = join(',', @alleles1); |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
# build string of genotype at site 2 |
|
1199
|
52
|
|
|
|
|
70
|
my ($genotype2) = $ind->get_Genotypes(-marker => $site2); |
|
1200
|
52
|
|
|
|
|
72
|
my @alleles2 = sort $genotype2->get_Alleles; |
|
1201
|
52
|
|
|
|
|
62
|
my $genostr2 = join(',', @alleles2); |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
52
|
100
|
|
|
|
61
|
next unless( scalar @alleles2 == 2); |
|
1204
|
50
|
|
|
|
|
48
|
for (@alleles1) { |
|
1205
|
100
|
|
|
|
|
74
|
$allele_count{$site1}++; |
|
1206
|
100
|
|
|
|
|
84
|
$allele_freqs{$site1}->{$_}++; |
|
1207
|
|
|
|
|
|
|
} |
|
1208
|
50
|
|
|
|
|
44
|
$genotypes{$site1}->{$genostr1}++; |
|
1209
|
50
|
|
|
|
|
25
|
$total_genotype_count{$site1}++; |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
50
|
|
|
|
|
42
|
for (@alleles2) { |
|
1212
|
100
|
|
|
|
|
67
|
$allele_count{$site2}++; |
|
1213
|
100
|
|
|
|
|
76
|
$allele_freqs{$site2}->{$_}++; |
|
1214
|
|
|
|
|
|
|
} |
|
1215
|
50
|
|
|
|
|
37
|
$genotypes{$site2}->{$genostr2}++; |
|
1216
|
50
|
|
|
|
|
33
|
$total_genotype_count{$site2}++; |
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
# We are using the $site1,$site2 to signify |
|
1219
|
|
|
|
|
|
|
# a unique key |
|
1220
|
50
|
|
|
|
|
63
|
$pairwise_genotypes{"$genostr1,$genostr2"}++; |
|
1221
|
|
|
|
|
|
|
# some individuals |
|
1222
|
50
|
|
|
|
|
57
|
$total_pairwisegeno_count++; |
|
1223
|
|
|
|
|
|
|
} |
|
1224
|
4
|
|
|
|
|
8
|
for my $site ( %allele_freqs ) { |
|
1225
|
16
|
|
|
|
|
9
|
for my $al ( keys %{ $allele_freqs{$site} } ) { |
|
|
16
|
|
|
|
|
30
|
|
|
1226
|
16
|
|
|
|
|
20
|
$allele_freqs{$site}->{$al} /= $allele_count{$site}; |
|
1227
|
|
|
|
|
|
|
} |
|
1228
|
|
|
|
|
|
|
} |
|
1229
|
4
|
|
|
|
|
3
|
my $n = $total_pairwisegeno_count; # number of pairs of comparisons |
|
1230
|
|
|
|
|
|
|
# 'A' and 'B' are two loci or in our case site1 and site2 |
|
1231
|
4
|
|
|
|
|
6
|
my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele |
|
1232
|
4
|
|
|
|
|
8
|
my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele |
|
1233
|
4
|
|
|
|
|
3
|
my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele |
|
1234
|
4
|
|
|
|
|
4
|
my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele |
|
1235
|
|
|
|
|
|
|
# AABB |
|
1236
|
4
|
|
|
|
|
5
|
my $N1genostr = join(",",( $allele1_site1, $allele1_site1, |
|
1237
|
|
|
|
|
|
|
$allele1_site2, $allele1_site2)); |
|
1238
|
4
|
|
|
|
|
12
|
$self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n"); |
|
1239
|
|
|
|
|
|
|
# AABb |
|
1240
|
4
|
|
|
|
|
6
|
my $N2genostr = join(",",( $allele1_site1, $allele1_site1, |
|
1241
|
|
|
|
|
|
|
$allele1_site2, $allele2_site2)); |
|
1242
|
4
|
|
|
|
|
9
|
$self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n"); |
|
1243
|
|
|
|
|
|
|
# AaBB |
|
1244
|
4
|
|
|
|
|
6
|
my $N4genostr = join(",",( $allele1_site1, $allele2_site1, |
|
1245
|
|
|
|
|
|
|
$allele1_site2, $allele1_site2)); |
|
1246
|
4
|
|
|
|
|
9
|
$self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n"); |
|
1247
|
|
|
|
|
|
|
# AaBb |
|
1248
|
4
|
|
|
|
|
5
|
my $N5genostr = join(",",( $allele1_site1, $allele2_site1, |
|
1249
|
|
|
|
|
|
|
$allele1_site2, $allele2_site2)); |
|
1250
|
4
|
|
|
|
|
10
|
$self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n"); |
|
1251
|
|
|
|
|
|
|
# count of AABB in |
|
1252
|
4
|
|
50
|
|
|
7
|
my $n1 = $pairwise_genotypes{$N1genostr} || 0; |
|
1253
|
|
|
|
|
|
|
# count of AABb in |
|
1254
|
4
|
|
100
|
|
|
10
|
my $n2 = $pairwise_genotypes{$N2genostr} || 0; |
|
1255
|
|
|
|
|
|
|
# count of AaBB in |
|
1256
|
4
|
|
100
|
|
|
9
|
my $n4 = $pairwise_genotypes{$N4genostr} || 0; |
|
1257
|
|
|
|
|
|
|
# count of AaBb in |
|
1258
|
4
|
|
100
|
|
|
16
|
my $n5 = $pairwise_genotypes{$N5genostr} || 0; |
|
1259
|
|
|
|
|
|
|
|
|
1260
|
4
|
|
|
|
|
5
|
my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1)); |
|
1261
|
4
|
|
|
|
|
3
|
my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2)); |
|
1262
|
4
|
|
50
|
|
|
9
|
my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n; |
|
1263
|
4
|
|
50
|
|
|
10
|
my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n; |
|
1264
|
4
|
|
50
|
|
|
6
|
my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq |
|
1265
|
4
|
|
|
|
|
4
|
my $p_a = 1 - $p_A; |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
4
|
|
50
|
|
|
8
|
my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq |
|
1268
|
4
|
|
|
|
|
2
|
my $p_b = 1 - $p_B; |
|
1269
|
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# variance of allele frequencies |
|
1271
|
4
|
|
|
|
|
6
|
my $pi_A = $p_A * $p_a; |
|
1272
|
4
|
|
|
|
|
4
|
my $pi_B = $p_B * $p_b; |
|
1273
|
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
# hardy weinberg |
|
1275
|
4
|
|
|
|
|
5
|
my $D_A = $p_AA - $p_A**2; |
|
1276
|
4
|
|
|
|
|
5
|
my $D_B = $p_BB - $p_B**2; |
|
1277
|
4
|
|
|
|
|
6
|
my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5; |
|
1278
|
4
|
|
|
|
|
18
|
$self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n"); |
|
1279
|
|
|
|
|
|
|
|
|
1280
|
4
|
|
|
|
|
7
|
my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B ); |
|
1281
|
4
|
|
|
|
|
36
|
$self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n"); |
|
1282
|
4
|
|
|
|
|
36
|
$self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n", |
|
1283
|
|
|
|
|
|
|
$n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B)); |
|
1284
|
|
|
|
|
|
|
|
|
1285
|
4
|
|
|
|
|
5
|
my $chisquared; |
|
1286
|
4
|
|
|
|
|
4
|
eval { $chisquared = ( $n * ($delta_AB**2) ) / |
|
|
4
|
|
|
|
|
7
|
|
|
1287
|
|
|
|
|
|
|
( ( $pi_A + $D_A) * ( $pi_B + $D_B) ); |
|
1288
|
|
|
|
|
|
|
}; |
|
1289
|
4
|
50
|
|
|
|
6
|
if( $@ ) { |
|
1290
|
0
|
|
|
|
|
0
|
$self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n"); |
|
1291
|
0
|
|
|
|
|
0
|
next; |
|
1292
|
|
|
|
|
|
|
} |
|
1293
|
|
|
|
|
|
|
# this will be an upper triangular matrix |
|
1294
|
4
|
|
|
|
|
37
|
$stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared]; |
|
1295
|
|
|
|
|
|
|
} |
|
1296
|
|
|
|
|
|
|
} |
|
1297
|
2
|
|
|
|
|
12
|
return %stats_for_sites; |
|
1298
|
|
|
|
|
|
|
} |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
=head2 mcdonald_kreitman |
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
Title : mcdonald_kreitman |
|
1303
|
|
|
|
|
|
|
Usage : $Fstat = mcdonald_kreitman($ingroup, $outgroup); |
|
1304
|
|
|
|
|
|
|
Function: Calculates McDonald-Kreitman statistic based on a set of ingroup |
|
1305
|
|
|
|
|
|
|
individuals and an outgroup by computing the number of |
|
1306
|
|
|
|
|
|
|
differences at synonymous and non-synonymous sites |
|
1307
|
|
|
|
|
|
|
for intraspecific comparisons and with the outgroup |
|
1308
|
|
|
|
|
|
|
Returns : 2x2 table, followed by a hash reference indicating any |
|
1309
|
|
|
|
|
|
|
warning messages about the status of the alleles or codons |
|
1310
|
|
|
|
|
|
|
Args : -ingroup => L object or |
|
1311
|
|
|
|
|
|
|
arrayref of Ls |
|
1312
|
|
|
|
|
|
|
-outgroup => L object or |
|
1313
|
|
|
|
|
|
|
arrayef of Ls |
|
1314
|
|
|
|
|
|
|
-polarized => Boolean, to indicate if this should be |
|
1315
|
|
|
|
|
|
|
a polarized test. Must provide two individuals |
|
1316
|
|
|
|
|
|
|
as outgroups. |
|
1317
|
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
=cut |
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
sub mcdonald_kreitman { |
|
1321
|
6
|
|
|
6
|
1
|
4983
|
my ($self,@args) = @_; |
|
1322
|
6
|
|
|
|
|
35
|
my ($ingroup, $outgroup,$polarized) = |
|
1323
|
|
|
|
|
|
|
$self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args); |
|
1324
|
6
|
|
|
|
|
30
|
my $verbose = $self->verbose; |
|
1325
|
6
|
|
|
|
|
9
|
my $outgroup_count; |
|
1326
|
6
|
|
|
|
|
8
|
my $gapchar = '\-'; |
|
1327
|
6
|
50
|
|
|
|
30
|
if( ref($outgroup) =~ /ARRAY/i ) { |
|
|
|
0
|
|
|
|
|
|
|
1328
|
6
|
|
|
|
|
10
|
$outgroup_count = scalar @$outgroup; |
|
1329
|
|
|
|
|
|
|
} elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) { |
|
1330
|
0
|
|
|
|
|
0
|
$outgroup_count = $outgroup->get_number_individuals; |
|
1331
|
|
|
|
|
|
|
} else { |
|
1332
|
0
|
|
|
|
|
0
|
$self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI"); |
|
1333
|
|
|
|
|
|
|
} |
|
1334
|
|
|
|
|
|
|
|
|
1335
|
6
|
100
|
|
|
|
24
|
if( $polarized ) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1336
|
2
|
50
|
|
|
|
8
|
if( $outgroup_count < 2 ) { |
|
1337
|
0
|
|
|
|
|
0
|
$self->throw("Need 2 outgroups with polarized option\n"); |
|
1338
|
|
|
|
|
|
|
} |
|
1339
|
|
|
|
|
|
|
} elsif( $outgroup_count > 1 ) { |
|
1340
|
0
|
|
|
|
|
0
|
$self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count )); |
|
1341
|
|
|
|
|
|
|
} elsif( $outgroup_count == 0 ) { |
|
1342
|
0
|
|
|
|
|
0
|
$self->throw("No outgroup sequence provided"); |
|
1343
|
|
|
|
|
|
|
} |
|
1344
|
|
|
|
|
|
|
|
|
1345
|
6
|
|
|
|
|
38
|
my $codon_path = Bio::MolEvol::CodonModel->codon_path; |
|
1346
|
|
|
|
|
|
|
|
|
1347
|
6
|
|
|
|
|
524
|
my (%marker_names,%unique,@inds); |
|
1348
|
6
|
|
|
|
|
12
|
for my $p ( $ingroup, $outgroup) { |
|
1349
|
12
|
50
|
|
|
|
42
|
if( ref($p) =~ /ARRAY/i ) { |
|
1350
|
12
|
|
|
|
|
27
|
push @inds, @$p; |
|
1351
|
|
|
|
|
|
|
} else { |
|
1352
|
0
|
|
|
|
|
0
|
push @inds, $p->get_Individuals; |
|
1353
|
|
|
|
|
|
|
} |
|
1354
|
|
|
|
|
|
|
} |
|
1355
|
6
|
|
|
|
|
12
|
for my $i ( @inds ) { |
|
1356
|
35
|
50
|
|
|
|
89
|
if( $unique{$i->unique_id}++ ) { |
|
1357
|
0
|
|
|
|
|
0
|
$self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n"); |
|
1358
|
|
|
|
|
|
|
} |
|
1359
|
35
|
|
|
|
|
61
|
for my $n ( $i->get_marker_names ) { |
|
1360
|
14126
|
|
|
|
|
9605
|
$marker_names{$n}++; |
|
1361
|
|
|
|
|
|
|
} |
|
1362
|
|
|
|
|
|
|
} |
|
1363
|
|
|
|
|
|
|
|
|
1364
|
6
|
|
|
|
|
335
|
my @marker_names = keys %marker_names; |
|
1365
|
6
|
50
|
|
|
|
81
|
if( $marker_names[0] =~ /^(Site|Codon)/ ) { |
|
1366
|
|
|
|
|
|
|
# sort by site or codon number and do it in |
|
1367
|
|
|
|
|
|
|
# a schwartzian transformation baby! |
|
1368
|
0
|
|
|
|
|
0
|
@marker_names = map { $_->[1] } |
|
1369
|
0
|
|
|
|
|
0
|
sort { $a->[0] <=> $b->[0] } |
|
1370
|
0
|
|
|
|
|
0
|
map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names; |
|
|
0
|
|
|
|
|
0
|
|
|
1371
|
|
|
|
|
|
|
} |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
|
|
1374
|
6
|
|
|
|
|
12
|
my $num_inds = scalar @inds; |
|
1375
|
6
|
|
|
|
|
21
|
my %vals = ( 'ingroup' => $ingroup, |
|
1376
|
|
|
|
|
|
|
'outgroup' => $outgroup, |
|
1377
|
|
|
|
|
|
|
); |
|
1378
|
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
# Make the Codon Table type a parameter! |
|
1380
|
6
|
|
|
|
|
45
|
my $table = Bio::Tools::CodonTable->new(-id => $codon_table); |
|
1381
|
6
|
|
|
|
|
15
|
my @vt = qw(outgroup ingroup); |
|
1382
|
6
|
|
|
|
|
5
|
my %changes; |
|
1383
|
|
|
|
|
|
|
my %status; |
|
1384
|
6
|
|
|
|
|
19
|
my %two_by_two = ( 'fixed_N' => 0, |
|
1385
|
|
|
|
|
|
|
'fixed_S' => 0, |
|
1386
|
|
|
|
|
|
|
'poly_N' => 0, |
|
1387
|
|
|
|
|
|
|
'poly_S' => 0); |
|
1388
|
|
|
|
|
|
|
|
|
1389
|
6
|
|
|
|
|
12
|
for my $codon ( @marker_names ) { |
|
1390
|
2436
|
|
|
|
|
1700
|
my (%codonvals); |
|
1391
|
|
|
|
|
|
|
my %all_alleles; |
|
1392
|
2436
|
|
|
|
|
2119
|
for my $t ( @vt ) { |
|
1393
|
4872
|
|
|
|
|
3461
|
my $outcount = 1; |
|
1394
|
4872
|
|
|
|
|
3395
|
for my $ind ( @{$vals{$t}} ) { |
|
|
4872
|
|
|
|
|
5995
|
|
|
1395
|
14126
|
|
|
|
|
21196
|
my @alleles = $ind->get_Genotypes($codon)->get_Alleles; |
|
1396
|
14126
|
50
|
|
|
|
18456
|
if( @alleles > 2 ) { |
|
1397
|
0
|
|
|
|
|
0
|
warn("Codon $codon saw ", scalar @alleles, " alleles for ind ", $ind->unique_id, "\n"); |
|
1398
|
0
|
|
|
|
|
0
|
die; |
|
1399
|
|
|
|
|
|
|
} else { |
|
1400
|
14126
|
|
|
|
|
10579
|
my ($allele) = shift @alleles; |
|
1401
|
14126
|
|
|
|
|
20793
|
$all_alleles{$ind->unique_id} = $allele; |
|
1402
|
14126
|
|
|
|
|
20428
|
my $AA = $table->translate($allele); |
|
1403
|
14126
|
50
|
66
|
|
|
53198
|
next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i); |
|
|
|
|
66
|
|
|
|
|
|
1404
|
|
|
|
|
|
|
|
|
1405
|
8971
|
|
|
|
|
6552
|
my $label = $t; |
|
1406
|
8971
|
100
|
|
|
|
10452
|
if( $t eq 'outgroup' ) { |
|
1407
|
3244
|
|
|
|
|
3478
|
$label = $t.$outcount++; |
|
1408
|
|
|
|
|
|
|
} |
|
1409
|
8971
|
|
|
|
|
10793
|
$codonvals{$label}->{$allele}++; |
|
1410
|
8971
|
|
|
|
|
14880
|
$codonvals{all}->{$allele}++; |
|
1411
|
|
|
|
|
|
|
} |
|
1412
|
|
|
|
|
|
|
} |
|
1413
|
|
|
|
|
|
|
} |
|
1414
|
2436
|
|
|
|
|
1874
|
my $total = sum ( values %{$codonvals{'ingroup'}} ); |
|
|
2436
|
|
|
|
|
5497
|
|
|
1415
|
2436
|
100
|
100
|
|
|
8190
|
next if( $total && $total < 2 ); # skip sites with < alleles |
|
1416
|
|
|
|
|
|
|
# process all the seen alleles (codons) |
|
1417
|
|
|
|
|
|
|
# this is a vertical slide through the alignment |
|
1418
|
1626
|
100
|
|
|
|
1094
|
if( keys %{$codonvals{all}} <= 1 ) { |
|
|
1626
|
|
|
|
|
4748
|
|
|
1419
|
|
|
|
|
|
|
# no changes or no VALID codons - monomorphic |
|
1420
|
|
|
|
|
|
|
} else { |
|
1421
|
|
|
|
|
|
|
# grab only the first outgroup codon (what to do with rest?) |
|
1422
|
278
|
|
|
|
|
173
|
my ($outcodon) = keys %{$codonvals{'outgroup1'}}; |
|
|
278
|
|
|
|
|
376
|
|
|
1423
|
278
|
50
|
|
|
|
397
|
if( ! $outcodon ) { |
|
1424
|
0
|
|
|
|
|
0
|
$status{"no outgroup codon $codon"}++; |
|
1425
|
0
|
|
|
|
|
0
|
next; |
|
1426
|
|
|
|
|
|
|
} |
|
1427
|
278
|
|
|
|
|
385
|
my $out_AA = $table->translate($outcodon); |
|
1428
|
278
|
|
|
|
|
260
|
my ($outcodon2) = keys %{$codonvals{'outgroup2'}}; |
|
|
278
|
|
|
|
|
441
|
|
|
1429
|
278
|
50
|
100
|
|
|
1263
|
if( ($polarized && ($outcodon ne $outcodon2)) || |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
$out_AA eq 'X' || $out_AA eq '*' ) { |
|
1431
|
|
|
|
|
|
|
# skip if outgroup codons are different |
|
1432
|
|
|
|
|
|
|
# (when polarized option is on) |
|
1433
|
|
|
|
|
|
|
# or skip if the outcodon is STOP or 'NNN' |
|
1434
|
90
|
50
|
|
|
|
125
|
if( $verbose > 0 ) { |
|
1435
|
0
|
|
|
|
|
0
|
$self->debug("skipping $out_AA and $outcodon $outcodon2\n"); |
|
1436
|
|
|
|
|
|
|
} |
|
1437
|
90
|
|
|
|
|
73
|
$status{'outgroup codons different'}++; |
|
1438
|
90
|
|
|
|
|
279
|
next; |
|
1439
|
|
|
|
|
|
|
} |
|
1440
|
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
# check if ingroup is actually different from outgroup - |
|
1442
|
|
|
|
|
|
|
# if there are the same number of alleles when considering |
|
1443
|
|
|
|
|
|
|
# ALL or just the ingroup, then there is nothing new seen |
|
1444
|
|
|
|
|
|
|
# in the outgroup so it must be a shared allele (codon) |
|
1445
|
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
# so we just count how many total alleles were seen |
|
1447
|
|
|
|
|
|
|
# if this is the same as the number of alleles seen for just |
|
1448
|
|
|
|
|
|
|
# the ingroup then the outgroup presents no new information |
|
1449
|
|
|
|
|
|
|
|
|
1450
|
188
|
|
|
|
|
127
|
my @ingroup_codons = keys %{$codonvals{'ingroup'}}; |
|
|
188
|
|
|
|
|
303
|
|
|
1451
|
188
|
|
|
|
|
234
|
my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon}; |
|
1452
|
|
|
|
|
|
|
|
|
1453
|
188
|
50
|
|
|
|
232
|
if( $verbose > 0 ) { |
|
1454
|
|
|
|
|
|
|
$self->debug("alleles are in: ", join(",", @ingroup_codons), |
|
1455
|
0
|
|
|
|
|
0
|
" out: ", join(",", keys %{$codonvals{outgroup1}}), |
|
|
0
|
|
|
|
|
0
|
|
|
1456
|
|
|
|
|
|
|
" diff_from_out=$diff_from_out\n"); |
|
1457
|
|
|
|
|
|
|
|
|
1458
|
0
|
|
|
|
|
0
|
for my $ind ( sort keys %all_alleles ) { |
|
1459
|
0
|
|
|
|
|
0
|
$self->debug( "$ind\t$all_alleles{$ind}\n"); |
|
1460
|
|
|
|
|
|
|
} |
|
1461
|
|
|
|
|
|
|
} |
|
1462
|
|
|
|
|
|
|
# are all the ingroup alleles the same and diferent from outgroup? |
|
1463
|
|
|
|
|
|
|
# fixed differences between species |
|
1464
|
188
|
100
|
|
|
|
216
|
if( $diff_from_out ) { |
|
1465
|
99
|
100
|
|
|
|
129
|
if( scalar @ingroup_codons == 1 ) { |
|
1466
|
|
|
|
|
|
|
# fixed differences |
|
1467
|
92
|
50
|
|
|
|
385
|
if( $outcodon =~ /^$gapchar/ ) { |
|
|
|
50
|
|
|
|
|
|
|
1468
|
0
|
|
|
|
|
0
|
$status{'outgroup codons with gaps'}++; |
|
1469
|
0
|
|
|
|
|
0
|
next; |
|
1470
|
|
|
|
|
|
|
} elsif( $ingroup_codons[0] =~ /$gapchar/) { |
|
1471
|
0
|
|
|
|
|
0
|
$status{'ingroup codons with gaps'}++; |
|
1472
|
0
|
|
|
|
|
0
|
next; |
|
1473
|
|
|
|
|
|
|
} |
|
1474
|
92
|
|
|
|
|
166
|
my $path = $codon_path->{uc $ingroup_codons[0].$outcodon}; |
|
1475
|
92
|
|
|
|
|
102
|
$two_by_two{fixed_N} += $path->[0]; |
|
1476
|
92
|
|
|
|
|
86
|
$two_by_two{fixed_S} += $path->[1]; |
|
1477
|
92
|
50
|
|
|
|
376
|
if( $verbose > 0 ) { |
|
1478
|
0
|
|
|
|
|
0
|
$self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n"); |
|
1479
|
0
|
|
|
|
|
0
|
$self->debug("path is ",join(",",@$path),"\n"); |
|
1480
|
|
|
|
|
|
|
$self->debug |
|
1481
|
|
|
|
|
|
|
(sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA, |
|
1482
|
0
|
|
|
|
|
0
|
@$path, map { $two_by_two{$_} } |
|
|
0
|
|
|
|
|
0
|
|
|
1483
|
|
|
|
|
|
|
qw(fixed_N fixed_S poly_N poly_S))); |
|
1484
|
|
|
|
|
|
|
} |
|
1485
|
|
|
|
|
|
|
} else { |
|
1486
|
|
|
|
|
|
|
# polymorphic and all are different from outgroup |
|
1487
|
|
|
|
|
|
|
# Here we find the minimum number of NS subst |
|
1488
|
7
|
|
|
|
|
10
|
my ($Ndiff,$Sdiff) = (3,0); # most different path |
|
1489
|
7
|
|
|
|
|
8
|
for my $c ( @ingroup_codons ) { |
|
1490
|
14
|
50
|
33
|
|
|
79
|
next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/); |
|
1491
|
14
|
|
|
|
|
24
|
my $path = $codon_path->{uc $c.$outcodon}; |
|
1492
|
14
|
|
|
|
|
16
|
my ($tNdiff,$tSdiff) = @$path; |
|
1493
|
14
|
100
|
100
|
|
|
46
|
if( $path->[0] < $Ndiff || |
|
|
|
|
66
|
|
|
|
|
|
1494
|
|
|
|
|
|
|
($tNdiff == $Ndiff && |
|
1495
|
|
|
|
|
|
|
$tSdiff <= $Sdiff)) { |
|
1496
|
11
|
|
|
|
|
15
|
($Ndiff,$Sdiff) = ($tNdiff,$tSdiff); |
|
1497
|
|
|
|
|
|
|
} |
|
1498
|
|
|
|
|
|
|
} |
|
1499
|
7
|
|
|
|
|
10
|
$two_by_two{fixed_N} += $Ndiff; |
|
1500
|
7
|
|
|
|
|
7
|
$two_by_two{fixed_S} += $Sdiff; |
|
1501
|
7
|
50
|
|
|
|
11
|
if( @ingroup_codons > 2 ) { |
|
1502
|
0
|
|
|
|
|
0
|
$status{"more than 2 ingroup codons $codon"}++; |
|
1503
|
0
|
|
|
|
|
0
|
warn("more than 2 ingroup codons (@ingroup_codons)\n"); |
|
1504
|
|
|
|
|
|
|
} else { |
|
1505
|
7
|
|
|
|
|
16
|
my $path = $codon_path->{uc join('',@ingroup_codons)}; |
|
1506
|
|
|
|
|
|
|
|
|
1507
|
7
|
|
|
|
|
10
|
$two_by_two{poly_N} += $path->[0]; |
|
1508
|
7
|
|
|
|
|
6
|
$two_by_two{poly_S} += $path->[1]; |
|
1509
|
7
|
50
|
|
|
|
33
|
if( $verbose > 0 ) { |
|
1510
|
0
|
|
|
|
|
0
|
$self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S))); |
|
|
0
|
|
|
|
|
0
|
|
|
1511
|
|
|
|
|
|
|
} |
|
1512
|
|
|
|
|
|
|
} |
|
1513
|
|
|
|
|
|
|
} |
|
1514
|
|
|
|
|
|
|
} else { |
|
1515
|
89
|
|
|
|
|
98
|
my %unq = map { $_ => 1 } @ingroup_codons; |
|
|
181
|
|
|
|
|
273
|
|
|
1516
|
89
|
|
|
|
|
97
|
delete $unq{$outcodon}; |
|
1517
|
89
|
|
|
|
|
113
|
my @unique_codons = keys %unq; |
|
1518
|
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
# calc path for diff add to poly |
|
1520
|
|
|
|
|
|
|
# Here we find the minimum number of subst bw |
|
1521
|
|
|
|
|
|
|
# codons |
|
1522
|
89
|
|
|
|
|
84
|
my ($Ndiff,$Sdiff) = (3,0); # most different path |
|
1523
|
89
|
|
|
|
|
84
|
for my $c ( @unique_codons ) { |
|
1524
|
92
|
|
|
|
|
159
|
my $path = $codon_path->{uc $c.$outcodon }; |
|
1525
|
92
|
50
|
|
|
|
138
|
if( ! defined $path ) { |
|
1526
|
0
|
|
|
|
|
0
|
die " cannot get path for ", $c.$outcodon, "\n"; |
|
1527
|
|
|
|
|
|
|
} |
|
1528
|
92
|
|
|
|
|
87
|
my ($tNdiff,$tSdiff) = @$path; |
|
1529
|
92
|
50
|
33
|
|
|
163
|
if( $path->[0] < $Ndiff || |
|
|
|
|
66
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
($tNdiff == $Ndiff && |
|
1531
|
|
|
|
|
|
|
$tSdiff <= $Sdiff)) { |
|
1532
|
92
|
|
|
|
|
127
|
($Ndiff,$Sdiff) = ($tNdiff,$tSdiff); |
|
1533
|
|
|
|
|
|
|
} |
|
1534
|
|
|
|
|
|
|
} |
|
1535
|
|
|
|
|
|
|
|
|
1536
|
89
|
100
|
|
|
|
139
|
if( @unique_codons == 2 ) { |
|
1537
|
3
|
|
|
|
|
8
|
my $path = $codon_path->{uc join('',@unique_codons)}; |
|
1538
|
3
|
50
|
|
|
|
8
|
if( ! defined $path ) { |
|
1539
|
0
|
|
|
|
|
0
|
$self->throw("no path for @unique_codons\n"); |
|
1540
|
|
|
|
|
|
|
} |
|
1541
|
3
|
|
|
|
|
5
|
$Ndiff += $path->[0]; |
|
1542
|
3
|
|
|
|
|
3
|
$Sdiff += $path->[1]; |
|
1543
|
|
|
|
|
|
|
} |
|
1544
|
89
|
|
|
|
|
91
|
$two_by_two{poly_N} += $Ndiff; |
|
1545
|
89
|
|
|
|
|
77
|
$two_by_two{poly_S} += $Sdiff; |
|
1546
|
89
|
50
|
|
|
|
374
|
if( $verbose > 0 ) { |
|
1547
|
|
|
|
|
|
|
$self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA, |
|
1548
|
0
|
|
|
|
|
0
|
$Ndiff, $Sdiff, map { $two_by_two{$_} } |
|
|
0
|
|
|
|
|
0
|
|
|
1549
|
|
|
|
|
|
|
qw(fixed_N fixed_S poly_N poly_S))); |
|
1550
|
|
|
|
|
|
|
} |
|
1551
|
|
|
|
|
|
|
} |
|
1552
|
|
|
|
|
|
|
} |
|
1553
|
|
|
|
|
|
|
} |
|
1554
|
|
|
|
|
|
|
return ( $two_by_two{'poly_N'}, |
|
1555
|
|
|
|
|
|
|
$two_by_two{'fixed_N'}, |
|
1556
|
|
|
|
|
|
|
$two_by_two{'poly_S'}, |
|
1557
|
6
|
|
|
|
|
83
|
$two_by_two{'fixed_S'}, |
|
1558
|
|
|
|
|
|
|
{%status}); |
|
1559
|
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
} |
|
1561
|
|
|
|
|
|
|
|
|
1562
|
|
|
|
|
|
|
*MK = \&mcdonald_kreitman; |
|
1563
|
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
=head2 mcdonald_kreitman_counts |
|
1566
|
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
Title : mcdonald_kreitman_counts |
|
1568
|
|
|
|
|
|
|
Usage : my $MK = $statistics->mcdonald_kreitman_counts( |
|
1569
|
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
N_poly -> integer of count of non-syn polymorphism |
|
1571
|
|
|
|
|
|
|
N_fix -> integer of count of non-syn fixed substitutions |
|
1572
|
|
|
|
|
|
|
S_poly -> integer of count of syn polymorphism |
|
1573
|
|
|
|
|
|
|
S_fix -> integer of count of syn fixed substitutions |
|
1574
|
|
|
|
|
|
|
); |
|
1575
|
|
|
|
|
|
|
Function: |
|
1576
|
|
|
|
|
|
|
Returns : decimal number |
|
1577
|
|
|
|
|
|
|
Args : |
|
1578
|
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
=cut |
|
1580
|
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
sub mcdonald_kreitman_counts { |
|
1583
|
0
|
|
|
0
|
1
|
|
my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_; |
|
1584
|
0
|
0
|
|
|
|
|
if( $has_twotailed ) { |
|
1585
|
0
|
|
|
|
|
|
return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic |
|
1586
|
|
|
|
|
|
|
(n11=>$Npoly, |
|
1587
|
|
|
|
|
|
|
n1p=>$Npoly+$Spoly, |
|
1588
|
|
|
|
|
|
|
np1=>$Npoly+$Nfix, |
|
1589
|
|
|
|
|
|
|
npp=>$Npoly+$Nfix+$Spoly+$Sfix); |
|
1590
|
|
|
|
|
|
|
} else { |
|
1591
|
0
|
|
|
|
|
|
$self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed"); |
|
1592
|
0
|
|
|
|
|
|
return 0; |
|
1593
|
|
|
|
|
|
|
} |
|
1594
|
|
|
|
|
|
|
} |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
1; |