File Coverage

Bio/PopGen/PopStats.pm

Criterion	Covered	Total	%
statement	67	80	83.7
branch	8	14	57.1
condition	6	10	60.0
subroutine	5	5	100.0
pod	3	3	100.0
total	89	112	79.4

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::PopGen::PopStats
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Jason Stajich
7							#
8							# Copyright Jason Stajich
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::PopGen::PopStats - A collection of methods for calculating
17							statistics about a population or sets of populations
18
19							=head1 SYNOPSIS
20
21							use Bio::PopGen::PopStats;
22							my $stats = Bio::PopGen::PopStats->new(); # add -haploid => 1
23							# to process haploid data
24
25							=head1 DESCRIPTION
26
27							Calculate various population structure statistics, most notably Wright's Fst.
28
29							=head1 FEEDBACK
30
31							=head2 Mailing Lists
32
33							User feedback is an integral part of the evolution of this and other
34							Bioperl modules. Send your comments and suggestions preferably to
35							the Bioperl mailing list. Your participation is much appreciated.
36
37							bioperl-l@bioperl.org - General discussion
38							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39
40							=head2 Support
41
42							Please direct usage questions or support issues to the mailing list:
43
44							I
45
46							rather than to the module maintainer directly. Many experienced and
47							reponsive experts will be able look at the problem and quickly
48							address it. Please include a thorough description of the problem
49							with code and data examples if at all possible.
50
51							=head2 Reporting Bugs
52
53							Report bugs to the Bioperl bug tracking system to help us keep track
54							of the bugs and their resolution. Bug reports can be submitted via
55							the web:
56
57							https://github.com/bioperl/bioperl-live/issues
58
59							=head1 AUTHOR - Jason Stajich
60
61							Email jason-at-bioperl.org
62
63							=head1 CONTRIBUTORS
64
65							Matthew Hahn, matthew.hahn-at-duke.edu
66
67							=head1 APPENDIX
68
69							The rest of the documentation details each of the object methods.
70							Internal methods are usually preceded with a _
71
72							=cut
73
74
75							# Let the code begin...
76
77
78							package Bio::PopGen::PopStats;
79	1			1		701	use strict;
	1					2
	1					25
80
81							# Object preamble - inherits from Bio::Root::Root
82
83
84
85	1			1		4	use base qw(Bio::Root::Root);
	1					1
	1					658
86
87							=head2 new
88
89							Title : new
90							Usage : my $obj = Bio::PopGen::PopStats->new();
91							Function: Builds a new Bio::PopGen::PopStats object
92							Returns : an instance of Bio::PopGen::PopStats
93							Args : -haploid => 1 (if want to use haploid calculations)
94
95
96							=cut
97
98							sub new {
99	2			2	1	21	my($class,@args) = @_;
100
101	2					11	my $self = $class->SUPER::new(@args);
102	2					8	my ($haploid) = $self->_rearrange([qw(HAPLOID)],@args);
103	2	50				5	if( $haploid ) { $self->haploid_status(1) }
	2					6
104	2					6	return $self;
105							}
106
107
108							=head2 haploid_status
109
110							Title : haploid_status
111							Usage : $obj->haploid_status($newval)
112							Function: Boolean value for whether or not to do haploid
113							or diploid calculations, where appropriate
114							Returns : Boolean
115							Args : on set, new boolean value optional)
116
117
118							=cut
119
120							sub haploid_status{
121	174			174	1	168	my $self = shift;
122	174	100				257	return $self->{'haploid_status'} = shift if @_;
123	172					244	return $self->{'haploid_status'};
124							}
125
126
127							# Implementation provided my Matthew Hahn, massaged by Jason Stajich
128
129							=head2 Fst
130
131							Title : Fst
132							Usage : my $fst = $stats->Fst(\@populations,\@markernames)
133							Function: Calculate Wright's Fst based on a set of sub-populations
134							and specific markers
135							Returns : Fst value (a value between 0 and 1)
136							Args : Arrayref of populations to process
137							Arrayref of marker names to process
138							Note : Based on diploid method in Weir BS, Genetics Data Analysis II, 1996
139							page 178.
140
141							=cut
142
143							#' make emacs happy here
144							sub Fst {
145	8			8	1	3555	my ($self,$populations,$markernames) = @_;
146
147	8	50	33			111	if( ! defined $populations \|\|
		50	33
148							ref($populations) !~ /ARRAY/i ) {
149	0					0	$self->warn("Must provide a valid arrayref for populations");
150	0					0	return;
151							} elsif( ! defined $markernames \|\|
152							ref($markernames) !~ /ARRAY/i ) {
153	0					0	$self->warn("Must provide a valid arrayref for marker names");
154	0					0	return;
155							}
156	8					14	my $num_sub_pops = scalar @$populations;
157
158	8	50				18	if( $num_sub_pops < 2 ) {
159	0					0	$self->warn("Must provide at least 2 populations for this test, you provided $num_sub_pops");
160	0					0	return;
161							}
162
163							# This code assumes that pop 1 contains at least one of all the
164							# alleles - need to do some more work to insure that the complete
165							# set of alleles is seen.
166	8					9	my $Fst;
167	8					12	my ($TS_sub1,$TS_sub2);
168
169	8					13	foreach my $marker ( @$markernames ) {
170							# Get all the alleles from all the genotypes in all subpopulations
171	84					87	my %allAlleles;
172	84					111	foreach my $allele ( map { $_->get_Alleles() }
	1440					1859
173	196					316	map { $_->get_Genotypes($marker) } @$populations ){
174	1440					1318	$allAlleles{$allele}++;
175							}
176	84					268	my @alleles = keys %allAlleles;
177
178	84					102	foreach my $allele_name ( @alleles ) {
179	172					164	my $avg_samp_size = 0; # n-bar
180	172					150	my $avg_allele_freq = 0; # p-tilda-A-dot
181
182	172					154	my $total_samples_squared = 0; #
183	172					161	my $sum_heterozygote = 0;
184
185	172					146	my @marker_freqs;
186
187							# Walk through each population, get the calculated allele frequencies
188							# for the marker, do some bookkeeping
189
190
191	172					206	foreach my $pop ( @$populations ) {
192	405					633	my $s = $pop->get_number_individuals($marker);
193
194	405					403	$avg_samp_size += $s;
195	405					435	$total_samples_squared += $s**2;
196
197	405					585	my $markerobj = $pop->get_Marker($marker);
198	405	50				555	if( ! defined $markerobj ) {
199	0					0	$self->warn("Could not derive Marker for $marker ".
200							"from population ". $pop->name);
201	0					0	return;
202							}
203
204	405					548	my $freq_homozygotes =
205							$pop->get_Frequency_Homozygotes($marker,$allele_name);
206	405					684	my %af = $markerobj->get_Allele_Frequencies();
207	405		100			808	my $all_freq = ( ($af{$allele_name} \|\| 0));
208
209	405					500	$avg_allele_freq += $s * $all_freq;
210	405					507	$sum_heterozygote += (2 * $s)*( $all_freq - $freq_homozygotes);
211
212	405					665	push @marker_freqs, \%af;
213							}
214	172					173	my $total_samples = $avg_samp_size; # sum of n over i sub-populations
215	172					173	$avg_samp_size /= $num_sub_pops;
216	172					156	$avg_allele_freq /= $total_samples;
217
218							# n-sub-c
219	172					208	my $adj_samp_size = ( 1/ ($num_sub_pops - 1)) *
220							( $total_samples - ( $total_samples_squared/$total_samples));
221
222	172					180	my $variance = 0; # s-squared-sub-A
223	172					157	my $sum_variance = 0;
224	172					164	my $i = 0; # we have cached the marker info
225	172					194	foreach my $pop ( @$populations ) {
226	405					605	my $s = $pop->get_number_individuals($marker);
227	405					338	my %af = %{$marker_freqs[$i++]};
	405					919
228	405		100			1147	$sum_variance += $s * (( ($af{$allele_name} \|\| 0) -
229							$avg_allele_freq)**2);
230							}
231	172					229	$variance = ( 1 / (( $num_sub_pops-1)$avg_samp_size))$sum_variance;
232
233							# H-tilda-A-dot
234	172					189	my $freq_heterozygote = ($sum_heterozygote / $total_samples);
235
236	172	50				218	if( $self->haploid_status ) {
237							# Haploid calculations
238
239	172					305	my $T_sub1 = $variance -
240							( ( 1/($avg_samp_size-1))*
241							( ($avg_allele_freq*(1-$avg_allele_freq))-
242							( (($num_sub_pops-1)/$num_sub_pops)*$variance)));
243	172					296	my $T_sub2 = ( (($adj_samp_size-1)/($avg_samp_size-1))*
244							$avg_allele_freq*(1-$avg_allele_freq) ) +
245							( 1 + ( (($num_sub_pops-1)*
246							($avg_samp_size-$adj_samp_size))/
247							($avg_samp_size - 1))) *
248							($variance/$num_sub_pops);
249
250
251							#to get total Fst from all alleles (if more than two) or all
252							#loci (if more than one), we need to calculate $T_sub1 and
253							#$T_sub2 for all alleles for all loci, sum, and then divide
254							#again to get Fst.
255	172					186	$TS_sub1 += $T_sub1;
256	172					459	$TS_sub2 += $T_sub2;
257
258							} else {
259	0					0	my $S_sub1 = $variance - ( (1/($avg_samp_size-1))*
260							( ($avg_allele_freq*
261							(1-$avg_allele_freq)) -
262							((($num_sub_pops-1)/$num_sub_pops)*
263							$variance)-0.25*$freq_heterozygote ) );
264	0					0	my $S_sub2 = ($avg_allele_freq*(1-$avg_allele_freq)) -
265							( ($avg_samp_size/($num_sub_pops($avg_samp_size-1)))
266							( ((($num_sub_pops*($avg_samp_size- $adj_samp_size))/
267							$avg_samp_size)$avg_allele_freq
268							(1-$avg_allele_freq)) -
269							( (1/$avg_samp_size)* (($avg_samp_size-1)+
270							($num_sub_pops-1)*
271							($avg_samp_size-
272							$adj_samp_size) )*$variance ) -
273							( (($num_sub_pops*($avg_samp_size-$adj_samp_size))/
274							(4$avg_samp_size$adj_samp_size))*
275							$freq_heterozygote ) ) );
276
277	0					0	my $S_sub3 = ($adj_samp_size/(2$avg_samp_size))
278							$freq_heterozygote;
279
280							#Again, to get the average over many alleles or many loci,
281							#we will have to run the above for each and then sum the $S
282							#variables and recalculate the F statistics
283	0					0	$TS_sub1 += $S_sub1;
284	0					0	$TS_sub2 += $S_sub2;
285							}
286							}
287							}
288							# $Fst_diploid = $S_sub1/$S_sub2;
289							#my $Fit_diploid = 1 - ($S_sub3/$S_sub2);
290							#my $Fis_diploid = ($Fit_diploid-$Fst_diploid)/(1-$Fst_diploid);
291	8					12	$Fst = $TS_sub1 / $TS_sub2;
292
293	8					21	return $Fst;
294							}
295
296							1;