File Coverage

Bio/Align/DNAStatistics.pm

Criterion	Covered	Total	%
statement	515	635	81.1
branch	94	152	61.8
condition	25	52	48.0
subroutine	39	45	86.6
pod	20	30	66.6
total	693	914	75.8

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::Align::DNAStatistics
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Jason Stajich
7							#
8							# Copyright Jason Stajich
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
17
18							=head1 SYNOPSIS
19
20							use Bio::AlignIO;
21							use Bio::Align::DNAStatistics;
22
23							my $stats = Bio::Align::DNAStatistics->new();
24							my $alignin = Bio::AlignIO->new(-format => 'emboss',
25							-file => 't/data/insulin.water');
26							my $aln = $alignin->next_aln;
27							my $jcmatrix = $stats->distance(-align => $aln,
28							-method => 'Jukes-Cantor');
29
30							print $jcmatrix->print_matrix;
31							## and for measurements of synonymous /nonsynonymous substitutions ##
32
33							my $in = Bio::AlignIO->new(-format => 'fasta',
34							-file => 't/data/nei_gojobori_test.aln');
35							my $alnobj = $in->next_aln;
36							my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
37							my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
38							print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
39							for (sort keys %{$results->[0]} ){
40							next if /Seq/;
41							printf("%-9s %.4f \n",$_ , $results->[0]{$_});
42							}
43
44							my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
45							for my $an (@$results2){
46							print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
47							for (sort keys %$an ){
48							next if /Seq/;
49							printf("%-9s %.4f \n",$_ , $an->{$_});
50							}
51							print "\n\n";
52							}
53
54							my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
55							for (sort keys %$result3 ){
56							next if /Seq/;
57							printf("%-9s %.4f \n",$_ , $result3->{$_});
58							}
59
60							=head1 DESCRIPTION
61
62							This object contains routines for calculating various statistics and
63							distances for DNA alignments. The routines are not well tested and do
64							contain errors at this point. Work is underway to correct them, but
65							do not expect this code to give you the right answer currently! Use
66							dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
67							distances.
68
69
70							Several different distance method calculations are supported. Listed
71							in brackets are the pattern which will match
72
73							=over 3
74
75							=item *
76
77							JukesCantor [jc\|jukes\|jukescantor\|jukes-cantor]
78
79							=item *
80
81							Uncorrected [jcuncor\|uncorrected]
82
83							=item *
84
85							F81 [f81\|felsenstein]
86
87							=item *
88
89							Kimura [k2\|k2p\|k80\|kimura]
90
91							=item *
92
93							Tamura [t92\|tamura\|tamura92]
94
95							=item *
96
97							F84 [f84\|felsenstein84]
98
99							=item *
100
101							TajimaNei [tajimanei\|tajima\-nei]
102
103							=item *
104
105							JinNei [jinnei\|jin\-nei] (not implemented)
106
107							=back
108
109							There are also three methods to calculate the ratio of synonymous to
110							non-synonymous mutations. All are implementations of the Nei-Gojobori
111							evolutionary pathway method and use the Jukes-Cantor method of
112							nucleotide substitution. This method works well so long as the
113							nucleotide frequencies are roughly equal and there is no significant
114							transition/transversion bias. In order to use these methods there are
115							several pre-requisites for the alignment.
116
117							=over 3
118
119							=item 1
120
121							DNA alignment must be based on protein alignment. Use the subroutine
122							L to achieve this.
123
124							=item 2
125
126							Therefore alignment gaps must be in multiples of 3 (representing an aa
127							deletion/insertion) and at present must be indicated by a '-' symbol.
128
129							=item 3
130
131							Alignment must be solely of coding region and be in reading frame 0 to
132							achieve meaningful results
133
134							=item 4
135
136							Alignment must therefore be a multiple of 3 nucleotides long.
137
138							=item 5
139
140							All sequences must be the same length (including gaps). This should be
141							the case anyway if the sequences have been automatically aligned using
142							a program like Clustal.
143
144							=item 6
145
146							Only the standard codon alphabet is supported at present.
147
148							=back
149
150							calc_KaKs_pair() calculates a number of statistics for a named pair of
151							sequences in the alignment.
152
153							calc_all_KaKs_pairs() calculates these statistics for all pairwise
154							comparisons in an MSA. The statistics returned are:
155
156							=over 3
157
158							=item *
159
160							S_d - Number of synonymous mutations between the 2 sequences.
161
162							=item *
163
164							N_d - Number of non-synonymous mutations between the 2 sequences.
165
166							=item *
167
168							S - Mean number of synonymous sites in both sequences.
169
170							=item *
171
172							N - mean number of synonymous sites in both sequences.
173
174							=item *
175
176							P_s - proportion of synonymous differences in both sequences given by
177							P_s = S_d/S.
178
179							=item *
180
181							P_n - proportion of non-synonymous differences in both sequences given
182							by P_n = S_n/S.
183
184							=item *
185
186							D_s - estimation of synonymous mutations per synonymous site (by
187							Jukes-Cantor).
188
189							=item *
190
191							D_n - estimation of non-synonymous mutations per non-synonymous site (by
192							Jukes-Cantor).
193
194							=item *
195
196							D_n_var - estimation of variance of D_n .
197
198							=item *
199
200							D_s_var - estimation of variance of S_n.
201
202							=item *
203
204							z_value - calculation of z value.Positive value indicates D_n E D_s,
205							negative value indicates D_s E D_n.
206
207							=back
208
209							The statistics returned by calc_average_KaKs are:
210
211							=over 3
212
213							=item *
214
215							D_s - Average number of synonymous mutations/synonymous site.
216
217							=item *
218
219							D_n - Average number of non-synonymous mutations/non-synonymous site.
220
221							=item *
222
223							D_s_var - Estimated variance of Ds from bootstrapped alignments.
224
225							=item *
226
227							D_n_var - Estimated variance of Dn from bootstrapped alignments.
228
229							=item *
230
231							z_score - calculation of z value. Positive value indicates D_n ED_s,
232							negative values vice versa.
233
234							=back
235
236							The design of the code is based around the explanation of the
237							Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
238							Phylogenetics" by Nei and Kumar, published by Oxford University
239							Press. The methods have been tested using the worked example 4.1 in
240							the book, and reproduce those results. If people like having this sort
241							of analysis in BioPerl other methods for estimating Ds and Dn can be
242							provided later.
243
244							Much of the DNA distance code is based on implementations in EMBOSS
245							(Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
246							al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
247							Mitchison.
248
249							=head1 REFERENCES
250
251							=over 3
252
253							=item *
254
255							D_JukesCantor
256
257							"Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
258							Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
259							Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
260							1969, pp. 21-132.
261
262							=item *
263
264							D_Tamura
265
266							K Tamura, Mol. Biol. Evol. 1992, 9, 678.
267
268							=item *
269
270							D_Kimura
271
272							M Kimura, J. Mol. Evol., 1980, 16, 111.
273
274							=item *
275
276							JinNei
277
278							Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
279
280							=item *
281
282							D_TajimaNei
283
284							Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
285
286							=back
287
288							=head1 FEEDBACK
289
290							=head2 Mailing Lists
291
292							User feedback is an integral part of the evolution of this and other
293							Bioperl modules. Send your comments and suggestions preferably to
294							the Bioperl mailing list. Your participation is much appreciated.
295
296							bioperl-l@bioperl.org - General discussion
297							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
298
299							=head2 Support
300
301							Please direct usage questions or support issues to the mailing list:
302
303							I
304
305							rather than to the module maintainer directly. Many experienced and
306							reponsive experts will be able look at the problem and quickly
307							address it. Please include a thorough description of the problem
308							with code and data examples if at all possible.
309
310							=head2 Reporting Bugs
311
312							Report bugs to the Bioperl bug tracking system to help us keep track
313							of the bugs and their resolution. Bug reports can be submitted via the
314							web:
315
316							https://github.com/bioperl/bioperl-live/issues
317
318							=head1 AUTHOR - Jason Stajich
319
320							Email jason-AT-bioperl.org
321
322							=head1 CONTRIBUTORS
323
324							Richard Adams, richard.adams@ed.ac.uk
325
326							=head1 APPENDIX
327
328							The rest of the documentation details each of the object methods.
329							Internal methods are usually preceded with a _
330
331							=cut
332
333
334							# Let the code begin...
335
336
337							package Bio::Align::DNAStatistics;
338	4					390	use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
339							$GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
340	4			4		1225	$CODONS %synchanges $synsites $Precision $GCChhars);
	4					5
341	4			4		14	use strict;
	4					5
	4					70
342	4			4		1006	use Bio::Align::PairwiseStatistics;
	4					10
	4					114
343	4			4		2153	use Bio::Matrix::PhylipDist;
	4					8
	4					108
344	4			4		717	use Bio::Tools::IUPAC;
	4					6
	4					483
345
346							BEGIN {
347	4			4		10	$GapChars = '[\.\-]';
348	4					6	$GCChhars = '[GCS]';
349	4					9	@Nucleotides = qw(A G T C);
350	4					5	$SeqCount = 2;
351	4					4	$Precision = 5;
352
353							# these values come from EMBOSS distmat implementation
354	4					25	%NucleotideIndexes = ( 'A' => 0,
355							'T' => 1,
356							'C' => 2,
357							'G' => 3,
358
359							'AT' => 0,
360							'AC' => 1,
361							'AG' => 2,
362							'CT' => 3,
363							'GT' => 4,
364							'CG' => 5,
365
366							# these are wrong now
367							# 'S' => [ 1, 3],
368							# 'W' => [ 0, 4],
369							# 'Y' => [ 2, 3],
370							# 'R' => [ 0, 1],
371							# 'M' => [ 0, 3],
372							# 'K' => [ 1, 2],
373							# 'B' => [ 1, 2, 3],
374							# 'H' => [ 0, 2, 3],
375							# 'V' => [ 0, 1, 3],
376							# 'D' => [ 0, 1, 2],
377							);
378
379	4					5	$DefaultGapPenalty = 0;
380							# could put ambiguities here?
381	4					26	%DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
382							'T' => [ 'A', 'G'],
383							'C' => [ 'A', 'G'],
384							'G' => [ 'C', 'T'],
385							},
386							'Transitions' => { 'A' => [ 'G' ],
387							'G' => [ 'A' ],
388							'C' => [ 'T' ],
389							'T' => [ 'C' ],
390							},
391							);
392	4					96	%DistanceMethods = ( 'jc\|jukes\|jukescantor\|jukes\-cantor' => 'JukesCantor',
393							'jcuncor\|uncorrected' => 'Uncorrected',
394							'f81\|felsenstein81' => 'F81',
395							'k2\|k2p\|k80\|kimura' => 'Kimura',
396							't92\|tamura\|tamura92' => 'Tamura',
397							'f84\|felsenstein84' => 'F84',
398							'tajimanei\|tajima\-nei' => 'TajimaNei',
399							'jinnei\|jin\-nei' => 'JinNei');
400
401							}
402	4			4		20	use base qw(Bio::Root::Root Bio::Align::StatisticsI);
	4					7
	4					24249
403
404							## generate look up hashes for Nei_Gojobori methods##
405							$CODONS = get_codons();
406							my @t = split '', "FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
407							#create look up hash of number of possible synonymous mutations per codon
408							$synsites = get_syn_sites();
409							#create reference look up hash of single basechanges in codons
410							%synchanges = get_syn_changes();
411
412
413
414							=head2 new
415
416							Title : new
417							Usage : my $obj = Bio::Align::DNAStatistics->new();
418							Function: Builds a new Bio::Align::DNAStatistics object
419							Returns : Bio::Align::DNAStatistics
420							Args : none
421
422
423							=cut
424
425							sub new {
426	2			2	1	783	my ($class,@args) = @_;
427	2					14	my $self = $class->SUPER::new(@args);
428
429	2					12	$self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
430
431	2					5	return $self;
432							}
433
434
435							=head2 distance
436
437							Title : distance
438							Usage : my $distance_mat = $stats->distance(-align => $aln,
439							-method => $method);
440							Function: Calculates a distance matrix for all pairwise distances of
441							sequences in an alignment.
442							Returns : L object
443							Args : -align => Bio::Align::AlignI object
444							-method => String specifying specific distance method
445							(implementing class may assume a default)
446							See also: L
447
448							=cut
449
450							sub distance{
451	13			13	1	79	my ($self,@args) = @_;
452	13					49	my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
453	13	50	33			90	if( ! defined $aln \|\| ! ref ($aln) \|\| ! $aln->isa('Bio::Align::AlignI') ) {
			33
454	0					0	$self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
455							}
456	13		50			26	$method \|\|= 'JukesCantor';
457	13					44	foreach my $m ( keys %DistanceMethods ) {
458	67	100	66			1524	if(defined $m && $method =~ /$m/i ) {
459	13					32	my $mtd = "D_$DistanceMethods{$m}";
460	13					49	return $self->$mtd($aln);
461							}
462							}
463	0					0	$self->warn("Unrecognized distance method $method must be one of [".
464							join(',',$self->available_distance_methods())."]");
465	0					0	return;
466							}
467
468							=head2 available_distance_methods
469
470							Title : available_distance_methods
471							Usage : my @methods = $stats->available_distance_methods();
472							Function: Enumerates the possible distance methods
473							Returns : Array of strings
474							Args : none
475
476
477							=cut
478
479							sub available_distance_methods{
480	0			0	1	0	my ($self,@args) = @_;
481	0					0	return values %DistanceMethods;
482							}
483
484							=head2 D - distance methods
485
486
487							=cut
488
489
490							=head2 D_JukesCantor
491
492							Title : D_JukesCantor
493							Usage : my $d = $stat->D_JukesCantor($aln)
494							Function: Calculates D (pairwise distance) between 2 sequences in an
495							alignment using the Jukes-Cantor 1 parameter model.
496							Returns : L
497							Args : L of DNA sequences
498							double - gap penalty
499
500
501							=cut
502
503							sub D_JukesCantor{
504	2			2	1	4	my ($self,$aln,$gappenalty) = @_;
505	2	50				10	return 0 unless $self->_check_arg($aln);
506	2	50				8	$gappenalty = $DefaultGapPenalty unless defined $gappenalty;
507							# ambiguities ignored at this point
508	2					3	my (@seqs,@names,@values,%dist);
509	2					2	my $seqct = 0;
510	2					5	foreach my $seq ( $aln->each_seq) {
511	4					10	push @names, $seq->display_id;
512	4					10	push @seqs, uc $seq->seq();
513	4					7	$seqct++;
514							}
515	2					6	my $precisionstr = "%.$Precision"."f";
516	2					8	for(my $i = 0; $i < $seqct-1; $i++ ) {
517							# (diagonals) distance is 0 for same sequence
518	2					9	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
519	2					24	$values[$i][$i] = sprintf($precisionstr,0);
520
521	2					8	for( my $j = $i+1; $j < $seqct; $j++ ) {
522	2					8	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
523							$seqs[$j]);
524							# just want diagonals
525	2					8	my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
526							$matrix->[2]->[2] + $matrix->[3]->[3] );
527	2					7	my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
528	2					9	my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
529							# fwd and rev lookup
530	2					9	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
531	2					7	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
532	2					18	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
533							# (diagonals) distance is 0 for same sequence
534	2					5	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
535	2					19	$values[$j][$j] = sprintf($precisionstr,0);
536							}
537							}
538	2					20	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
539							-matrix => \%dist,
540							-names => \@names,
541							-values => \@values);
542							}
543
544							=head2 D_F81
545
546							Title : D_F81
547							Usage : my $d = $stat->D_F81($aln)
548							Function: Calculates D (pairwise distance) between 2 sequences in an
549							alignment using the Felsenstein 1981 distance model.
550							Relaxes the assumption of equal base frequencies that is
551							in JC.
552							Returns : L
553							Args : L of DNA sequences
554
555
556							=cut
557
558							sub D_F81{
559	2			2	1	5	my ($self,$aln,$gappenalty) = @_;
560	2	50				8	return 0 unless $self->_check_arg($aln);
561	2	50				7	$gappenalty = $DefaultGapPenalty unless defined $gappenalty;
562							# ambiguities ignored at this point
563	2					2	my (@seqs,@names,@values,%dist);
564	2					3	my $seqct = 0;
565	2					6	foreach my $seq ( $aln->each_seq) {
566	4					9	push @names, $seq->display_id;;
567	4					11	push @seqs, uc $seq->seq();
568	4					4	$seqct++;
569							}
570	2					6	my $precisionstr = "%.$Precision"."f";
571	2					9	for(my $i = 0; $i < $seqct-1; $i++ ) {
572							# (diagonals) distance is 0 for same sequence
573	2					9	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
574	2					23	$values[$i][$i] = sprintf($precisionstr,0);
575
576	2					8	for( my $j = $i+1; $j < $seqct; $j++ ) {
577
578	2					8	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
579							$seqs[$j]);
580							# just want diagonals
581	2					8	my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
582							$matrix->[2]->[2] + $matrix->[3]->[3] );
583	2					7	my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
584	2					9	my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
585							# fwd and rev lookup
586	2					7	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
587	2					5	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
588	2					15	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
589							# (diagonals) distance is 0 for same sequence
590	2					7	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
591	2					15	$values[$j][$j] = sprintf($precisionstr,0);
592							}
593							}
594	2					21	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
595							-matrix => \%dist,
596							-names => \@names,
597							-values => \@values);
598							}
599
600							=head2 D_Uncorrected
601
602							Title : D_Uncorrected
603							Usage : my $d = $stats->D_Uncorrected($aln)
604							Function: Calculate a distance D, no correction for multiple substitutions
605							is used. In rare cases where sequences may not overlap, 'NA' is
606							substituted for the distance.
607							Returns : L
608							Args : L (DNA Alignment)
609							[optional] gap penalty
610
611							=cut
612
613							sub D_Uncorrected {
614	3			3	1	7	my ($self,$aln,$gappenalty) = @_;
615	3	50				10	$gappenalty = $DefaultGapPenalty unless defined $gappenalty;
616	3	50				8	return 0 unless $self->_check_arg($aln);
617							# ambiguities ignored at this point
618	3					5	my (@seqs,@names,@values,%dist);
619	3					4	my $seqct = 0;
620	3					5	foreach my $seq ( $aln->each_seq) {
621	10					17	push @names, $seq->display_id;
622	10					40	push @seqs, uc $seq->seq();
623	10					12	$seqct++;
624							}
625
626	3					7	my $precisionstr = "%.$Precision"."f";
627	3					8	my $len = $aln->length;
628	3					10	for( my $i = 0; $i < $seqct-1; $i++ ) {
629							# (diagonals) distance is 0 for same sequence
630	7					15	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
631	7					42	$values[$i][$i] = sprintf($precisionstr,0);
632
633	7					15	for( my $j = $i+1; $j < $seqct; $j++ ) {
634	13					25	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
635							$seqs[$j]);
636	13					26	my $m = ( $matrix->[0]->[0] +
637							$matrix->[1]->[1] +
638							$matrix->[2]->[2] +
639							$matrix->[3]->[3] );
640	13					16	my $denom = ( $len - $gaps + ( $gaps * $gappenalty));
641
642	13	100				33	$self->warn("No distance calculated between $names[$i] and $names[$j], inserting -1")
643							unless $denom;
644
645	12	100				21	my $D = $denom ? 1 - ( $m / $denom) : -1;
646							# fwd and rev lookup
647	12					22	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
648	12					20	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
649	12	100				61	$values[$j][$i] = $values[$i][$j] = $denom ? sprintf($precisionstr,$D)
650							: sprintf("%-*s", $Precision + 2, $D);
651							# (diagonals) distance is 0 for same sequence
652	12					19	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
653	12					66	$values[$j][$j] = sprintf($precisionstr,0);
654							}
655							}
656	2					21	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
657							-matrix => \%dist,
658							-names => \@names,
659							-values => \@values);
660							}
661
662
663							# M Kimura, J. Mol. Evol., 1980, 16, 111.
664
665							=head2 D_Kimura
666
667							Title : D_Kimura
668							Usage : my $d = $stat->D_Kimura($aln)
669							Function: Calculates D (pairwise distance) between all pairs of sequences
670							in an alignment using the Kimura 2 parameter model.
671							Returns : L
672							Args : L of DNA sequences
673
674
675							=cut
676
677							sub D_Kimura {
678	2			2	1	5	my ($self,$aln) = @_;
679	2	50				5	return 0 unless $self->_check_arg($aln);
680							# ambiguities ignored at this point
681	2					2	my (@names,@values,%dist);
682	2					3	my $seqct = 0;
683	2					5	foreach my $seq ( $aln->each_seq) {
684	4					8	push @names, $seq->display_id;
685	4					7	$seqct++;
686							}
687
688	2					6	my $precisionstr = "%.$Precision"."f";
689
690	2					8	for( my $i = 0; $i < $seqct-1; $i++ ) {
691							# (diagonals) distance is 0 for same sequence
692	2					7	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
693	2					16	$values[$i][$i] = sprintf($precisionstr,0);
694
695	2					8	for( my $j = $i+1; $j < $seqct; $j++ ) {
696	2					11	my $pairwise = $aln->select_noncont($i+1,$j+1);
697	2					7	my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
698	2	50				6	unless( $L ) {
699	0					0	$L = 1;
700							}
701	2					9	my $P = $self->transitions($pairwise) / $L;
702	2					9	my $Q = $self->transversions($pairwise) / $L;
703	2					6	my $K = 0;
704	2					7	my $denom = ( 1 - (2 * $P) - $Q);
705	2	50				6	if( $denom == 0 ) {
706	0					0	$self->throw("cannot find distance for ",$i+1,
707							",",$j+1," $P, $Q\n");
708							}
709	2					5	my $a = 1 / ( 1 - (2 * $P) - $Q);
710	2					4	my $b = 1 / ( 1 - 2 * $Q );
711	2	50	33			11	if( $a < 0 \|\| $b < 0 ) {
712	0					0	$K = -1;
713							} else{
714	2					9	$K = (1/2) * log ( $a ) + (1/4) * log($b);
715							}
716							# fwd and rev lookup
717	2					8	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
718	2					6	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
719	2					21	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
720							# (diagonals) distance is 0 for same sequence
721	2					8	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
722	2					17	$values[$j][$j] = sprintf($precisionstr,0);
723							}
724							}
725	2					15	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
726							-matrix => \%dist,
727							-names => \@names,
728							-values => \@values);
729							}
730
731
732							=head2 D_Kimura_variance
733
734							Title : D_Kimura
735							Usage : my $d = $stat->D_Kimura_variance($aln)
736							Function: Calculates D (pairwise distance) between all pairs of sequences
737							in an alignment using the Kimura 2 parameter model.
738							Returns : array of 2 L,
739							the first is the Kimura distance and the second is
740							a matrix of variance V(K)
741							Args : L of DNA sequences
742
743
744							=cut
745
746							sub D_Kimura_variance {
747	0			0	1	0	my ($self,$aln) = @_;
748	0	0				0	return 0 unless $self->_check_arg($aln);
749							# ambiguities ignored at this point
750	0					0	my (@names,@values,%dist,@var);
751	0					0	my $seqct = 0;
752	0					0	foreach my $seq ( $aln->each_seq) {
753	0					0	push @names, $seq->display_id;
754	0					0	$seqct++;
755							}
756
757	0					0	my $precisionstr = "%.$Precision"."f";
758
759	0					0	for( my $i = 0; $i < $seqct-1; $i++ ) {
760							# (diagonals) distance is 0 for same sequence
761	0					0	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
762	0					0	$values[$i][$i] = sprintf($precisionstr,0);
763
764	0					0	for( my $j = $i+1; $j < $seqct; $j++ ) {
765	0					0	my $pairwise = $aln->select_noncont($i+1,$j+1);
766	0					0	my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
767	0	0				0	unless( $L ) {
768	0					0	$L = 1;
769							}
770	0					0	my $P = $self->transitions($pairwise) / $L;
771	0					0	my $Q = $self->transversions($pairwise) / $L;
772	0					0	my ($a,$b,$K,$var_k);
773	0					0	my $a_denom = ( 1 - (2 * $P) - $Q);
774	0					0	my $b_denom = 1 - 2 * $Q;
775	0	0	0			0	unless( $a_denom > 0 && $b_denom > 0 ) {
776	0					0	$a = 1;
777	0					0	$b = 1;
778	0					0	$K = -1;
779	0					0	$var_k = -1;
780							} else {
781	0					0	$a = 1 / $a_denom;
782	0					0	$b = 1 / $b_denom;
783	0					0	$K = (1/2) * log ( $a ) + (1/4) * log($b);
784							# from Wu and Li 1985 which in turn is from Kimura 1980
785	0					0	my $c = ( $a - $b ) / 2;
786	0					0	my $d = ( $a + $b ) / 2;
787	0					0	$var_k = ( $a*2 $P + $d*2 $Q - ( $a * $P + $d * $Q)**2 ) / $L;
788							}
789
790							# fwd and rev lookup
791	0					0	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
792	0					0	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
793	0					0	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
794							# (diagonals) distance is 0 for same sequence
795	0					0	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
796	0					0	$values[$j]->[$j] = sprintf($precisionstr,0);
797
798	0					0	$var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
799	0					0	$var[$j]->[$j] = $values[$j]->[$j];
800							}
801							}
802	0					0	return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
803							-matrix => \%dist,
804							-names => \@names,
805							-values => \@values),
806							Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
807							-matrix => \%dist,
808							-names => \@names,
809							-values => \@var)
810							);
811							}
812
813
814							# K Tamura, Mol. Biol. Evol. 1992, 9, 678.
815
816							=head2 D_Tamura
817
818							Title : D_Tamura
819							Usage : Calculates D (pairwise distance) between 2 sequences in an
820							alignment using Tamura 1992 distance model.
821							Returns : L
822							Args : L of DNA sequences
823
824
825							=cut
826
827							sub D_Tamura {
828	2			2	1	3	my ($self,$aln) = @_;
829	2	50				6	return 0 unless $self->_check_arg($aln);
830							# ambiguities ignored at this point
831	2					3	my (@seqs,@names,@values,%dist,$i,$j);
832	2					3	my $seqct = 0;
833	2					4	my $length = $aln->length;
834	2					5	foreach my $seq ( $aln->each_seq) {
835	4					7	push @names, $seq->display_id;;
836	4					8	push @seqs, uc $seq->seq();
837	4					6	$seqct++;
838							}
839
840	2					5	my $precisionstr = "%.$Precision"."f";
841	2					2	my (@gap,@gc,@trans,@tranv,@score);
842	2					4	$i = 0;
843	2					4	for my $t1 ( @seqs ) {
844	4					3	$j = 0;
845	4					4	for my $t2 ( @seqs ) {
846	8					12	$gap[$i][$j] = 0;
847	8					13	for( my $k = 0; $k < $length; $k++ ) {
848	1532					1522	my ($c1,$c2) = ( substr($seqs[$i],$k,1),
849							substr($seqs[$j],$k,1) );
850	1532	100	100			6457	if( $c1 =~ /^$GapChars$/ \|\|
		100
851							$c2 =~ /^$GapChars$/ ) {
852	120					167	$gap[$i][$j]++;
853							} elsif( $c2 =~ /^$GCChhars$/i ) {
854	960					1470	$gc[$i][$j]++;
855							}
856							}
857	8					13	$gc[$i][$j] = ( $gc[$i][$j] /
858							($length - $gap[$i][$j]) );
859	8					9	$j++;
860							}
861	4					4	$i++;
862							}
863
864	2					8	for( $i = 0; $i < $seqct-1; $i++ ) {
865							# (diagonals) distance is 0 for same sequence
866	2					10	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
867	2					19	$values[$i][$i] = sprintf($precisionstr,0);
868
869	2					7	for( $j = $i+1; $j < $seqct; $j++ ) {
870
871	2					10	my $pairwise = $aln->select_noncont($i+1,$j+1);
872	2					6	my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
873	2					6	my $P = $self->transitions($pairwise) / $L;
874	2					6	my $Q = $self->transversions($pairwise) / $L;
875	2					9	my $C = $gc[$i][$j] + $gc[$j][$i]-
876							( 2 * $gc[$i][$j] * $gc[$j][$i] );
877	2	50				5	if( $P ) {
878	2					5	$P = $P / $C;
879							}
880	2					11	my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
881							# fwd and rev lookup
882	2					6	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
883	2					5	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
884	2					17	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
885							# (diagonals) distance is 0 for same sequence
886	2					4	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
887	2					12	$values[$j][$j] = sprintf($precisionstr,0);
888							}
889							}
890	2					13	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
891							-matrix => \%dist,
892							-names => \@names,
893							-values => \@values);
894
895							}
896
897							=head2 D_F84
898
899							Title : D_F84
900							Usage : my $d = $stat->D_F84($aln)
901							Function: Calculates D (pairwise distance) between 2 sequences in an
902							alignment using the Felsenstein 1984 distance model.
903							Returns : L
904							Args : L of DNA sequences
905							[optional] double - gap penalty
906
907							=cut
908
909							sub D_F84 {
910	0			0	1	0	my ($self,$aln,$gappenalty) = @_;
911	0	0				0	return 0 unless $self->_check_arg($aln);
912	0					0	$self->throw_not_implemented();
913							# ambiguities ignored at this point
914	0					0	my (@seqs,@names,@values,%dist);
915	0					0	my $seqct = 0;
916	0					0	foreach my $seq ( $aln->each_seq) {
917							# if there is no name,
918	0					0	my $id = $seq->display_id;
919	0	0	0			0	if( ! length($id) \|\| # deal with empty names
920							$id =~ /^\s+$/ ) {
921	0					0	$id = $seqct+1;
922							}
923	0					0	push @names, $id;
924	0					0	push @seqs, uc $seq->seq();
925	0					0	$seqct++;
926							}
927
928	0					0	my $precisionstr = "%.$Precision"."f";
929
930	0					0	for( my $i = 0; $i < $seqct-1; $i++ ) {
931							# (diagonals) distance is 0 for same sequence
932	0					0	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
933	0					0	$values[$i][$i] = sprintf($precisionstr,0);
934
935	0					0	for( my $j = $i+1; $j < $seqct; $j++ ) {
936							}
937							}
938							}
939
940							# Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
941							# Tajima-Nei correction used for multiple substitutions in the calc
942							# of the distance matrix. Nucleic acids only.
943							#
944							# D = p-distance = 1 - (matches/(posns_scored + gaps)
945							#
946							# distance = -b * ln(1-D/b)
947							#
948
949							=head2 D_TajimaNei
950
951							Title : D_TajimaNei
952							Usage : my $d = $stat->D_TajimaNei($aln)
953							Function: Calculates D (pairwise distance) between 2 sequences in an
954							alignment using the TajimaNei 1984 distance model.
955							Returns : L
956							Args : Bio::Align::AlignI of DNA sequences
957
958
959							=cut
960
961							sub D_TajimaNei{
962	2			2	1	4	my ($self,$aln) = @_;
963	2	50				5	return 0 unless $self->_check_arg($aln);
964							# ambiguities ignored at this point
965	2					4	my (@seqs,@names,@values,%dist);
966	2					2	my $seqct = 0;
967	2					4	foreach my $seq ( $aln->each_seq) {
968							# if there is no name,
969	4					9	push @names, $seq->display_id;
970	4					10	push @seqs, uc $seq->seq();
971	4					5	$seqct++;
972							}
973	2					5	my $precisionstr = "%.$Precision"."f";
974	2					2	my ($i,$j,$bs);
975							# pairwise
976	2					7	for( $i =0; $i < $seqct -1; $i++ ) {
977	2					8	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
978	2					15	$values[$i][$i] = sprintf($precisionstr,0);
979
980	2					8	for ( $j = $i+1; $j <$seqct;$j++ ) {
981	2					8	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
982							$seqs[$j]);
983	2					9	my $pairwise = $aln->select_noncont($i+1,$j+1);
984	2					5	my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
985	2					3	my $fij2 = 0;
986	2					9	for( $bs = 0; $bs < 4; $bs++ ) {
987	8					7	my $fi = 0;
988	8					7	map {$fi += $matrix->[$bs]->[$_] } 0..3;
	32					32
989	8					7	my $fj = 0;
990							# summation
991	8					8	map { $fj += $matrix->[$_]->[$bs] } 0..3;
	32					26
992	8	50	33			29	my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
993	8					17	$fij2 += $fij**2;
994							}
995
996	2					2	my ($pair,$h) = (0,0);
997	2					6	for( $bs = 0; $bs < 3; $bs++ ) {
998	6					11	for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
999	12					20	my $fij = $pfreq->[$pair++] / $slen;
1000	12	100				17	if( $fij ) {
1001
1002	10					8	my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
1003
1004	10					10	map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
	40					35
1005	10					8	map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
	40					32
1006	10					9	map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
	40					32
1007	10					7	map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
	40					30
1008
1009	10	50				16	if( $fij ) {
1010	10					20	$h += ( ($fij**2) / 2 ) /
1011							( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
1012							( ( $ci2 + $cj2 ) / (2 * $slen) )
1013							);
1014							}
1015	10					64	$self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
1016							}
1017							}
1018							}
1019							# just want diagonals which are matches (A matched A, C -> C)
1020
1021	2					5	my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
1022							$matrix->[2]->[2] + $matrix->[3]->[3] );
1023	2					4	my $D = 1 - ( $m / $slen);
1024	2					2	my $d;
1025	2	50				5	if( $h == 0 ) {
1026	0					0	$d = -1;
1027							} else {
1028	2					4	my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
1029	2					3	my $c = 1- $D/ $b;
1030
1031	2	50				4	if( $c < 0 ) {
1032	0					0	$d = -1;
1033							} else {
1034	2					6	$d = (-1 * $b) * log ( $c);
1035							}
1036							}
1037							# fwd and rev lookup
1038	2					6	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
1039	2					7	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
1040	2					14	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1041
1042							# (diagonals) distance is 0 for same sequence
1043	2					5	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
1044	2					11	$values[$j][$j] = sprintf($precisionstr,0);
1045							}
1046							}
1047	2					13	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1048							-matrix => \%dist,
1049							-names => \@names,
1050							-values => \@values);
1051
1052							}
1053
1054							# Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1055
1056							=head2 D_JinNei
1057
1058							Title : D_JinNei
1059							Usage : my $d = $stat->D_JinNei($aln)
1060							Function: Calculates D (pairwise distance) between 2 sequences in an
1061							alignment using the Jin-Nei 1990 distance model.
1062							Returns : L
1063							Args : L of DNA sequences
1064
1065
1066							=cut
1067
1068							sub D_JinNei{
1069	0			0	1	0	my ($self,@args) = @_;
1070	0					0	$self->warn("JinNei implementation not completed");
1071	0					0	return;
1072							}
1073
1074							=head2 transversions
1075
1076							Title : transversions
1077							Usage : my $transversions = $stats->transversion($aln);
1078							Function: Calculates the number of transversions between two sequences in
1079							an alignment
1080							Returns : integer
1081							Args : Bio::Align::AlignI
1082
1083
1084							=cut
1085
1086							sub transversions{
1087	6			6	1	12	my ($self,$aln) = @_;
1088	6					13	return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1089							}
1090
1091							=head2 transitions
1092
1093							Title : transitions
1094							Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1095							Function: Calculates the number of transitions in a given DNA alignment
1096							Returns : integer representing the number of transitions
1097							Args : Bio::Align::AlignI object
1098
1099
1100							=cut
1101
1102							sub transitions{
1103	6			6	1	8	my ($self,$aln) = @_;
1104	6					17	return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1105							}
1106
1107
1108							sub _trans_count_helper {
1109	12			12		12	my ($self,$aln,$type) = @_;
1110	12	50				15	return 0 unless( $self->_check_arg($aln) );
1111	12	50				27	if( ! $aln->is_flush ) { $self->throw("must be flush") }
	0					0
1112	12					10	my (@tcount);
1113	12					18	my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1114							uc $aln->get_seq_by_pos(2)->seq() );
1115	12					17	my $alen = $aln->length;
1116	12					23	for (my $i = 0;$i<$alen; $i++ ) {
1117	2298					1740	my ($c1,$c2) = ( substr($first,$i,1),
1118							substr($second,$i,1) );
1119	2298	100				3326	if( $c1 ne $c2 ) {
1120	480					268	foreach my $nt ( @{$type->{$c1}} ) {
	480					542
1121	477	100				715	if( $nt eq $c2) {
1122	120					183	$tcount[$i]++;
1123							}
1124							}
1125							}
1126							}
1127	12					11	my $sum = 0;
1128	12	100				19	map { if( $_) { $sum += $_} } @tcount;
	2025					2109
	120					175
1129	12					41	return $sum;
1130							}
1131
1132							# this will generate a matrix which records across the row, the number
1133							# of DNA subst
1134							#
1135							sub _build_nt_matrix {
1136	19			19		19	my ($self,$seqa,$seqb) = @_;
1137
1138
1139	19					71	my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1140							[ qw(0 0 0 0) ],
1141							[ qw(0 0 0 0) ],
1142							[ qw(0 0 0 0) ] ];
1143	19					15	my $gaps = 0; # number of gaps
1144	19					32	my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1145	19					16	my $len_a = length($seqa);
1146	19					30	for( my $i = 0; $i < $len_a; $i++) {
1147	1974					1806	my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1148	1974					1238	$ti =~ tr/U/T/;
1149	1974					1114	$tj =~ tr/U/T/;
1150
1151	1974	100				3354	if( $ti =~ /^$GapChars$/) { $gaps++; next; }
	151					88
	151					205
1152	1823	100				2875	if( $tj =~ /^$GapChars$/) { $gaps++; next }
	318					200
	318					416
1153
1154	1505					1113	my $ti_index = $NucleotideIndexes{$ti};
1155	1505					884	my $tj_index = $NucleotideIndexes{$tj};
1156
1157	1505	50				1501	if( ! defined $ti_index ) {
1158	0					0	$self->warn("ti_index not defined for $ti\n");
1159	0					0	next;
1160							}
1161
1162	1505					984	$basect_matrix->[$ti_index]->[$tj_index]++;
1163
1164	1505	100				2730	if( $ti ne $tj ) {
1165	159					366	$pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1166							}
1167							}
1168	19					35	return ($basect_matrix,$pfreq,$gaps);
1169							}
1170
1171							sub _check_ambiguity_nucleotide {
1172	0			0		0	my ($base1,$base2) = @_;
1173	0					0	my %iub = Bio::Tools::IUPAC->iupac_iub();
1174	0					0	my @amb1 = @{ $iub{uc($base1)} };
	0					0
1175	0					0	my @amb2 = @{ $iub{uc($base2)} };
	0					0
1176	0					0	my ($pmatch) = (0);
1177	0					0	for my $amb ( @amb1 ) {
1178	0	0				0	if( grep { $amb eq $_ } @amb2 ) {
	0					0
1179	0					0	$pmatch = 1;
1180	0					0	last;
1181							}
1182							}
1183	0	0				0	if( $pmatch ) {
1184	0					0	return (1 / scalar @amb1) * (1 / scalar @amb2);
1185							} else {
1186	0					0	return 0;
1187							}
1188							}
1189
1190
1191							sub _check_arg {
1192	25			25		22	my($self,$aln ) = @_;
1193	25	50	33			163	if( ! defined $aln \|\| ! $aln->isa('Bio::Align::AlignI') ) {
		50
1194	0					0	$self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1195	0					0	return 0;
1196							} elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1197	0					0	$self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1198	0					0	return 0;
1199							}
1200	25					60	return 1;
1201							}
1202
1203							=head2 Data Methods
1204
1205							=cut
1206
1207							=head2 pairwise_stats
1208
1209							Title : pairwise_stats
1210							Usage : $obj->pairwise_stats($newval)
1211							Function:
1212							Returns : value of pairwise_stats
1213							Args : newvalue (optional)
1214
1215
1216							=cut
1217
1218							sub pairwise_stats{
1219	18			18	1	23	my ($self,$value) = @_;
1220	18	100				36	if( defined $value) {
1221	2					4	$self->{'_pairwise_stats'} = $value;
1222							}
1223	18					75	return $self->{'_pairwise_stats'};
1224
1225							}
1226
1227							=head2 calc_KaKs_pair
1228
1229							Title : calc_KaKs_pair
1230							Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1231							$name1, $name2).
1232							Function : calculates Nei-Gojobori statistics for pairwise
1233							comparison.
1234							Args : A Bio::Align::AlignI compliant object such as a
1235							Bio::SimpleAlign object, and 2 sequence name strings.
1236							Returns : a reference to a hash of statistics with keys as
1237							listed in Description.
1238
1239							=cut
1240
1241							sub calc_KaKs_pair {
1242	1			1	1	256	my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1243	1	50				4	$self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1244							if @_!= 4;
1245	1	50				4	$self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1246	1					5	my @seqs = (
1247							#{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1248							#{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1249							{id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1250							{id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1251							) ;
1252	1	50				4	if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1253	0					0	$self->throw(" aligned sequences must be of equal length!");
1254							}
1255	1					3	my $results = [];
1256	1					7	$self->_get_av_ds_dn(\@seqs, $results);
1257	1					3	return $results;
1258
1259							}
1260
1261							=head2 calc_all_KaKs_pairs
1262
1263							Title : calc_all_KaKs_pairs
1264							Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1265							Function : Calculates Nei_gojobori statistics for all pairwise
1266							combinations in sequence.
1267							Arguments: A Bio::Align::ALignI compliant object such as
1268							a Bio::SimpleAlign object.
1269							Returns : A reference to an array of hashes of statistics of
1270							all pairwise comparisons in the alignment.
1271
1272							=cut
1273
1274
1275
1276							sub calc_all_KaKs_pairs {
1277							#returns a multi_element_array with all pairwise comparisons
1278	1			1	1	690	my ($self,$aln) = @_;
1279	1	50				6	$self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1280	1					1	my @seqs;
1281	1					3	for my $seq ($aln->each_seq) {
1282	3					7	push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1283							}
1284	1					1	my $results ;
1285	1					3	$results = $self->_get_av_ds_dn(\@seqs, $results);
1286	1					3	return $results;
1287							}
1288
1289							=head2 calc_average_KaKs
1290
1291							Title : calc_average_KaKs.
1292							Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1293							Function : calculates Nei_Gojobori stats for average of all
1294							sequences in the alignment.
1295							Args : A Bio::Align::AlignI compliant object such as a
1296							Bio::SimpleAlign object, number of bootstrap iterations
1297							(default 1000).
1298							Returns : A reference to a hash of statistics as listed in Description.
1299
1300							=cut
1301
1302							sub calc_average_KaKs {
1303							#calculates global value for sequences in alignment using bootstrapping
1304							#this is quite slow (~10 seconds per 3 X 200nt seqs);
1305	1			1	1	677	my ($self, $aln, $bootstrap_rpt) = @_;
1306	1		50			5	$bootstrap_rpt \|\|= 1000;
1307	1	50				5	$self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1308	1					1	my @seqs;
1309	1					3	for my $seq ($aln->each_seq) {
1310	3					7	push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1311							}
1312	1					2	my $results ;
1313	1					3	my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1314							#print "ds = $ds_orig, dn = $dn_orig\n";
1315	1					3	$results = {D_s => $ds_orig, D_n => $dn_orig};
1316	1					4	$self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1317	1					10	return $results;
1318							}
1319
1320							############## primary internal subs for alignment comparisons ########################
1321
1322							sub _run_bootstrap {
1323							### generates sampled sequences, calculates Ds and Dn values,
1324							### then calculates variance of sampled sequences and add results to results hash
1325							###
1326	1			1		1	my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1327	1					2	my @seqs = @$seq_ref;
1328	1					1	my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1329	1					2	my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1330
1331							#1st make alternative array of codons;
1332	1					1	my $c = 0;
1333	1					4	while ($c < length $seqs[0]{'seq'}) {
1334	57					52	for (0..$#seqs) {
1335	171					89	push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
	171					211
1336							}
1337	57					67	$c+=3;
1338							}
1339
1340	1					2	for (1..$bootstrap_rpt) {
1341	100					119	my $sampled = _resample (\@btstrp_aoa);
1342	100					143	my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1343	100					81	push @{$bootstrap_values{'ds'}}, $ds;
	100					122
1344	100					77	push @{$bootstrap_values{'dn'}}, $dn;
	100					234
1345							}
1346
1347	1					5	$results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1348	1					3	$results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1349							$results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1350	1					16	sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1351							#print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1352							#print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1353							#print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1354							}
1355
1356							sub _resample {
1357	100			100		67	my $ref = shift;
1358	100					74	my $codon_num = scalar (@{$ref->[0]});
	100					95
1359	100					62	my @altered;
1360	100					132	for (0..$codon_num -1) { #for each codon
1361	5700					3769	my $rand = int (rand ($codon_num));
1362	5700					5037	for (0..$#$ref) {
1363	17100					9095	push @{$altered[$_]}, $ref->[$_][$rand];
	17100					18628
1364							}
1365							}
1366	100					89	my @stringed = map {join '', @$_}@altered;
	300					931
1367	100					81	my @return;
1368							#now out in random name to keep other subs happy
1369	100					100	for (@stringed) {
1370	300					487	push @return, {id=>'1', seq=> $_};
1371							}
1372	100					708	return \@return;
1373							}
1374
1375							sub _get_av_ds_dn {
1376							# takes array of hashes of sequence strings and ids #
1377	103			103		81	my $self = shift;
1378	103					76	my $seq_ref = shift;
1379	103	100				162	my $result = shift if @_;
1380	103					245	my @caller = caller(1);
1381	103					1231	my @seqarray = @$seq_ref;
1382	103					61	my $bootstrap_score_list;
1383							#for a multiple alignment considers all pairwise combinations#
1384	103					177	my %dsfor_average = (ds => [], dn => []);
1385	103					176	for (my $i = 0; $i < scalar @seqarray; $i++) {
1386	308					532	for (my $j = $i +1; $j
1387							# print "comparing $i and $j\n";
1388	307	50				494	if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1389	0					0	$self->warn(" aligned sequences must be of equal length!");
1390	0					0	next;
1391							}
1392
1393	307					392	my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1394	307					345	my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1395							# print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1396	307					399	my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1397							#get averages
1398	307					333	my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1399	307					337	my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1400
1401							#calculate ps and pn (p54)
1402	307					239	my $syn_prop = $syn_count / $av_s_site;
1403	307					214	my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1404
1405							#now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1406	307					415	my $d_syn = $self->jk($syn_prop);
1407	307					302	my $d_nc = $self->jk($nc_prop);
1408
1409							#JK calculation must succeed for continuation of calculation
1410							#ret_value = -1 if error
1411	307	50	33			879	next unless $d_nc >=0 && $d_syn >=0;
1412
1413
1414	307					201	push @{$dsfor_average{'ds'}}, $d_syn;
	307					377
1415	307					204	push @{$dsfor_average{'dn'}}, $d_nc;
	307					227
1416
1417							#if not doing bootstrap, calculate the pairwise comparisin stats
1418	307	100	100			1489	if ($caller[3] =~ /calc_KaKs_pair/ \|\| $caller[3] =~ /calc_all_KaKs_pairs/) {
1419							#now calculate variances assuming large sample
1420	4					9	my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1421	4					6	my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1422							#now calculate z_value
1423							#print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1424							#my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1425	4	50				9	my $z = ($d_syn_var + $d_nc_var) ?
1426							($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1427							# print "z is $z\n";
1428							push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1429							S_d => $syn_count, N_d =>$non_syn_count,
1430							P_s => $syn_prop, P_n=>$nc_prop,
1431	4					6	D_s => @{$dsfor_average{'ds'}}[-1],
1432	4					27	D_n => @{$dsfor_average{'dn'}}[-1],
1433							D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1434							Seq1 => $seqarray[$i]{'id'},
1435	4					5	Seq2 => $seqarray[$j]{'id'},
1436							z_score => $z,
1437							};
1438	4	50	33			18	$self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
			33
1439							if ($syn_count < 10 \|\| $non_syn_count < 10 ) && $self->verbose > -1 ;
1440							}#endif
1441							}
1442							}
1443
1444							#warn of failure if no results hashes are present
1445							#will fail if Jukes Cantor has failed for all pairwise combinations
1446							#$self->warn("calculation failed!") if scalar @$result ==0;
1447
1448							#return results unless bootstrapping
1449	103	100	100			339	return $result if $caller[3]=~ /calc_all_KaKs/ \|\| $caller[3] =~ /calc_KaKs_pair/;
1450							#else if getting average for bootstrap
1451	101					164	return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1452							}
1453
1454
1455							sub jk {
1456	614			614	0	466	my ($self, $p) = @_;
1457	614	50				684	if ($p > 0.75) {
1458	0					0	$self->warn( " Jukes Cantor won't work -too divergent!");
1459	0					0	return -1;
1460							}
1461	614					885	return -1 * (3/4) * (log(1 - (4/3) * $p));
1462							}
1463
1464							#works for large value of n (50?100?)
1465							sub jk_var {
1466	8			8	0	6	my ($p, $n) = @_;
1467	8					16	return (9 * $p * (1 -$p))/(((3 - 4 $p) 2) $n);
1468							}
1469
1470
1471							# compares 2 sequences to find the number of synonymous/non
1472							# synonymous mutations between them
1473
1474							sub analyse_mutations {
1475	307			307	0	227	my ($seq1, $seq2) = @_;
1476	307					1727	my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1477							[2,1]], # depend on which is the same
1478							1=>[[0,2],
1479							[2,0]],
1480							2=>[[0,1],
1481							[1,0]],
1482							},
1483							3=> [ [0,1,2], # all need to be altered
1484							[1,0,2],
1485							[0,2,1],
1486							[1,2,0],
1487							[2,0,1],
1488							[2,1,0] ],
1489							);
1490	307					292	my $TOTAL = 0; # total synonymous changes
1491	307					215	my $TOTAL_n = 0; # total non-synonymous changes
1492	307					176	my $gap_cnt = 0;
1493
1494	307					186	my %input;
1495	307					198	my $seqlen = length($seq1);
1496	307					421	for (my $j=0; $j< $seqlen; $j+=3) {
1497	17499					14822	$input{'cod1'} = substr($seq1, $j,3);
1498	17499					11413	$input{'cod2'} = substr($seq2, $j,3);
1499
1500							#ignore codon if beeing compared with gaps!
1501	17499	50	33			45090	if ($input{'cod1'} =~ /\-/ \|\| $input{'cod2'} =~ /\-/){
1502	0					0	$gap_cnt += 3; #just increments once if there is a pair of gaps
1503	0					0	next;
1504							}
1505
1506	17499					16899	my ($diff_cnt, $same) = count_diffs(\%input);
1507
1508							#ignore if codons are identical
1509	17499	100				28261	next if $diff_cnt == 0 ;
1510	5039	100				4636	if ($diff_cnt == 1) {
		50
		0
1511	3973					3954	$TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1512	3973					6324	$TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1513							#print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1514							}
1515							elsif ($diff_cnt ==2) {
1516	1066					700	my $s_cnt = 0;
1517	1066					567	my $n_cnt = 0;
1518	1066					661	my $tot_muts = 4;
1519							#will stay 4 unless there are stop codons at intervening point
1520	1066					587	OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
	1066					1212
1521	2132					1376	my $altered = $input{'cod1'};
1522	2132					1191	my $prev= $altered;
1523							# print "$prev -> (", $t[$CODONS->{$altered}], ")";
1524	2132					1574	for my $mut_i (@$perm) { #index of codon mutated
1525	4264					3012	substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1526	4264	50				4333	if ($t[$CODONS->{$altered}] eq '*') {
1527	0					0	$tot_muts -=2;
1528							#print "changes to stop codon!!\n";
1529	0					0	next OUTER;
1530							}
1531							else {
1532	4264					3386	$s_cnt += $synchanges{$prev}{$altered};
1533							# print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1534							}
1535	4264					3798	$prev = $altered;
1536							}
1537							# print "\n";
1538							}
1539	1066	50				1254	if ($tot_muts != 0) {
1540	1066					990	$TOTAL += ($s_cnt/($tot_muts/2));
1541	1066					1782	$TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1542							}
1543
1544							}
1545							elsif ($diff_cnt ==3 ) {
1546	0					0	my $s_cnt = 0;
1547	0					0	my $n_cnt = 0;
1548	0					0	my $tot_muts = 18; #potential number of mutations
1549	0					0	OUTER: for my $perm (@{$mutator{'3'}}) {
	0					0
1550	0					0	my $altered = $input{'cod1'};
1551	0					0	my $prev= $altered;
1552							# print "$prev -> (", $t[$CODONS->{$altered}], ")";
1553	0					0	for my $mut_i (@$perm) { #index of codon mutated
1554	0					0	substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1555	0	0				0	if ($t[$CODONS->{$altered}] eq '*') {
1556	0					0	$tot_muts -=3;
1557							# print "changes to stop codon!!\n";
1558	0					0	next OUTER;
1559
1560							}
1561							else {
1562	0					0	$s_cnt += $synchanges{$prev}{$altered};
1563							# print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1564							}
1565	0					0	$prev = $altered;
1566							}
1567							# print "\n";
1568
1569							}#end OUTER loop
1570							#calculate number of synonymous/non synonymous mutations for that codon
1571							# and add to total
1572	0	0				0	if ($tot_muts != 0) {
1573	0					0	$TOTAL += ($s_cnt / ($tot_muts /3));
1574	0					0	$TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1575							}
1576							} #endif $diffcnt = 3
1577							} #end of sequencetraversal
1578	307					1115	return ($TOTAL, $TOTAL_n, $gap_cnt);
1579							}
1580
1581
1582							sub count_diffs {
1583							#counts the number of nucleotide differences between 2 codons
1584							# returns this value plus the codon index of which nucleotide is the same when 2
1585							#nucleotides are different. This is so analyse_mutations() knows which nucleotides
1586							# to change.
1587	17499			17499	0	10465	my $ref = shift;
1588	17499					9526	my $cnt = 0;
1589	17499					9831	my $same= undef;
1590							#just for 2 differences
1591	17499					15522	for (0..2) {
1592	52497	100				55731	if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1593	6105					4402	$cnt++;
1594							} else {
1595	46392					33271	$same = $_;
1596							}
1597							}
1598	17499					15047	return ($cnt, $same);
1599							}
1600
1601							=head2 get_syn_changes
1602
1603							Title : get_syn_changes
1604							Usage : Bio::Align::DNAStatitics->get_syn_changes
1605							Function: Generate a hashref of all pairwise combinations of codns
1606							differing by 1
1607							Returns : Symetic matrix using hashes
1608							First key is codon
1609							and each codon points to a hashref of codons
1610							the values of which describe type of change.
1611							my $type = $hash{$codon1}->{$codon2};
1612							values are :
1613							1 synonymous
1614							0 non-syn
1615							-1 either codon is a stop codon
1616							Args : none
1617
1618							=cut
1619
1620							sub get_syn_changes {
1621							#hash of all pairwise combinations of codons differing by 1
1622							# 1 = syn, 0 = non-syn, -1 = stop
1623	4			4	1	6	my %results;
1624	4					11	my @codons = _make_codons ();
1625	4					14	my $arr_len = scalar @codons;
1626	4					21	for (my $i = 0; $i < $arr_len -1; $i++) {
1627	252					333	my $cod1 = $codons[$i];
1628	252					344	for (my $j = $i +1; $j < $arr_len; $j++) {
1629	8064					4487	my $diff_cnt = 0;
1630	8064					6057	for my $pos(0..2) {
1631	24192	100				32743	$diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1632							}
1633	8064	100				13211	next if $diff_cnt !=1;
1634
1635							#synon change
1636	1152	100	100			3380	if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
		100
1637	276					272	$results{$cod1}{$codons[$j]} =1;
1638	276					457	$results{$codons[$j]}{$cod1} = 1;
1639							}
1640							#stop codon
1641							elsif ($t[$CODONS->{$cod1}] eq '' or $t[$CODONS->{$codons[$j]}] eq '') {
1642	92					110	$results{$cod1}{$codons[$j]} = -1;
1643	92					167	$results{$codons[$j]}{$cod1} = -1;
1644							}
1645							# nc change
1646							else {
1647	784					874	$results{$cod1}{$codons[$j]} = 0;
1648	784					1460	$results{$codons[$j]}{$cod1} = 0;
1649							}
1650							}
1651							}
1652	4					158	return %results;
1653							}
1654
1655							=head2 dnds_pattern_number
1656
1657							Title : dnds_pattern_number
1658							Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1659							Function: Counts the number of codons with no gaps in the MSA
1660							Returns : Number of codons with no gaps ('patterns' in PAML notation)
1661							Args : A Bio::Align::AlignI compliant object such as a
1662							Bio::SimpleAlign object.
1663
1664							=cut
1665
1666							sub dnds_pattern_number{
1667	0			0	1	0	my ($self, $aln) = @_;
1668	0					0	return ($aln->remove_gaps->length)/3;
1669							}
1670
1671							sub count_syn_sites {
1672							#counts the number of possible synonymous changes for sequence
1673	615			615	0	854	my ($seq, $synsite) = @_;
1674	615	100				763	__PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1675	614					383	my $S = 0;
1676	614					756	for (my $i = 0; $i< length($seq); $i+=3) {
1677	34998					22341	my $cod = substr($seq, $i, 3);
1678	34998	50				35320	next if $cod =~ /\-/; #deal with alignment gaps
1679	34998					43519	$S += $synsite->{$cod}{'s'};
1680							}
1681							#print "S is $S\n";
1682	614					595	return $S;
1683							}
1684
1685
1686
1687							sub get_syn_sites {
1688							#sub to generate lookup hash for the number of synonymous changes per codon
1689	4			4	0	9	my @nucs = qw(T C A G);
1690	4					8	my %raw_results;
1691	4					6	for my $i (@nucs) {
1692	16					16	for my $j (@nucs) {
1693	64					55	for my $k (@nucs) {
1694							# for each possible codon
1695	256					251	my $cod = "$i$j$k";
1696	256					211	my $aa = $t[$CODONS->{$cod}];
1697							#calculate number of synonymous mutations vs non syn mutations
1698	256					203	for my $i (qw(0 1 2)){
1699	768					515	my $s = 0;
1700	768					424	my $n = 3;
1701	768					607	for my $nuc (qw(A T C G)) {
1702	3072	100				3875	next if substr ($cod, $i,1) eq $nuc;
1703	2304					1417	my $test = $cod;
1704	2304					1500	substr($test, $i, 1) = $nuc ;
1705	2304	100				2964	if ($t[$CODONS->{$test}] eq $aa) {
1706	552					353	$s++;
1707							}
1708	2304	100				3231	if ($t[$CODONS->{$test}] eq '*') {
1709	108					100	$n--;
1710							}
1711							}
1712	768					1661	$raw_results{$cod}[$i] = {'s' => $s ,
1713							'n' => $n };
1714							}
1715
1716							} #end analysis of single codon
1717							}
1718							} #end analysis of all codons
1719	4					10	my %final_results;
1720
1721	4					97	for my $cod (sort keys %raw_results) {
1722	256					167	my $t = 0;
1723	256					138	map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
	768					935
	256					263
1724	256					504	$final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1725							}
1726	4					159	return \%final_results;
1727							}
1728
1729							sub _make_codons {
1730							#makes all codon combinations, returns array of them
1731	8			8		20	my @nucs = qw(T C A G);
1732	8					10	my @codons;
1733	8					34	for my $i (@nucs) {
1734	32					35	for my $j (@nucs) {
1735	128					85	for my $k (@nucs) {
1736	512					595	push @codons, "$i$j$k";
1737							}
1738							}
1739							}
1740	8					137	return @codons;
1741							}
1742
1743							sub get_codons {
1744							#generates codon translation look up table#
1745	4			4	0	8	my $x = 0;
1746	4					9	my $CODONS = {};
1747	4					11	for my $codon (_make_codons) {
1748	256					271	$CODONS->{$codon} = $x;
1749	256					178	$x++;
1750							}
1751	4					21	return $CODONS;
1752							}
1753
1754							#########stats subs, can go in another module? Here for speed. ###
1755							sub mean {
1756	204			204	0	140	my $ref = shift;
1757	204					143	my $el_num = scalar @$ref;
1758	204					117	my $tot = 0;
1759	204					165	map{$tot += $_}@$ref;
	806					655
1760	204					385	return ($tot/$el_num);
1761							}
1762
1763							sub variance {
1764	2			2	0	3	my $ref = shift;
1765	2					2	my $mean = mean($ref);
1766	2					1	my $sum_of_squares = 0;
1767	2					4	map{$sum_of_squares += ($_ - $mean) **2}@$ref;
	200					145
1768	2					5	return $sum_of_squares;
1769							}
1770
1771							sub sampling_variance {
1772	2			2	0	3	my $ref = shift;
1773	2					5	return variance($ref) / (scalar @$ref -1);
1774							}
1775
1776							1;