File Coverage

Bio/Align/DNAStatistics.pm

Criterion	Covered	Total	%
statement	515	635	81.1
branch	94	152	61.8
condition	25	52	48.0
subroutine	39	45	86.6
pod	20	30	66.6
total	693	914	75.8

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::Align::DNAStatistics
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Jason Stajich
7							#
8							# Copyright Jason Stajich
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
17
18							=head1 SYNOPSIS
19
20							use Bio::AlignIO;
21							use Bio::Align::DNAStatistics;
22
23							my $stats = Bio::Align::DNAStatistics->new();
24							my $alignin = Bio::AlignIO->new(-format => 'emboss',
25							-file => 't/data/insulin.water');
26							my $aln = $alignin->next_aln;
27							my $jcmatrix = $stats->distance(-align => $aln,
28							-method => 'Jukes-Cantor');
29
30							print $jcmatrix->print_matrix;
31							## and for measurements of synonymous /nonsynonymous substitutions ##
32
33							my $in = Bio::AlignIO->new(-format => 'fasta',
34							-file => 't/data/nei_gojobori_test.aln');
35							my $alnobj = $in->next_aln;
36							my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
37							my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
38							print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
39							for (sort keys %{$results->[0]} ){
40							next if /Seq/;
41							printf("%-9s %.4f \n",$_ , $results->[0]{$_});
42							}
43
44							my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
45							for my $an (@$results2){
46							print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
47							for (sort keys %$an ){
48							next if /Seq/;
49							printf("%-9s %.4f \n",$_ , $an->{$_});
50							}
51							print "\n\n";
52							}
53
54							my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
55							for (sort keys %$result3 ){
56							next if /Seq/;
57							printf("%-9s %.4f \n",$_ , $result3->{$_});
58							}
59
60							=head1 DESCRIPTION
61
62							This object contains routines for calculating various statistics and
63							distances for DNA alignments. The routines are not well tested and do
64							contain errors at this point. Work is underway to correct them, but
65							do not expect this code to give you the right answer currently! Use
66							dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
67							distances.
68
69
70							Several different distance method calculations are supported. Listed
71							in brackets are the pattern which will match
72
73							=over 3
74
75							=item *
76
77							JukesCantor [jc\|jukes\|jukescantor\|jukes-cantor]
78
79							=item *
80
81							Uncorrected [jcuncor\|uncorrected]
82
83							=item *
84
85							F81 [f81\|felsenstein]
86
87							=item *
88
89							Kimura [k2\|k2p\|k80\|kimura]
90
91							=item *
92
93							Tamura [t92\|tamura\|tamura92]
94
95							=item *
96
97							F84 [f84\|felsenstein84]
98
99							=item *
100
101							TajimaNei [tajimanei\|tajima\-nei]
102
103							=item *
104
105							JinNei [jinnei\|jin\-nei] (not implemented)
106
107							=back
108
109							There are also three methods to calculate the ratio of synonymous to
110							non-synonymous mutations. All are implementations of the Nei-Gojobori
111							evolutionary pathway method and use the Jukes-Cantor method of
112							nucleotide substitution. This method works well so long as the
113							nucleotide frequencies are roughly equal and there is no significant
114							transition/transversion bias. In order to use these methods there are
115							several pre-requisites for the alignment.
116
117							=over 3
118
119							=item 1
120
121							DNA alignment must be based on protein alignment. Use the subroutine
122							L to achieve this.
123
124							=item 2
125
126							Therefore alignment gaps must be in multiples of 3 (representing an aa
127							deletion/insertion) and at present must be indicated by a '-' symbol.
128
129							=item 3
130
131							Alignment must be solely of coding region and be in reading frame 0 to
132							achieve meaningful results
133
134							=item 4
135
136							Alignment must therefore be a multiple of 3 nucleotides long.
137
138							=item 5
139
140							All sequences must be the same length (including gaps). This should be
141							the case anyway if the sequences have been automatically aligned using
142							a program like Clustal.
143
144							=item 6
145
146							Only the standard codon alphabet is supported at present.
147
148							=back
149
150							calc_KaKs_pair() calculates a number of statistics for a named pair of
151							sequences in the alignment.
152
153							calc_all_KaKs_pairs() calculates these statistics for all pairwise
154							comparisons in an MSA. The statistics returned are:
155
156							=over 3
157
158							=item *
159
160							S_d - Number of synonymous mutations between the 2 sequences.
161
162							=item *
163
164							N_d - Number of non-synonymous mutations between the 2 sequences.
165
166							=item *
167
168							S - Mean number of synonymous sites in both sequences.
169
170							=item *
171
172							N - mean number of synonymous sites in both sequences.
173
174							=item *
175
176							P_s - proportion of synonymous differences in both sequences given by
177							P_s = S_d/S.
178
179							=item *
180
181							P_n - proportion of non-synonymous differences in both sequences given
182							by P_n = S_n/S.
183
184							=item *
185
186							D_s - estimation of synonymous mutations per synonymous site (by
187							Jukes-Cantor).
188
189							=item *
190
191							D_n - estimation of non-synonymous mutations per non-synonymous site (by
192							Jukes-Cantor).
193
194							=item *
195
196							D_n_var - estimation of variance of D_n .
197
198							=item *
199
200							D_s_var - estimation of variance of S_n.
201
202							=item *
203
204							z_value - calculation of z value.Positive value indicates D_n E D_s,
205							negative value indicates D_s E D_n.
206
207							=back
208
209							The statistics returned by calc_average_KaKs are:
210
211							=over 3
212
213							=item *
214
215							D_s - Average number of synonymous mutations/synonymous site.
216
217							=item *
218
219							D_n - Average number of non-synonymous mutations/non-synonymous site.
220
221							=item *
222
223							D_s_var - Estimated variance of Ds from bootstrapped alignments.
224
225							=item *
226
227							D_n_var - Estimated variance of Dn from bootstrapped alignments.
228
229							=item *
230
231							z_score - calculation of z value. Positive value indicates D_n ED_s,
232							negative values vice versa.
233
234							=back
235
236							The design of the code is based around the explanation of the
237							Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
238							Phylogenetics" by Nei and Kumar, published by Oxford University
239							Press. The methods have been tested using the worked example 4.1 in
240							the book, and reproduce those results. If people like having this sort
241							of analysis in BioPerl other methods for estimating Ds and Dn can be
242							provided later.
243
244							Much of the DNA distance code is based on implementations in EMBOSS
245							(Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
246							al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
247							Mitchison.
248
249							=head1 REFERENCES
250
251							=over 3
252
253							=item *
254
255							D_JukesCantor
256
257							"Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
258							Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
259							Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
260							1969, pp. 21-132.
261
262							=item *
263
264							D_Tamura
265
266							K Tamura, Mol. Biol. Evol. 1992, 9, 678.
267
268							=item *
269
270							D_Kimura
271
272							M Kimura, J. Mol. Evol., 1980, 16, 111.
273
274							=item *
275
276							JinNei
277
278							Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
279
280							=item *
281
282							D_TajimaNei
283
284							Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
285
286							=back
287
288							=head1 FEEDBACK
289
290							=head2 Mailing Lists
291
292							User feedback is an integral part of the evolution of this and other
293							Bioperl modules. Send your comments and suggestions preferably to
294							the Bioperl mailing list. Your participation is much appreciated.
295
296							bioperl-l@bioperl.org - General discussion
297							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
298
299							=head2 Support
300
301							Please direct usage questions or support issues to the mailing list:
302
303							I
304
305							rather than to the module maintainer directly. Many experienced and
306							reponsive experts will be able look at the problem and quickly
307							address it. Please include a thorough description of the problem
308							with code and data examples if at all possible.
309
310							=head2 Reporting Bugs
311
312							Report bugs to the Bioperl bug tracking system to help us keep track
313							of the bugs and their resolution. Bug reports can be submitted via the
314							web:
315
316							https://github.com/bioperl/bioperl-live/issues
317
318							=head1 AUTHOR - Jason Stajich
319
320							Email jason-AT-bioperl.org
321
322							=head1 CONTRIBUTORS
323
324							Richard Adams, richard.adams@ed.ac.uk
325
326							=head1 APPENDIX
327
328							The rest of the documentation details each of the object methods.
329							Internal methods are usually preceded with a _
330
331							=cut
332
333
334							# Let the code begin...
335
336
337							package Bio::Align::DNAStatistics;
338	4					361	use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
339							$GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
340	4			4		1166	$CODONS %synchanges $synsites $Precision $GCChhars);
	4					7
341	4			4		22	use strict;
	4					4
	4					74
342	4			4		907	use Bio::Align::PairwiseStatistics;
	4					7
	4					98
343	4			4		1013	use Bio::Matrix::PhylipDist;
	4					9
	4					107
344	4			4		580	use Bio::Tools::IUPAC;
	4					6
	4					453
345
346							BEGIN {
347	4			4		15	$GapChars = '[\.\-]';
348	4					4	$GCChhars = '[GCS]';
349	4					18	@Nucleotides = qw(A G T C);
350	4					6	$SeqCount = 2;
351	4					5	$Precision = 5;
352
353							# these values come from EMBOSS distmat implementation
354	4					19	%NucleotideIndexes = ( 'A' => 0,
355							'T' => 1,
356							'C' => 2,
357							'G' => 3,
358
359							'AT' => 0,
360							'AC' => 1,
361							'AG' => 2,
362							'CT' => 3,
363							'GT' => 4,
364							'CG' => 5,
365
366							# these are wrong now
367							# 'S' => [ 1, 3],
368							# 'W' => [ 0, 4],
369							# 'Y' => [ 2, 3],
370							# 'R' => [ 0, 1],
371							# 'M' => [ 0, 3],
372							# 'K' => [ 1, 2],
373							# 'B' => [ 1, 2, 3],
374							# 'H' => [ 0, 2, 3],
375							# 'V' => [ 0, 1, 3],
376							# 'D' => [ 0, 1, 2],
377							);
378
379	4					6	$DefaultGapPenalty = 0;
380							# could put ambiguities here?
381	4					31	%DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
382							'T' => [ 'A', 'G'],
383							'C' => [ 'A', 'G'],
384							'G' => [ 'C', 'T'],
385							},
386							'Transitions' => { 'A' => [ 'G' ],
387							'G' => [ 'A' ],
388							'C' => [ 'T' ],
389							'T' => [ 'C' ],
390							},
391							);
392	4					91	%DistanceMethods = ( 'jc\|jukes\|jukescantor\|jukes\-cantor' => 'JukesCantor',
393							'jcuncor\|uncorrected' => 'Uncorrected',
394							'f81\|felsenstein81' => 'F81',
395							'k2\|k2p\|k80\|kimura' => 'Kimura',
396							't92\|tamura\|tamura92' => 'Tamura',
397							'f84\|felsenstein84' => 'F84',
398							'tajimanei\|tajima\-nei' => 'TajimaNei',
399							'jinnei\|jin\-nei' => 'JinNei');
400
401							}
402	4			4		22	use base qw(Bio::Root::Root Bio::Align::StatisticsI);
	4					11
	4					21812
403
404							## generate look up hashes for Nei_Gojobori methods##
405							$CODONS = get_codons();
406							my @t = split '', "FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
407							#create look up hash of number of possible synonymous mutations per codon
408							$synsites = get_syn_sites();
409							#create reference look up hash of single basechanges in codons
410							%synchanges = get_syn_changes();
411
412
413
414							=head2 new
415
416							Title : new
417							Usage : my $obj = Bio::Align::DNAStatistics->new();
418							Function: Builds a new Bio::Align::DNAStatistics object
419							Returns : Bio::Align::DNAStatistics
420							Args : none
421
422
423							=cut
424
425							sub new {
426	2			2	1	617	my ($class,@args) = @_;
427	2					11	my $self = $class->SUPER::new(@args);
428
429	2					11	$self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
430
431	2					8	return $self;
432							}
433
434
435							=head2 distance
436
437							Title : distance
438							Usage : my $distance_mat = $stats->distance(-align => $aln,
439							-method => $method);
440							Function: Calculates a distance matrix for all pairwise distances of
441							sequences in an alignment.
442							Returns : L object
443							Args : -align => Bio::Align::AlignI object
444							-method => String specifying specific distance method
445							(implementing class may assume a default)
446							See also: L
447
448							=cut
449
450							sub distance{
451	13			13	1	100	my ($self,@args) = @_;
452	13					61	my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
453	13	50	33			112	if( ! defined $aln \|\| ! ref ($aln) \|\| ! $aln->isa('Bio::Align::AlignI') ) {
			33
454	0					0	$self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
455							}
456	13		50			33	$method \|\|= 'JukesCantor';
457	13					54	foreach my $m ( keys %DistanceMethods ) {
458	58	100	66			1455	if(defined $m && $method =~ /$m/i ) {
459	13					43	my $mtd = "D_$DistanceMethods{$m}";
460	13					71	return $self->$mtd($aln);
461							}
462							}
463	0					0	$self->warn("Unrecognized distance method $method must be one of [".
464							join(',',$self->available_distance_methods())."]");
465	0					0	return;
466							}
467
468							=head2 available_distance_methods
469
470							Title : available_distance_methods
471							Usage : my @methods = $stats->available_distance_methods();
472							Function: Enumerates the possible distance methods
473							Returns : Array of strings
474							Args : none
475
476
477							=cut
478
479							sub available_distance_methods{
480	0			0	1	0	my ($self,@args) = @_;
481	0					0	return values %DistanceMethods;
482							}
483
484							=head2 D - distance methods
485
486
487							=cut
488
489
490							=head2 D_JukesCantor
491
492							Title : D_JukesCantor
493							Usage : my $d = $stat->D_JukesCantor($aln)
494							Function: Calculates D (pairwise distance) between 2 sequences in an
495							alignment using the Jukes-Cantor 1 parameter model.
496							Returns : L
497							Args : L of DNA sequences
498							double - gap penalty
499
500
501							=cut
502
503							sub D_JukesCantor{
504	2			2	1	7	my ($self,$aln,$gappenalty) = @_;
505	2	50				8	return 0 unless $self->_check_arg($aln);
506	2	50				9	$gappenalty = $DefaultGapPenalty unless defined $gappenalty;
507							# ambiguities ignored at this point
508	2					4	my (@seqs,@names,@values,%dist);
509	2					4	my $seqct = 0;
510	2					18	foreach my $seq ( $aln->each_seq) {
511	4					10	push @names, $seq->display_id;
512	4					11	push @seqs, uc $seq->seq();
513	4					21	$seqct++;
514							}
515	2					8	my $precisionstr = "%.$Precision"."f";
516	2					9	for(my $i = 0; $i < $seqct-1; $i++ ) {
517							# (diagonals) distance is 0 for same sequence
518	2					11	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
519	2					25	$values[$i][$i] = sprintf($precisionstr,0);
520
521	2					11	for( my $j = $i+1; $j < $seqct; $j++ ) {
522	2					7	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
523							$seqs[$j]);
524							# just want diagonals
525	2					12	my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
526							$matrix->[2]->[2] + $matrix->[3]->[3] );
527	2					10	my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
528	2					11	my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
529							# fwd and rev lookup
530	2					9	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
531	2					9	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
532	2					20	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
533							# (diagonals) distance is 0 for same sequence
534	2					7	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
535	2					18	$values[$j][$j] = sprintf($precisionstr,0);
536							}
537							}
538	2					20	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
539							-matrix => \%dist,
540							-names => \@names,
541							-values => \@values);
542							}
543
544							=head2 D_F81
545
546							Title : D_F81
547							Usage : my $d = $stat->D_F81($aln)
548							Function: Calculates D (pairwise distance) between 2 sequences in an
549							alignment using the Felsenstein 1981 distance model.
550							Relaxes the assumption of equal base frequencies that is
551							in JC.
552							Returns : L
553							Args : L of DNA sequences
554
555
556							=cut
557
558							sub D_F81{
559	2			2	1	7	my ($self,$aln,$gappenalty) = @_;
560	2	50				8	return 0 unless $self->_check_arg($aln);
561	2	50				8	$gappenalty = $DefaultGapPenalty unless defined $gappenalty;
562							# ambiguities ignored at this point
563	2					6	my (@seqs,@names,@values,%dist);
564	2					5	my $seqct = 0;
565	2					6	foreach my $seq ( $aln->each_seq) {
566	4					10	push @names, $seq->display_id;;
567	4					13	push @seqs, uc $seq->seq();
568	4					7	$seqct++;
569							}
570	2					7	my $precisionstr = "%.$Precision"."f";
571	2					8	for(my $i = 0; $i < $seqct-1; $i++ ) {
572							# (diagonals) distance is 0 for same sequence
573	2					10	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
574	2					23	$values[$i][$i] = sprintf($precisionstr,0);
575
576	2					9	for( my $j = $i+1; $j < $seqct; $j++ ) {
577
578	2					18	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
579							$seqs[$j]);
580							# just want diagonals
581	2					11	my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
582							$matrix->[2]->[2] + $matrix->[3]->[3] );
583	2					10	my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
584	2					12	my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
585							# fwd and rev lookup
586	2					9	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
587	2					7	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
588	2					18	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
589							# (diagonals) distance is 0 for same sequence
590	2					8	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
591	2					17	$values[$j][$j] = sprintf($precisionstr,0);
592							}
593							}
594	2					19	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
595							-matrix => \%dist,
596							-names => \@names,
597							-values => \@values);
598							}
599
600							=head2 D_Uncorrected
601
602							Title : D_Uncorrected
603							Usage : my $d = $stats->D_Uncorrected($aln)
604							Function: Calculate a distance D, no correction for multiple substitutions
605							is used. In rare cases where sequences may not overlap, 'NA' is
606							substituted for the distance.
607							Returns : L
608							Args : L (DNA Alignment)
609							[optional] gap penalty
610
611							=cut
612
613							sub D_Uncorrected {
614	3			3	1	10	my ($self,$aln,$gappenalty) = @_;
615	3	50				8	$gappenalty = $DefaultGapPenalty unless defined $gappenalty;
616	3	50				8	return 0 unless $self->_check_arg($aln);
617							# ambiguities ignored at this point
618	3					5	my (@seqs,@names,@values,%dist);
619	3					6	my $seqct = 0;
620	3					9	foreach my $seq ( $aln->each_seq) {
621	10					18	push @names, $seq->display_id;
622	10					16	push @seqs, uc $seq->seq();
623	10					14	$seqct++;
624							}
625
626	3					8	my $precisionstr = "%.$Precision"."f";
627	3					8	my $len = $aln->length;
628	3					13	for( my $i = 0; $i < $seqct-1; $i++ ) {
629							# (diagonals) distance is 0 for same sequence
630	7					18	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
631	7					34	$values[$i][$i] = sprintf($precisionstr,0);
632
633	7					19	for( my $j = $i+1; $j < $seqct; $j++ ) {
634	13					30	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
635							$seqs[$j]);
636	13					34	my $m = ( $matrix->[0]->[0] +
637							$matrix->[1]->[1] +
638							$matrix->[2]->[2] +
639							$matrix->[3]->[3] );
640	13					22	my $denom = ( $len - $gaps + ( $gaps * $gappenalty));
641
642	13	100				36	$self->warn("No distance calculated between $names[$i] and $names[$j], inserting -1")
643							unless $denom;
644
645	12	100				27	my $D = $denom ? 1 - ( $m / $denom) : -1;
646							# fwd and rev lookup
647	12					28	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
648	12					21	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
649	12	100				64	$values[$j][$i] = $values[$i][$j] = $denom ? sprintf($precisionstr,$D)
650							: sprintf("%-*s", $Precision + 2, $D);
651							# (diagonals) distance is 0 for same sequence
652	12					26	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
653	12					62	$values[$j][$j] = sprintf($precisionstr,0);
654							}
655							}
656	2					17	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
657							-matrix => \%dist,
658							-names => \@names,
659							-values => \@values);
660							}
661
662
663							# M Kimura, J. Mol. Evol., 1980, 16, 111.
664
665							=head2 D_Kimura
666
667							Title : D_Kimura
668							Usage : my $d = $stat->D_Kimura($aln)
669							Function: Calculates D (pairwise distance) between all pairs of sequences
670							in an alignment using the Kimura 2 parameter model.
671							Returns : L
672							Args : L of DNA sequences
673
674
675							=cut
676
677							sub D_Kimura {
678	2			2	1	6	my ($self,$aln) = @_;
679	2	50				5	return 0 unless $self->_check_arg($aln);
680							# ambiguities ignored at this point
681	2					5	my (@names,@values,%dist);
682	2					4	my $seqct = 0;
683	2					7	foreach my $seq ( $aln->each_seq) {
684	4					9	push @names, $seq->display_id;
685	4					6	$seqct++;
686							}
687
688	2					7	my $precisionstr = "%.$Precision"."f";
689
690	2					9	for( my $i = 0; $i < $seqct-1; $i++ ) {
691							# (diagonals) distance is 0 for same sequence
692	2					8	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
693	2					15	$values[$i][$i] = sprintf($precisionstr,0);
694
695	2					11	for( my $j = $i+1; $j < $seqct; $j++ ) {
696	2					11	my $pairwise = $aln->select_noncont($i+1,$j+1);
697	2					10	my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
698	2	50				8	unless( $L ) {
699	0					0	$L = 1;
700							}
701	2					14	my $P = $self->transitions($pairwise) / $L;
702	2					10	my $Q = $self->transversions($pairwise) / $L;
703	2					7	my $K = 0;
704	2					8	my $denom = ( 1 - (2 * $P) - $Q);
705	2	50				8	if( $denom == 0 ) {
706	0					0	$self->throw("cannot find distance for ",$i+1,
707							",",$j+1," $P, $Q\n");
708							}
709	2					7	my $a = 1 / ( 1 - (2 * $P) - $Q);
710	2					6	my $b = 1 / ( 1 - 2 * $Q );
711	2	50	33			10	if( $a < 0 \|\| $b < 0 ) {
712	0					0	$K = -1;
713							} else{
714	2					11	$K = (1/2) * log ( $a ) + (1/4) * log($b);
715							}
716							# fwd and rev lookup
717	2					11	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
718	2					8	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
719	2					27	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
720							# (diagonals) distance is 0 for same sequence
721	2					7	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
722	2					23	$values[$j][$j] = sprintf($precisionstr,0);
723							}
724							}
725	2					17	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
726							-matrix => \%dist,
727							-names => \@names,
728							-values => \@values);
729							}
730
731
732							=head2 D_Kimura_variance
733
734							Title : D_Kimura
735							Usage : my $d = $stat->D_Kimura_variance($aln)
736							Function: Calculates D (pairwise distance) between all pairs of sequences
737							in an alignment using the Kimura 2 parameter model.
738							Returns : array of 2 L,
739							the first is the Kimura distance and the second is
740							a matrix of variance V(K)
741							Args : L of DNA sequences
742
743
744							=cut
745
746							sub D_Kimura_variance {
747	0			0	1	0	my ($self,$aln) = @_;
748	0	0				0	return 0 unless $self->_check_arg($aln);
749							# ambiguities ignored at this point
750	0					0	my (@names,@values,%dist,@var);
751	0					0	my $seqct = 0;
752	0					0	foreach my $seq ( $aln->each_seq) {
753	0					0	push @names, $seq->display_id;
754	0					0	$seqct++;
755							}
756
757	0					0	my $precisionstr = "%.$Precision"."f";
758
759	0					0	for( my $i = 0; $i < $seqct-1; $i++ ) {
760							# (diagonals) distance is 0 for same sequence
761	0					0	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
762	0					0	$values[$i][$i] = sprintf($precisionstr,0);
763
764	0					0	for( my $j = $i+1; $j < $seqct; $j++ ) {
765	0					0	my $pairwise = $aln->select_noncont($i+1,$j+1);
766	0					0	my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
767	0	0				0	unless( $L ) {
768	0					0	$L = 1;
769							}
770	0					0	my $P = $self->transitions($pairwise) / $L;
771	0					0	my $Q = $self->transversions($pairwise) / $L;
772	0					0	my ($a,$b,$K,$var_k);
773	0					0	my $a_denom = ( 1 - (2 * $P) - $Q);
774	0					0	my $b_denom = 1 - 2 * $Q;
775	0	0	0			0	unless( $a_denom > 0 && $b_denom > 0 ) {
776	0					0	$a = 1;
777	0					0	$b = 1;
778	0					0	$K = -1;
779	0					0	$var_k = -1;
780							} else {
781	0					0	$a = 1 / $a_denom;
782	0					0	$b = 1 / $b_denom;
783	0					0	$K = (1/2) * log ( $a ) + (1/4) * log($b);
784							# from Wu and Li 1985 which in turn is from Kimura 1980
785	0					0	my $c = ( $a - $b ) / 2;
786	0					0	my $d = ( $a + $b ) / 2;
787	0					0	$var_k = ( $a*2 $P + $d*2 $Q - ( $a * $P + $d * $Q)**2 ) / $L;
788							}
789
790							# fwd and rev lookup
791	0					0	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
792	0					0	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
793	0					0	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
794							# (diagonals) distance is 0 for same sequence
795	0					0	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
796	0					0	$values[$j]->[$j] = sprintf($precisionstr,0);
797
798	0					0	$var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
799	0					0	$var[$j]->[$j] = $values[$j]->[$j];
800							}
801							}
802	0					0	return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
803							-matrix => \%dist,
804							-names => \@names,
805							-values => \@values),
806							Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
807							-matrix => \%dist,
808							-names => \@names,
809							-values => \@var)
810							);
811							}
812
813
814							# K Tamura, Mol. Biol. Evol. 1992, 9, 678.
815
816							=head2 D_Tamura
817
818							Title : D_Tamura
819							Usage : Calculates D (pairwise distance) between 2 sequences in an
820							alignment using Tamura 1992 distance model.
821							Returns : L
822							Args : L of DNA sequences
823
824
825							=cut
826
827							sub D_Tamura {
828	2			2	1	8	my ($self,$aln) = @_;
829	2	50				6	return 0 unless $self->_check_arg($aln);
830							# ambiguities ignored at this point
831	2					6	my (@seqs,@names,@values,%dist,$i,$j);
832	2					3	my $seqct = 0;
833	2					6	my $length = $aln->length;
834	2					7	foreach my $seq ( $aln->each_seq) {
835	4					10	push @names, $seq->display_id;;
836	4					10	push @seqs, uc $seq->seq();
837	4					7	$seqct++;
838							}
839
840	2					6	my $precisionstr = "%.$Precision"."f";
841	2					4	my (@gap,@gc,@trans,@tranv,@score);
842	2					3	$i = 0;
843	2					4	for my $t1 ( @seqs ) {
844	4					7	$j = 0;
845	4					6	for my $t2 ( @seqs ) {
846	8					12	$gap[$i][$j] = 0;
847	8					17	for( my $k = 0; $k < $length; $k++ ) {
848	1532					2373	my ($c1,$c2) = ( substr($seqs[$i],$k,1),
849							substr($seqs[$j],$k,1) );
850	1532	100	100			6414	if( $c1 =~ /^$GapChars$/ \|\|
		100
851							$c2 =~ /^$GapChars$/ ) {
852	120					231	$gap[$i][$j]++;
853							} elsif( $c2 =~ /^$GCChhars$/i ) {
854	960					1728	$gc[$i][$j]++;
855							}
856							}
857	8					17	$gc[$i][$j] = ( $gc[$i][$j] /
858							($length - $gap[$i][$j]) );
859	8					14	$j++;
860							}
861	4					14	$i++;
862							}
863
864	2					14	for( $i = 0; $i < $seqct-1; $i++ ) {
865							# (diagonals) distance is 0 for same sequence
866	2					11	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
867	2					27	$values[$i][$i] = sprintf($precisionstr,0);
868
869	2					8	for( $j = $i+1; $j < $seqct; $j++ ) {
870
871	2					18	my $pairwise = $aln->select_noncont($i+1,$j+1);
872	2					8	my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
873	2					11	my $P = $self->transitions($pairwise) / $L;
874	2					12	my $Q = $self->transversions($pairwise) / $L;
875	2					13	my $C = $gc[$i][$j] + $gc[$j][$i]-
876							( 2 * $gc[$i][$j] * $gc[$j][$i] );
877	2	50				7	if( $P ) {
878	2					7	$P = $P / $C;
879							}
880	2					18	my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
881							# fwd and rev lookup
882	2					12	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
883	2					11	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
884	2					21	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
885							# (diagonals) distance is 0 for same sequence
886	2					7	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
887	2					19	$values[$j][$j] = sprintf($precisionstr,0);
888							}
889							}
890	2					17	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
891							-matrix => \%dist,
892							-names => \@names,
893							-values => \@values);
894
895							}
896
897							=head2 D_F84
898
899							Title : D_F84
900							Usage : my $d = $stat->D_F84($aln)
901							Function: Calculates D (pairwise distance) between 2 sequences in an
902							alignment using the Felsenstein 1984 distance model.
903							Returns : L
904							Args : L of DNA sequences
905							[optional] double - gap penalty
906
907							=cut
908
909							sub D_F84 {
910	0			0	1	0	my ($self,$aln,$gappenalty) = @_;
911	0	0				0	return 0 unless $self->_check_arg($aln);
912	0					0	$self->throw_not_implemented();
913							# ambiguities ignored at this point
914	0					0	my (@seqs,@names,@values,%dist);
915	0					0	my $seqct = 0;
916	0					0	foreach my $seq ( $aln->each_seq) {
917							# if there is no name,
918	0					0	my $id = $seq->display_id;
919	0	0	0			0	if( ! length($id) \|\| # deal with empty names
920							$id =~ /^\s+$/ ) {
921	0					0	$id = $seqct+1;
922							}
923	0					0	push @names, $id;
924	0					0	push @seqs, uc $seq->seq();
925	0					0	$seqct++;
926							}
927
928	0					0	my $precisionstr = "%.$Precision"."f";
929
930	0					0	for( my $i = 0; $i < $seqct-1; $i++ ) {
931							# (diagonals) distance is 0 for same sequence
932	0					0	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
933	0					0	$values[$i][$i] = sprintf($precisionstr,0);
934
935	0					0	for( my $j = $i+1; $j < $seqct; $j++ ) {
936							}
937							}
938							}
939
940							# Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
941							# Tajima-Nei correction used for multiple substitutions in the calc
942							# of the distance matrix. Nucleic acids only.
943							#
944							# D = p-distance = 1 - (matches/(posns_scored + gaps)
945							#
946							# distance = -b * ln(1-D/b)
947							#
948
949							=head2 D_TajimaNei
950
951							Title : D_TajimaNei
952							Usage : my $d = $stat->D_TajimaNei($aln)
953							Function: Calculates D (pairwise distance) between 2 sequences in an
954							alignment using the TajimaNei 1984 distance model.
955							Returns : L
956							Args : Bio::Align::AlignI of DNA sequences
957
958
959							=cut
960
961							sub D_TajimaNei{
962	2			2	1	8	my ($self,$aln) = @_;
963	2	50				8	return 0 unless $self->_check_arg($aln);
964							# ambiguities ignored at this point
965	2					5	my (@seqs,@names,@values,%dist);
966	2					5	my $seqct = 0;
967	2					6	foreach my $seq ( $aln->each_seq) {
968							# if there is no name,
969	4					9	push @names, $seq->display_id;
970	4					10	push @seqs, uc $seq->seq();
971	4					8	$seqct++;
972							}
973	2					6	my $precisionstr = "%.$Precision"."f";
974	2					3	my ($i,$j,$bs);
975							# pairwise
976	2					7	for( $i =0; $i < $seqct -1; $i++ ) {
977	2					11	$dist{$names[$i]}->{$names[$i]} = [$i,$i];
978	2					21	$values[$i][$i] = sprintf($precisionstr,0);
979
980	2					9	for ( $j = $i+1; $j <$seqct;$j++ ) {
981	2					9	my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
982							$seqs[$j]);
983	2					14	my $pairwise = $aln->select_noncont($i+1,$j+1);
984	2					7	my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
985	2					6	my $fij2 = 0;
986	2					7	for( $bs = 0; $bs < 4; $bs++ ) {
987	8					9	my $fi = 0;
988	8					11	map {$fi += $matrix->[$bs]->[$_] } 0..3;
	32					52
989	8					9	my $fj = 0;
990							# summation
991	8					11	map { $fj += $matrix->[$_]->[$bs] } 0..3;
	32					36
992	8	50	33			27	my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
993	8					17	$fij2 += $fij**2;
994							}
995
996	2					5	my ($pair,$h) = (0,0);
997	2					8	for( $bs = 0; $bs < 3; $bs++ ) {
998	6					14	for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
999	12					18	my $fij = $pfreq->[$pair++] / $slen;
1000	12	100				19	if( $fij ) {
1001
1002	10					16	my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
1003
1004	10					16	map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
	40					53
1005	10					15	map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
	40					48
1006	10					15	map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
	40					48
1007	10					10	map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
	40					46
1008
1009	10	50				14	if( $fij ) {
1010	10					24	$h += ( ($fij**2) / 2 ) /
1011							( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
1012							( ( $ci2 + $cj2 ) / (2 * $slen) )
1013							);
1014							}
1015	10					72	$self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
1016							}
1017							}
1018							}
1019							# just want diagonals which are matches (A matched A, C -> C)
1020
1021	2					8	my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
1022							$matrix->[2]->[2] + $matrix->[3]->[3] );
1023	2					5	my $D = 1 - ( $m / $slen);
1024	2					7	my $d;
1025	2	50				9	if( $h == 0 ) {
1026	0					0	$d = -1;
1027							} else {
1028	2					7	my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
1029	2					6	my $c = 1- $D/ $b;
1030
1031	2	50				5	if( $c < 0 ) {
1032	0					0	$d = -1;
1033							} else {
1034	2					7	$d = (-1 * $b) * log ( $c);
1035							}
1036							}
1037							# fwd and rev lookup
1038	2					9	$dist{$names[$i]}->{$names[$j]} = [$i,$j];
1039	2					8	$dist{$names[$j]}->{$names[$i]} = [$i,$j];
1040	2					17	$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1041
1042							# (diagonals) distance is 0 for same sequence
1043	2					8	$dist{$names[$j]}->{$names[$j]} = [$j,$j];
1044	2					19	$values[$j][$j] = sprintf($precisionstr,0);
1045							}
1046							}
1047	2					16	return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1048							-matrix => \%dist,
1049							-names => \@names,
1050							-values => \@values);
1051
1052							}
1053
1054							# Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1055
1056							=head2 D_JinNei
1057
1058							Title : D_JinNei
1059							Usage : my $d = $stat->D_JinNei($aln)
1060							Function: Calculates D (pairwise distance) between 2 sequences in an
1061							alignment using the Jin-Nei 1990 distance model.
1062							Returns : L
1063							Args : L of DNA sequences
1064
1065
1066							=cut
1067
1068							sub D_JinNei{
1069	0			0	1	0	my ($self,@args) = @_;
1070	0					0	$self->warn("JinNei implementation not completed");
1071	0					0	return;
1072							}
1073
1074							=head2 transversions
1075
1076							Title : transversions
1077							Usage : my $transversions = $stats->transversion($aln);
1078							Function: Calculates the number of transversions between two sequences in
1079							an alignment
1080							Returns : integer
1081							Args : Bio::Align::AlignI
1082
1083
1084							=cut
1085
1086							sub transversions{
1087	6			6	1	480	my ($self,$aln) = @_;
1088	6					24	return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1089							}
1090
1091							=head2 transitions
1092
1093							Title : transitions
1094							Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1095							Function: Calculates the number of transitions in a given DNA alignment
1096							Returns : integer representing the number of transitions
1097							Args : Bio::Align::AlignI object
1098
1099
1100							=cut
1101
1102							sub transitions{
1103	6			6	1	19	my ($self,$aln) = @_;
1104	6					40	return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1105							}
1106
1107
1108							sub _trans_count_helper {
1109	12			12		28	my ($self,$aln,$type) = @_;
1110	12	50				30	return 0 unless( $self->_check_arg($aln) );
1111	12	50				41	if( ! $aln->is_flush ) { $self->throw("must be flush") }
	0					0
1112	12					19	my (@tcount);
1113	12					25	my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1114							uc $aln->get_seq_by_pos(2)->seq() );
1115	12					32	my $alen = $aln->length;
1116	12					26	for (my $i = 0;$i<$alen; $i++ ) {
1117	2298					2697	my ($c1,$c2) = ( substr($first,$i,1),
1118							substr($second,$i,1) );
1119	2298	100				3263	if( $c1 ne $c2 ) {
1120	480					707	foreach my $nt ( @{$type->{$c1}} ) {
	480					677
1121	477	100				695	if( $nt eq $c2) {
1122	120					209	$tcount[$i]++;
1123							}
1124							}
1125							}
1126							}
1127	12					18	my $sum = 0;
1128	12	100				27	map { if( $_) { $sum += $_} } @tcount;
	2025					2286
	120					183
1129	12					60	return $sum;
1130							}
1131
1132							# this will generate a matrix which records across the row, the number
1133							# of DNA subst
1134							#
1135							sub _build_nt_matrix {
1136	19			19		35	my ($self,$seqa,$seqb) = @_;
1137
1138
1139	19					85	my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1140							[ qw(0 0 0 0) ],
1141							[ qw(0 0 0 0) ],
1142							[ qw(0 0 0 0) ] ];
1143	19					24	my $gaps = 0; # number of gaps
1144	19					38	my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1145	19					28	my $len_a = length($seqa);
1146	19					40	for( my $i = 0; $i < $len_a; $i++) {
1147	1974					2649	my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1148	1974					1863	$ti =~ tr/U/T/;
1149	1974					1704	$tj =~ tr/U/T/;
1150
1151	1974	100				3912	if( $ti =~ /^$GapChars$/) { $gaps++; next; }
	151					138
	151					207
1152	1823	100				3330	if( $tj =~ /^$GapChars$/) { $gaps++; next }
	318					260
	318					485
1153
1154	1505					1642	my $ti_index = $NucleotideIndexes{$ti};
1155	1505					1343	my $tj_index = $NucleotideIndexes{$tj};
1156
1157	1505	50				1811	if( ! defined $ti_index ) {
1158	0					0	$self->warn("ti_index not defined for $ti\n");
1159	0					0	next;
1160							}
1161
1162	1505					1433	$basect_matrix->[$ti_index]->[$tj_index]++;
1163
1164	1505	100				2705	if( $ti ne $tj ) {
1165	159					435	$pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1166							}
1167							}
1168	19					47	return ($basect_matrix,$pfreq,$gaps);
1169							}
1170
1171							sub _check_ambiguity_nucleotide {
1172	0			0		0	my ($base1,$base2) = @_;
1173	0					0	my %iub = Bio::Tools::IUPAC->iupac_iub();
1174	0					0	my @amb1 = @{ $iub{uc($base1)} };
	0					0
1175	0					0	my @amb2 = @{ $iub{uc($base2)} };
	0					0
1176	0					0	my ($pmatch) = (0);
1177	0					0	for my $amb ( @amb1 ) {
1178	0	0				0	if( grep { $amb eq $_ } @amb2 ) {
	0					0
1179	0					0	$pmatch = 1;
1180	0					0	last;
1181							}
1182							}
1183	0	0				0	if( $pmatch ) {
1184	0					0	return (1 / scalar @amb1) * (1 / scalar @amb2);
1185							} else {
1186	0					0	return 0;
1187							}
1188							}
1189
1190
1191							sub _check_arg {
1192	25			25		43	my($self,$aln ) = @_;
1193	25	50	33			177	if( ! defined $aln \|\| ! $aln->isa('Bio::Align::AlignI') ) {
		50
1194	0					0	$self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1195	0					0	return 0;
1196							} elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1197	0					0	$self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1198	0					0	return 0;
1199							}
1200	25					68	return 1;
1201							}
1202
1203							=head2 Data Methods
1204
1205							=cut
1206
1207							=head2 pairwise_stats
1208
1209							Title : pairwise_stats
1210							Usage : $obj->pairwise_stats($newval)
1211							Function:
1212							Returns : value of pairwise_stats
1213							Args : newvalue (optional)
1214
1215
1216							=cut
1217
1218							sub pairwise_stats{
1219	18			18	1	48	my ($self,$value) = @_;
1220	18	100				48	if( defined $value) {
1221	2					5	$self->{'_pairwise_stats'} = $value;
1222							}
1223	18					109	return $self->{'_pairwise_stats'};
1224
1225							}
1226
1227							=head2 calc_KaKs_pair
1228
1229							Title : calc_KaKs_pair
1230							Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1231							$name1, $name2).
1232							Function : calculates Nei-Gojobori statistics for pairwise
1233							comparison.
1234							Args : A Bio::Align::AlignI compliant object such as a
1235							Bio::SimpleAlign object, and 2 sequence name strings.
1236							Returns : a reference to a hash of statistics with keys as
1237							listed in Description.
1238
1239							=cut
1240
1241							sub calc_KaKs_pair {
1242	1			1	1	466	my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1243	1	50				5	$self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1244							if @_!= 4;
1245	1	50				5	$self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1246	1					6	my @seqs = (
1247							#{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1248							#{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1249							{id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1250							{id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1251							) ;
1252	1	50				5	if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1253	0					0	$self->throw(" aligned sequences must be of equal length!");
1254							}
1255	1					2	my $results = [];
1256	1					5	$self->_get_av_ds_dn(\@seqs, $results);
1257	1					5	return $results;
1258
1259							}
1260
1261							=head2 calc_all_KaKs_pairs
1262
1263							Title : calc_all_KaKs_pairs
1264							Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1265							Function : Calculates Nei_gojobori statistics for all pairwise
1266							combinations in sequence.
1267							Arguments: A Bio::Align::ALignI compliant object such as
1268							a Bio::SimpleAlign object.
1269							Returns : A reference to an array of hashes of statistics of
1270							all pairwise comparisons in the alignment.
1271
1272							=cut
1273
1274
1275
1276							sub calc_all_KaKs_pairs {
1277							#returns a multi_element_array with all pairwise comparisons
1278	1			1	1	1162	my ($self,$aln) = @_;
1279	1	50				10	$self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1280	1					3	my @seqs;
1281	1					6	for my $seq ($aln->each_seq) {
1282	3					14	push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1283							}
1284	1					5	my $results ;
1285	1					4	$results = $self->_get_av_ds_dn(\@seqs, $results);
1286	1					7	return $results;
1287							}
1288
1289							=head2 calc_average_KaKs
1290
1291							Title : calc_average_KaKs.
1292							Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1293							Function : calculates Nei_Gojobori stats for average of all
1294							sequences in the alignment.
1295							Args : A Bio::Align::AlignI compliant object such as a
1296							Bio::SimpleAlign object, number of bootstrap iterations
1297							(default 1000).
1298							Returns : A reference to a hash of statistics as listed in Description.
1299
1300							=cut
1301
1302							sub calc_average_KaKs {
1303							#calculates global value for sequences in alignment using bootstrapping
1304							#this is quite slow (~10 seconds per 3 X 200nt seqs);
1305	1			1	1	1025	my ($self, $aln, $bootstrap_rpt) = @_;
1306	1		50			4	$bootstrap_rpt \|\|= 1000;
1307	1	50				7	$self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1308	1					1	my @seqs;
1309	1					5	for my $seq ($aln->each_seq) {
1310	3					6	push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1311							}
1312	1					2	my $results ;
1313	1					3	my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1314							#print "ds = $ds_orig, dn = $dn_orig\n";
1315	1					4	$results = {D_s => $ds_orig, D_n => $dn_orig};
1316	1					5	$self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1317	1					14	return $results;
1318							}
1319
1320							############## primary internal subs for alignment comparisons ########################
1321
1322							sub _run_bootstrap {
1323							### generates sampled sequences, calculates Ds and Dn values,
1324							### then calculates variance of sampled sequences and add results to results hash
1325							###
1326	1			1		2	my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1327	1					2	my @seqs = @$seq_ref;
1328	1					2	my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1329	1					4	my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1330
1331							#1st make alternative array of codons;
1332	1					1	my $c = 0;
1333	1					4	while ($c < length $seqs[0]{'seq'}) {
1334	57					68	for (0..$#seqs) {
1335	171					149	push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
	171					575
1336							}
1337	57					81	$c+=3;
1338							}
1339
1340	1					3	for (1..$bootstrap_rpt) {
1341	100					207	my $sampled = _resample (\@btstrp_aoa);
1342	100					219	my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1343	100					129	push @{$bootstrap_values{'ds'}}, $ds;
	100					183
1344	100					109	push @{$bootstrap_values{'dn'}}, $dn;
	100					319
1345							}
1346
1347	1					6	$results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1348	1					3	$results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1349							$results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1350	1					18	sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1351							#print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1352							#print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1353							#print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1354							}
1355
1356							sub _resample {
1357	100			100		107	my $ref = shift;
1358	100					94	my $codon_num = scalar (@{$ref->[0]});
	100					145
1359	100					102	my @altered;
1360	100					171	for (0..$codon_num -1) { #for each codon
1361	5700					6348	my $rand = int (rand ($codon_num));
1362	5700					6685	for (0..$#$ref) {
1363	17100					14813	push @{$altered[$_]}, $ref->[$_][$rand];
	17100					26744
1364							}
1365							}
1366	100					134	my @stringed = map {join '', @$_}@altered;
	300					1224
1367	100					139	my @return;
1368							#now out in random name to keep other subs happy
1369	100					155	for (@stringed) {
1370	300					664	push @return, {id=>'1', seq=> $_};
1371							}
1372	100					820	return \@return;
1373							}
1374
1375							sub _get_av_ds_dn {
1376							# takes array of hashes of sequence strings and ids #
1377	103			103		125	my $self = shift;
1378	103					129	my $seq_ref = shift;
1379	103	100				190	my $result = shift if @_;
1380	103					341	my @caller = caller(1);
1381	103					1812	my @seqarray = @$seq_ref;
1382	103					111	my $bootstrap_score_list;
1383							#for a multiple alignment considers all pairwise combinations#
1384	103					225	my %dsfor_average = (ds => [], dn => []);
1385	103					203	for (my $i = 0; $i < scalar @seqarray; $i++) {
1386	308					589	for (my $j = $i +1; $j
1387							# print "comparing $i and $j\n";
1388	307	50				571	if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1389	0					0	$self->warn(" aligned sequences must be of equal length!");
1390	0					0	next;
1391							}
1392
1393	307					458	my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1394	307					504	my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1395							# print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1396	307					526	my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1397							#get averages
1398	307					484	my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1399	307					561	my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1400
1401							#calculate ps and pn (p54)
1402	307					325	my $syn_prop = $syn_count / $av_s_site;
1403	307					317	my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1404
1405							#now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1406	307					564	my $d_syn = $self->jk($syn_prop);
1407	307					451	my $d_nc = $self->jk($nc_prop);
1408
1409							#JK calculation must succeed for continuation of calculation
1410							#ret_value = -1 if error
1411	307	50	33			830	next unless $d_nc >=0 && $d_syn >=0;
1412
1413
1414	307					318	push @{$dsfor_average{'ds'}}, $d_syn;
	307					538
1415	307					298	push @{$dsfor_average{'dn'}}, $d_nc;
	307					444
1416
1417							#if not doing bootstrap, calculate the pairwise comparisin stats
1418	307	100	100			1420	if ($caller[3] =~ /calc_KaKs_pair/ \|\| $caller[3] =~ /calc_all_KaKs_pairs/) {
1419							#now calculate variances assuming large sample
1420	4					10	my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1421	4					7	my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1422							#now calculate z_value
1423							#print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1424							#my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1425	4	50				11	my $z = ($d_syn_var + $d_nc_var) ?
1426							($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1427							# print "z is $z\n";
1428							push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1429							S_d => $syn_count, N_d =>$non_syn_count,
1430							P_s => $syn_prop, P_n=>$nc_prop,
1431	4					5	D_s => @{$dsfor_average{'ds'}}[-1],
1432	4					60	D_n => @{$dsfor_average{'dn'}}[-1],
1433							D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1434							Seq1 => $seqarray[$i]{'id'},
1435	4					7	Seq2 => $seqarray[$j]{'id'},
1436							z_score => $z,
1437							};
1438	4	50	33			24	$self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
			33
1439							if ($syn_count < 10 \|\| $non_syn_count < 10 ) && $self->verbose > -1 ;
1440							}#endif
1441							}
1442							}
1443
1444							#warn of failure if no results hashes are present
1445							#will fail if Jukes Cantor has failed for all pairwise combinations
1446							#$self->warn("calculation failed!") if scalar @$result ==0;
1447
1448							#return results unless bootstrapping
1449	103	100	100			345	return $result if $caller[3]=~ /calc_all_KaKs/ \|\| $caller[3] =~ /calc_KaKs_pair/;
1450							#else if getting average for bootstrap
1451	101					226	return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1452							}
1453
1454
1455							sub jk {
1456	614			614	0	669	my ($self, $p) = @_;
1457	614	50				826	if ($p > 0.75) {
1458	0					0	$self->warn( " Jukes Cantor won't work -too divergent!");
1459	0					0	return -1;
1460							}
1461	614					1081	return -1 * (3/4) * (log(1 - (4/3) * $p));
1462							}
1463
1464							#works for large value of n (50?100?)
1465							sub jk_var {
1466	8			8	0	11	my ($p, $n) = @_;
1467	8					20	return (9 * $p * (1 -$p))/(((3 - 4 $p) 2) $n);
1468							}
1469
1470
1471							# compares 2 sequences to find the number of synonymous/non
1472							# synonymous mutations between them
1473
1474							sub analyse_mutations {
1475	307			307	0	406	my ($seq1, $seq2) = @_;
1476	307					1978	my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1477							[2,1]], # depend on which is the same
1478							1=>[[0,2],
1479							[2,0]],
1480							2=>[[0,1],
1481							[1,0]],
1482							},
1483							3=> [ [0,1,2], # all need to be altered
1484							[1,0,2],
1485							[0,2,1],
1486							[1,2,0],
1487							[2,0,1],
1488							[2,1,0] ],
1489							);
1490	307					513	my $TOTAL = 0; # total synonymous changes
1491	307					320	my $TOTAL_n = 0; # total non-synonymous changes
1492	307					289	my $gap_cnt = 0;
1493
1494	307					329	my %input;
1495	307					304	my $seqlen = length($seq1);
1496	307					461	for (my $j=0; $j< $seqlen; $j+=3) {
1497	17499					20785	$input{'cod1'} = substr($seq1, $j,3);
1498	17499					18369	$input{'cod2'} = substr($seq2, $j,3);
1499
1500							#ignore codon if beeing compared with gaps!
1501	17499	50	33			46608	if ($input{'cod1'} =~ /\-/ \|\| $input{'cod2'} =~ /\-/){
1502	0					0	$gap_cnt += 3; #just increments once if there is a pair of gaps
1503	0					0	next;
1504							}
1505
1506	17499					21958	my ($diff_cnt, $same) = count_diffs(\%input);
1507
1508							#ignore if codons are identical
1509	17499	100				30834	next if $diff_cnt == 0 ;
1510	4912	100				5538	if ($diff_cnt == 1) {
		50
		0
1511	3921					5725	$TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1512	3921					7172	$TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1513							#print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1514							}
1515							elsif ($diff_cnt ==2) {
1516	991					920	my $s_cnt = 0;
1517	991					908	my $n_cnt = 0;
1518	991					874	my $tot_muts = 4;
1519							#will stay 4 unless there are stop codons at intervening point
1520	991					863	OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
	991					1536
1521	1982					2137	my $altered = $input{'cod1'};
1522	1982					2056	my $prev= $altered;
1523							# print "$prev -> (", $t[$CODONS->{$altered}], ")";
1524	1982					2042	for my $mut_i (@$perm) { #index of codon mutated
1525	3964					4405	substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1526	3964	50				5180	if ($t[$CODONS->{$altered}] eq '*') {
1527	0					0	$tot_muts -=2;
1528							#print "changes to stop codon!!\n";
1529	0					0	next OUTER;
1530							}
1531							else {
1532	3964					4717	$s_cnt += $synchanges{$prev}{$altered};
1533							# print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1534							}
1535	3964					4522	$prev = $altered;
1536							}
1537							# print "\n";
1538							}
1539	991	50				1525	if ($tot_muts != 0) {
1540	991					1339	$TOTAL += ($s_cnt/($tot_muts/2));
1541	991					1827	$TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1542							}
1543
1544							}
1545							elsif ($diff_cnt ==3 ) {
1546	0					0	my $s_cnt = 0;
1547	0					0	my $n_cnt = 0;
1548	0					0	my $tot_muts = 18; #potential number of mutations
1549	0					0	OUTER: for my $perm (@{$mutator{'3'}}) {
	0					0
1550	0					0	my $altered = $input{'cod1'};
1551	0					0	my $prev= $altered;
1552							# print "$prev -> (", $t[$CODONS->{$altered}], ")";
1553	0					0	for my $mut_i (@$perm) { #index of codon mutated
1554	0					0	substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1555	0	0				0	if ($t[$CODONS->{$altered}] eq '*') {
1556	0					0	$tot_muts -=3;
1557							# print "changes to stop codon!!\n";
1558	0					0	next OUTER;
1559
1560							}
1561							else {
1562	0					0	$s_cnt += $synchanges{$prev}{$altered};
1563							# print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1564							}
1565	0					0	$prev = $altered;
1566							}
1567							# print "\n";
1568
1569							}#end OUTER loop
1570							#calculate number of synonymous/non synonymous mutations for that codon
1571							# and add to total
1572	0	0				0	if ($tot_muts != 0) {
1573	0					0	$TOTAL += ($s_cnt / ($tot_muts /3));
1574	0					0	$TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1575							}
1576							} #endif $diffcnt = 3
1577							} #end of sequencetraversal
1578	307					1680	return ($TOTAL, $TOTAL_n, $gap_cnt);
1579							}
1580
1581
1582							sub count_diffs {
1583							#counts the number of nucleotide differences between 2 codons
1584							# returns this value plus the codon index of which nucleotide is the same when 2
1585							#nucleotides are different. This is so analyse_mutations() knows which nucleotides
1586							# to change.
1587	17499			17499	0	15403	my $ref = shift;
1588	17499					16060	my $cnt = 0;
1589	17499					15474	my $same= undef;
1590							#just for 2 differences
1591	17499					19637	for (0..2) {
1592	52497	100				70389	if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1593	5903					6321	$cnt++;
1594							} else {
1595	46594					48570	$same = $_;
1596							}
1597							}
1598	17499					24912	return ($cnt, $same);
1599							}
1600
1601							=head2 get_syn_changes
1602
1603							Title : get_syn_changes
1604							Usage : Bio::Align::DNAStatitics->get_syn_changes
1605							Function: Generate a hashref of all pairwise combinations of codns
1606							differing by 1
1607							Returns : Symetic matrix using hashes
1608							First key is codon
1609							and each codon points to a hashref of codons
1610							the values of which describe type of change.
1611							my $type = $hash{$codon1}->{$codon2};
1612							values are :
1613							1 synonymous
1614							0 non-syn
1615							-1 either codon is a stop codon
1616							Args : none
1617
1618							=cut
1619
1620							sub get_syn_changes {
1621							#hash of all pairwise combinations of codons differing by 1
1622							# 1 = syn, 0 = non-syn, -1 = stop
1623	4			4	1	6	my %results;
1624	4					9	my @codons = _make_codons ();
1625	4					125	my $arr_len = scalar @codons;
1626	4					19	for (my $i = 0; $i < $arr_len -1; $i++) {
1627	252					302	my $cod1 = $codons[$i];
1628	252					330	for (my $j = $i +1; $j < $arr_len; $j++) {
1629	8064					6724	my $diff_cnt = 0;
1630	8064					7668	for my $pos(0..2) {
1631	24192	100				32696	$diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1632							}
1633	8064	100				11895	next if $diff_cnt !=1;
1634
1635							#synon change
1636	1152	100	100			2909	if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
		100
1637	276					347	$results{$cod1}{$codons[$j]} =1;
1638	276					455	$results{$codons[$j]}{$cod1} = 1;
1639							}
1640							#stop codon
1641							elsif ($t[$CODONS->{$cod1}] eq '' or $t[$CODONS->{$codons[$j]}] eq '') {
1642	92					117	$results{$cod1}{$codons[$j]} = -1;
1643	92					151	$results{$codons[$j]}{$cod1} = -1;
1644							}
1645							# nc change
1646							else {
1647	784					1029	$results{$cod1}{$codons[$j]} = 0;
1648	784					1377	$results{$codons[$j]}{$cod1} = 0;
1649							}
1650							}
1651							}
1652	4					142	return %results;
1653							}
1654
1655							=head2 dnds_pattern_number
1656
1657							Title : dnds_pattern_number
1658							Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1659							Function: Counts the number of codons with no gaps in the MSA
1660							Returns : Number of codons with no gaps ('patterns' in PAML notation)
1661							Args : A Bio::Align::AlignI compliant object such as a
1662							Bio::SimpleAlign object.
1663
1664							=cut
1665
1666							sub dnds_pattern_number{
1667	0			0	1	0	my ($self, $aln) = @_;
1668	0					0	return ($aln->remove_gaps->length)/3;
1669							}
1670
1671							sub count_syn_sites {
1672							#counts the number of possible synonymous changes for sequence
1673	615			615	0	1123	my ($seq, $synsite) = @_;
1674	615	100				910	__PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1675	614					588	my $S = 0;
1676	614					810	for (my $i = 0; $i< length($seq); $i+=3) {
1677	34998					34394	my $cod = substr($seq, $i, 3);
1678	34998	50				42249	next if $cod =~ /\-/; #deal with alignment gaps
1679	34998					48649	$S += $synsite->{$cod}{'s'};
1680							}
1681							#print "S is $S\n";
1682	614					835	return $S;
1683							}
1684
1685
1686
1687							sub get_syn_sites {
1688							#sub to generate lookup hash for the number of synonymous changes per codon
1689	4			4	0	10	my @nucs = qw(T C A G);
1690	4					5	my %raw_results;
1691	4					8	for my $i (@nucs) {
1692	16					16	for my $j (@nucs) {
1693	64					64	for my $k (@nucs) {
1694							# for each possible codon
1695	256					322	my $cod = "$i$j$k";
1696	256					256	my $aa = $t[$CODONS->{$cod}];
1697							#calculate number of synonymous mutations vs non syn mutations
1698	256					259	for my $i (qw(0 1 2)){
1699	768					675	my $s = 0;
1700	768					620	my $n = 3;
1701	768					730	for my $nuc (qw(A T C G)) {
1702	3072	100				3812	next if substr ($cod, $i,1) eq $nuc;
1703	2304					1895	my $test = $cod;
1704	2304					2007	substr($test, $i, 1) = $nuc ;
1705	2304	100				2769	if ($t[$CODONS->{$test}] eq $aa) {
1706	552					457	$s++;
1707							}
1708	2304	100				2976	if ($t[$CODONS->{$test}] eq '*') {
1709	108					105	$n--;
1710							}
1711							}
1712	768					1500	$raw_results{$cod}[$i] = {'s' => $s ,
1713							'n' => $n };
1714							}
1715
1716							} #end analysis of single codon
1717							}
1718							} #end analysis of all codons
1719	4					9	my %final_results;
1720
1721	4					95	for my $cod (sort keys %raw_results) {
1722	256					233	my $t = 0;
1723	256					190	map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
	768					1100
	256					288
1724	256					522	$final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1725							}
1726	4					138	return \%final_results;
1727							}
1728
1729							sub _make_codons {
1730							#makes all codon combinations, returns array of them
1731	8			8		22	my @nucs = qw(T C A G);
1732	8					9	my @codons;
1733	8					16	for my $i (@nucs) {
1734	32					36	for my $j (@nucs) {
1735	128					117	for my $k (@nucs) {
1736	512					665	push @codons, "$i$j$k";
1737							}
1738							}
1739							}
1740	8					103	return @codons;
1741							}
1742
1743							sub get_codons {
1744							#generates codon translation look up table#
1745	4			4	0	9	my $x = 0;
1746	4					9	my $CODONS = {};
1747	4					7	for my $codon (_make_codons) {
1748	256					324	$CODONS->{$codon} = $x;
1749	256					212	$x++;
1750							}
1751	4					18	return $CODONS;
1752							}
1753
1754							#########stats subs, can go in another module? Here for speed. ###
1755							sub mean {
1756	204			204	0	232	my $ref = shift;
1757	204					209	my $el_num = scalar @$ref;
1758	204					179	my $tot = 0;
1759	204					295	map{$tot += $_}@$ref;
	806					1030
1760	204					577	return ($tot/$el_num);
1761							}
1762
1763							sub variance {
1764	2			2	0	2	my $ref = shift;
1765	2					3	my $mean = mean($ref);
1766	2					3	my $sum_of_squares = 0;
1767	2					3	map{$sum_of_squares += ($_ - $mean) **2}@$ref;
	200					207
1768	2					8	return $sum_of_squares;
1769							}
1770
1771							sub sampling_variance {
1772	2			2	0	2	my $ref = shift;
1773	2					5	return variance($ref) / (scalar @$ref -1);
1774							}
1775
1776							1;