File Coverage

blib/lib/Bio/CUA/CUB/Calculator.pm

Criterion	Covered	Total	%
statement	99	239	41.4
branch	37	114	32.4
condition	7	39	17.9
subroutine	12	22	54.5
pod	11	14	78.5
total	166	428	38.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::CUA::CUB::Calculator;
2
3							=pod
4
5							=head1 NAME
6
7							Bio::CUA::CUB::Calculator -- A module to calculate codon usage bias
8							(CUB) indice for protein-coding sequences
9
10							=head1 SYNOPSIS
11
12							use Bio::CUA::CUB::Calculator;
13
14							my $calc = Bio::CUA::CUB::Calculator->new(
15							-codon_table => 1,
16							-tAI_values => 'tai.out' # from Bio::CUA::CUB::Builder
17							);
18
19							# calculate tAI for each sequence
20							my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa");
21							or
22							my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa", -format => 'fasta');
23
24							while(my $seq = $io->next_seq)
25							{
26							my $tai = $calc->tai($seq);
27							printf("%10s: %.7f\n", $seq->id, $tai);
28							}
29
30							=head1 DESCRIPTION
31
32							Codon usage bias (CUB) can be represented at two levels, codon and
33							sequence. The latter is often computed as the geometric means of the
34							sequence's codons. This module caculates CUB metrics at sequence
35							level.
36
37							Supported CUB metrics include CAI (codon adaptation index), tAI (tRNA
38							adaptation index), Fop (Frequency of optimal codons), ENC (Effective
39							Number of Codons) and their variants. See the methods below for
40							details.
41
42							=cut
43
44	2			2		5124	use 5.006;
	2					8
	2					135
45	2			2		16	use strict;
	2					4
	2					88
46	2			2		12	use warnings;
	2					3
	2					88
47	2			2		13	use parent qw/Bio::CUA::CUB/;
	2					4
	2					18
48	2			2		129	use Bio::CUA::CodonTable;
	2					3
	2					6091
49
50							=head1 METHODS
51
52							=head2 new
53
54							Title : new
55							Usage : my $calc=Bio::CUA::CUB::Calculator->new(@args);
56							Function: initialize the calculator
57							Returns : an object of this class
58							Args : a hash with following acceptable keys:
59
60							B:
61
62							=over
63
64							=item C<-codon_table>
65
66							the genetic code table applied for following sequence analyses. It
67							can be specified by an integer (genetic code table id), an object of
68							L, or a map-file. See the method
69							L for details.
70
71							=back
72
73							B
74
75							=over
76
77							=item C<-optimal_codons>
78
79							a file contains all the optimal codons, one codon per line. Or a
80							hashref with keys being the optimal codons
81
82							=back
83
84							B
85
86							=over
87
88							=item C<-CAI_values>
89
90							a file containing CAI values for each codon, excluding 3
91							stop codons, so 61 lines with each line containing a codon and its
92							value separated by space or tab.
93							or
94							a hashref with each key being a codon and each value being CAI index
95							for the codon.
96
97							=back
98
99							B
100
101							=over
102
103							=item C<-tAI_values>
104
105							similar to C<-CAI_values>, a file or a hash containing tAI value
106							for each codon.
107
108							=back
109
110							B
111
112							=over
113
114							=item C<-base_background>
115
116							optional.
117							an arrayref containing base frequency of 4 bases (in the order
118							A,T,C, and G) derived from background data such as introns.
119							Or one of the following values: 'seq', 'seq3', which will lead to
120							estimating base frequencies from each analyzed sequence in whol or
121							its 3rd codon position, respectively.
122
123							It can also be specified for each analyzed sequence with the methods
124							L and L
125
126							=back
127
128							=cut
129
130							sub new
131							{
132	1			1	1	5312	my ($caller, @args) = @_;
133
134							# option -codon_table is processed in this parent class
135	1					22	my $self = $caller->SUPER::new(@args);
136
137							# process all the parameters
138	1					10	my $hashRef = $self->_array_to_hash(\@args);
139	1					9	while(my ($tag, $val) = each %$hashRef)
140							{
141							# tag 'codon_table' is now processed by parent package
142	3	50				45	if($tag =~ /^optimal/o) # optimal codons
		100
		100
		50
143							{
144							# a hash using codons as keys
145	0					0	my $optimalCodons = ref($val) eq 'HASH'?
146	0	0				0	{map { $_ => 1 } keys(%$val)} : $self->_parse_file($val,1);
147	0					0	$self->{'_optimal_codons'} = $optimalCodons;
148							}elsif($tag =~ /^cai/o) # CAI values
149							{
150							# a hash like codon => CAI_value
151	1	50				30	my $caiValues = ref($val) eq 'HASH'?
152							$val : $self->_parse_file($val,2);
153	1					11	$self->{'_cai_values'} = $caiValues;
154							}elsif($tag =~ /^tai/o) # tAI values
155							{
156							# a hash like codon => tAI_value
157	1	50				11	my $taiValues = ref($val) eq 'HASH'?
158							$val : $self->_parse_file($val,2);
159	1					14	$self->{'_tai_values'} = $taiValues;
160							}elsif($tag =~ /^base/o) # background base composition
161							{
162	0	0	0			0	if(ref($val) eq 'ARRAY' or $val =~ /^seq/)
163							{
164	0					0	$self->{'_base_comp'} = $val;
165							}else
166							{
167	0					0	$self->throw("Unknown value '$val' for parameter",
168							"-base_background");
169							}
170							}else
171							{
172							# Unknown parameter '$tag', ignored
173							}
174							}
175
176	1					25	$self->no_atg(1); # exclude ATG in tAI calculation
177							# check the input values
178							# 1. make sure all the sense codons have CAI or tAI values if
179							# provided
180
181	1					12	return $self;
182							}
183
184							=head1 sequence input
185
186							all the following methods accept one of the following formats as
187							sequence input
188
189							=over
190
191							=item 1
192
193							string of nucleotide sequence with length of 3N,
194
195							=item 2
196
197							sequence object which has a method I to get the sequence string,
198
199							=item 3
200
201							a sequence file in fasta format
202
203							=item 4
204
205							reference to a codon count hash, like
206							$codons = {
207							AGC => 50,
208							GTC => 124,
209							... ...
210							}.
211
212							=back
213
214							=head2 cai
215
216							Title : cai
217							Usage : $caiValue = $self->cai($seq);
218							Function: calculate the CAI value for the sequence
219							Returns : a number, or undef if failed
220							Args : see L
221							Note: codons without synonymous competitors are excluded in
222							calculation.
223
224							=cut
225
226							sub cai
227							{
228	13			13	1	104	my ($self, $seq) = @_;
229	13					75	$self->_xai($seq, 'CAI');
230							}
231
232							# the real calculator of tAI or CAI as both have the same formula
233							sub _xai
234							{
235	26			26		63	my ($self, $seq, $type) = @_;
236
237	26					108	my $name;
238							my $xaiHash;
239	26	100				251	if($type =~ /cai/i)
		50
240							{
241	13					31	$name = 'CAI';
242	13					42	$xaiHash = $self->{"_cai_values"};
243							}elsif($type =~ /tai/i)
244							{
245	13					29	$name = 'tAI';
246	13					44	$xaiHash = $self->{"_tai_values"};
247							}else
248							{
249	0					0	$self->throw("Unknown adaptation index type '$type'");
250							}
251	26	50				95	unless($xaiHash)
252							{
253	0					0	$self->warn("$name values for codons were not provided for",
254							"this analyzer, so can not calculate $name for sequences");
255	0					0	return undef;
256							}
257
258	26	50				163	my $codonList = $self->get_codon_list($seq) or return;
259
260	26					84	my $xai = 0;
261	26					49	my $seqLen = 0; # this excludes some unsuitable codons
262							# get non-degenerative codons which are excluded in CAI
263	26					315	my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
	52					317
264	26					140	my @senseCodons = $self->codon_table->all_sense_codons;
265	26					179	foreach my $codon (@senseCodons)
266							{
267	1586	100				4002	next unless($codonList->{$codon}); # no observation of this codon
268							# excluding non-degenerate codons for CAI calculation
269	1560	100	100			4702	next if($nonDegCodons{$codon} and $type =~ /cai/i);
270	1534	100				3308	unless(exists $xaiHash->{$codon})
271							{
272	13	0	33			258	$self->warn("Codon '$codon' is ignored")
		0
273							if($self->debug and ($self->no_atg? ($codon ne 'ATG') : 1));
274	13					58	next;
275							}
276	1521					1840	my $cnt = $codonList->{$codon};
277							# to overcome real number overflow, use log
278	1521					5596	$xai += $cnt*log($xaiHash->{$codon});
279	1521					3017	$seqLen += $cnt;
280							}
281
282	26	50				153	return undef unless($xai); # no codons with CAI/tAI
283
284	26					163	$xai = exp($xai/$seqLen);
285	26					770	return $xai;
286							}
287
288							=head2 fop
289
290							Title : fop
291							Usage : $fopValue = $self->fop($seq[,$withNonDegenerate]);
292							Function: calculate the fraction of optimal codons in the sequence
293							Returns : a number, or undef if failed
294							Args : for sequence see L.
295							if optional argument '$withNonDegenerate' is true, then
296							non-degenerate codons (those do not have synonymous partners) are
297							included in calculation. Default is excluding these codons.
298
299							=cut
300
301							sub fop
302							{
303	0			0	1	0	my ($self, $seq, $withNonDeg) = @_;
304
305	0	0				0	my $optimalCodons = $self->{'_optimal_codons'} or
306							$self->throw("No optimal codons associated with $self");
307
308	0	0				0	my $codonList = $self->get_codon_list($seq) or return;
309							# get non-degenerate codons
310	0					0	my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
	0					0
311
312
313	0					0	my $optCnt = 0; # optimal codons
314	0					0	my $total = 0;
315	0					0	while(my ($codon, $cnt) = each %$codonList)
316							{
317							# excluding non-degenerate codons if necessary
318	0	0	0			0	next if(!$withNonDeg and $nonDegCodons{$codon});
319	0	0				0	$optCnt += $cnt if($optimalCodons->{$codon});
320	0					0	$total += $cnt;
321							}
322
323	0		0			0	return $optCnt/($total \|\| 1);
324							}
325
326							=head2 tai
327
328							Title : tai
329							Usage : $taiValue = $self->tai($seq);
330							Function: calculate the tAI value for the sequence
331							Returns : a number, or undef if failed
332							Args : for sequence see L.
333
334							Note: codons which do not have tAI values are ignored from input
335							sequence
336
337							=cut
338
339							sub tai
340							{
341	13			13	1	118	my ($self, $seq) = @_;
342	13					71	$self->_xai($seq, 'tAI');
343							}
344
345							# an alias
346							sub tAI
347							{
348	0			0	0	0	my ($self, $seq) = @_;
349	0					0	$self->_xai($seq, 'tAI');
350							}
351
352							=head2 enc
353
354							Title : enc
355							Usage : $encValue = $self->enc($seq,[$minTotal]);
356							Function: calculate ENC for the sequence using the original method
357							I
358							Returns : a number, or undef if failed
359							Args : for sequence see L.
360							Optional argument I specifies minimal count
361							for an amino acid; if observed count is smaller than this count, this
362							amino acid's F will not be calculated but inferred. Deafult is 5.
363
364							Note: when the F of a redundancy group is unavailable due to lack of
365							sufficient data, it will be estimated from other groups following Wright's
366							method, that is, F3=(F2+F4)/2, and for others, F=1/r where r is the
367							degeneracy degree of that group.
368
369							=cut
370							sub enc
371							{
372	13			13	1	107	my ($self, $seq, $minTotal) = @_;
373	13					67	$self->_enc_factory($seq, $minTotal, 'mean');
374							}
375
376							=head2 enc_r
377
378							Title : enc_r
379							Usage : $encValue = $self->enc_r($seq,[$minTotal]);
380							Function: similar to the method L, except that missing F values
381							are estimated in a different way.
382							Returns : a number, or undef if failed
383							Args : for sequence see L.
384							Optional argument I specifies minimal count
385							for an amino acid; if observed count is smaller than this count, this
386							amino acid's F will not be calculated but inferred. Deafult is 5.
387
388							Note: for missing Fx of degeneracy class 'x', we first estimated the
389							ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
390							classes with known F values. Then Fx is obtained by solving the simple
391							equation.
392
393							=cut
394
395							sub enc_r
396							{
397	13			13	1	169	my ($self, $seq, $minTotal) = @_;
398	13					52	$self->_enc_factory($seq, $minTotal, 'equal_ratio');
399							}
400
401							=head2 encp
402
403							Title : encp
404							Usage : $encpValue = $self->encp($seq,[$minTotal,[$A,$T,$C,$G]]);
405							Function: calculate ENC for the sequence using the updated method
406							by Novembre I<2002, MBE>, which corrects the background nucleotide
407							composition.
408							Returns : a number, or undef if failed
409							Args : for sequence see L.
410
411							Optional argument I specifies minimal count
412							for an amino acid; if observed count is smaller than this count, this
413							amino acid's F will not be calculated but inferred. Deafult is 5.
414
415							another optional argument gives the background nucleotide composition
416							in the order of A,T,C,G in an array, if not provided, it will use the
417							default one provided when calling the method L. If stil
418							unavailable, error occurs.
419
420							=cut
421
422							sub encp
423							{
424	0			0	1	0	my ($self, $seq, $minTotal, $baseComp) = @_;
425	0					0	$self->_enc_factory($seq, $minTotal, 'mean', 1, $baseComp);
426							}
427
428							=head2 encp_r
429
430							Title : encp_r
431							Usage : $encpValue =
432							$self->encp_r($seq,[$minTotal,[$A,$T,$C,$G]]);
433							Function: similar to the method L, except that missing F values
434							are estimated using a different way.
435							Returns : a number, or undef if failed
436							Args : for sequence see L.
437
438							Optional argument I specifies minimal count
439							for an amino acid; if observed count is smaller than this count, this
440							amino acid's F will not be calculated but inferred. Deafult is 5.
441
442							another optional argument gives the background nucleotide composition
443							in the order of A,T,C,G in an array, if not provided, it will use the
444							default one provided when calling the method L. If stil
445							unavailable, error occurs.
446
447							Note: for missing Fx of degeneracy class 'x', we first estimated the
448							ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
449							classes with known F values. Then Fx is obtained by solving the simple
450							equation.
451
452							=cut
453
454							sub encp_r
455							{
456	0			0	1	0	my ($self, $seq, $minTotal, $baseComp) = @_;
457	0					0	$self->_enc_factory($seq, $minTotal, 'equal_ratio', 1, $baseComp);
458							}
459
460							# real function calculate different versions of ENC
461							# parameters explanation
462							# seq: sequence string, sequence object, sequence file, or hash
463							# reference to codon list
464							# correctBaseComp: if true, correct background base composition using
465							# Novembre's method
466							# F_EstimateMethod: how to estimate average F for a certain redundancy
467							# class if that class does not have observed data so can't be
468							# calculated; 'mean' is for Wright's method, and 'equal_ratio' for
469							# Zhenguo's method. The latter assumes a similar (1/F[r])/r for each
470							# redundancy class with redundancy degree 'r'
471							# baseComposition: optional, a reference to an array containing
472							# background nucleotide composition. If provided, it overides the
473							# values set when method L was called.
474							sub _enc_factory
475							{
476	26			26		62	my ($self, $seq, $minTotal, $F_EstimateMethod, $correctBaseComp, $baseComposition) = @_;
477
478	26	50				112	$minTotal = 5 unless(defined $minTotal); # the minumum count of residule for a given amino
479							# acid for it to be included in F calculation
480
481							# a hash ref, codon => counts
482	26	50				142	my $codonList = $self->get_codon_list($seq) or return;
483	26	50	33			647	my $seqId = (ref($seq) and $seq->can('id'))? $seq->id : '';
484
485							# determine expected codon frequency if necessary
486	26					77	my $expectedCodonFreq;
487							# determine base compositions now
488	26	50				94	if($correctBaseComp)
489							{
490	0	0				0	if(!defined($baseComposition)) # not provided for this sequence
491							{
492	0					0	my $defaultBaseComp = $self->base_composition;
493	0	0				0	unless($defaultBaseComp)
494							{
495	0					0	$self->warn("No default base composition for seq",
496							" '$seqId', so no GC-corrected ENC");
497	0					0	return undef;
498							}
499	0	0				0	if($defaultBaseComp eq 'seq')
		0
500							{
501	0					0	$baseComposition =
502							$self->estimate_base_composition($codonList);
503							}elsif($defaultBaseComp eq 'seq3')
504							{
505	0					0	$baseComposition =
506							$self->estimate_base_composition($codonList,3);
507							}else
508							{
509	0					0	$baseComposition = $defaultBaseComp;
510							}
511							} # otherwise sequence-specific base-composition is provided
512							# here
513
514							# codon frequency may not be estimated due to invalid
515							# compositions
516							$expectedCodonFreq =
517	0	0				0	$self->expect_codon_freq($baseComposition)
518							if($baseComposition);
519	0	0				0	return undef unless($expectedCodonFreq);
520							}
521
522
523							# now let us calculate F for each redundancy class
524							# determined by codon table, containing all amino acid classes
525	26					200	my $AARedundancyClasses = $self->aa_degeneracy_classes; #
526	26					52	my %FavgByClass; # record the average F from each class
527	26					247	while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
528							{
529							# number of observed AA types in this class
530	130					168	my $numAAInClass = 0; # number of amino acid species in this class
531	130					157	my $Fsum = 0;
532	130					421	while(my ($AA, $codonArray) = each %$AAHash)
533							{
534	494	100				1095	if($redundancy == 1) # this class has only one codon
535							{
536	26					3570	$numAAInClass = scalar(keys %$AAHash);
537	26					51	$Fsum = $numAAInClass; # each AA contribute 1
538	26					78	last;
539							}
540							# total count of observed residules for this AA
541	468					541	my $AAcnt = 0;
542	468					847	foreach (@$codonArray)
543							{
544							# check the codon exists in this seq
545	1534	100				3166	next unless(exists $codonList->{$_});
546	1508					2844	$AAcnt += $codonList->{$_};
547							}
548
549							# skip if occurence of this amino acid is less than the
550							# minimal threshold
551	468	100	66			3486	next if($AAcnt < $minTotal or $AAcnt < 2);
552
553							# now calculate F for this AA species
554	464	50				741	if($correctBaseComp) # correct base composition
555							{
556	0					0	my $chisq = 0;
557							# get the freq of codons of this amino acids
558	0					0	my $totalFreq = 0;
559	0					0	foreach (@$codonArray)
560							{
561	0					0	$totalFreq += $expectedCodonFreq->{$_};
562							}
563	0					0	foreach (@$codonArray)
564							{
565							# set unobserved codons to 0
566	0		0			0	my $codonCnt = $codonList->{$_} \|\| 0;
567	0					0	my $expectedFreq =
568							$expectedCodonFreq->{$_}/$totalFreq;
569	0					0	$chisq += ($codonCnt/$AAcnt -
570							$expectedFreq)**2/$expectedFreq;
571							}
572	0					0	$chisq *= $AAcnt; # don't forget multiply this
573	0					0	$Fsum += ($chisq + $AAcnt -
574							$redundancy)/($redundancy*($AAcnt-1));
575							}else # no correction, use old Wright method
576							{
577	464					721	my $pSquareSum = 0;
578	464					951	foreach (@$codonArray)
579							{
580	1526					2238	my $codonCnt = $codonList->{$_};
581	1526	100				3230	next unless($codonCnt);
582	1504					3302	$pSquareSum += ($codonCnt/$AAcnt)**2;
583							}
584	464					1234	$Fsum += ($AAcnt*$pSquareSum -1)/($AAcnt-1);
585							}
586							# increase the number of AA species in this class
587	464					1958	$numAAInClass++;
588							}
589							# check whether all AA species are ignored or not observed
590	130	50				387	if($numAAInClass > 0)
591							{
592							# note, in some special cases, Fsum == 0 even though
593							# $numAAInClass >0, for example for a 6-fold amino acid,
594							# if each of its codon is observed only once, it would
595							# result in Faa = 0. so we need add restriction on this
596	130	50				877	$FavgByClass{$redundancy} = $Fsum/$numAAInClass if($Fsum >
597							0);
598							} # otherwise no data
599							}
600
601							# estimate missing redundancy classes due to no observation of
602							# that class's AAs, and get the final Nc values
603	26					72	my $enc = 0;
604	26					141	while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
605							{
606							# the number of AA species in this class, determined by the
607							# codon table, not the input seq
608	130					169	my $AAcntInClass = scalar(keys %$AAHash);
609	130	50				278	if(exists $FavgByClass{$redundancy})
610							{
611	130	50				279	die "$redundancy, $AAcntInClass in seq '$seqId':$!"
612							unless($FavgByClass{$redundancy});
613	130					300	$enc += $AAcntInClass/$FavgByClass{$redundancy};
614	130					420	next;
615							}
616
617							# otherwise this class was not observed
618	0	0				0	my $equalRatio = $F_EstimateMethod eq 'mean'? 0 : 1;
619	0					0	my $estimatedFavg = _estimate_F(\%FavgByClass, $redundancy,
620							$equalRatio);
621	0	0				0	unless($estimatedFavg)
622							{
623	0					0	$self->warn("Cannot estimate average F for class with",
624							"redundancy=$redundancy in sequence $seqId, ",
625							"probably no known F values for any class");
626	0					0	return undef;
627							}
628	0					0	$enc += $AAcntInClass/$estimatedFavg;
629							}
630
631	26					680	return $enc;
632							}
633
634							# estimate F average
635							sub _estimate_F
636							{
637	0			0			my ($knownF,$redundancy,$equalRatio) = @_;
638
639	0	0					return 1 if($redundancy == 1);
640
641	0	0					if($equalRatio) # get the mean (1/Fr-1)/(r-1)
642							{
643	0						my $ratioSum;
644	0						my $cnt = 0; # number of known Fs
645	0						while(my ($r, $F) = each %$knownF)
646							{
647	0	0					next if $r < 2; # excluding class of redundancy==1
648	0						$ratioSum += (1/$F-1)/($r-1);
649	0						$cnt++;
650							}
651
652	0	0					if( $cnt > 0)
653							{
654	0						my $Fx = 1/($ratioSum/$cnt*($redundancy-1)+1);
655	0						return $Fx;
656							}else # no known F for any class with redundancy > 1
657							{
658	0						return undef;
659							}
660
661							}else # otherwise use Wright's method
662							{
663	0	0					if($redundancy == 3)
664							{
665	0		0				my $F2 = $knownF->{2} \|\| 1/2; # class 2
666	0		0				my $F4 = $knownF->{4} \|\| 1/4; # class 4
667	0						return ($F2 + $F4)/2;
668							}else
669							{
670	0						return 1/$redundancy; # assuming no bias
671							}
672							}
673
674							}
675
676							# get the default base compostion of this object
677							sub base_composition
678							{
679	0			0	0		my $self = shift;
680
681	0						return $self->{'_base_comp'};
682							}
683
684							=head2 estimate_base_composition
685
686							Title : estimate_base_composition
687							Usage : @baseComp = $self->estimate_base_composition($seq,[$pos])
688							Function: estimate base compositions in the sequence
689							Returns : an array of numbers in the order of A,T,C,G, or its
690							reference if in the scalar context
691							Args : a sequence string or a reference of hash containing codons
692							and their counts (eg., AGG => 30), and optionally an integer; the integer
693							specifies which codon position's nucleotide will be used instead of
694							all three codon positions.
695
696							=cut
697
698							sub estimate_base_composition
699							{
700	0			0	1		my ($self, $seq, $pos) = @_;
701
702	0						my %bases;
703							# check if input is a codon list
704							my $codonList;
705	0	0					if(ref($seq) eq 'HASH') # a codon list
706							{
707	0						$codonList = $seq;
708							}else # a sequence string or object
709							{
710	0						$seq = $self->_get_seq_str($seq);
711							}
712
713	0	0					if($pos)
714							{
715	0	0	0				$self->throw("Only 1, 2, or 3 are acceptable for pos,",
716							"'$pos' is not valid here") unless($pos > 0 and $pos < 4);
717	0	0					if($codonList) # input is a codon list
718							{
719	0						my $base;
720	0						while(my ($codon, $cnt) = each %$codonList)
721							{
722	0						$base = substr($codon, $pos-1,1);
723	0						$bases{$base} += $cnt;
724							}
725							}else # a sequence
726							{
727	0						my $seqLen = length($seq);
728	0						my $accuLen = $pos - 1;
729	0						my $period = 3; # a codon length
730	0						my $base;
731	0						while($accuLen < $seqLen)
732							{
733	0						$base = substr($seq,$accuLen,1);
734	0						$bases{$base}++;
735	0						$accuLen += $period;
736							}
737							}
738							}else # all nucleotides
739							{
740	0	0					if($codonList) # input is a codon list
741							{
742	0						while(my ($codon, $cnt) = each %$codonList)
743							{
744	0						map { $bases{$_} += $cnt } split('', $codon);
	0
745							}
746							}else
747							{
748	0						map { $bases{$_}++ } split('',$seq);
	0
749							}
750							}
751
752	0						my $total = 0;
753	0						my @comp;
754	0						foreach ($self->bases) # only consider A,T,C,G
755							{
756	0		0				$total += $bases{$_} \|\| 0;
757	0		0				push @comp, $bases{$_} \|\| 0;
758							}
759	0						@comp = map { $_/$total } @comp;
	0
760
761	0	0					return wantarray? @comp : \@comp;
762							}
763
764							=head2 gc_fraction
765
766							Title : gc_fraction
767							Usage : $frac = $self->gc_fraction($seq,[$pos])
768							Function: get fraction of GC content in the sequence
769							Returns : a floating number between 0 and 1.
770							Args : a sequence string or a reference of hash containing codons
771							and their counts (eg., AGG => 30), and optionally an integer; the integer
772							specifies which codon position's nucleotide will be used for
773							calculation (i.e., 1, 2, or 3), instead of all three positions.
774
775							=cut
776
777							sub gc_frac
778							{
779	0			0	0		my ($self, @args) = @_;
780	0						$self->gc_fraction(@args);
781							}
782
783							sub gc_fraction
784							{
785	0			0	1		my ($self, @args) = @_;
786
787	0						my @composition = $self->estimate_base_composition(@args);
788	0						my @bases = $self->bases;
789	0						my @indice = grep { $bases[$_] =~ /[GC]/ } 0..$#bases;
	0
790
791	0						my $frac = 0;
792	0						foreach (@indice)
793							{
794	0						$frac += $composition[$_];
795							}
796
797	0						return $frac;
798							}
799
800							=head2 expect_codon_freq
801
802							Title : expect_codon_freq
803							Usage : $codonFreq = $self->expect_codon_freq($base_composition)
804							Function: return the expected frequency of codons
805							Returns : reference to a hash in which codon is hash key, and
806							fraction is hash value
807							Args : reference to an array of base compositions in the order of
808							[A, T, C, G], represented as either counts or fractions
809
810							=cut
811
812							sub expect_codon_freq
813							{
814	0			0	1		my ($self, $baseComp) = @_;
815
816	0	0	0				unless($baseComp and ref($baseComp) eq 'ARRAY')
817							{
818	0						$self->warn("Invalid base composition '$baseComp'",
819							" for expect_codon_freq, which should be an array reference");
820	0						return undef;
821							}
822
823	0						my @bases = $self->bases;
824	0						my $compSum = 0; # used to normalize in case they are not summed to 1
825	0						my $zeroCnt = 0; # count of zero values
826	0						foreach (0..3)
827							{
828	0	0					$zeroCnt++ if($baseComp->[$_] == 0);
829	0						$compSum += $baseComp->[$_];
830							}
831
832							# set zero value a pseudo count, depending on the provided values
833							# are fractions or counts
834	0	0					my $pseudoCnt = $compSum > 2? 1 : 1/100;
835	0						$compSum += $pseudoCnt * $zeroCnt;
836	0		0				my %freq = map { $bases[$_] => ($baseComp->[$_] \|\| $pseudoCnt)/$compSum } 0..3;
	0
837	0						my %result;
838	0						foreach my $b1 (@bases)
839							{
840	0						foreach my $b2 (@bases)
841							{
842	0						foreach my $b3 (@bases)
843							{
844	0						my $codon = $b1.$b2.$b3;
845	0						$result{$codon} = $freq{$b1}$freq{$b2}$freq{$b3};
846							}
847							}
848							}
849
850	0						return \%result;
851							}
852
853							=head1 AUTHOR
854
855							Zhenguo Zhang, C<< >>
856
857							=head1 BUGS
858
859							Please report any bugs or feature requests to C, or through
860							the web interface at L. I will be notified, and then you'll
861							automatically be notified of progress on your bug as I make changes.
862
863
864							=head1 SUPPORT
865
866							You can find documentation for this module with the perldoc command.
867
868							perldoc Bio::CUA::CUB::Calculator
869
870
871							You can also look for information at:
872
873							=over 4
874
875							=item * RT: CPAN's request tracker (report bugs here)
876
877							L
878
879							=item * AnnoCPAN: Annotated CPAN documentation
880
881							L
882
883							=item * CPAN Ratings
884
885							L
886
887							=item * Search CPAN
888
889							L
890
891							=back
892
893
894							=head1 ACKNOWLEDGEMENTS
895
896
897							=head1 LICENSE AND COPYRIGHT
898
899							Copyright 2015 Zhenguo Zhang.
900
901							This program is free software: you can redistribute it and/or modify
902							it under the terms of the GNU General Public License as published by
903							the Free Software Foundation, either version 3 of the License, or
904							(at your option) any later version.
905
906							This program is distributed in the hope that it will be useful,
907							but WITHOUT ANY WARRANTY; without even the implied warranty of
908							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
909							GNU General Public License for more details.
910
911							You should have received a copy of the GNU General Public License
912							along with this program. If not, see L.
913
914
915							=cut
916
917							1; # End of Bio::CUA::CUB::Calculator