File Coverage

blib/lib/Bio/CUA/CUB/Calculator.pm

Criterion	Covered	Total	%
statement	98	235	41.7
branch	36	112	32.1
condition	6	36	16.6
subroutine	12	22	54.5
pod	11	14	78.5
total	163	419	38.9

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::CUA::CUB::Calculator;
2
3							=pod
4
5							=head1 NAME
6
7							Bio::CUA::CUB::Calculator -- A module to calculate codon usage bias
8							(CUB) indice for protein-coding sequences
9
10							=head1 SYNOPSIS
11
12							use Bio::CUA::CUB::Calculator;
13
14							my $calc = Bio::CUA::CUB::Calculator->new(
15							-codon_table => 1,
16							-tAI_values => 'tai.out' # from Bio::CUA::CUB::Builder
17							);
18
19							# calculate tAI for each sequence
20							my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa");
21							or
22							my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa", -format => 'fasta');
23
24							while(my $seq = $io->next_seq)
25							{
26							my $tai = $calc->tai($seq);
27							printf("%10s: %.7f\n", $seq->id, $tai);
28							}
29
30							=head1 DESCRIPTION
31
32							Codon usage bias (CUB) can be represented at two levels, codon and
33							sequence. The latter is often computed as the geometric means of the
34							sequence's codons. This module caculates CUB metrics at sequence
35							level.
36
37							Supported CUB metrics include CAI (codon adaptation index), tAI (tRNA
38							adaptation index), Fop (Frequency of optimal codons), ENC (Effective
39							Number of Codons) and their variants. See the methods below for
40							details.
41
42							=cut
43
44	2			2		4194	use 5.006;
	2					8
	2					97
45	2			2		15	use strict;
	2					4
	2					86
46	2			2		14	use warnings;
	2					3
	2					88
47	2			2		14	use parent qw/Bio::CUA::CUB/;
	2					3
	2					22
48	2			2		111	use Bio::CUA::CodonTable;
	2					4
	2					5698
49
50							=head1 METHODS
51
52							=head2 new
53
54							Title : new
55							Usage : my $calc=Bio::CUA::CUB::Calculator->new(@args);
56							Function: initialize the calculator
57							Returns : an object of this class
58							Args : a hash with following acceptable keys:
59
60							B:
61
62							=over
63
64							=item C<-codon_table>
65
66							the genetic code table applied for following sequence analyses. It
67							can be specified by an integer (genetic code table id), an object of
68							L, or a map-file. See the method
69							L for details.
70
71							=back
72
73							B
74
75							=over
76
77							=item C<-optimal_codons>
78
79							a file contains all the optimal codons, one codon per line. Or a
80							hashref with keys being the optimal codons
81
82							=back
83
84							B
85
86							=over
87
88							=item C<-CAI_values>
89
90							a file containing CAI values for each codon, excluding 3
91							stop codons, so 61 lines with each line containing a codon and its
92							value separated by space or tab.
93							or
94							a hashref with each key being a codon and each value being CAI index
95							for the codon.
96
97							=back
98
99							B
100
101							=over
102
103							=item C<-tAI_values>
104
105							similar to C<-CAI_values>, a file or a hash containing tAI value
106							for each codon.
107
108							=back
109
110							B
111
112							=over
113
114							=item C<-base_background>
115
116							optional.
117							an arrayref containing base frequency of 4 bases (in the order
118							A,T,C, and G) derived from background data such as introns.
119							Or one of the following values: 'seq', 'seq3', which will lead to
120							estimating base frequencies from each analyzed sequence in whol or
121							its 3rd codon position, respectively.
122
123							It can also be specified for each analyzed sequence with the methods
124							L and L
125
126							=back
127
128							=cut
129
130							sub new
131							{
132	1			1	1	2873	my ($caller, @args) = @_;
133
134							# option -codon_table is processed in this parent class
135	1					12	my $self = $caller->SUPER::new(@args);
136
137							# process all the parameters
138	1					4	my $hashRef = $self->_array_to_hash(\@args);
139	1					6	while(my ($tag, $val) = each %$hashRef)
140							{
141							# tag 'codon_table' is now processed by parent package
142	3	50				29	if($tag =~ /^optimal/o) # optimal codons
		100
		100
		50
143							{
144							# a hash using codons as keys
145	0					0	my $optimalCodons = ref($val) eq 'HASH'?
146	0	0				0	{map { $_ => 1 } keys(%$val)} : $self->_parse_file($val,1);
147	0					0	$self->{'_optimal_codons'} = $optimalCodons;
148							}elsif($tag =~ /^cai/o) # CAI values
149							{
150							# a hash like codon => CAI_value
151	1	50				8	my $caiValues = ref($val) eq 'HASH'?
152							$val : $self->_parse_file($val,2);
153	1					10	$self->{'_cai_values'} = $caiValues;
154							}elsif($tag =~ /^tai/o) # tAI values
155							{
156							# a hash like codon => tAI_value
157	1	50				11	my $taiValues = ref($val) eq 'HASH'?
158							$val : $self->_parse_file($val,2);
159	1					8	$self->{'_tai_values'} = $taiValues;
160							}elsif($tag =~ /^base/o) # background base composition
161							{
162	0	0	0			0	if(ref($val) eq 'ARRAY' or $val =~ /^seq/)
163							{
164	0					0	$self->{'_base_comp'} = $val;
165							}else
166							{
167	0					0	$self->throw("Unknown value '$val' for parameter",
168							"-base_composition");
169							}
170							}else
171							{
172							# Unknown parameter '$tag', ignored
173							}
174							}
175
176	1					18	$self->no_atg(1); # exclude ATG in tAI calculation
177							# check the input values
178							# 1. make sure all the sense codons have CAI or tAI values if
179							# provided
180
181	1					11	return $self;
182							}
183
184							=head1 sequence input
185
186							all the following methods accept one of the following formats as
187							sequence input
188
189							=over
190
191							=item 1
192
193							string of nucleotide sequence with length of 3N,
194
195							=item 2
196
197							sequence object which has a method I to get the sequence string,
198
199							=item 3
200
201							a sequence file in fasta format
202
203							=item 4
204
205							reference to a codon count hash, like
206							$codons = {
207							AGC => 50,
208							GTC => 124,
209							... ...
210							}.
211
212							=back
213
214							=head2 cai
215
216							Title : cai
217							Usage : $caiValue = $self->cai($seq);
218							Function: calculate the CAI value for the sequence
219							Returns : a number, or undef if failed
220							Args : see L
221							Note: codons without synonymous competitors are excluded in
222							calculation.
223
224							=cut
225
226							sub cai
227							{
228	13			13	1	98	my ($self, $seq) = @_;
229	13					56	$self->_xai($seq, 'CAI');
230							}
231
232							# the real calculator of tAI or CAI as both have the same formula
233							sub _xai
234							{
235	26			26		60	my ($self, $seq, $type) = @_;
236
237	26					52	my $name;
238							my $xaiHash;
239	26	100				198	if($type =~ /cai/i)
		50
240							{
241	13					29	$name = 'CAI';
242	13					35	$xaiHash = $self->{"_cai_values"};
243							}elsif($type =~ /tai/i)
244							{
245	13					24	$name = 'tAI';
246	13					39	$xaiHash = $self->{"_tai_values"};
247							}else
248							{
249	0					0	$self->throw("Unknown adaptation index type '$type'");
250							}
251	26	50				79	unless($xaiHash)
252							{
253	0					0	$self->warn("$name values for codons were not provided for",
254							"this analyzer, so can not calculate $name for sequences");
255	0					0	return undef;
256							}
257
258	26	50				132	my $codonList = $self->get_codon_list($seq) or return;
259
260	26					61	my $xai = 0;
261	26					47	my $seqLen = 0; # this excludes some unsuitable codons
262							# get non-degenerative codons which are excluded in CAI
263	26					193	my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
	52					184
264	26					112	my @senseCodons = $self->codon_table->all_sense_codons;
265	26					144	foreach my $codon (@senseCodons)
266							{
267	1586	100				3138	next unless($codonList->{$codon}); # no observation of this codon
268							# excluding non-degenerate codons for CAI calculation
269	1560	100	100			3521	next if($nonDegCodons{$codon} and $type =~ /cai/i);
270	1534	100				2853	unless(exists $xaiHash->{$codon})
271							{
272	13	0	33			91	$self->warn("Codon '$codon' is ignored")
		0
273							if($self->debug and ($self->no_atg? ($codon ne 'ATG') : 1));
274	13					39	next;
275							}
276	1521					1711	my $cnt = $codonList->{$codon};
277							# to overcome real number overflow, use log
278	1521					4536	$xai += $cnt*log($xaiHash->{$codon});
279	1521					2615	$seqLen += $cnt;
280							}
281
282	26	50				104	return undef unless($xai); # no codons with CAI/tAI
283
284	26					114	$xai = exp($xai/$seqLen);
285	26					578	return $xai;
286							}
287
288							=head2 fop
289
290							Title : fop
291							Usage : $fopValue = $self->fop($seq[,$withNonDegenerate]);
292							Function: calculate the fraction of optimal codons in the sequence
293							Returns : a number, or undef if failed
294							Args : for sequence see L.
295							if optional argument '$withNonDegenerate' is true, then
296							non-degenerate codons (those do not have synonymous partners) are
297							included in calculation. Default is excluding these codons.
298
299							=cut
300
301							sub fop
302							{
303	0			0	1	0	my ($self, $seq, $withNonDeg) = @_;
304
305	0	0				0	my $optimalCodons = $self->{'_optimal_codons'} or
306							$self->throw("No optimal codons associated with $self");
307
308	0	0				0	my $codonList = $self->get_codon_list($seq) or return;
309							# get non-degenerate codons
310	0					0	my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
	0					0
311
312
313	0					0	my $optCnt = 0; # optimal codons
314	0					0	my $total = 0;
315	0					0	while(my ($codon, $cnt) = each %$codonList)
316							{
317							# excluding non-degenerate codons if necessary
318	0	0	0			0	next if(!$withNonDeg and $nonDegCodons{$codon});
319	0	0				0	$optCnt += $cnt if($optimalCodons->{$codon});
320	0					0	$total += $cnt;
321							}
322
323	0		0			0	return $optCnt/($total \|\| 1);
324							}
325
326							=head2 tai
327
328							Title : tai
329							Usage : $taiValue = $self->tai($seq);
330							Function: calculate the tAI value for the sequence
331							Returns : a number, or undef if failed
332							Args : for sequence see L.
333
334							Note: codons which do not have tAI values are ignored from input
335							sequence
336
337							=cut
338
339							sub tai
340							{
341	13			13	1	91	my ($self, $seq) = @_;
342	13					55	$self->_xai($seq, 'tAI');
343							}
344
345							# an alias
346							sub tAI
347							{
348	0			0	0	0	my ($self, $seq) = @_;
349	0					0	$self->_xai($seq, 'tAI');
350							}
351
352							=head2 enc
353
354							Title : enc
355							Usage : $encValue = $self->enc($seq,[$minTotal]);
356							Function: calculate ENC for the sequence using the original method
357							I
358							Returns : a number, or undef if failed
359							Args : for sequence see L.
360							Optional argument I specifies minimal count
361							for an amino acid; if observed count is smaller than this count, this
362							amino acid's F will not be calculated but inferred. Deafult is 5.
363
364							Note: when the F of a redundancy group is unavailable due to lack of
365							sufficient data, it will be estimated from other groups following Wright's
366							method, that is, F3=(F2+F4)/2, and for others, F=1/r where r is the
367							degeneracy degree of that group.
368
369							=cut
370							sub enc
371							{
372	13			13	1	84	my ($self, $seq, $minTotal) = @_;
373	13					57	$self->_enc_factory($seq, $minTotal, 'mean');
374							}
375
376							=head2 enc_r
377
378							Title : enc_r
379							Usage : $encValue = $self->enc_r($seq,[$minTotal]);
380							Function: similar to the method L, except that missing F values
381							are estimated in a different way.
382							Returns : a number, or undef if failed
383							Args : for sequence see L.
384							Optional argument I specifies minimal count
385							for an amino acid; if observed count is smaller than this count, this
386							amino acid's F will not be calculated but inferred. Deafult is 5.
387
388							Note: for missing Fx of degeneracy class 'x', we first estimated the
389							ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
390							classes with known F values. Then Fx is obtained by solving the simple
391							equation.
392
393							=cut
394
395							sub enc_r
396							{
397	13			13	1	77	my ($self, $seq, $minTotal) = @_;
398	13					46	$self->_enc_factory($seq, $minTotal, 'equal_ratio');
399							}
400
401							=head2 encp
402
403							Title : encp
404							Usage : $encpValue = $self->encp($seq,[$minTotal,[$A,$T,$C,$G]]);
405							Function: calculate ENC for the sequence using the updated method
406							by Novembre I<2002, MBE>, which corrects the background nucleotide
407							composition.
408							Returns : a number, or undef if failed
409							Args : for sequence see L.
410
411							Optional argument I specifies minimal count
412							for an amino acid; if observed count is smaller than this count, this
413							amino acid's F will not be calculated but inferred. Deafult is 5.
414
415							another optional argument gives the background nucleotide composition
416							in the order of A,T,C,G in an array, if not provided, it will use the
417							default one provided when calling the method L. If stil
418							unavailable, error occurs.
419
420							=cut
421
422							sub encp
423							{
424	0			0	1	0	my ($self, $seq, $minTotal, $baseComp) = @_;
425	0					0	$self->_enc_factory($seq, $minTotal, 'mean', 1, $baseComp);
426							}
427
428							=head2 encp_r
429
430							Title : encp_r
431							Usage : $encpValue =
432							$self->encp_r($seq,[$minTotal,[$A,$T,$C,$G]]);
433							Function: similar to the method L, except that missing F values
434							are estimated using a different way.
435							Returns : a number, or undef if failed
436							Args : for sequence see L.
437
438							Optional argument I specifies minimal count
439							for an amino acid; if observed count is smaller than this count, this
440							amino acid's F will not be calculated but inferred. Deafult is 5.
441
442							another optional argument gives the background nucleotide composition
443							in the order of A,T,C,G in an array, if not provided, it will use the
444							default one provided when calling the method L. If stil
445							unavailable, error occurs.
446
447							Note: for missing Fx of degeneracy class 'x', we first estimated the
448							ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
449							classes with known F values. Then Fx is obtained by solving the simple
450							equation.
451
452							=cut
453
454							sub encp_r
455							{
456	0			0	1	0	my ($self, $seq, $minTotal, $baseComp) = @_;
457	0					0	$self->_enc_factory($seq, $minTotal, 'equal_ratio', 1, $baseComp);
458							}
459
460							# real function calculate different versions of ENC
461							# parameters explanation
462							# seq: sequence string, sequence object, sequence file, or hash
463							# reference to codon list
464							# correctBaseComp: if true, correct background base composition using
465							# Novembre's method
466							# F_EstimateMethod: how to estimate average F for a certain redundancy
467							# class if that class does not have observed data so can't be
468							# calculated; 'mean' is for Wright's method, and 'equal_ratio' for
469							# Zhenbguo's method. The latter assumes a similar (1/F[r])/r for each
470							# redundancy class with redundancy degree 'r'
471							# baseComposition: optional, a reference to an array containing
472							# background nucleotide composition. If provided, it overides the
473							# values set when method L was called.
474							sub _enc_factory
475							{
476	26			26		61	my ($self, $seq, $minTotal, $F_EstimateMethod, $correctBaseComp, $baseComposition) = @_;
477
478	26	50				83	$minTotal = 5 unless(defined $minTotal); # the minumum count of residule for a given amino
479							# acid for it to be included in F calculation
480
481							# a hash ref, codon => counts
482	26	50				122	my $codonList = $self->get_codon_list($seq) or return;
483
484
485							# determine expected codon frequency if necessary
486	26					58	my $expectedCodonFreq;
487							# determine base compositions now
488	26	50				82	if($correctBaseComp)
489							{
490	0	0				0	if(!defined($baseComposition)) # not provided for this sequence
491							{
492	0	0				0	my $defaultBaseComp = $self->base_composition or
493							$self->throw("No base composition information, so can not"
494							,"compute background corrected version of ENC");
495	0	0				0	if($defaultBaseComp eq 'seq')
		0
496							{
497	0					0	$baseComposition =
498							$self->estimate_base_composition($codonList);
499							}elsif($defaultBaseComp eq 'seq3')
500							{
501	0					0	$baseComposition =
502							$self->estimate_base_composition($codonList,3);
503							}else
504							{
505	0					0	$baseComposition = $defaultBaseComp;
506							}
507							} # otherwise sequence-specific base-composition is provided
508							# here
509
510							# codon frequency may not be estimated due to invalid
511							# compositions
512							$expectedCodonFreq =
513	0					0	$self->expect_codon_freq($baseComposition);
514	0	0				0	return undef unless($expectedCodonFreq);
515							}
516
517
518							# now let us calculate F for each redundancy class
519							# determined by codon table, containing all amino acid classes
520	26					180	my $AARedundancyClasses = $self->aa_degeneracy_classes; #
521	26					52	my %FavgByClass; # record the average F from each class
522	26					217	while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
523							{
524							# number of observed AA types in this class
525	130					157	my $numAAInClass = 0; # number of amino acid species in this class
526	130					131	my $Fsum = 0;
527	130					430	while(my ($AA, $codonArray) = each %$AAHash)
528							{
529	494	100				1070	if($redundancy == 1) # this class has only one codon
530							{
531	26					49	$numAAInClass = scalar(keys %$AAHash);
532	26					38	$Fsum = $numAAInClass; # each AA contribute 1
533	26					59	last;
534							}
535							# total count of observed residules for this AA
536	468					498	my $AAcnt = 0;
537	468					792	foreach (@$codonArray)
538							{
539							# check the codon exists in this seq
540	1534	100				2948	next unless(exists $codonList->{$_});
541	1508					2597	$AAcnt += $codonList->{$_};
542							}
543
544							# skip if occurence of this amino acid is less than the
545							# minimal threshold
546	468	100	66			2131	next if($AAcnt < $minTotal or $AAcnt < 2);
547
548							# now calculate F for this AA species
549	464	50				769	if($correctBaseComp) # correct base composition
550							{
551	0					0	my $chisq = 0;
552							# get the freq of codons of this amino acids
553	0					0	my $totalFreq = 0;
554	0					0	foreach (@$codonArray)
555							{
556	0					0	$totalFreq += $expectedCodonFreq->{$_};
557							}
558	0					0	foreach (@$codonArray)
559							{
560							# set unobserved codons to 0
561	0		0			0	my $codonCnt = $codonList->{$_} \|\| 0;
562	0					0	my $expectedFreq =
563							$expectedCodonFreq->{$_}/$totalFreq;
564	0					0	$chisq += ($codonCnt/$AAcnt -
565							$expectedFreq)**2/$expectedFreq;
566							}
567	0					0	$chisq *= $AAcnt; # don't forget multiply this
568	0					0	$Fsum += ($chisq + $AAcnt -
569							$redundancy)/($redundancy*($AAcnt-1));
570							}else # no correction, use old Wright method
571							{
572	464					454	my $pSquareSum = 0;
573	464					722	foreach (@$codonArray)
574							{
575	1526					1858	my $codonCnt = $codonList->{$_};
576	1526	100				2678	next unless($codonCnt);
577	1504					3262	$pSquareSum += ($codonCnt/$AAcnt)**2;
578							}
579	464					1052	$Fsum += ($AAcnt*$pSquareSum -1)/($AAcnt-1);
580							}
581							# increase the number of AA species in this class
582	464					1742	$numAAInClass++;
583							}
584							# check whether all AA species are ignored or not observed
585	130	50				309	if($numAAInClass > 0)
586							{
587							# note, in some special cases, Fsum == 0 even though
588							# $numAAInClass >0, for example for a 6-fold amino acid,
589							# if each of its codon is observed only once, it would
590							# result in Faa = 0. so we need add restriction on this
591	130	50				778	$FavgByClass{$redundancy} = $Fsum/$numAAInClass if($Fsum >
592							0);
593							} # otherwise no data
594							}
595
596							# estimate missing redundancy classes due to no observation of
597							# that class's AAs, and get the final Nc values
598	26					41	my $enc = 0;
599	26					109	while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
600							{
601							# the number of AA species in this class, determined by the
602							# codon table, not the input seq
603	130					172	my $AAcntInClass = scalar(keys %$AAHash);
604	130	50				303	if(exists $FavgByClass{$redundancy})
605							{
606	130	50				277	die "$redundancy, $AAcntInClass:$!"
607							unless($FavgByClass{$redundancy});
608	130					215	$enc += $AAcntInClass/$FavgByClass{$redundancy};
609	130					391	next;
610							}
611
612							# otherwise this class was not observed
613	0	0				0	my $equalRatio = $F_EstimateMethod eq 'mean'? 0 : 1;
614	0					0	my $estimatedFavg = _estimate_F(\%FavgByClass, $redundancy,
615							$equalRatio);
616	0	0				0	unless($estimatedFavg)
617							{
618	0					0	$self->warn("Can not estimate average F for class with",
619							"redundancy=$redundancy, probably no known values for this class");
620	0					0	return undef;
621							}
622	0					0	$enc += $AAcntInClass/$estimatedFavg;
623							}
624
625	26					419	return $enc;
626							}
627
628							# estimate F average
629							sub _estimate_F
630							{
631	0			0			my ($knownF,$redundancy,$equalRatio) = @_;
632
633	0	0					return 1 if($redundancy == 1);
634
635	0	0					if($equalRatio) # get the mean (1/Fr-1)/(r-1)
636							{
637	0						my $ratioSum;
638	0						my $cnt = 0; # number of known Fs
639	0						while(my ($r, $F) = each %$knownF)
640							{
641	0	0					next if $r < 2; # excluding class of redundancy==1
642	0						$ratioSum += (1/$F-1)/($r-1);
643	0						$cnt++;
644							}
645
646	0	0					if( $cnt > 0)
647							{
648	0						my $Fx = 1/($ratioSum/$cnt*($redundancy-1)+1);
649	0						return $Fx;
650							}else # no known F for any class with redundancy > 1
651							{
652	0						return undef;
653							}
654
655							}else # otherwise use Wright's method
656							{
657	0	0					if($redundancy == 3)
658							{
659	0		0				my $F2 = $knownF->{2} \|\| 1/2; # class 2
660	0		0				my $F4 = $knownF->{4} \|\| 1/4; # class 4
661	0						return ($F2 + $F4)/2;
662							}else
663							{
664	0						return 1/$redundancy; # assuming no bias
665							}
666							}
667
668							}
669
670							# get the default base compostion of this object
671							sub base_composition
672							{
673	0			0	0		my $self = shift;
674
675	0						return $self->{'_base_comp'};
676							}
677
678							=head2 estimate_base_composition
679
680							Title : estimate_base_composition
681							Usage : @baseComp = $self->estimate_base_composition($seq,[$pos])
682							Function: estimate base compositions in the sequence
683							Returns : an array of numbers in the order of A,T,C,G, or its
684							reference if in the scalar context
685							Args : a sequence string or a reference of hash containing codons
686							and their counts (eg., AGG => 30), and optionally an integer; the integer
687							specifies which codon position's nucleotide will be used instead of
688							all three codon positions.
689
690							=cut
691
692							sub estimate_base_composition
693							{
694	0			0	1		my ($self, $seq, $pos) = @_;
695
696	0						my %bases;
697							# check if input is a codon list
698							my $codonList;
699	0	0					if(ref($seq) eq 'HASH') # a codon list
700							{
701	0						$codonList = $seq;
702							}else # a sequence string or object
703							{
704	0						$seq = $self->_get_seq_str($seq);
705							}
706
707	0	0					if($pos)
708							{
709	0	0	0				$self->throw("Only 1, 2, or 3 are acceptable for pos,",
710							"'$pos' is not valid here") unless($pos > 0 and $pos < 4);
711	0	0					if($codonList) # input is a codon list
712							{
713	0						my $base;
714	0						while(my ($codon, $cnt) = each %$codonList)
715							{
716	0						$base = substr($codon, $pos-1,1);
717	0						$bases{$base} += $cnt;
718							}
719							}else # a sequence
720							{
721	0						my $seqLen = length($seq);
722	0						my $accuLen = $pos - 1;
723	0						my $period = 3; # a codon length
724	0						my $base;
725	0						while($accuLen < $seqLen)
726							{
727	0						$base = substr($seq,$accuLen,1);
728	0						$bases{$base}++;
729	0						$accuLen += $period;
730							}
731							}
732							}else # all nucleotides
733							{
734	0	0					if($codonList) # input is a codon list
735							{
736	0						while(my ($codon, $cnt) = each %$codonList)
737							{
738	0						map { $bases{$_} += $cnt } split('', $codon);
	0
739							}
740							}else
741							{
742	0						map { $bases{$_}++ } split('',$seq);
	0
743							}
744							}
745
746	0						my $total = 0;
747	0						my @comp;
748	0						foreach ($self->bases) # only consider A,T,C,G
749							{
750	0		0				$total += $bases{$_} \|\| 0;
751	0		0				push @comp, $bases{$_} \|\| 0;
752							}
753	0						@comp = map { $_/$total } @comp;
	0
754
755	0	0					return wantarray? @comp : \@comp;
756							}
757
758							=head2 gc_fraction
759
760							Title : gc_fraction
761							Usage : $frac = $self->gc_fraction($seq,[$pos])
762							Function: get fraction of GC content in the sequence
763							Returns : a floating number between 0 and 1.
764							Args : a sequence string or a reference of hash containing codons
765							and their counts (eg., AGG => 30), and optionally an integer; the integer
766							specifies which codon position's nucleotide will be used for
767							calculation (i.e., 1, 2, or 3), instead of all three positions.
768
769							=cut
770
771							sub gc_frac
772							{
773	0			0	0		my ($self, @args) = @_;
774	0						$self->gc_fraction(@args);
775							}
776
777							sub gc_fraction
778							{
779	0			0	1		my ($self, @args) = @_;
780
781	0						my @composition = $self->estimate_base_composition(@args);
782	0						my @bases = $self->bases;
783	0						my @indice = grep { $bases[$_] =~ /[GC]/ } 0..$#bases;
	0
784
785	0						my $frac = 0;
786	0						foreach (@indice)
787							{
788	0						$frac += $composition[$_];
789							}
790
791	0						return $frac;
792							}
793
794							=head2 expect_codon_freq
795
796							Title : expect_codon_freq
797							Usage : $codonFreq = $self->expect_codon_freq($base_composition)
798							Function: return the expected frequency of codons
799							Returns : reference to a hash in which codon is hash key, and
800							fraction is hash value
801							Args : reference to an array of base compositions in the order of
802							[A, T, C, G], represented as either counts or fractions
803
804							=cut
805
806							sub expect_codon_freq
807							{
808	0			0	1		my ($self, $baseComp) = @_;
809
810	0	0					if(uc($baseComp) eq 'NA') # for cases the composition is
811							# unavailable
812							{
813	0						return undef;
814							}
815
816	0	0	0				$self->throw("Input base composition for expect_codon_freq is not an array reference")
817							unless($baseComp and ref($baseComp) eq 'ARRAY');
818
819	0						my @bases = $self->bases;
820	0						my $compSum = 0; # used to normalize in case they are not summed to 1
821	0						my $zeroCnt = 0; # count of zero values
822	0						foreach (0..3)
823							{
824	0	0					$zeroCnt++ if($baseComp->[$_] == 0);
825	0						$compSum += $baseComp->[$_];
826							}
827
828							# set zero value a pseudo count, depending on the provided values
829							# are fractions or counts
830	0	0					my $pseudoCnt = $compSum > 2? 1 : 1/100;
831	0						$compSum += $pseudoCnt * $zeroCnt;
832	0		0				my %freq = map { $bases[$_] => ($baseComp->[$_] \|\| $pseudoCnt)/$compSum } 0..3;
	0
833	0						my %result;
834	0						foreach my $b1 (@bases)
835							{
836	0						foreach my $b2 (@bases)
837							{
838	0						foreach my $b3 (@bases)
839							{
840	0						my $codon = $b1.$b2.$b3;
841	0						$result{$codon} = $freq{$b1}$freq{$b2}$freq{$b3};
842							}
843							}
844							}
845
846	0						return \%result;
847							}
848
849							=head1 AUTHOR
850
851							Zhenguo Zhang, C<< >>
852
853							=head1 BUGS
854
855							Please report any bugs or feature requests to C, or through
856							the web interface at L. I will be notified, and then you'll
857							automatically be notified of progress on your bug as I make changes.
858
859
860							=head1 SUPPORT
861
862							You can find documentation for this module with the perldoc command.
863
864							perldoc Bio::CUA::CUB::Calculator
865
866
867							You can also look for information at:
868
869							=over 4
870
871							=item * RT: CPAN's request tracker (report bugs here)
872
873							L
874
875							=item * AnnoCPAN: Annotated CPAN documentation
876
877							L
878
879							=item * CPAN Ratings
880
881							L
882
883							=item * Search CPAN
884
885							L
886
887							=back
888
889
890							=head1 ACKNOWLEDGEMENTS
891
892
893							=head1 LICENSE AND COPYRIGHT
894
895							Copyright 2015 Zhenguo Zhang.
896
897							This program is free software: you can redistribute it and/or modify
898							it under the terms of the GNU General Public License as published by
899							the Free Software Foundation, either version 3 of the License, or
900							(at your option) any later version.
901
902							This program is distributed in the hope that it will be useful,
903							but WITHOUT ANY WARRANTY; without even the implied warranty of
904							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
905							GNU General Public License for more details.
906
907							You should have received a copy of the GNU General Public License
908							along with this program. If not, see L.
909
910
911							=cut
912
913							1; # End of Bio::CUA::CUB::Calculator