File Coverage

Bio/Tools/SeqStats.pm

Criterion	Covered	Total	%
statement	215	243	88.4
branch	55	74	74.3
condition	3	5	60.0
subroutine	16	17	94.1
pod	5	5	100.0
total	294	344	85.4

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::Tools::SeqStats
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by
7							#
8							# Copyright Peter Schattner
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Tools::SeqStats - Object holding statistics for one
17							particular sequence
18
19							=head1 SYNOPSIS
20
21							# build a primary nucleic acid or protein sequence object somehow
22							# then build a statistics object from the sequence object
23
24							$seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
25							-alphabet => 'dna',
26							-id => 'test');
27							$seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
28
29							# obtain a hash of counts of each type of monomer
30							# (i.e. amino or nucleic acid)
31							print "\nMonomer counts using statistics object\n";
32							$seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
33							$hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
34							foreach $base (sort keys %$hash_ref) {
35							print "Number of bases of type ", $base, "= ",
36							%$hash_ref->{$base},"\n";
37							}
38
39							# obtain the count directly without creating a new statistics object
40							print "\nMonomer counts without statistics object\n";
41							$hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
42							foreach $base (sort keys %$hash_ref) {
43							print "Number of bases of type ", $base, "= ",
44							%$hash_ref->{$base},"\n";
45							}
46
47
48							# obtain hash of counts of each type of codon in a nucleic acid sequence
49							print "\nCodon counts using statistics object\n";
50							$hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
51							foreach $base (sort keys %$hash_ref) {
52							print "Number of codons of type ", $base, "= ",
53							%$hash_ref->{$base},"\n";
54							}
55
56							# or
57							print "\nCodon counts without statistics object\n";
58							$hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
59							foreach $base (sort keys %$hash_ref) {
60							print "Number of codons of type ", $base, "= ",
61							%$hash_ref->{$base},"\n";
62							}
63
64							# Obtain the molecular weight of a sequence. Since the sequence
65							# may contain ambiguous monomers, the molecular weight is returned
66							# as a (reference to) a two element array containing greatest lower
67							# bound (GLB) and least upper bound (LUB) of the molecular weight
68							$weight = $seq_stats->get_mol_wt();
69							print "\nMolecular weight (using statistics object) of sequence ",
70							$seqobj->id(), " is between ", $$weight[0], " and " ,
71							$$weight[1], "\n";
72
73							# or
74							$weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
75							print "\nMolecular weight (without statistics object) of sequence ",
76							$seqobj->id(), " is between ", $$weight[0], " and " ,
77							$$weight[1], "\n";
78
79							# Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
80							my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
81							-alphabet=>'protein');
82							my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
83							print "might be hydropathic" if $gravy > 1;
84
85							=head1 DESCRIPTION
86
87							Bio::Tools::SeqStats is a lightweight object for the calculation of
88							simple statistical and numerical properties of a sequence. By
89							"lightweight" I mean that only "primary" sequences are handled by the
90							object. The calling script needs to create the appropriate primary
91							sequence to be passed to SeqStats if statistics on a sequence feature
92							are required. Similarly if a codon count is desired for a
93							frame-shifted sequence and/or a negative strand sequence, the calling
94							script needs to create that sequence and pass it to the SeqStats
95							object.
96
97							Nota that nucleotide sequences in bioperl do not strictly separate RNA
98							and DNA sequences. By convention, sequences from RNA molecules are
99							shown as is they were DNA. Objects are supposed to make the
100							distinction when needed. This class is one of the few where this
101							distinctions needs to be made. Internally, it changes all Ts into Us
102							before weight and monomer count.
103
104							SeqStats can be called in two distinct manners. If only a single
105							computation is required on a given sequence object, the method can be
106							called easily using the SeqStats object directly:
107
108							$weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
109
110							Alternately, if several computations will be required on a given
111							sequence object, an "instance" statistics object can be constructed
112							and used for the method calls:
113
114							$seq_stats = Bio::Tools::SeqStats->new($seqobj);
115							$monomers = $seq_stats->count_monomers();
116							$codons = $seq_stats->count_codons();
117							$weight = $seq_stats->get_mol_wt();
118							$gravy = $seq_stats->hydropathicity();
119
120							As currently implemented the object can return the following values
121							from a sequence:
122
123							=over
124
125							=item *
126
127							The molecular weight of the sequence: get_mol_wt()
128
129							=item *
130
131							The number of each type of monomer present: count_monomers()
132
133							=item *
134
135							The number of each codon present in a nucleic acid sequence:
136							count_codons()
137
138							=item *
139
140							The mean hydropathicity ("gravy" score) of a protein:
141							hydropathicity()
142
143							=back
144
145							For DNA and RNA sequences single-stranded weights are returned. The
146							molecular weights are calculated for neutral, or not ionized,
147							nucleic acids. The returned weight is the sum of the
148							base-sugar-phosphate residues of the chain plus one weight of water to
149							to account for the additional OH on the phosphate of the 5' residue
150							and the additional H on the sugar ring of the 3' residue. Note that
151							this leads to a difference of 18 in calculated molecular weights
152							compared to some other available programs (e.g. Informax VectorNTI).
153
154							Note that since sequences may contain ambiguous monomers (e.g. "M",
155							meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
156							returns a two-element array containing the greatest lower bound and
157							least upper bound of the molecule. For a sequence with no ambiguous
158							monomers, the two elements of the returned array will be equal. The
159							method count_codons() handles ambiguous bases by simply counting all
160							ambiguous codons together and issuing a warning to that effect.
161
162
163							=head1 DEVELOPERS NOTES
164
165							Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
166
167							Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
168							molecular weights to have the output match values in SWISS-PROT.
169
170							Torsten added hydropathicity calculation.
171
172							=head1 FEEDBACK
173
174							=head2 Mailing Lists
175
176							User feedback is an integral part of the evolution of this and other
177							Bioperl modules. Send your comments and suggestions preferably to one
178							of the Bioperl mailing lists. Your participation is much appreciated.
179
180							bioperl-l@bioperl.org - General discussion
181							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
182
183							=head2 Support
184
185							Please direct usage questions or support issues to the mailing list:
186
187							I
188
189							rather than to the module maintainer directly. Many experienced and
190							reponsive experts will be able look at the problem and quickly
191							address it. Please include a thorough description of the problem
192							with code and data examples if at all possible.
193
194							=head2 Reporting Bugs
195
196							Report bugs to the Bioperl bug tracking system to help us keep track
197							the bugs and their resolution. Bug reports can be submitted the web:
198
199							https://github.com/bioperl/bioperl-live/issues
200
201							=head1 AUTHOR - Peter Schattner
202
203							Email schattner AT alum.mit.edu
204
205							=head1 CONTRIBUTOR - Torsten Seemann
206
207							Email torsten.seemann AT infotech.monash.edu.au
208
209							=head1 APPENDIX
210
211							The rest of the documentation details each of the object
212							methods. Internal methods are usually preceded with a _
213
214							=cut
215
216
217							package Bio::Tools::SeqStats;
218	12			12		1842	use strict;
	12					21
	12					392
219	12					909	use vars qw(%Alphabets %Alphabets_strict $amino_weights
220	12			12		53	$rna_weights $dna_weights %Weights $amino_hydropathicity);
	12					20
221	12			12		2079	use Bio::Seq;
	12					25
	12					387
222	12			12		72	use base qw(Bio::Root::Root);
	12					19
	12					5814
223
224							BEGIN {
225	12			12		148	%Alphabets = (
226							'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
227							'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
228							'protein' => [ qw(A R N D C Q E G H I L K M F U
229							P S T W X Y V B Z J O *) ], # sac: added B, Z
230							);
231
232							# SAC: new strict alphabet: doesn't allow any ambiguity characters.
233	12					70	%Alphabets_strict = (
234							'dna' => [ qw( A C G T ) ],
235							'rna' => [ qw( A C G U ) ],
236							'protein' => [ qw(A R N D C Q E G H I L K M F U
237							P S T W Y V O) ],
238							);
239
240
241							# IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
242							# Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
243
244							# Amino Acid alphabet
245
246							# ------------------------------------------
247							# Symbol Meaning
248							# ------------------------------------------
249
250	12					24	my $amino_A_wt = 89.09;
251	12					23	my $amino_C_wt = 121.15;
252	12					18	my $amino_D_wt = 133.1;
253	12					19	my $amino_E_wt = 147.13;
254	12					18	my $amino_F_wt = 165.19;
255	12					17	my $amino_G_wt = 75.07;
256	12					16	my $amino_H_wt = 155.16;
257	12					19	my $amino_I_wt = 131.17;
258	12					25	my $amino_K_wt = 146.19;
259	12					17	my $amino_L_wt = 131.17;
260	12					18	my $amino_M_wt = 149.21;
261	12					760	my $amino_N_wt = 132.12;
262	12					23	my $amino_O_wt = 255.31;
263	12					17	my $amino_P_wt = 115.13;
264	12					17	my $amino_Q_wt = 146.15;
265	12					14	my $amino_R_wt = 174.20;
266	12					18	my $amino_S_wt = 105.09;
267	12					15	my $amino_T_wt = 119.12;
268	12					20	my $amino_U_wt = 168.06;
269	12					22	my $amino_V_wt = 117.15;
270	12					19	my $amino_W_wt = 204.23;
271	12					20	my $amino_Y_wt = 181.19;
272
273
274	12					214	$amino_weights = {
275							'A' => [$amino_A_wt, $amino_A_wt], # Alanine
276							'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
277							'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
278							'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
279							'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
280							'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
281							'G' => [$amino_G_wt, $amino_G_wt], # Glycine
282							'H' => [$amino_H_wt, $amino_H_wt], # Histidine
283							'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
284							'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
285							'K' => [$amino_K_wt, $amino_K_wt], # Lysine
286							'L' => [$amino_L_wt, $amino_L_wt], # Leucine
287							'M' => [$amino_M_wt, $amino_M_wt], # Methionine
288							'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
289							'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
290							'P' => [$amino_P_wt, $amino_P_wt], # Proline
291							'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
292							'R' => [$amino_R_wt, $amino_R_wt], # Arginine
293							'S' => [$amino_S_wt, $amino_S_wt], # Serine
294							'T' => [$amino_T_wt, $amino_T_wt], # Threonine
295							'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
296							'V' => [$amino_V_wt, $amino_V_wt], # Valine
297							'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
298							'X' => [$amino_G_wt, $amino_W_wt], # Unknown
299							'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
300							'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
301							};
302
303							# Extended Dna / Rna alphabet
304	12			12		80	use vars ( qw($C $O $N $H $P $water) );
	12					27
	12					842
305	12			12		61	use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
	12					21
	12					603
306	12			12		55	use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
	12					19
	12					564
307	12					826	use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
308	12			12		55	$rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
	12					19
309	12			12		56	use vars ( qw($dna_weights $rna_weights %Weights));
	12					21
	12					4546
310
311	12					28	$C = 12.01;
312	12					22	$O = 16.00;
313	12					18	$N = 14.01;
314	12					17	$H = 1.01;
315	12					14	$P = 30.97;
316	12					19	$water = 18.015;
317
318	12					63	$adenine = 5 * $C + 5 * $N + 5 * $H;
319	12					38	$guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
320	12					39	$cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
321	12					39	$thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
322	12					28	$uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
323
324	12					25	$ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
325							# neutral (unionized) form
326	12					31	$deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
327
328							# the following are single strand molecular weights / base
329	12					30	$dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
330	12					20	$dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
331	12					19	$dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
332	12					21	$dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
333
334	12					18	$rna_A_wt = $adenine + $ribose_phosphate - $water;
335	12					18	$rna_C_wt = $cytosine + $ribose_phosphate - $water;
336	12					26	$rna_G_wt = $guanine + $ribose_phosphate - $water;
337	12					21	$rna_U_wt = $uracil + $ribose_phosphate - $water;
338
339	12					125	$dna_weights = {
340							'A' => [$dna_A_wt,$dna_A_wt], # Adenine
341							'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
342							'G' => [$dna_G_wt,$dna_G_wt], # Guanine
343							'T' => [$dna_T_wt,$dna_T_wt], # Thymine
344							'M' => [$dna_C_wt,$dna_A_wt], # A or C
345							'R' => [$dna_A_wt,$dna_G_wt], # A or G
346							'W' => [$dna_T_wt,$dna_A_wt], # A or T
347							'S' => [$dna_C_wt,$dna_G_wt], # C or G
348							'Y' => [$dna_C_wt,$dna_T_wt], # C or T
349							'K' => [$dna_T_wt,$dna_G_wt], # G or T
350							'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
351							'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
352							'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
353							'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
354							'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
355							'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
356							};
357
358	12					105	$rna_weights = {
359							'A' => [$rna_A_wt,$rna_A_wt], # Adenine
360							'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
361							'G' => [$rna_G_wt,$rna_G_wt], # Guanine
362							'U' => [$rna_U_wt,$rna_U_wt], # Uracil
363							'M' => [$rna_C_wt,$rna_A_wt], # A or C
364							'R' => [$rna_A_wt,$rna_G_wt], # A or G
365							'W' => [$rna_U_wt,$rna_A_wt], # A or U
366							'S' => [$rna_C_wt,$rna_G_wt], # C or G
367							'Y' => [$rna_C_wt,$rna_U_wt], # C or U
368							'K' => [$rna_U_wt,$rna_G_wt], # G or U
369							'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
370							'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
371							'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
372							'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
373							'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
374							'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
375							};
376
377	12					85	%Weights = (
378							'dna' => $dna_weights,
379							'rna' => $rna_weights,
380							'protein' => $amino_weights,
381							);
382
383							# Amino acid scale: Hydropathicity.
384							# Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
385							# http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
386
387	12					16753	$amino_hydropathicity = {
388							A => 1.800,
389							R => -4.500,
390							N => -3.500,
391							D => -3.500,
392							C => 2.500,
393							Q => -3.500,
394							E => -3.500,
395							G => -0.400,
396							H => -3.200,
397							I => 4.500,
398							L => 3.800,
399							K => -3.900,
400							M => 1.900,
401							F => 2.800,
402							P => -1.600,
403							S => -0.800,
404							T => -0.700,
405							W => -0.900,
406							Y => -1.300,
407							V => 4.200,
408							};
409
410							}
411
412							sub new {
413	5			5	1	409	my($class,@args) = @_;
414	5					14	my $self = $class->SUPER::new(@args);
415
416	5					16	my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
417	5	50				26	unless ($seqobj->isa("Bio::PrimarySeqI")) {
418	0					0	$self->throw("SeqStats works only on PrimarySeqI objects");
419							}
420	5	50	33			11	if ( !defined $seqobj->alphabet \|\|
421							!defined $Alphabets{$seqobj->alphabet}) {
422	0					0	$self->throw("Must have a valid alphabet defined for seq (".
423							join(",",keys %Alphabets));
424							}
425	5					7	$self->{'_seqref'} = $seqobj;
426							# check the letters in the sequence
427	5					8	$self->{'_is_strict'} = _is_alphabet_strict($seqobj);
428	5					20	return $self;
429							}
430
431							=head2 count_monomers
432
433							Title : count_monomers
434							Usage : $rcount = $seq_stats->count_monomers();
435							or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
436							Function: Counts the number of each type of monomer (amino acid or
437							base) in the sequence.
438							Ts are counted as Us in RNA sequences.
439							Example :
440							Returns : Reference to a hash in which keys are letters of the
441							genetic alphabet used and values are number of occurrences
442							of the letter in the sequence.
443							Args : None or reference to sequence object
444							Throws : Throws an exception if type of sequence is unknown (ie amino
445							or nucleic)or if unknown letter in alphabet. Ambiguous
446							elements are allowed.
447
448							=cut
449
450							sub count_monomers{
451	18			18	1	260	my %count = ();
452	18					26	my $seqobj;
453							my $_is_strict;
454	18					49	my $element = '';
455	18					23	my $_is_instance = 1 ;
456	18					28	my $self = shift @_;
457	18					24	my $object_argument = shift @_;
458
459							# First we need to determine if the present object is an instance
460							# object or if the sequence object has been passed as an argument
461
462	18	100				31	if (defined $object_argument) {
463	13					17	$_is_instance = 0;
464							}
465
466							# If we are using an instance object...
467	18	100				38	if ($_is_instance) {
468	5	100				7	if ($self->{'_monomer_count'}) {
469	1					3	return $self->{'_monomer_count'}; # return count if previously calculated
470							}
471	4					5	$_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
472	4					5	$seqobj = $self->{'_seqref'};
473							} else {
474							# otherwise...
475	13					17	$seqobj = $object_argument;
476
477							# Following two lines lead to error in "throw" routine
478	13	50				40	$seqobj->isa("Bio::PrimarySeqI") \|\|
479							$self->throw("SeqStats works only on PrimarySeqI objects");
480							# is alphabet OK? Is it strict?
481	13					26	$_is_strict = _is_alphabet_strict($seqobj);
482							}
483
484							my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
485	17	100				68	$Alphabets{$seqobj->alphabet} ; # get array of allowed letters
486
487							# convert everything to upper case to be safe
488	17					41	my $seqstring = uc $seqobj->seq();
489
490							# Since T is used in RichSeq RNA sequences, do conversion locally
491	17	100				37	$seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
492
493							# For each letter, count the number of times it appears in
494							# the sequence
495							LETTER:
496	17					36	foreach $element (@$alphabet) {
497							# skip terminator symbol which may confuse regex
498	241	100				418	next LETTER if $element eq '*';
499	240					2217	$count{$element} = ( $seqstring =~ s/$element/$element/g);
500							}
501
502	17	100				43	if ($_is_instance) {
503	4					7	$self->{'_monomer_count'} = \%count; # Save in case called again later
504							}
505
506	17					44	return \%count;
507							}
508
509							=head2 get_mol_wt
510
511							Title : get_mol_wt
512							Usage : $wt = $seqobj->get_mol_wt() or
513							$wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
514							Function: Calculate molecular weight of sequence
515							Ts are counted as Us in RNA sequences.
516							Example :
517
518							Returns : Reference to two element array containing lower and upper
519							bounds of molecule molecular weight. For DNA and RNA
520							sequences single-stranded weights are returned. If
521							sequence contains no ambiguous elements, both entries in
522							array are equal to molecular weight of molecule.
523							Args : None or reference to sequence object
524							Throws : Exception if type of sequence is unknown (ie not amino or
525							nucleic) or if unknown letter in alphabet. Ambiguous
526							elements are allowed.
527
528							=cut
529
530							sub get_mol_wt {
531	12			12	1	1962	my $seqobj;
532							my $_is_strict;
533	12					21	my $element = '';
534	12					19	my $_is_instance = 1 ;
535	12					24	my $self = shift @_;
536	12					15	my $object_argument = shift @_;
537	12					20	my ($weight_array, $rcount);
538
539	12	100				24	if (defined $object_argument) {
540	10					16	$_is_instance = 0;
541							}
542
543	12	100				26	if ($_is_instance) {
544	2	50				4	if ($weight_array = $self->{'_mol_wt'}) {
545							# return mol. weight if previously calculated
546	0					0	return $weight_array;
547							}
548	2					3	$seqobj = $self->{'_seqref'};
549	2					2	$rcount = $self->count_monomers();
550							} else {
551	10					16	$seqobj = $object_argument;
552	10	50				41	$seqobj->isa("Bio::PrimarySeqI") \|\|
553							$self->throw("Error: SeqStats works only on PrimarySeqI objects");
554	10					32	$_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
555	10					31	$rcount = $self->count_monomers($seqobj);
556							}
557
558							# We will also need to know what type of monomer we are dealing with
559	12					35	my $moltype = $seqobj->alphabet();
560
561							# In general,the molecular weight is bounded below by the sum of the
562							# weights of lower bounds of each alphabet symbol times the number of
563							# occurrences of the symbol in the sequence. A similar upper bound on
564							# the weight is also calculated.
565
566							# Note that for "strict" (i.e. unambiguous) sequences there is an
567							# inefficiency since the upper bound = the lower bound and there are
568							# two calculations. However, this decrease in performance will be
569							# minor and leads to significantly more readable code.
570
571	12					19	my $weight_lower_bound = 0;
572	12					16	my $weight_upper_bound = 0;
573	12					21	my $weight_table = $Weights{$moltype};
574	12					17	my $total_res;
575
576							# compute weight of all the residues
577	12					54	foreach $element (keys %$rcount) {
578	202					322	$weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
579	202					200	$weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
580
581							# this tracks only the residues used for counting MW
582	202					213	$total_res += $$rcount{$element};
583							}
584	12	100				44	if ($moltype =~ /protein/) {
585							# remove H2O during peptide bond formation.
586	7					17	$weight_lower_bound -= $water * ($total_res - 1);
587	7					19	$weight_upper_bound -= $water * ($total_res - 1);
588							} else {
589							# Correction because phosphate of 5' residue has additional OH and
590							# sugar ring of 3' residue has additional H
591	5					5	$weight_lower_bound += $water;
592	5					6	$weight_upper_bound += $water;
593							}
594
595	12					153	$weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
596	12					44	$weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
597
598	12					32	$weight_array = [$weight_lower_bound, $weight_upper_bound];
599
600	12	100				59	if ($_is_instance) {
601	2					3	$self->{'_mol_wt'} = $weight_array; # Save in case called again later
602							}
603	12					67	return $weight_array;
604							}
605
606
607							=head2 count_codons
608
609							Title : count_codons
610							Usage : $rcount = $seqstats->count_codons() or
611							$rcount = Bio::Tools::SeqStats->count_codons($seqobj)
612							Function: Counts the number of each type of codons for a dna or rna
613							sequence, starting at the 1st triple of the input sequence.
614							Example :
615							Returns : Reference to a hash in which keys are codons of the genetic
616							alphabet used and values are number of occurrences of the
617							codons in the sequence. All codons with "ambiguous" bases
618							are counted together.
619							Args : None or sequence object
620							Throws : an exception if type of sequence is unknown or protein.
621
622							=cut
623
624							sub count_codons {
625	3			3	1	2173	my $rcount = {};
626	3					9	my $codon ;
627							my $seqobj;
628	3					0	my $_is_strict;
629	3					5	my $element = '';
630	3					3	my $_is_instance = 1 ;
631	3					5	my $self = shift @_;
632	3					4	my $object_argument = shift @_;
633
634	3	100				10	if (defined $object_argument) {
635	1					1	$_is_instance = 0;
636							}
637
638	3	100				6	if ($_is_instance) {
639	2	50				6	if ($rcount = $self->{'_codon_count'}) {
640	0					0	return $rcount; # return count if previously calculated
641							}
642	2					2	$_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
643	2					3	$seqobj = $self->{'_seqref'};
644							} else {
645	1					2	$seqobj = $object_argument;
646	1	50				11	$seqobj->isa("Bio::PrimarySeqI") \|\|
647							$self->throw("Error: SeqStats works only on PrimarySeqI objects");
648	1					3	$_is_strict = _is_alphabet_strict($seqobj);
649							}
650
651							# Codon counts only make sense for nucleic acid sequences
652	3					10	my $alphabet = $seqobj->alphabet();
653
654	3	50				15	unless ($alphabet =~ /[dr]na/i) {
655	0					0	$seqobj->throw("Codon counts only meaningful for dna or rna, ".
656							"not for $alphabet sequences.");
657							}
658
659							# If sequence contains ambiguous bases, warn that codons
660							# containing them will all be lumped together in the count.
661
662	3	50				6	if (!$_is_strict ) {
663	0	0				0	$seqobj->warn("Sequence $seqobj contains ambiguous bases.".
664							" All codons with ambiguous bases will be added together in count.")
665							if $self->verbose >= 0 ;
666							}
667
668	3					7	my $seq = $seqobj->seq();
669
670							# Now step through the string by threes and count the codons
671
672							CODON:
673	3					8	while (length($seq) > 2) {
674	1112					1144	$codon = uc substr($seq,0,3);
675	1112					1588	$seq = substr($seq,3);
676	1112	50				1500	if ($codon =~ /[^ACTGU]/i) {
677	0					0	$$rcount{'ambiguous'}++; #lump together ambiguous codons
678	0					0	next CODON;
679							}
680	1112	100				1377	if (!defined $$rcount{$codon}) {
681	122					144	$$rcount{$codon}= 1 ;
682	122					156	next CODON;
683							}
684	990					1211	$$rcount{$codon}++; # default
685							}
686
687	3	100				7	if ($_is_instance) {
688	2					3	$self->{'_codon_count'} = $rcount; # Save in case called again later
689							}
690
691	3					13	return $rcount;
692							}
693
694
695							=head2 hydropathicity
696
697							Title : hydropathicity
698							Usage : $gravy = $seqstats->hydropathicity(); or
699							$gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
700
701							Function: Calculates the mean Kyte-Doolittle hydropathicity for a
702							protein sequence. Also known as the "gravy" score. Refer to
703							Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
704							Example :
705							Returns : float
706							Args : None or reference to sequence object
707
708							Throws : an exception if type of sequence is not protein.
709
710							=cut
711
712							sub hydropathicity {
713	4			4	1	8	my $seqobj;
714							my $_is_strict;
715	4					5	my $element = '';
716	4					4	my $_is_instance = 1 ;
717	4					5	my $self = shift @_;
718	4					4	my $object_argument = shift @_;
719
720	4	50				7	if (defined $object_argument) {
721	4					6	$_is_instance = 0;
722							}
723
724	4	50				7	if ($_is_instance) {
725	0	0				0	if (my $gravy = $self->{'_hydropathicity'}) {
726	0					0	return $gravy; # return value if previously calculated
727							}
728	0					0	$_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
729	0					0	$seqobj = $self->{'_seqref'};
730							} else {
731	4					4	$seqobj = $object_argument;
732	4	50				12	$seqobj->isa("Bio::PrimarySeqI") \|\|
733							$self->throw("Error: SeqStats works only on PrimarySeqI objects");
734	4					7	$_is_strict = _is_alphabet_strict($seqobj);
735							}
736
737							# hydropathicity not menaingful for empty sequences
738	4	100				9	unless ($seqobj->length() > 0) {
739	1					4	$seqobj->throw("hydropathicity not defined for zero-length sequences");
740							}
741
742							# hydropathicity only make sense for protein sequences
743	3					5	my $alphabet = $seqobj->alphabet();
744
745	3	100				10	unless ($alphabet =~ /protein/i) {
746	1					4	$seqobj->throw("hydropathicity only meaningful for protein, ".
747							"not for $alphabet sequences.");
748							}
749
750							# If sequence contains ambiguous bases, warn that codons
751							# containing them will all be lumped together in the count.
752
753	2	100				5	unless ($_is_strict ) {
754	1					9	$seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
755							"Hydropathicity can not be caculated.")
756							}
757
758	1					2	my $seq = $seqobj->seq();
759
760							# Now step through the string and add up the hydropathicity values
761
762	1					2	my $gravy = 0;
763	1					2	for my $i ( 0 .. length($seq) ) {
764	30					27	my $codon = uc(substr($seq,$i,1));
765	30		100			45	$gravy += $amino_hydropathicity->{$codon}\|\|0; # table look-up
766							}
767	1					2	$gravy /= length($seq);
768
769
770	1	50				3	if ($_is_instance) {
771	0					0	$self->{'_hydropathicity'} = $gravy; # Save in case called again later
772							}
773
774	1					3	return $gravy;
775							}
776
777
778							=head2 _is_alphabet_strict
779
780							Title : _is_alphabet_strict
781							Usage :
782							Function: internal function to determine whether there are
783							any ambiguous elements in the current sequence
784							Example :
785							Returns : 1 if strict alphabet is being used,
786							0 if ambiguous elements are present
787							Args :
788
789							Throws : an exception if type of sequence is unknown (ie amino or
790							nucleic) or if unknown letter in alphabet. Ambiguous
791							monomers are allowed.
792
793							=cut
794
795							sub _is_alphabet_strict {
796
797	33			33		45	my ($seqobj) = @_;
798	33					75	my $moltype = $seqobj->alphabet();
799
800							# convert everything to upper case to be safe
801	33					66	my $seqstring = uc $seqobj->seq();
802
803							# Since T is used in RichSeq RNA sequences, do conversion locally
804	33	100				68	$seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
805
806							# First we check if only the 'strict' letters are present in the
807							# sequence string If not, we check whether the remaining letters
808							# are ambiguous monomers or whether there are illegal letters in
809							# the string
810
811							# $alpha_array is a ref to an array of the 'strictly' allowed letters
812	33					66	my $alpha_array = $Alphabets_strict{$moltype} ;
813
814							# $alphabet contains the allowed letters in string form
815	33					90	my $alphabet = join ('', @$alpha_array) ;
816	33	100				371	unless ($seqstring =~ /[^$alphabet]/) {
817	25					66	return 1 ;
818							}
819
820							# Next try to match with the alphabet's ambiguous letters
821	8					12	$alpha_array = $Alphabets{$moltype} ;
822	8					16	$alphabet = join ('', @$alpha_array) ;
823
824	8	50				65	unless ($seqstring =~ /[^$alphabet]/) {
825	8					19	return 0 ;
826							}
827
828							# If we got here there is an illegal letter in the sequence
829	0						$seqobj->throw("Alphabet not OK for $seqobj");
830							}
831
832							=head2 _print_data
833
834							Title : _print_data
835							Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
836							Function: Displays dna / rna parameters (used for debugging)
837							Returns : 1
838							Args : None
839
840							Used for debugging.
841
842							=cut
843
844							sub _print_data {
845
846	0			0			print "\n adenine = : $adenine \n";
847	0						print "\n guanine = : $guanine \n";
848	0						print "\n cytosine = : $cytosine \n";
849	0						print "\n thymine = : $thymine \n";
850	0						print "\n uracil = : $uracil \n";
851
852	0						print "\n dna_A_wt = : $dna_A_wt \n";
853	0						print "\n dna_C_wt = : $dna_C_wt \n";
854	0						print "\n dna_G_wt = : $dna_G_wt \n";
855	0						print "\n dna_T_wt = : $dna_T_wt \n";
856
857	0						print "\n rna_A_wt = : $rna_A_wt \n";
858	0						print "\n rna_C_wt = : $rna_C_wt \n";
859	0						print "\n rna_G_wt = : $rna_G_wt \n";
860	0						print "\n rna_U_wt = : $rna_U_wt \n";
861
862	0						return 1;
863							}
864
865							1;