File Coverage

Bio/Tools/SeqStats.pm
Criterion Covered Total %
statement 215 243 88.4
branch 55 74 74.3
condition 3 5 60.0
subroutine 16 17 94.1
pod 5 5 100.0
total 294 344 85.4


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::SeqStats
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by
7             #
8             # Copyright Peter Schattner
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::SeqStats - Object holding statistics for one
17             particular sequence
18              
19             =head1 SYNOPSIS
20              
21             # build a primary nucleic acid or protein sequence object somehow
22             # then build a statistics object from the sequence object
23              
24             $seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
25             -alphabet => 'dna',
26             -id => 'test');
27             $seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
28              
29             # obtain a hash of counts of each type of monomer
30             # (i.e. amino or nucleic acid)
31             print "\nMonomer counts using statistics object\n";
32             $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
33             $hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
34             foreach $base (sort keys %$hash_ref) {
35             print "Number of bases of type ", $base, "= ",
36             %$hash_ref->{$base},"\n";
37             }
38              
39             # obtain the count directly without creating a new statistics object
40             print "\nMonomer counts without statistics object\n";
41             $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
42             foreach $base (sort keys %$hash_ref) {
43             print "Number of bases of type ", $base, "= ",
44             %$hash_ref->{$base},"\n";
45             }
46              
47              
48             # obtain hash of counts of each type of codon in a nucleic acid sequence
49             print "\nCodon counts using statistics object\n";
50             $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
51             foreach $base (sort keys %$hash_ref) {
52             print "Number of codons of type ", $base, "= ",
53             %$hash_ref->{$base},"\n";
54             }
55              
56             # or
57             print "\nCodon counts without statistics object\n";
58             $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
59             foreach $base (sort keys %$hash_ref) {
60             print "Number of codons of type ", $base, "= ",
61             %$hash_ref->{$base},"\n";
62             }
63              
64             # Obtain the molecular weight of a sequence. Since the sequence
65             # may contain ambiguous monomers, the molecular weight is returned
66             # as a (reference to) a two element array containing greatest lower
67             # bound (GLB) and least upper bound (LUB) of the molecular weight
68             $weight = $seq_stats->get_mol_wt();
69             print "\nMolecular weight (using statistics object) of sequence ",
70             $seqobj->id(), " is between ", $$weight[0], " and " ,
71             $$weight[1], "\n";
72              
73             # or
74             $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
75             print "\nMolecular weight (without statistics object) of sequence ",
76             $seqobj->id(), " is between ", $$weight[0], " and " ,
77             $$weight[1], "\n";
78              
79             # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
80             my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
81             -alphabet=>'protein');
82             my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
83             print "might be hydropathic" if $gravy > 1;
84              
85             =head1 DESCRIPTION
86              
87             Bio::Tools::SeqStats is a lightweight object for the calculation of
88             simple statistical and numerical properties of a sequence. By
89             "lightweight" I mean that only "primary" sequences are handled by the
90             object. The calling script needs to create the appropriate primary
91             sequence to be passed to SeqStats if statistics on a sequence feature
92             are required. Similarly if a codon count is desired for a
93             frame-shifted sequence and/or a negative strand sequence, the calling
94             script needs to create that sequence and pass it to the SeqStats
95             object.
96              
97             Nota that nucleotide sequences in bioperl do not strictly separate RNA
98             and DNA sequences. By convention, sequences from RNA molecules are
99             shown as is they were DNA. Objects are supposed to make the
100             distinction when needed. This class is one of the few where this
101             distinctions needs to be made. Internally, it changes all Ts into Us
102             before weight and monomer count.
103              
104             SeqStats can be called in two distinct manners. If only a single
105             computation is required on a given sequence object, the method can be
106             called easily using the SeqStats object directly:
107              
108             $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
109              
110             Alternately, if several computations will be required on a given
111             sequence object, an "instance" statistics object can be constructed
112             and used for the method calls:
113              
114             $seq_stats = Bio::Tools::SeqStats->new($seqobj);
115             $monomers = $seq_stats->count_monomers();
116             $codons = $seq_stats->count_codons();
117             $weight = $seq_stats->get_mol_wt();
118             $gravy = $seq_stats->hydropathicity();
119              
120             As currently implemented the object can return the following values
121             from a sequence:
122              
123             =over
124              
125             =item *
126              
127             The molecular weight of the sequence: get_mol_wt()
128              
129             =item *
130              
131             The number of each type of monomer present: count_monomers()
132              
133             =item *
134              
135             The number of each codon present in a nucleic acid sequence:
136             count_codons()
137              
138             =item *
139              
140             The mean hydropathicity ("gravy" score) of a protein:
141             hydropathicity()
142              
143             =back
144              
145             For DNA and RNA sequences single-stranded weights are returned. The
146             molecular weights are calculated for neutral, or not ionized,
147             nucleic acids. The returned weight is the sum of the
148             base-sugar-phosphate residues of the chain plus one weight of water to
149             to account for the additional OH on the phosphate of the 5' residue
150             and the additional H on the sugar ring of the 3' residue. Note that
151             this leads to a difference of 18 in calculated molecular weights
152             compared to some other available programs (e.g. Informax VectorNTI).
153              
154             Note that since sequences may contain ambiguous monomers (e.g. "M",
155             meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
156             returns a two-element array containing the greatest lower bound and
157             least upper bound of the molecule. For a sequence with no ambiguous
158             monomers, the two elements of the returned array will be equal. The
159             method count_codons() handles ambiguous bases by simply counting all
160             ambiguous codons together and issuing a warning to that effect.
161              
162              
163             =head1 DEVELOPERS NOTES
164              
165             Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
166              
167             Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
168             molecular weights to have the output match values in SWISS-PROT.
169              
170             Torsten added hydropathicity calculation.
171              
172             =head1 FEEDBACK
173              
174             =head2 Mailing Lists
175              
176             User feedback is an integral part of the evolution of this and other
177             Bioperl modules. Send your comments and suggestions preferably to one
178             of the Bioperl mailing lists. Your participation is much appreciated.
179              
180             bioperl-l@bioperl.org - General discussion
181             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
182              
183             =head2 Support
184              
185             Please direct usage questions or support issues to the mailing list:
186              
187             I
188              
189             rather than to the module maintainer directly. Many experienced and
190             reponsive experts will be able look at the problem and quickly
191             address it. Please include a thorough description of the problem
192             with code and data examples if at all possible.
193              
194             =head2 Reporting Bugs
195              
196             Report bugs to the Bioperl bug tracking system to help us keep track
197             the bugs and their resolution. Bug reports can be submitted the web:
198              
199             https://github.com/bioperl/bioperl-live/issues
200              
201             =head1 AUTHOR - Peter Schattner
202              
203             Email schattner AT alum.mit.edu
204              
205             =head1 CONTRIBUTOR - Torsten Seemann
206              
207             Email torsten.seemann AT infotech.monash.edu.au
208              
209             =head1 APPENDIX
210              
211             The rest of the documentation details each of the object
212             methods. Internal methods are usually preceded with a _
213              
214             =cut
215              
216              
217             package Bio::Tools::SeqStats;
218 12     12   1842 use strict;
  12         21  
  12         392  
219 12         909 use vars qw(%Alphabets %Alphabets_strict $amino_weights
220 12     12   53 $rna_weights $dna_weights %Weights $amino_hydropathicity);
  12         20  
221 12     12   2079 use Bio::Seq;
  12         25  
  12         387  
222 12     12   72 use base qw(Bio::Root::Root);
  12         19  
  12         5814  
223              
224             BEGIN {
225 12     12   148 %Alphabets = (
226             'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
227             'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
228             'protein' => [ qw(A R N D C Q E G H I L K M F U
229             P S T W X Y V B Z J O *) ], # sac: added B, Z
230             );
231              
232             # SAC: new strict alphabet: doesn't allow any ambiguity characters.
233 12         70 %Alphabets_strict = (
234             'dna' => [ qw( A C G T ) ],
235             'rna' => [ qw( A C G U ) ],
236             'protein' => [ qw(A R N D C Q E G H I L K M F U
237             P S T W Y V O) ],
238             );
239              
240              
241             # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
242             # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
243              
244             # Amino Acid alphabet
245              
246             # ------------------------------------------
247             # Symbol Meaning
248             # ------------------------------------------
249              
250 12         24 my $amino_A_wt = 89.09;
251 12         23 my $amino_C_wt = 121.15;
252 12         18 my $amino_D_wt = 133.1;
253 12         19 my $amino_E_wt = 147.13;
254 12         18 my $amino_F_wt = 165.19;
255 12         17 my $amino_G_wt = 75.07;
256 12         16 my $amino_H_wt = 155.16;
257 12         19 my $amino_I_wt = 131.17;
258 12         25 my $amino_K_wt = 146.19;
259 12         17 my $amino_L_wt = 131.17;
260 12         18 my $amino_M_wt = 149.21;
261 12         760 my $amino_N_wt = 132.12;
262 12         23 my $amino_O_wt = 255.31;
263 12         17 my $amino_P_wt = 115.13;
264 12         17 my $amino_Q_wt = 146.15;
265 12         14 my $amino_R_wt = 174.20;
266 12         18 my $amino_S_wt = 105.09;
267 12         15 my $amino_T_wt = 119.12;
268 12         20 my $amino_U_wt = 168.06;
269 12         22 my $amino_V_wt = 117.15;
270 12         19 my $amino_W_wt = 204.23;
271 12         20 my $amino_Y_wt = 181.19;
272              
273              
274 12         214 $amino_weights = {
275             'A' => [$amino_A_wt, $amino_A_wt], # Alanine
276             'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
277             'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
278             'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
279             'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
280             'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
281             'G' => [$amino_G_wt, $amino_G_wt], # Glycine
282             'H' => [$amino_H_wt, $amino_H_wt], # Histidine
283             'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
284             'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
285             'K' => [$amino_K_wt, $amino_K_wt], # Lysine
286             'L' => [$amino_L_wt, $amino_L_wt], # Leucine
287             'M' => [$amino_M_wt, $amino_M_wt], # Methionine
288             'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
289             'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
290             'P' => [$amino_P_wt, $amino_P_wt], # Proline
291             'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
292             'R' => [$amino_R_wt, $amino_R_wt], # Arginine
293             'S' => [$amino_S_wt, $amino_S_wt], # Serine
294             'T' => [$amino_T_wt, $amino_T_wt], # Threonine
295             'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
296             'V' => [$amino_V_wt, $amino_V_wt], # Valine
297             'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
298             'X' => [$amino_G_wt, $amino_W_wt], # Unknown
299             'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
300             'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
301             };
302              
303             # Extended Dna / Rna alphabet
304 12     12   80 use vars ( qw($C $O $N $H $P $water) );
  12         27  
  12         842  
305 12     12   61 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
  12         21  
  12         603  
306 12     12   55 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
  12         19  
  12         564  
307 12         826 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
308 12     12   55 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
  12         19  
309 12     12   56 use vars ( qw($dna_weights $rna_weights %Weights));
  12         21  
  12         4546  
310              
311 12         28 $C = 12.01;
312 12         22 $O = 16.00;
313 12         18 $N = 14.01;
314 12         17 $H = 1.01;
315 12         14 $P = 30.97;
316 12         19 $water = 18.015;
317              
318 12         63 $adenine = 5 * $C + 5 * $N + 5 * $H;
319 12         38 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
320 12         39 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
321 12         39 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
322 12         28 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
323              
324 12         25 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
325             # neutral (unionized) form
326 12         31 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
327              
328             # the following are single strand molecular weights / base
329 12         30 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
330 12         20 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
331 12         19 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
332 12         21 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
333              
334 12         18 $rna_A_wt = $adenine + $ribose_phosphate - $water;
335 12         18 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
336 12         26 $rna_G_wt = $guanine + $ribose_phosphate - $water;
337 12         21 $rna_U_wt = $uracil + $ribose_phosphate - $water;
338              
339 12         125 $dna_weights = {
340             'A' => [$dna_A_wt,$dna_A_wt], # Adenine
341             'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
342             'G' => [$dna_G_wt,$dna_G_wt], # Guanine
343             'T' => [$dna_T_wt,$dna_T_wt], # Thymine
344             'M' => [$dna_C_wt,$dna_A_wt], # A or C
345             'R' => [$dna_A_wt,$dna_G_wt], # A or G
346             'W' => [$dna_T_wt,$dna_A_wt], # A or T
347             'S' => [$dna_C_wt,$dna_G_wt], # C or G
348             'Y' => [$dna_C_wt,$dna_T_wt], # C or T
349             'K' => [$dna_T_wt,$dna_G_wt], # G or T
350             'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
351             'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
352             'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
353             'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
354             'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
355             'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
356             };
357              
358 12         105 $rna_weights = {
359             'A' => [$rna_A_wt,$rna_A_wt], # Adenine
360             'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
361             'G' => [$rna_G_wt,$rna_G_wt], # Guanine
362             'U' => [$rna_U_wt,$rna_U_wt], # Uracil
363             'M' => [$rna_C_wt,$rna_A_wt], # A or C
364             'R' => [$rna_A_wt,$rna_G_wt], # A or G
365             'W' => [$rna_U_wt,$rna_A_wt], # A or U
366             'S' => [$rna_C_wt,$rna_G_wt], # C or G
367             'Y' => [$rna_C_wt,$rna_U_wt], # C or U
368             'K' => [$rna_U_wt,$rna_G_wt], # G or U
369             'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
370             'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
371             'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
372             'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
373             'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
374             'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
375             };
376              
377 12         85 %Weights = (
378             'dna' => $dna_weights,
379             'rna' => $rna_weights,
380             'protein' => $amino_weights,
381             );
382            
383             # Amino acid scale: Hydropathicity.
384             # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
385             # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
386            
387 12         16753 $amino_hydropathicity = {
388             A => 1.800,
389             R => -4.500,
390             N => -3.500,
391             D => -3.500,
392             C => 2.500,
393             Q => -3.500,
394             E => -3.500,
395             G => -0.400,
396             H => -3.200,
397             I => 4.500,
398             L => 3.800,
399             K => -3.900,
400             M => 1.900,
401             F => 2.800,
402             P => -1.600,
403             S => -0.800,
404             T => -0.700,
405             W => -0.900,
406             Y => -1.300,
407             V => 4.200,
408             };
409              
410             }
411              
412             sub new {
413 5     5 1 409 my($class,@args) = @_;
414 5         14 my $self = $class->SUPER::new(@args);
415              
416 5         16 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
417 5 50       26 unless ($seqobj->isa("Bio::PrimarySeqI")) {
418 0         0 $self->throw("SeqStats works only on PrimarySeqI objects");
419             }
420 5 50 33     11 if ( !defined $seqobj->alphabet ||
421             !defined $Alphabets{$seqobj->alphabet}) {
422 0         0 $self->throw("Must have a valid alphabet defined for seq (".
423             join(",",keys %Alphabets));
424             }
425 5         7 $self->{'_seqref'} = $seqobj;
426             # check the letters in the sequence
427 5         8 $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
428 5         20 return $self;
429             }
430              
431             =head2 count_monomers
432              
433             Title : count_monomers
434             Usage : $rcount = $seq_stats->count_monomers();
435             or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
436             Function: Counts the number of each type of monomer (amino acid or
437             base) in the sequence.
438             Ts are counted as Us in RNA sequences.
439             Example :
440             Returns : Reference to a hash in which keys are letters of the
441             genetic alphabet used and values are number of occurrences
442             of the letter in the sequence.
443             Args : None or reference to sequence object
444             Throws : Throws an exception if type of sequence is unknown (ie amino
445             or nucleic)or if unknown letter in alphabet. Ambiguous
446             elements are allowed.
447              
448             =cut
449              
450             sub count_monomers{
451 18     18 1 260 my %count = ();
452 18         26 my $seqobj;
453             my $_is_strict;
454 18         49 my $element = '';
455 18         23 my $_is_instance = 1 ;
456 18         28 my $self = shift @_;
457 18         24 my $object_argument = shift @_;
458              
459             # First we need to determine if the present object is an instance
460             # object or if the sequence object has been passed as an argument
461              
462 18 100       31 if (defined $object_argument) {
463 13         17 $_is_instance = 0;
464             }
465              
466             # If we are using an instance object...
467 18 100       38 if ($_is_instance) {
468 5 100       7 if ($self->{'_monomer_count'}) {
469 1         3 return $self->{'_monomer_count'}; # return count if previously calculated
470             }
471 4         5 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
472 4         5 $seqobj = $self->{'_seqref'};
473             } else {
474             # otherwise...
475 13         17 $seqobj = $object_argument;
476              
477             # Following two lines lead to error in "throw" routine
478 13 50       40 $seqobj->isa("Bio::PrimarySeqI") ||
479             $self->throw("SeqStats works only on PrimarySeqI objects");
480             # is alphabet OK? Is it strict?
481 13         26 $_is_strict = _is_alphabet_strict($seqobj);
482             }
483              
484             my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
485 17 100       68 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
486              
487             # convert everything to upper case to be safe
488 17         41 my $seqstring = uc $seqobj->seq();
489              
490             # Since T is used in RichSeq RNA sequences, do conversion locally
491 17 100       37 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
492              
493             # For each letter, count the number of times it appears in
494             # the sequence
495             LETTER:
496 17         36 foreach $element (@$alphabet) {
497             # skip terminator symbol which may confuse regex
498 241 100       418 next LETTER if $element eq '*';
499 240         2217 $count{$element} = ( $seqstring =~ s/$element/$element/g);
500             }
501              
502 17 100       43 if ($_is_instance) {
503 4         7 $self->{'_monomer_count'} = \%count; # Save in case called again later
504             }
505              
506 17         44 return \%count;
507             }
508              
509             =head2 get_mol_wt
510              
511             Title : get_mol_wt
512             Usage : $wt = $seqobj->get_mol_wt() or
513             $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
514             Function: Calculate molecular weight of sequence
515             Ts are counted as Us in RNA sequences.
516             Example :
517              
518             Returns : Reference to two element array containing lower and upper
519             bounds of molecule molecular weight. For DNA and RNA
520             sequences single-stranded weights are returned. If
521             sequence contains no ambiguous elements, both entries in
522             array are equal to molecular weight of molecule.
523             Args : None or reference to sequence object
524             Throws : Exception if type of sequence is unknown (ie not amino or
525             nucleic) or if unknown letter in alphabet. Ambiguous
526             elements are allowed.
527              
528             =cut
529              
530             sub get_mol_wt {
531 12     12 1 1962 my $seqobj;
532             my $_is_strict;
533 12         21 my $element = '';
534 12         19 my $_is_instance = 1 ;
535 12         24 my $self = shift @_;
536 12         15 my $object_argument = shift @_;
537 12         20 my ($weight_array, $rcount);
538              
539 12 100       24 if (defined $object_argument) {
540 10         16 $_is_instance = 0;
541             }
542              
543 12 100       26 if ($_is_instance) {
544 2 50       4 if ($weight_array = $self->{'_mol_wt'}) {
545             # return mol. weight if previously calculated
546 0         0 return $weight_array;
547             }
548 2         3 $seqobj = $self->{'_seqref'};
549 2         2 $rcount = $self->count_monomers();
550             } else {
551 10         16 $seqobj = $object_argument;
552 10 50       41 $seqobj->isa("Bio::PrimarySeqI") ||
553             $self->throw("Error: SeqStats works only on PrimarySeqI objects");
554 10         32 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
555 10         31 $rcount = $self->count_monomers($seqobj);
556             }
557              
558             # We will also need to know what type of monomer we are dealing with
559 12         35 my $moltype = $seqobj->alphabet();
560              
561             # In general,the molecular weight is bounded below by the sum of the
562             # weights of lower bounds of each alphabet symbol times the number of
563             # occurrences of the symbol in the sequence. A similar upper bound on
564             # the weight is also calculated.
565              
566             # Note that for "strict" (i.e. unambiguous) sequences there is an
567             # inefficiency since the upper bound = the lower bound and there are
568             # two calculations. However, this decrease in performance will be
569             # minor and leads to significantly more readable code.
570              
571 12         19 my $weight_lower_bound = 0;
572 12         16 my $weight_upper_bound = 0;
573 12         21 my $weight_table = $Weights{$moltype};
574 12         17 my $total_res;
575            
576             # compute weight of all the residues
577 12         54 foreach $element (keys %$rcount) {
578 202         322 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
579 202         200 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
580            
581             # this tracks only the residues used for counting MW
582 202         213 $total_res += $$rcount{$element};
583             }
584 12 100       44 if ($moltype =~ /protein/) {
585             # remove H2O during peptide bond formation.
586 7         17 $weight_lower_bound -= $water * ($total_res - 1);
587 7         19 $weight_upper_bound -= $water * ($total_res - 1);
588             } else {
589             # Correction because phosphate of 5' residue has additional OH and
590             # sugar ring of 3' residue has additional H
591 5         5 $weight_lower_bound += $water;
592 5         6 $weight_upper_bound += $water;
593             }
594              
595 12         153 $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
596 12         44 $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
597              
598 12         32 $weight_array = [$weight_lower_bound, $weight_upper_bound];
599              
600 12 100       59 if ($_is_instance) {
601 2         3 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
602             }
603 12         67 return $weight_array;
604             }
605              
606              
607             =head2 count_codons
608              
609             Title : count_codons
610             Usage : $rcount = $seqstats->count_codons() or
611             $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
612             Function: Counts the number of each type of codons for a dna or rna
613             sequence, starting at the 1st triple of the input sequence.
614             Example :
615             Returns : Reference to a hash in which keys are codons of the genetic
616             alphabet used and values are number of occurrences of the
617             codons in the sequence. All codons with "ambiguous" bases
618             are counted together.
619             Args : None or sequence object
620             Throws : an exception if type of sequence is unknown or protein.
621              
622             =cut
623              
624             sub count_codons {
625 3     3 1 2173 my $rcount = {};
626 3         9 my $codon ;
627             my $seqobj;
628 3         0 my $_is_strict;
629 3         5 my $element = '';
630 3         3 my $_is_instance = 1 ;
631 3         5 my $self = shift @_;
632 3         4 my $object_argument = shift @_;
633              
634 3 100       10 if (defined $object_argument) {
635 1         1 $_is_instance = 0;
636             }
637              
638 3 100       6 if ($_is_instance) {
639 2 50       6 if ($rcount = $self->{'_codon_count'}) {
640 0         0 return $rcount; # return count if previously calculated
641             }
642 2         2 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
643 2         3 $seqobj = $self->{'_seqref'};
644             } else {
645 1         2 $seqobj = $object_argument;
646 1 50       11 $seqobj->isa("Bio::PrimarySeqI") ||
647             $self->throw("Error: SeqStats works only on PrimarySeqI objects");
648 1         3 $_is_strict = _is_alphabet_strict($seqobj);
649             }
650              
651             # Codon counts only make sense for nucleic acid sequences
652 3         10 my $alphabet = $seqobj->alphabet();
653              
654 3 50       15 unless ($alphabet =~ /[dr]na/i) {
655 0         0 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
656             "not for $alphabet sequences.");
657             }
658              
659             # If sequence contains ambiguous bases, warn that codons
660             # containing them will all be lumped together in the count.
661              
662 3 50       6 if (!$_is_strict ) {
663 0 0       0 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
664             " All codons with ambiguous bases will be added together in count.")
665             if $self->verbose >= 0 ;
666             }
667              
668 3         7 my $seq = $seqobj->seq();
669              
670             # Now step through the string by threes and count the codons
671              
672             CODON:
673 3         8 while (length($seq) > 2) {
674 1112         1144 $codon = uc substr($seq,0,3);
675 1112         1588 $seq = substr($seq,3);
676 1112 50       1500 if ($codon =~ /[^ACTGU]/i) {
677 0         0 $$rcount{'ambiguous'}++; #lump together ambiguous codons
678 0         0 next CODON;
679             }
680 1112 100       1377 if (!defined $$rcount{$codon}) {
681 122         144 $$rcount{$codon}= 1 ;
682 122         156 next CODON;
683             }
684 990         1211 $$rcount{$codon}++; # default
685             }
686              
687 3 100       7 if ($_is_instance) {
688 2         3 $self->{'_codon_count'} = $rcount; # Save in case called again later
689             }
690              
691 3         13 return $rcount;
692             }
693              
694              
695             =head2 hydropathicity
696              
697             Title : hydropathicity
698             Usage : $gravy = $seqstats->hydropathicity(); or
699             $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
700              
701             Function: Calculates the mean Kyte-Doolittle hydropathicity for a
702             protein sequence. Also known as the "gravy" score. Refer to
703             Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
704             Example :
705             Returns : float
706             Args : None or reference to sequence object
707              
708             Throws : an exception if type of sequence is not protein.
709              
710             =cut
711              
712             sub hydropathicity {
713 4     4 1 8 my $seqobj;
714             my $_is_strict;
715 4         5 my $element = '';
716 4         4 my $_is_instance = 1 ;
717 4         5 my $self = shift @_;
718 4         4 my $object_argument = shift @_;
719              
720 4 50       7 if (defined $object_argument) {
721 4         6 $_is_instance = 0;
722             }
723              
724 4 50       7 if ($_is_instance) {
725 0 0       0 if (my $gravy = $self->{'_hydropathicity'}) {
726 0         0 return $gravy; # return value if previously calculated
727             }
728 0         0 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
729 0         0 $seqobj = $self->{'_seqref'};
730             } else {
731 4         4 $seqobj = $object_argument;
732 4 50       12 $seqobj->isa("Bio::PrimarySeqI") ||
733             $self->throw("Error: SeqStats works only on PrimarySeqI objects");
734 4         7 $_is_strict = _is_alphabet_strict($seqobj);
735             }
736            
737             # hydropathicity not menaingful for empty sequences
738 4 100       9 unless ($seqobj->length() > 0) {
739 1         4 $seqobj->throw("hydropathicity not defined for zero-length sequences");
740             }
741              
742             # hydropathicity only make sense for protein sequences
743 3         5 my $alphabet = $seqobj->alphabet();
744              
745 3 100       10 unless ($alphabet =~ /protein/i) {
746 1         4 $seqobj->throw("hydropathicity only meaningful for protein, ".
747             "not for $alphabet sequences.");
748             }
749              
750             # If sequence contains ambiguous bases, warn that codons
751             # containing them will all be lumped together in the count.
752              
753 2 100       5 unless ($_is_strict ) {
754 1         9 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
755             "Hydropathicity can not be caculated.")
756             }
757              
758 1         2 my $seq = $seqobj->seq();
759              
760             # Now step through the string and add up the hydropathicity values
761              
762 1         2 my $gravy = 0;
763 1         2 for my $i ( 0 .. length($seq) ) {
764 30         27 my $codon = uc(substr($seq,$i,1));
765 30   100     45 $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
766             }
767 1         2 $gravy /= length($seq);
768              
769              
770 1 50       3 if ($_is_instance) {
771 0         0 $self->{'_hydropathicity'} = $gravy; # Save in case called again later
772             }
773              
774 1         3 return $gravy;
775             }
776              
777              
778             =head2 _is_alphabet_strict
779              
780             Title : _is_alphabet_strict
781             Usage :
782             Function: internal function to determine whether there are
783             any ambiguous elements in the current sequence
784             Example :
785             Returns : 1 if strict alphabet is being used,
786             0 if ambiguous elements are present
787             Args :
788              
789             Throws : an exception if type of sequence is unknown (ie amino or
790             nucleic) or if unknown letter in alphabet. Ambiguous
791             monomers are allowed.
792              
793             =cut
794              
795             sub _is_alphabet_strict {
796              
797 33     33   45 my ($seqobj) = @_;
798 33         75 my $moltype = $seqobj->alphabet();
799              
800             # convert everything to upper case to be safe
801 33         66 my $seqstring = uc $seqobj->seq();
802              
803             # Since T is used in RichSeq RNA sequences, do conversion locally
804 33 100       68 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
805              
806             # First we check if only the 'strict' letters are present in the
807             # sequence string If not, we check whether the remaining letters
808             # are ambiguous monomers or whether there are illegal letters in
809             # the string
810              
811             # $alpha_array is a ref to an array of the 'strictly' allowed letters
812 33         66 my $alpha_array = $Alphabets_strict{$moltype} ;
813              
814             # $alphabet contains the allowed letters in string form
815 33         90 my $alphabet = join ('', @$alpha_array) ;
816 33 100       371 unless ($seqstring =~ /[^$alphabet]/) {
817 25         66 return 1 ;
818             }
819              
820             # Next try to match with the alphabet's ambiguous letters
821 8         12 $alpha_array = $Alphabets{$moltype} ;
822 8         16 $alphabet = join ('', @$alpha_array) ;
823              
824 8 50       65 unless ($seqstring =~ /[^$alphabet]/) {
825 8         19 return 0 ;
826             }
827              
828             # If we got here there is an illegal letter in the sequence
829 0           $seqobj->throw("Alphabet not OK for $seqobj");
830             }
831              
832             =head2 _print_data
833              
834             Title : _print_data
835             Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
836             Function: Displays dna / rna parameters (used for debugging)
837             Returns : 1
838             Args : None
839              
840             Used for debugging.
841              
842             =cut
843              
844             sub _print_data {
845              
846 0     0     print "\n adenine = : $adenine \n";
847 0           print "\n guanine = : $guanine \n";
848 0           print "\n cytosine = : $cytosine \n";
849 0           print "\n thymine = : $thymine \n";
850 0           print "\n uracil = : $uracil \n";
851              
852 0           print "\n dna_A_wt = : $dna_A_wt \n";
853 0           print "\n dna_C_wt = : $dna_C_wt \n";
854 0           print "\n dna_G_wt = : $dna_G_wt \n";
855 0           print "\n dna_T_wt = : $dna_T_wt \n";
856              
857 0           print "\n rna_A_wt = : $rna_A_wt \n";
858 0           print "\n rna_C_wt = : $rna_C_wt \n";
859 0           print "\n rna_G_wt = : $rna_G_wt \n";
860 0           print "\n rna_U_wt = : $rna_U_wt \n";
861              
862 0           return 1;
863             }
864              
865             1;