File Coverage

Bio/Tools/OddCodes.pm
Criterion Covered Total %
statement 118 122 96.7
branch 4 10 40.0
condition 3 9 33.3
subroutine 12 12 100.0
pod 9 9 100.0
total 146 162 90.1


line stmt bran cond sub pod time code
1             #$Id$
2             #-----------------------------------------------------------------------------
3             # PACKAGE : OddCodes.pm
4             # PURPOSE : To write amino acid sequences in alternative alphabets
5             # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
6             # SOURCE :
7             # CREATED : 8th July 2000
8             # MODIFIED :
9             # DISCLAIMER : I am employed in the pharmaceutical industry but my
10             # : employers do not endorse or sponsor this module
11             # : in any way whatsoever. The above email address is
12             # : given purely for the purpose of easy communication
13             # : with the author, and does not imply any connection
14             # : between my employers and anything written below.
15             # LICENCE : You may distribute this module under the same terms
16             # : as the rest of BioPerl.
17             #----------------------------------------------------------------------------
18              
19             =head1 NAME
20              
21             Bio::Tools::OddCodes - Object holding alternative alphabet coding for
22             one protein sequence
23              
24             =head1 SYNOPSIS
25              
26             # Take a sequence object from eg, an inputstream, and creates an
27             # object for the purposes of rewriting that sequence in another
28             # alphabet. These are abbreviated amino acid sequence alphabets,
29             # designed to simplify the statistical aspects of analysing protein
30             # sequences, by reducing the combinatorial explosion of the
31             # 20-letter alphabet. These abbreviated alphabets range in size
32             # from 2 to 8.
33              
34             # Creating the OddCodes object, eg:
35              
36             my $inputstream = Bio::SeqIO->new( '-file' => "seqfile",
37             '-format' => 'Fasta');
38             my $seqobj = $inputstream->next_seq();
39             my $oddcode_obj = Bio::Tools::Oddcodes->new(-seq => $seqobj);
40              
41             # or:
42              
43             my $seqobj = Bio::PrimarySeq->new
44             (-seq=>'[cut and paste a sequence here]',
45             -alphabet => 'protein',
46             -id => 'test');
47             my $oddcode_obj = Bio::Tools::OddCodes->new(-seq => $seqobj);
48              
49             # do the alternative coding, returning the answer as a reference to
50             # a string
51              
52             my $output = $oddcode_obj->structural();
53             my $output = $oddcode_obj->chemical();
54             my $output = $oddcode_obj->functional();
55             my $output = $oddcode_obj->charge();
56             my $output = $oddcode_obj->hydrophobic();
57             my $output = $oddcode_obj->Dayhoff();
58             my $output = $oddcode_obj->Sneath();
59             my $output = $oddcode_obj->Stanfel();
60              
61              
62             # display sequence in new form, eg:
63              
64             my $new_coding = $$output;
65             print "\n$new_coding";
66              
67             =head1 DESCRIPTION
68              
69             Bio::Tools::Oddcodes is a welterweight object for rewriting a protein
70             sequence in an alternative alphabet. Eight of these are provided, ranging
71             from the the 2-letter hydrophobic alphabet, to the 8-letter chemical
72             alphabet. These are useful for the statistical analysis of protein
73             sequences since they can partially avoid the combinatorial explosion
74             produced by the full 20-letter alphabet (eg. 400 dimers, 8000 trimers
75             etc.)
76              
77             The objects will print out a warning if the input sequence is not a
78             protein. If you know what you are doing, you can silence the warning
79             by setting verbose() to a negative value.
80              
81             See SYNOPSIS above for object creation code.
82              
83             =head1 REFERENCES
84              
85             Stanfel LE (1996) A new approach to clustering the amino acids. J. theor.
86             Biol. 183, 195-205.
87              
88             Karlin S, Ost F and Blaisdell BE (1989) Patterns in DNA and amino acid
89             sequences and their statistical significance. Chapter 6 of: Mathematical
90             Methods for DNA Sequences. Waterman MS (ed.) CRC Press, Boca Raton , FL.
91              
92             =head1 FEEDBACK
93              
94             =head2 Mailing Lists
95              
96             User feedback is an integral part of the evolution of this and other
97             Bioperl modules. Send your comments and suggestions preferably to one
98             of the Bioperl mailing lists. Your participation is much appreciated.
99              
100             bioperl-l@bioperl.org - General discussion
101             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
102              
103             =head2 Support
104              
105             Please direct usage questions or support issues to the mailing list:
106              
107             I
108              
109             rather than to the module maintainer directly. Many experienced and
110             reponsive experts will be able look at the problem and quickly
111             address it. Please include a thorough description of the problem
112             with code and data examples if at all possible.
113              
114             =head2 Reporting Bugs
115              
116             Report bugs to the Bioperl bug tracking system to help us keep track
117             the bugs and their resolution. Bug reports can be submitted via the
118             web:
119              
120             https://github.com/bioperl/bioperl-live/issues
121              
122             =head1 AUTHOR
123              
124             Derek Gatherer
125              
126             =head1 APPENDIX
127              
128             The rest of the documentation details each of the object methods.
129             Internal methods are usually preceded with a _
130              
131             =cut
132              
133             package Bio::Tools::OddCodes;
134 1     1   1107 use strict;
  1         1  
  1         30  
135              
136              
137 1     1   3 use base qw(Bio::Root::Root);
  1         1  
  1         761  
138              
139             sub new
140             {
141 1     1 1 10 my($class,@args) = @_;
142              
143 1         5 my $self = $class->SUPER::new(@args);
144              
145 1         6 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
146 1 0 33     4 if((! defined($seqobj)) && @args && ref($args[0])) {
      33        
147             # parameter not passed as named parameter?
148 0         0 $seqobj = $args[0];
149             }
150 1 50       8 unless ($seqobj->isa("Bio::PrimarySeqI"))
151             {
152 0         0 $self->throw("Bio::Tools::OddCodes only works on PrimarySeqI objects");
153             }
154              
155 1         2 $self->{'_seqref'} = $seqobj;
156              
157 1         2 return $self;
158             }
159              
160             =head2 structural
161              
162             Title : structural
163             Usage : $output = $oddcode_obj->structural();
164             Function: turns amino acid sequence into 3-letter structural alphabet
165             : A (ambivalent), E (external), I (internal)
166             Example : a sequence ACDEFGH will become AAEEIAE
167             Returns : Reference to the new sequence string
168             Args : none
169              
170             =cut
171              
172             sub structural()
173             {
174 1     1 1 711 my $self = $_[0];
175 1         4 my $seqstring = &_pullseq($self); # see _pullseq() below
176              
177             # now the real business
178              
179 1         1 $seqstring =~ tr/[ACGPSTWY]/1/;
180 1         2 $seqstring =~ tr/[RNDQEHK]/2/;
181 1         1 $seqstring =~ tr/[ILMFV]/3/;
182 1         1 $seqstring =~ tr/1/A/;
183 1         1 $seqstring =~ tr/2/E/;
184 1         2 $seqstring =~ tr/3/I/;
185              
186 1         4 return \$seqstring;
187              
188             # and that's that one
189             }
190              
191             =head2 functional
192              
193             Title : functional
194             Usage : $output = $oddcode_obj->functional();
195             Function: turns amino acid sequence into 4-letter functional alphabet
196             : A (acidic), C (basic), H (hydrophobic), P (polar)
197             Example : a sequence ACDEFGH will become HPAAHHC
198             Returns : Reference to the new sequence string
199             Args : none
200              
201             =cut
202              
203             sub functional()
204             {
205 1     1 1 1 my $self = $_[0];
206 1         2 my $seqstring = &_pullseq($self);
207              
208             # now the real business
209              
210 1         2 $seqstring =~ tr/[DE]/1/;
211 1         1 $seqstring =~ tr/[HKR]/2/;
212 1         2 $seqstring =~ tr/[AFILMPVW]/3/;
213 1         1 $seqstring =~ tr/[CGNQSTY]/4/;
214 1         1 $seqstring =~ tr/1/A/;
215 1         1 $seqstring =~ tr/2/C/;
216 1         1 $seqstring =~ tr/3/H/;
217 1         1 $seqstring =~ tr/4/P/;
218              
219 1         3 return \$seqstring;
220              
221             # and that's that one
222             }
223              
224             =head2 hydrophobic
225              
226             Title : hydrophobic
227             Usage : $output = $oddcode_obj->hydrophobic();
228             Function: turns amino acid sequence into 2-letter hydrophobicity alphabet
229             : O (hydrophobic), I (hydrophilic)
230             Example : a sequence ACDEFGH will become OIIIOII
231             Returns : Reference to the new sequence string
232             Args : none
233              
234             =cut
235              
236             sub hydrophobic()
237             {
238 1     1 1 2 my $self = $_[0];
239 1         2 my $seqstring = &_pullseq($self);
240              
241             # now the real business
242              
243 1         2 $seqstring =~ tr/[AFILMPVW]/1/;
244 1         2 $seqstring =~ tr/[CDEGHKNQRSTY]/2/;
245 1         1 $seqstring =~ tr/1/I/;
246 1         2 $seqstring =~ tr/2/O/;
247              
248 1         2 return \$seqstring;
249              
250             # and that's that one
251             }
252              
253             =head2 Dayhoff
254              
255             Title : Dayhoff
256             Usage : $output = $oddcode_obj->Dayhoff();
257             Function: turns amino acid sequence into 6-letter Dayhoff alphabet
258             Example : a sequence ACDEFGH will become CADDGCE
259             : A (=C), C (=AGPST), D (=DENQ),
260             : E (=HKR), F (=ILMV), G (=FWY)
261             Returns : Reference to the new sequence string
262             Args : none
263              
264             =cut
265              
266             sub Dayhoff()
267             {
268 1     1 1 1 my $self = $_[0];
269 1         3 my $seqstring = &_pullseq($self);
270              
271             # now the real business
272              
273 1         2 $seqstring =~ tr/[C]/1/;
274 1         2 $seqstring =~ tr/[AGPST]/2/;
275 1         1 $seqstring =~ tr/[DENQ]/3/;
276 1         1 $seqstring =~ tr/[HKR]/4/;
277 1         1 $seqstring =~ tr/[ILMV]/5/;
278 1         1 $seqstring =~ tr/[FWY]/6/;
279 1         2 $seqstring =~ tr/1/A/;
280 1         1 $seqstring =~ tr/2/C/;
281 1         1 $seqstring =~ tr/3/D/;
282 1         1 $seqstring =~ tr/4/E/;
283 1         1 $seqstring =~ tr/5/F/;
284 1         1 $seqstring =~ tr/6/G/;
285              
286 1         4 return \$seqstring;
287              
288             # and that's that one
289             }
290              
291             =head2 Sneath
292              
293             Title : Sneath
294             Usage : $output = $oddcode_obj->Sneath();
295             Function: turns amino acid sequence into 7-letter Sneath alphabet
296             Example : a sequence ACDEFGH will become CEFFHCF
297             : A (=ILV), C (=AGP), D (=MNQ), E (=CST),
298             : F (=DE), G (=KR), H (=FHWY)
299             Returns : Reference to the new sequence string
300             Args : none
301              
302             =cut
303              
304             sub Sneath()
305             {
306 1     1 1 1 my $self = $_[0];
307 1         2 my $seqstring = &_pullseq($self);
308              
309             # now the real business
310              
311 1         2 $seqstring =~ tr/[ILV]/1/;
312 1         3 $seqstring =~ tr/[AGP]/2/;
313 1         1 $seqstring =~ tr/[MNQ]/3/;
314 1         1 $seqstring =~ tr/[CST]/4/;
315 1         2 $seqstring =~ tr/[DE]/5/;
316 1         0 $seqstring =~ tr/[KR]/6/;
317 1         1 $seqstring =~ tr/[FHWY]/7/;
318 1         2 $seqstring =~ tr/1/A/;
319 1         1 $seqstring =~ tr/2/C/;
320 1         1 $seqstring =~ tr/3/D/;
321 1         1 $seqstring =~ tr/4/E/;
322 1         2 $seqstring =~ tr/5/F/;
323 1         1 $seqstring =~ tr/6/G/;
324 1         1 $seqstring =~ tr/7/H/;
325              
326 1         4 return \$seqstring;
327              
328             # and that's that one
329             }
330              
331             =head2 Stanfel
332              
333             Title : Stanfel
334             Usage : $output = $oddcode_obj->Stanfel();
335             Function: turns amino acid sequence into 4-letter Stanfel alphabet
336             Example : a sequence ACDEFGH will become AACCDAE
337             : A (=ACGILMPSTV), C (=DENQ), D (=FWY), E (=HKR)
338             Returns : Reference to the new sequence string
339             Args : none
340              
341             =cut
342              
343             sub Stanfel()
344             {
345 1     1 1 2 my $self = $_[0];
346 1         9 my $seqstring = &_pullseq($self);
347              
348             # now the real business
349              
350 1         1 $seqstring =~ tr/[ACGILMPSTV]/1/;
351 1         2 $seqstring =~ tr/[DENQ]/2/;
352 1         1 $seqstring =~ tr/[FWY]/3/;
353 1         2 $seqstring =~ tr/[HKR]/4/;
354 1         1 $seqstring =~ tr/1/A/;
355 1         1 $seqstring =~ tr/2/C/;
356 1         1 $seqstring =~ tr/3/D/;
357 1         1 $seqstring =~ tr/4/E/;
358              
359 1         4 return \$seqstring;
360              
361             # and that's that one
362             }
363              
364             =head2 chemical
365              
366             Title : chemical
367             Usage : $output = $oddcode_obj->chemical();
368             Function: turns amino acid sequence into 8-letter chemical alphabet
369             : A (acidic), L (aliphatic), M (amide), R (aromatic)
370             : C (basic), H (hydroxyl), I (imino), S (sulphur)
371             Example : a sequence ACDEFGH will become LSAARAC
372             Returns : Reference to the new sequence string
373             Args : none
374              
375             =cut
376              
377             sub chemical()
378             {
379 1     1 1 1 my $self = $_[0];
380 1         2 my $seqstring = &_pullseq($self);
381              
382             # now the real business
383              
384 1         2 $seqstring =~ tr/[DE]/1/;
385 1         2 $seqstring =~ tr/[AGILV]/2/;
386 1         1 $seqstring =~ tr/[NQ]/3/;
387 1         1 $seqstring =~ tr/[FWY]/4/;
388 1         1 $seqstring =~ tr/[RHK]/5/;
389 1         1 $seqstring =~ tr/[ST]/6/;
390 1         1 $seqstring =~ tr/P/7/;
391 1         2 $seqstring =~ tr/[CM]/8/;
392 1         1 $seqstring =~ tr/1/A/;
393 1         1 $seqstring =~ tr/2/L/;
394 1         1 $seqstring =~ tr/3/M/;
395 1         1 $seqstring =~ tr/4/R/;
396 1         1 $seqstring =~ tr/5/C/;
397 1         1 $seqstring =~ tr/6/H/;
398 1         1 $seqstring =~ tr/7/I/;
399 1         2 $seqstring =~ tr/8/S/;
400              
401 1         4 return \$seqstring;
402              
403             # and that's that one
404             }
405              
406             =head2 charge
407              
408             Title : charge
409             Usage : $output = $oddcode_obj->charge();
410             Function: turns amino acid sequence into 3-letter charge alphabet
411             Example : a sequence ACDEFGH will become NNAANNC
412             : A (negative; NOT anode), C (positive; NOT cathode), N (neutral)
413             Returns : Reference to the new sequence string
414             Args : none
415              
416             =cut
417              
418             sub charge()
419             {
420 1     1 1 2 my $self = $_[0];
421 1         2 my $seqstring = &_pullseq($self);
422              
423             # now the real business
424              
425 1         3 $seqstring =~ tr/[DE]/1/;
426 1         2 $seqstring =~ tr/[HKR]/2/;
427 1         2 $seqstring =~ tr/[ACFGILMNPQSTVWY]/3/;
428 1         1 $seqstring =~ tr/1/A/;
429 1         2 $seqstring =~ tr/2/C/;
430 1         1 $seqstring =~ tr/3/N/;
431              
432 1         4 return \$seqstring;
433              
434             # and that's that one
435             }
436              
437             # _pullseq is called within each of the subroutines
438             # it just checks a few things and returns the sequence
439              
440             sub _pullseq
441             {
442 8     8   8 my $self = $_[0];
443              
444 8         7 my $seqobj = $self->{'_seqref'};
445              
446 8 50       29 unless ($seqobj->isa("Bio::PrimarySeqI"))
447             {
448 0         0 $self->throw("die, OddCodes works only on PrimarySeqI objects\n");
449             }
450 8 50 33     13 $self->warn("\tAll OddCode alphabets need a protein sequence,\n".
451             "\tbut BioPerl thinks this is not: [". $seqobj->id. "]")
452             unless $seqobj->alphabet eq 'protein' or $self->verbose < 0;;
453              
454 8         12 my $seqstring = uc $seqobj->seq();
455              
456 8 50       12 if(length($seqstring)<1)
457             {
458 0         0 $self->throw("$seqstring: die, sequence has zero length\n");
459             }
460 8         11 return $seqstring;
461             }
462              
463             1;