File Coverage

Bio/Tools/IUPAC.pm

Criterion	Covered	Total	%
statement	91	104	87.5
branch	22	36	61.1
condition	11	19	57.8
subroutine	16	17	94.1
pod	11	11	100.0
total	151	187	80.7

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for IUPAC
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Aaron Mackey
7							#
8							# Copyright Aaron Mackey
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
17							an ambiguous IUPAC sequence
18
19							=head1 SYNOPSIS
20
21							use Bio::PrimarySeq;
22							use Bio::Tools::IUPAC;
23
24							# Get the IUPAC code for proteins
25							my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
26
27							# Create a sequence with degenerate residues
28							my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
29
30							# Create all possible non-degenerate sequences
31							my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
32							while ($uniqueseq = $iupac->next_seq()) {
33							# process the unique Bio::Seq object.
34							}
35
36							# Get a regular expression that matches all possible sequences
37							my $regexp = $iupac->regexp();
38
39							=head1 DESCRIPTION
40
41							Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
42							following the IUPAC conventions. Non-standard characters have the meaning
43							described below:
44
45							IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
46							Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
47
48							---------------------------------------------------------------
49							Symbol Meaning Nucleic Acid
50							---------------------------------------------------------------
51							A A Adenine
52							C C Cytosine
53							G G Guanine
54							T T Thymine
55							U U Uracil
56							M A or C aMino
57							R A or G puRine
58							W A or T Weak
59							S C or G Strong
60							Y C or T pYrimidine
61							K G or T Keto
62							V A or C or G not T (closest unused char after T)
63							H A or C or T not G (closest unused char after G)
64							D A or G or T not C (closest unused char after C)
65							B C or G or T not A (closest unused char after A)
66							X G or A or T or C Unknown (very rarely used)
67							N G or A or T or C Unknown (commonly used)
68
69
70							IUPAC-IUP AMINO ACID SYMBOLS:
71							Biochem J. 1984 Apr 15; 219(2): 345-373
72							Eur J Biochem. 1993 Apr 1; 213(1): 2
73
74							------------------------------------------
75							Symbol Meaning
76							------------------------------------------
77							A Alanine
78							B Aspartic Acid, Asparagine
79							C Cysteine
80							D Aspartic Acid
81							E Glutamic Acid
82							F Phenylalanine
83							G Glycine
84							H Histidine
85							I Isoleucine
86							J Isoleucine/Leucine
87							K Lysine
88							L Leucine
89							M Methionine
90							N Asparagine
91							O Pyrrolysine
92							P Proline
93							Q Glutamine
94							R Arginine
95							S Serine
96							T Threonine
97							U Selenocysteine
98							V Valine
99							W Tryptophan
100							X Unknown
101							Y Tyrosine
102							Z Glutamic Acid, Glutamine
103							* Terminator
104
105							There are a few things Bio::Tools::IUPAC can do for you:
106
107							=over
108
109							=item *
110
111							report the IUPAC mapping between ambiguous and non-ambiguous residues
112
113							=item *
114
115							produce a stream of all possible corresponding unambiguous Bio::Seq objects given
116							an ambiguous sequence object
117
118							=item *
119
120							convert an ambiguous sequence object to a corresponding regular expression
121
122							=back
123
124							=head1 FEEDBACK
125
126							=head2 Mailing Lists
127
128							User feedback is an integral part of the evolution of this and other
129							Bioperl modules. Send your comments and suggestions preferably to one
130							of the Bioperl mailing lists. Your participation is much appreciated.
131
132							bioperl-l@bioperl.org - General discussion
133							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
134
135							=head2 Support
136
137							Please direct usage questions or support issues to the mailing list:
138
139							I
140
141							rather than to the module maintainer directly. Many experienced and
142							reponsive experts will be able look at the problem and quickly
143							address it. Please include a thorough description of the problem
144							with code and data examples if at all possible.
145
146							=head2 Reporting Bugs
147
148							Report bugs to the Bioperl bug tracking system to help us keep track
149							the bugs and their resolution. Bug reports can be submitted via the
150							web:
151
152							https://github.com/bioperl/bioperl-live/issues
153
154							=head1 AUTHOR - Aaron Mackey
155
156							Email amackey-at-virginia.edu
157
158							=head1 APPENDIX
159
160							The rest of the documentation details each of the object
161							methods. Internal methods are usually preceded with a _
162
163							=cut
164
165
166							package Bio::Tools::IUPAC;
167
168	204			204		1107	use strict;
	204					262
	204					4951
169	204			204		618	use base qw(Bio::Root::Root);
	204					216
	204					13641
170	204			204		779	use vars qw(%IUB %IUB_AMB %REV_IUB %IUP %IUP_AMB $AUTOLOAD);
	204					295
	204					43977
171
172							BEGIN {
173							# Ambiguous nucleic residues are matched to unambiguous residues
174	204			204		3681	%IUB = (
175							A => [qw(A)],
176							C => [qw(C)],
177							G => [qw(G)],
178							T => [qw(T)],
179							U => [qw(U)],
180							M => [qw(A C)],
181							R => [qw(A G)],
182							S => [qw(C G)],
183							W => [qw(A T)],
184							Y => [qw(C T)],
185							K => [qw(G T)],
186							V => [qw(A C G)],
187							H => [qw(A C T)],
188							D => [qw(A G T)],
189							B => [qw(C G T)],
190							N => [qw(A C G T)],
191							X => [qw(A C G T)],
192							);
193
194							# Same as %IUB but ambiguous residues are matched to ambiguous residues only
195	204					1673	%IUB_AMB = (
196							M => [qw(M)],
197							R => [qw(R)],
198							W => [qw(W)],
199							S => [qw(S)],
200							Y => [qw(Y)],
201							K => [qw(K)],
202							V => [qw(M R S V)],
203							H => [qw(H M W Y)],
204							D => [qw(D K R W)],
205							B => [qw(B K S Y)],
206							N => [qw(B D H K M N R S V W Y)],
207							);
208
209							# The inverse of %IUB
210	204					2809	%REV_IUB = (
211							A => 'A',
212							T => 'T',
213							U => 'U',
214							C => 'C',
215							G => 'G',
216							AC => 'M',
217							AG => 'R',
218							AT => 'W',
219							CG => 'S',
220							CT => 'Y',
221							GT => 'K',
222							ACG => 'V',
223							ACT => 'H',
224							AGT => 'D',
225							CGT => 'B',
226							ACGT => 'N',
227							N => 'N'
228							);
229
230							# Same thing with proteins now
231	204					3349	%IUP = (
232							A => [qw(A)],
233							B => [qw(D N)],
234							C => [qw(C)],
235							D => [qw(D)],
236							E => [qw(E)],
237							F => [qw(F)],
238							G => [qw(G)],
239							H => [qw(H)],
240							I => [qw(I)],
241							J => [qw(I L)],
242							K => [qw(K)],
243							L => [qw(L)],
244							M => [qw(M)],
245							N => [qw(N)],
246							O => [qw(O)],
247							P => [qw(P)],
248							Q => [qw(Q)],
249							R => [qw(R)],
250							S => [qw(S)],
251							T => [qw(T)],
252							U => [qw(U)],
253							V => [qw(V)],
254							W => [qw(W)],
255							X => [qw(X)],
256							Y => [qw(Y)],
257							Z => [qw(E Q)],
258							'' => [qw()],
259							);
260
261	204					181742	%IUP_AMB = (
262							B => [qw(B)],
263							J => [qw(J)],
264							Z => [qw(Z)],
265							);
266
267							}
268
269
270							=head2 new
271
272							Title : new
273							Usage : Bio::Tools::IUPAC->new($seq);
274							Function: Create a new IUPAC object, which acts as a sequence stream (akin to
275							SeqIO)
276							Args : an ambiguously coded sequence object that has a specified 'alphabet'
277							Returns : a Bio::Tools::IUPAC object.
278
279							=cut
280
281							sub new {
282	34			34	1	51	my ($class,@args) = @_;
283	34					70	my $self = $class->SUPER::new(@args);
284	34					80	my ($seq) = $self->_rearrange([qw(SEQ)],@args);
285
286	34	50	66			81	if ( (not defined $seq) && @args && ref($args[0]) ) {
			33
287							# parameter not passed as named parameter?
288	0					0	$seq = $args[0];
289							}
290
291	34	100				49	if (defined $seq) {
292	33	50				81	if (not $seq->isa('Bio::PrimarySeqI')) {
293	0					0	$self->throw('Must supply a sequence object');
294							}
295	33	50				48	if (length $seq->seq == 0) {
296	0					0	$self->throw('Sequence had zero-length');
297							}
298	33					45	$self->{'_seq'} = $seq;
299							}
300
301	34					75	return $self;
302							}
303
304
305							sub _initialize {
306	1			1		1	my ($self) = @_;
307	1					2	my %iupac = $self->iupac;
308	1					4	$self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
	9					10
309	1					3	$self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
310	1					3	$self->{'_string'}->[0] = -1;
311							}
312
313
314							=head2 next_seq
315
316							Title : next_seq
317							Usage : $iupac->next_seq();
318							Function: returns the next unique sequence object
319							Args : none.
320							Returns : a Bio::Seq object
321
322							=cut
323
324							sub next_seq {
325	9			9	1	8	my ($self) = @_;
326
327	9	50				16	if (not exists $self->{'_string'}) {
328	0					0	$self->_initialize();
329							}
330
331	9					8	for my $i ( 0 .. $#{$self->{'_string'}} ) {
	9					22
332	45	100	100			61	next unless $self->{'_string'}->[$i] \|\| @{$self->{'_alpha'}->[$i]} > 1;
	37					79
333	13	100				10	if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
	13					21
334	5	100				3	if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
	5					7
335	1					2	return;
336							} else {
337	4					4	$self->{'_string'}->[$i] = 0;
338	4					4	next;
339							}
340							} else {
341	8					8	$self->{'_string'}->[$i]++;
342	8					7	my $j = -1;
343	8					7	my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
	72					38
	72					70
	8					10
344	8		50			19	my $desc = $self->{'_seq'}->desc() \|\| '';
345	8					9	$self->{'_num'}++;
346	8					17	1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
347	8					44	$desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])\|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
348	8					11	$self->{'_num'} =~ s/,//g;
349
350							# Return a fresh sequence object
351	8					23	return Bio::PrimarySeq->new(-seq => $seqstr, -desc => $desc);
352							}
353							}
354							}
355
356
357							=head2 iupac
358
359							Title : iupac
360							Usage : my %symbols = $iupac->iupac;
361							Function: Returns a hash of symbols -> symbol components of the right type
362							for the given sequence, i.e. it is the same as iupac_iup() if
363							Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
364							sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
365							Args : none
366							Returns : Hash
367
368							=cut
369
370							sub iupac {
371	35			35	1	29	my ($self) = @_;
372	35					65	my $alphabet = lc( $self->{'_seq'}->alphabet() );
373	35	50	33			83	if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
		0
374	35					251	return %IUB; # nucleic
375							} elsif ( $alphabet eq 'protein' ) {
376	0					0	return %IUP; # proteic
377							} else {
378	0					0	$self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
379							}
380							}
381
382
383
384							=head2 iupac_amb
385
386							Title : iupac_amb
387							Usage : my %symbols = $iupac->iupac_amb;
388							Function: Same as iupac() but only contains a mapping between ambiguous residues
389							and the ambiguous residues they map to. For example, the key 'N' has
390							the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
391							i.e. it matches all other ambiguous residues.
392							Args : none
393							Returns : Hash
394
395							=cut
396
397							sub iupac_amb {
398	34			34	1	27	my ($self) = @_;
399	34					47	my $alphabet = lc( $self->{'_seq'}->alphabet() );
400	34	50	33			74	if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
		0
401	34					126	return %IUB_AMB; # nucleic
402							} elsif ( $alphabet eq 'protein' ) {
403	0					0	return %IUP_AMB; # proteic
404							} else {
405	0					0	$self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
406							}
407							}
408
409
410							=head2 iupac_iup
411
412							Title : iupac_iup
413							Usage : my %aasymbols = $iupac->iupac_iup;
414							Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
415							Args : none
416							Returns : Hash
417
418							=cut
419
420							sub iupac_iup {
421	206			206	1	2369	return %IUP;
422							}
423
424
425							=head2 iupac_iup_amb
426
427							Title : iupac_iup_amb
428							Usage : my %aasymbols = $iupac->iupac_iup_amb;
429							Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
430							Args : none
431							Returns : Hash
432
433							=cut
434
435							sub iupac_iup_amb {
436	1			1	1	8	return %IUP_AMB;
437							}
438
439
440							=head2 iupac_iub
441
442							Title : iupac_iub
443							Usage : my %dnasymbols = $iupac->iupac_iub;
444							Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
445							Args : none
446							Returns : Hash
447
448							=cut
449
450							sub iupac_iub {
451	205			205	1	2216	return %IUB;
452							}
453
454
455							=head2 iupac_iub_amb
456
457							Title : iupac_iub_amb
458							Usage : my %dnasymbols = $iupac->iupac_iub;
459							Function: Returns a hash of DNA symbols -> ambiguous symbol components
460							Args : none
461							Returns : Hash
462
463							=cut
464
465							sub iupac_iub_amb {
466	1			1	1	10	return %IUB_AMB;
467							}
468
469
470							=head2 iupac_rev_iub
471
472							Title : iupac_rev_iub
473							Usage : my %dnasymbols = $iupac->iupac_rev_iub;
474							Function: Returns a hash of nucleotide combinations -> IUPAC code
475							(a reverse of the iupac_iub hash).
476							Args : none
477							Returns : Hash
478
479							=cut
480
481							sub iupac_rev_iub {
482	23			23	1	270	return %REV_IUB;
483							}
484
485
486							=head2 count
487
488							Title : count
489							Usage : my $total = $iupac->count();
490							Function: Calculates the number of unique, unambiguous sequences that
491							this ambiguous sequence could generate
492							Args : none
493							Return : int
494
495							=cut
496
497							sub count {
498	1			1	1	442	my ($self) = @_;
499	1	50				4	if (not exists $self->{'_string'}) {
500	1					2	$self->_initialize();
501							}
502	1					2	my $count = 1;
503	1					1	$count *= scalar(@$_) for (@{$self->{'_alpha'}});
	1					5
504	1					2	return $count;
505							}
506
507
508							=head2 regexp
509
510							Title : regexp
511							Usage : my $re = $iupac->regexp();
512							Function: Converts the ambiguous sequence into a regular expression that
513							matches all of the corresponding ambiguous and non-ambiguous sequences.
514							You can further manipulate the resulting regular expression with the
515							Bio::Tools::SeqPattern module. After you are done building your
516							regular expression, you might want to compile it and make it case-
517							insensitive:
518							$re = qr/$re/i;
519							Args : 1 to match RNA: T and U characters will match interchangeably
520							Return : regular expression
521
522							=cut
523
524							sub regexp {
525	33			33	1	27	my ($self, $match_rna) = @_;
526	33					24	my $re;
527	33					44	my $seq = $self->{'_seq'}->seq;
528	33					54	my %iupac = $self->iupac;
529	33					76	my %iupac_amb = $self->iupac_amb;
530	33					103	for my $pos (0 .. length($seq)-1) {
531	285					232	my $res = substr $seq, $pos, 1;
532	285					212	my $iupacs = $iupac{$res};
533	285		100			581	my $iupacs_amb = $iupac_amb{$res} \|\| [];
534	285	50				322	if (not defined $iupacs) {
535	0					0	$self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
536							" Offending character was '$res'.\n");
537							}
538	285					268	my $part = join '', (@$iupacs, @$iupacs_amb);
539	285	100				314	if ($match_rna) {
540	276	100				513	$part =~ s/T/TU/i \|\| $part =~ s/U/TU/i;
541							}
542	285	100				321	if (length $part > 1) {
543	103					92	$part = '['.$part.']';
544							}
545	285					287	$re .= $part;
546							}
547	33					122	return $re;
548							}
549
550
551							sub AUTOLOAD {
552	0			0			my $self = shift @_;
553	0						my $method = $AUTOLOAD;
554	0						$method =~ s/.*:://;
555	0	0					return $self->{'_seq'}->$method(@_)
556							unless $method eq 'DESTROY';
557							}
558
559							1;
560