File Coverage

blib/lib/Bio/Tools/SeqPattern/Backtranslate.pm

Criterion	Covered	Total	%
statement	126	127	99.2
branch	26	28	92.8
condition	3	9	33.3
subroutine	17	17	100.0
pod			n/a
total	172	181	95.0

line	stmt	bran	cond	sub	time	code
1						package Bio::Tools::SeqPattern::Backtranslate;
2	2			2	633	use strict;
	2				2
	2				44
3	2			2	5	use warnings;
	2				2
	2				41
4
5	2			2	5	use base qw(Bio::Root::Root);
	2				3
	2				463
6	2			2	6	use base qw(Exporter);
	2				2
	2				95
7
8						=head1 NAME
9
10						Bio::Tools::SeqPattern::Backtranslate
11
12						=head1 DESCRIPTION
13
14						This module should not be used directly. It provides helper methods to
15						Bio::Tools::SeqPattern to reverse translate protein patterns.
16
17						=cut
18
19	2			2	862	use Bio::Seq;
	2				4
	2				62
20	2			2	9	use Bio::Tools::CodonTable;
	2				2
	2				75
21
22	2			2	6	use List::MoreUtils qw(uniq);
	2				2
	2				21
23	2			2	1004	use Carp qw(croak);
	2				2
	2				2072
24
25						our @EXPORT_OK = qw(_reverse_translate_motif);
26
27						our @EXPORT = @EXPORT_OK;
28
29						sub _reverse_translate_motif {
30						# Main subroutine. It takes a Profam-like motif and returns its
31						# reverse translation using degenerate codons.
32
33						# Steps:
34						# 1. Tokenize, then parse tokens.
35						# 2. Reverse translate each token type.
36						# 3. Join tokens in original order. Return the resulting string.
37
38	12			12	3316	my $motif = shift;
39
40	12				21	$motif =~ s/\./X/g;
41	12				19	$motif = uc $motif;
42
43						### 1. Tokenize, parse the motif.
44	12				17	my ( $ordered, $classified ) = _parse_motif($motif);
45
46						### 2. Reverse translate each token type.
47						# Reverse translate the plain (unambiguous) tokens.
48	11				30	my $ct = Bio::Tools::CodonTable->new;
49	11				10	foreach my $seq ( @{ $classified->{plain} } ) {
	11				15
50	19				59	my $seqO
51						= Bio::Seq->new( -seq => $$seq, -alphabet => 'protein' );
52	19				38	$$seq = $ct->reverse_translate_all($seqO);
53						}
54
55						# Reverse translate the ambiguous tokens.
56	11				9	foreach my $token ( @{ $classified->{ambiguous} } ) {
	11				19
57	8				31	my ($aas) = $$token =~ m(([A-Za-z\.]+));
58	8				8	my @codons_to_contract;
59
60	8				16	foreach my $residue ( split '', $aas ) {
61	32				55	push @codons_to_contract, $ct->revtranslate($residue);
62						}
63
64	8				15	my $ambiguous_codon = _contract_codons(@codons_to_contract);
65	8				13	$$token = $ambiguous_codon;
66						}
67
68						# Reverse translate the negated residues.
69	11				11	foreach my $token ( @{ $classified->{negated} } ) {
	11				14
70	6				18	my ($aas) = $$token =~ m(([A-Za-z\.]+));
71	6				9	my $ambiguous_codon = _negated_aas_to_codon($aas);
72	6				10	$$token = $ambiguous_codon;
73						}
74
75						### 3. Join the profile back from its tokens.
76	11				10	return join '', map {$$_} @{$ordered};
	41				71
	11				14
77
78						}
79
80						sub _parse_motif {
81						# Profam-like motif parser. It takes the pattern as a string, and
82						# returns two data structures that contain the tokens, organized
83						# by order of appearance in the pattern (first return value) and by
84						# category (second return value).
85
86	12			12	632	my $motif = shift;
87	12				19	my $parser = _tokenize_motif($motif);
88	12				9	my ( %tokens, @tokens );
89
90	12				14	while ( my $token = $parser->() ) {
91	42	100			80	croak ("Unknown syntax token: <", $token->[1], ">")
92						if ( $token->[0] eq 'UNKNOWN' );
93	41				21	push @{ $tokens{ $token->[0] } }, \$token->[1];
	41				70
94	41				66	push @tokens, \$token->[1];
95						}
96	11				41	return ( \@tokens, \%tokens );
97						}
98
99						sub _tokenize_motif {
100
101						# Return a tokenizer iterator that sequentially recognizes and
102						# returns each token in the input pattern.
103						# Examples of each token type:
104
105						# ambiguous: a position with more than one possible residue.
106						# eg. [ALEP]
107						# negated: a position in which some residues are excluded.
108						# eg. [^WY]
109						# plain: a common sequence of residues. One position, one residue.
110						# eg. MAAEIK
111						# open_par, close_par: tags surrounding a motif that is repeated
112						# a certain number of times.
113						# eg. (...){3}
114
115	12			12	12	my $target = shift;
116						return sub {
117	53	100		53	107	return [ 'ambiguous', $1 ]
118						if $target =~ /\G (\[[A-Za-z\.]+\]) /gcx;
119	45	100			70	return [ 'negated', $1 ]
120						if $target =~ /\G (\[\^[A-Za-z\.]+\]) /gcx;
121	39	100			117	return [ 'plain', $1 ]
122						if $target =~ /\G ([A-Za-z\.]+) /gcx;
123	20	100			43	return [ 'open_par', $1 ]
124						if $target =~ /\G (\() /gcx;
125	16	100			33	return [ 'close_par', $1 ]
126						if $target =~ /\G (\)[\{\d+[,\d+]\}]) /gcx;
127	12	100			18	return [ 'UNKNOWN', $1 ]
128						if $target =~ /\G (.) /gcx;
129	11				20	return;
130	12				37	};
131						}
132
133						sub _contract_codons {
134
135						# Take a list of codons, return an ambiguous codon.
136	8			8	10	my @codons = map { uc $_ } @_;
	56				54
137
138	8				13	my @by_letter = ( [], [], [], );
139	8				8	my $ambiguous_codon;
140	8				8	foreach my $codon (@codons) {
141	56				56	my @letters = split '', $codon;
142	56				44	for my $i ( 0 .. 2 ) {
143	168				89	push @{ $by_letter[$i] }, $letters[$i];
	168				200
144						}
145						}
146	8				7	for my $i ( 0 .. 2 ) {
147						$ambiguous_codon
148	24				19	.= _convert( 'dna', _uniq_string( @{ $by_letter[$i] } ) );
	24				33
149						}
150	8				22	return $ambiguous_codon;
151						}
152
153						sub _expand_codon {
154
155						# Given a degenerate codon, return a list with all its
156						# constituents. Takes a three-letter string (codon) as
157						# input, returns a list with three-letter scalars.
158
159	300			300	227	my $codon = shift;
160	300	50			372	die "Wrong codon length!\n" if length $codon != 3;
161
162
163	300				183	my ( @codons, @return_bases );
164	300				389	my @orig_bases = split '', $codon;
165
166	300				300	for my $i ( 0 .. 2 ) {
167
168						# from each redundant base, create a list with all their
169						# components (e.g., N -> (A, C, G, T) );
170	900				913	my @components = split '', _convert('dna', $orig_bases[$i] );
171	900				1519	$orig_bases[$i] = [@components];
172						}
173
174						# Combine all the bases of each of the three positions of the
175						# codons, and build the return list.
176	300				208	for my $i ( @{ $orig_bases[0] } ) {
	300				335
177	1050				608	for my $j ( @{ $orig_bases[1] } ) {
	1050				914
178	3558				1895	for my $k ( @{ $orig_bases[2] } ) {
	3558				3021
179	11832				10954	push @return_bases, $i . $j . $k;
180						}
181						}
182						}
183	300				2390	return @return_bases;
184						}
185
186						{
187						my %convert;
188
189						sub _convert {
190						# Interconvert between redundant and non-redundant protein and
191						# dna alphabets. Takes an alphabet (protein or dna) and a string
192						# with the letter, and returns its equivalent in
193						# redundant/non-redundant alphabet. Example ACTG -> N.
194
195	924			924	668	my ($alphabet, $letter) = @_;
196	924	50	33		4998	unless (
			33
			33
197						$alphabet and $alphabet =~ /^dna$\|^protein$/i
198						and $letter and length $letter <= 4
199	0				0	) { croak "Wrong arguments!\n"; }
200
201	924	100			1694	unless (%convert) {
202	2				19	%convert = (
203						'dna' => {
204						qw(N ACGT B CGT D AGT H ACT V ACG K GT
205						M AC R AG S CG W AT Y CT A A C C T T G G)
206						},
207						'protein' => {
208						'.' => 'ACDEFGHIJKLMNOPQRSTUVWY',
209						X => 'ACDEFGHIJKLMNOPQRSTUVWY',
210						Z => 'QE',
211						B => 'ND',
212						},
213						);
214
215						# Make %convert hash key/value agnostic.
216	2				4	foreach my $alphabet ( keys %convert ) {
217	38				51	map { $convert{$alphabet}->{ $convert{$alphabet}{$_} } = $_ }
218	4				4	keys %{ $convert{$alphabet} };
	4				7
219						}
220						}
221
222	924				1644	return $convert{$alphabet}{$letter};
223						}
224
225						}
226
227						sub _uniq_string {
228						# Takes a list of letters and returns an alphabetically sorted
229						# list with unique elements.
230
231	24			24	37	my @letters = @_;
232	24				89	return join '', sort { $a cmp $b } uniq @letters;
	48				77
233						}
234
235						{
236						my ( @codon_library, $ct );
237
238						sub _negated_aas_to_codon {
239
240						# Given a string of residues, returns a degenerate codon that will
241						# not be translated into any of them, while maximizing degeneracy
242						# (ie, it tries to also translate into as many residues as possible).
243
244						# This functionality is required for reverse translating profiles
245						# that contain negative patterns: [^X]. This means that the current
246						# position should not contain aminoacid X, but can have any of the
247						# others. The reverse translated nucleotide sequence should
248						# reflect this.
249
250						# Approach: construct a list of all possible codons, incluiding all
251						# degenerate bases. This is an array of 15x15x15 = 3375 elements.
252						# Order them by descendent "degeneracy".
253						# Return the first one whose expansion in 4-lettered codons
254						# doesn't contain a codon that translates into any of the
255						# non-wanted residues.
256
257						# * Since this takes some time, I presorted them and saved them.
258						# Reading them from a file takes a fraction of the time that it taes
259						# to re-sort them every time the application is launched.
260
261	6			6	7	my $aas_to_avoid = shift;
262
263						# Initialize reusable variables if it's the first time the sub
264						# is called.
265	6	100			14	unless (@codon_library) {
266	2				6	while () { chomp; push @codon_library, split ' ', $_ }
	356				214
	356				1863
267						}
268	6	100			13	unless ($ct) { $ct = Bio::Tools::CodonTable->new; }
	2				8
269
270						# Reverse translate the unwanted aminoacids to unwanted codons.
271	6				4	my @unwanted_codons;
272	6				12	foreach my $aa ( split '', $aas_to_avoid ) {
273	18				29	push @unwanted_codons, $ct->revtranslate($aa);
274						}
275
276	6				7	foreach my $degenerate_codon (@codon_library) {
277	300				336	my @codons = _expand_codon($degenerate_codon);
278	300				402	my $success = 1;
279
280	300				233	foreach my $unwanted (@unwanted_codons) {
281	3000	100			2129	if ( grep { uc $unwanted eq $_ } @codons ) {
	118320				91757
282	1788				1370	$success = 0;
283						}
284						}
285
286	300	100			781	if ($success) { return $degenerate_codon }
	6				24
287						}
288						}
289
290						}
291
292						1;
293
294						=head1 COPYRIGHT & LICENSE
295
296						Copyright 2009 Bruno Vecchi, all rights reserved.
297
298						This program is free software; you can redistribute it and/or modify it
299						under the same terms as Perl itself.
300
301						=cut
302
303						__DATA__