line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# Basic GeneDesign libraries |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Bio::GeneDesign::Basic |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 VERSION |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
Version 5.56 |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 DESCRIPTION |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
GeneDesign is a library for the computer-assisted design of synthetic genes |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 AUTHOR |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Sarah Richardson |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=cut |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
package Bio::GeneDesign::Basic; |
24
|
|
|
|
|
|
|
require Exporter; |
25
|
|
|
|
|
|
|
|
26
|
11
|
|
|
11
|
|
73
|
use POSIX qw(log10); |
|
11
|
|
|
|
|
21
|
|
|
11
|
|
|
|
|
92
|
|
27
|
11
|
|
|
11
|
|
16223
|
use Carp; |
|
11
|
|
|
|
|
20
|
|
|
11
|
|
|
|
|
632
|
|
28
|
|
|
|
|
|
|
|
29
|
11
|
|
|
11
|
|
64
|
use strict; |
|
11
|
|
|
|
|
29
|
|
|
11
|
|
|
|
|
286
|
|
30
|
11
|
|
|
11
|
|
61
|
use warnings; |
|
11
|
|
|
|
|
17
|
|
|
11
|
|
|
|
|
499
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
our $VERSION = 5.56; |
33
|
|
|
|
|
|
|
|
34
|
11
|
|
|
11
|
|
60
|
use base qw(Exporter); |
|
11
|
|
|
|
|
32
|
|
|
11
|
|
|
|
|
25658
|
|
35
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
36
|
|
|
|
|
|
|
_generate_dimers |
37
|
|
|
|
|
|
|
_sanitize |
38
|
|
|
|
|
|
|
_count |
39
|
|
|
|
|
|
|
_gcwindows |
40
|
|
|
|
|
|
|
_ntherm |
41
|
|
|
|
|
|
|
_complement |
42
|
|
|
|
|
|
|
_toRNA |
43
|
|
|
|
|
|
|
_melt |
44
|
|
|
|
|
|
|
_regres |
45
|
|
|
|
|
|
|
_regarr |
46
|
|
|
|
|
|
|
_positions |
47
|
|
|
|
|
|
|
_find_runs |
48
|
|
|
|
|
|
|
_is_ambiguous |
49
|
|
|
|
|
|
|
_compare_sequences |
50
|
|
|
|
|
|
|
_amb_transcription |
51
|
|
|
|
|
|
|
_add_arr |
52
|
|
|
|
|
|
|
$ambnt |
53
|
|
|
|
|
|
|
@BASES |
54
|
|
|
|
|
|
|
%NTIDES |
55
|
|
|
|
|
|
|
%AA_NAMES |
56
|
|
|
|
|
|
|
%AACIDS |
57
|
|
|
|
|
|
|
); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
our %EXPORT_TAGS = (GD=> \@EXPORT_OK); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=head1 Definitions |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=cut |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
our @BASES = qw(A T C G); |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
our %NTIDES = ( |
68
|
|
|
|
|
|
|
A => [qw(A)], T => [qw(T)], |
69
|
|
|
|
|
|
|
C => [qw(C)], G => [qw(G)], |
70
|
|
|
|
|
|
|
R => [qw(A G)], Y => [qw(C T)], |
71
|
|
|
|
|
|
|
W => [qw(A T)], S => [qw(C G)], |
72
|
|
|
|
|
|
|
K => [qw(G T)], M => [qw(A C)], |
73
|
|
|
|
|
|
|
B => [qw(C G T)], D => [qw(A G T)], |
74
|
|
|
|
|
|
|
H => [qw(A C T)], V => [qw(A C G)], |
75
|
|
|
|
|
|
|
N => [qw(A C G T)], |
76
|
|
|
|
|
|
|
); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
my %REGHSH = ( |
79
|
|
|
|
|
|
|
A => 'A', T => 'T', |
80
|
|
|
|
|
|
|
C => 'C', G => 'G', |
81
|
|
|
|
|
|
|
R => '[AGR]', Y => '[CTY]', |
82
|
|
|
|
|
|
|
W => '[ATW]', S => '[CGS]', |
83
|
|
|
|
|
|
|
K => '[GKT]', M => '[ACM]', |
84
|
|
|
|
|
|
|
B => '[BCGKSTY]', D => '[ADGKRTW]', |
85
|
|
|
|
|
|
|
H => '[ACHMTWY]', V => '[ACGMRSV]', |
86
|
|
|
|
|
|
|
N => '[ABCDGHKMNRSTVWY]', |
87
|
|
|
|
|
|
|
); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
our $ambnt = qr/[R Y W S K M B D H V N]+/x; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
our %AACIDS = map { $_ => $_ } qw(A C D E F G H I K L M N P Q R S T V W Y); |
92
|
|
|
|
|
|
|
$AACIDS{"*"} = "[\*]"; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
our %AA_NAMES = ( |
95
|
|
|
|
|
|
|
A => 'Ala', B => 'Unk', C => 'Cys', D => 'Asp', E => 'Glu', F => 'Phe', |
96
|
|
|
|
|
|
|
G => 'Gly', H => 'His', I => 'Ile', J => 'Unk', K => 'Lys', L => 'Leu', |
97
|
|
|
|
|
|
|
M => 'Met', N => 'Asn', O => 'Unk', P => 'Pro', Q => 'Gln', R => 'Arg', |
98
|
|
|
|
|
|
|
S => 'Ser', T => 'Thr', U => 'Unk', V => 'Val', W => 'Trp', X => 'Unk', |
99
|
|
|
|
|
|
|
Y => 'Tyr', Z => 'Unk', '*' => 'Stp'); |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# entropy, enthalpy, and free energy of paired bases |
102
|
|
|
|
|
|
|
my %EEG = ( |
103
|
|
|
|
|
|
|
"TC" => ([ 8.8, 23.5, 1.5]), "GA" => ([ 8.8, 23.5, 1.5]), |
104
|
|
|
|
|
|
|
"CT" => ([ 6.6, 16.4, 1.5]), "AG" => ([ 6.6, 16.4, 1.5]), |
105
|
|
|
|
|
|
|
"GG" => ([10.9, 28.4, 2.1]), "CC" => ([10.9, 28.4, 2.1]), |
106
|
|
|
|
|
|
|
"AA" => ([ 8.0, 21.9, 1.2]), "TT" => ([ 8.0, 21.9, 1.2]), |
107
|
|
|
|
|
|
|
"AT" => ([ 5.6, 15.2, 0.9]), "TA" => ([ 6.6, 18.4, 0.9]), |
108
|
|
|
|
|
|
|
"CG" => ([11.8, 29.0, 2.8]), "GC" => ([10.5, 26.4, 2.3]), |
109
|
|
|
|
|
|
|
"CA" => ([ 8.2, 21.0, 1.7]), "TG" => ([ 8.2, 21.0, 1.7]), |
110
|
|
|
|
|
|
|
"GT" => ([ 9.4, 25.5, 1.5]), "AC" => ([ 9.4, 25.5, 1.5]) |
111
|
|
|
|
|
|
|
); |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 Functions |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 _generate_dimers |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=cut |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub _generate_dimers |
120
|
|
|
|
|
|
|
{ |
121
|
0
|
|
|
0
|
|
0
|
my @dimers; |
122
|
0
|
|
|
|
|
0
|
foreach my $a (@BASES) |
123
|
|
|
|
|
|
|
{ |
124
|
0
|
|
|
|
|
0
|
foreach my $b (@BASES) |
125
|
|
|
|
|
|
|
{ |
126
|
0
|
|
|
|
|
0
|
push @dimers, $a . $b; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
0
|
|
|
|
|
0
|
return @dimers; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 _sanitize() |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
remove non nucleotide characters from nucleotide sequences. remove non amino |
135
|
|
|
|
|
|
|
acid characters from peptide sequences. return sanitized strand and list of bad |
136
|
|
|
|
|
|
|
characters. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=cut |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub _sanitize |
141
|
|
|
|
|
|
|
{ |
142
|
20001
|
|
|
20001
|
|
37359
|
my ($strand, $swit) = @_; |
143
|
20001
|
|
50
|
|
|
40468
|
$swit = $swit || 'nuc'; |
144
|
20001
|
50
|
33
|
|
|
72571
|
die ('bad sanitize switch') if ($swit ne 'nuc' && $swit ne 'pep'); |
145
|
20001
|
|
|
|
|
30732
|
my %noncount = (); |
146
|
20001
|
|
|
|
|
27676
|
my $newstrand = q{}; |
147
|
20001
|
|
|
|
|
69752
|
my @arr = split q{}, $strand; |
148
|
20001
|
|
|
|
|
32444
|
foreach my $char (@arr) |
149
|
|
|
|
|
|
|
{ |
150
|
100199
|
|
|
|
|
131202
|
my $CH = uc $char; |
151
|
100199
|
50
|
33
|
|
|
386967
|
if ($swit eq 'nuc' && exists $NTIDES{$CH} |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
152
|
|
|
|
|
|
|
|| ($swit eq 'pep' && exists $AACIDS{$CH})) |
153
|
|
|
|
|
|
|
{ |
154
|
100199
|
|
|
|
|
157276
|
$newstrand .= $char; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
else |
157
|
|
|
|
|
|
|
{ |
158
|
0
|
|
|
|
|
0
|
$noncount{$char}++; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
} |
161
|
20001
|
|
|
|
|
36307
|
my @baddies = keys %noncount; |
162
|
20001
|
|
|
|
|
68088
|
return ($newstrand, @baddies); |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head2 _count() |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
takes a nucleotide sequence and returns a base count. Looks for total length, |
168
|
|
|
|
|
|
|
purines, pyrimidines, and degenerate bases. If degenerate bases are present |
169
|
|
|
|
|
|
|
assumes their substitution for non degenerate bases is totally random for |
170
|
|
|
|
|
|
|
percentage estimation. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
in: nucleotide sequence (string), |
173
|
|
|
|
|
|
|
out: base count (hash) |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=cut |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub _count |
178
|
|
|
|
|
|
|
{ |
179
|
5
|
|
|
5
|
|
8
|
my ($strand) = @_; |
180
|
5
|
|
|
|
|
9
|
my $len = length $strand; |
181
|
5
|
|
|
|
|
137
|
my @arr = split q{}, uc $strand; |
182
|
5
|
|
|
|
|
24
|
my %C = map {$_ => 0} keys %NTIDES; |
|
75
|
|
|
|
|
116
|
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
#Count bases |
185
|
5
|
|
|
|
|
196
|
$C{$_}++ foreach @arr; |
186
|
|
|
|
|
|
|
|
187
|
5
|
|
|
|
|
29
|
$C{d} += $C{$_} foreach (qw(A T C G)); |
188
|
5
|
|
|
|
|
25
|
$C{n} += $C{$_} foreach (qw(B D H K M N R S V W Y)); |
189
|
5
|
|
|
|
|
10
|
$C{'?'} = ($C{d} + $C{n}) - $len; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#Estimate how many of each degenerate base would be a G or C |
192
|
5
|
|
|
|
|
19
|
my $split = .5*$C{R} + .5*$C{Y} + .5*$C{K} + .5*$C{M} + .5*$C{N}; |
193
|
5
|
|
|
|
|
14
|
my $trip = (2*$C{B} / 3) + (2*$C{V} / 3) + ($C{D} / 3) + ($C{H} / 3); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
#Calculate GC/AT percentage |
196
|
5
|
|
|
|
|
10
|
my $gcc = $C{S} + $C{G} + $C{C} + $split + $trip; |
197
|
5
|
|
|
|
|
39
|
my $gcp = sprintf "%.1f", ($gcc / $len) * 100; |
198
|
5
|
|
|
|
|
9
|
$C{GCp} = $gcp; |
199
|
5
|
|
|
|
|
16
|
$C{ATp} = 100 - $gcp; |
200
|
5
|
|
|
|
|
7
|
$C{len} = $len; |
201
|
|
|
|
|
|
|
|
202
|
5
|
|
|
|
|
56
|
return \%C; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head2 _gcwindows() |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
takes a nucleotide sequence, a window size, and minimum and maximum values. |
208
|
|
|
|
|
|
|
returns lists of real coordinates of subsequences that violate mimimum or |
209
|
|
|
|
|
|
|
maximum GC percentages. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Values are returned inside an array reference such that the first value is an |
212
|
|
|
|
|
|
|
array ref of minimum violators (as array refs of left/right coordinates), and |
213
|
|
|
|
|
|
|
the second value is an array ref of maximum violators. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
$return_value = [ |
216
|
|
|
|
|
|
|
[[left, right], [left, right]], #minimum violators |
217
|
|
|
|
|
|
|
[[left, right], [left, right]] #maximum violators |
218
|
|
|
|
|
|
|
]; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=cut |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub _gcwindows |
223
|
|
|
|
|
|
|
{ |
224
|
0
|
|
|
0
|
|
0
|
my ($seq, $winsize, $min, $max) = @_; |
225
|
0
|
|
|
|
|
0
|
my @minflags = (); |
226
|
0
|
|
|
|
|
0
|
my @maxflags = (); |
227
|
0
|
|
|
|
|
0
|
my $seqlen = length $seq; |
228
|
0
|
|
|
|
|
0
|
my $lefpos = 0; |
229
|
0
|
|
|
|
|
0
|
my $rigpos = $winsize - 1; |
230
|
0
|
|
|
|
|
0
|
my $win = substr $seq, $lefpos, $winsize; |
231
|
0
|
|
|
|
|
0
|
my %hsh = (G => 0, C => 0, A => 0, T => 0); |
232
|
0
|
|
|
|
|
0
|
$hsh{$_}++ foreach (split q{}, $win); |
233
|
0
|
|
|
|
|
0
|
my $initbase = substr $seq, $lefpos, 1; |
234
|
0
|
|
|
|
|
0
|
my $lastbase = substr $seq, $rigpos, 1; |
235
|
0
|
|
|
|
|
0
|
while ($rigpos < $seqlen) |
236
|
|
|
|
|
|
|
{ |
237
|
0
|
|
|
|
|
0
|
my $GCperc = 100 * (($hsh{G} + $hsh{C}) / $winsize); |
238
|
0
|
|
|
|
|
0
|
$lefpos++; |
239
|
0
|
|
|
|
|
0
|
$rigpos++; |
240
|
0
|
0
|
|
|
|
0
|
if ($GCperc < $min) |
241
|
|
|
|
|
|
|
{ |
242
|
0
|
|
|
|
|
0
|
push @minflags, [$lefpos, $rigpos]; |
243
|
|
|
|
|
|
|
} |
244
|
0
|
0
|
|
|
|
0
|
if ($GCperc > $max) |
245
|
|
|
|
|
|
|
{ |
246
|
0
|
|
|
|
|
0
|
push @maxflags, [$lefpos, $rigpos]; |
247
|
|
|
|
|
|
|
} |
248
|
0
|
|
|
|
|
0
|
$hsh{$initbase}--; |
249
|
0
|
|
|
|
|
0
|
$initbase = substr $seq, $lefpos, 1; |
250
|
0
|
|
|
|
|
0
|
$lastbase = substr $seq, $rigpos, 1; |
251
|
0
|
|
|
|
|
0
|
$hsh{$lastbase}++; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
0
|
return [\@minflags, \@maxflags]; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head2 _melt() |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
takes a nucleotide sequence and returns a melting temperature |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
in: nucleotide sequence (string) |
262
|
|
|
|
|
|
|
salt concentration (string, opt, def =.05), |
263
|
|
|
|
|
|
|
oligo concentration (string, opt, def = .0000001) |
264
|
|
|
|
|
|
|
out: temperature (float) |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub _melt |
269
|
|
|
|
|
|
|
{ |
270
|
3
|
|
|
3
|
|
7
|
my ($strand, $salt, $conc) = @_; |
271
|
3
|
|
|
|
|
5
|
my $C = _count($strand); |
272
|
3
|
|
|
|
|
29
|
my $len = length($strand); |
273
|
3
|
|
50
|
|
|
8
|
$salt = $salt || .05; |
274
|
3
|
|
50
|
|
|
6
|
$conc = $conc || .0000001; |
275
|
3
|
|
|
|
|
16
|
my $Naj = 16.6 * log10($salt); |
276
|
|
|
|
|
|
|
|
277
|
3
|
|
|
|
|
5
|
my $Tm; |
278
|
|
|
|
|
|
|
|
279
|
3
|
100
|
33
|
|
|
13
|
if ($len < 14) |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
280
|
|
|
|
|
|
|
{ |
281
|
1
|
|
|
|
|
4
|
$Tm = ((4 * ($C->{C} + $C->{G})) + (2 * ($C->{A} + $C->{T}))); |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
elsif ($len >= 14 && $len <= 50) |
284
|
|
|
|
|
|
|
{ |
285
|
2
|
|
|
|
|
7
|
$Tm = 100.5 + (41 * ($C->{G} + $C->{C}) / $len) - (820/$len) + $Naj; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
elsif ($len > 50) |
288
|
|
|
|
|
|
|
{ |
289
|
0
|
|
|
|
|
0
|
$Tm = 81.5 + (41 * ($C->{G} + $C->{C}) / $len) - (500/$len) + $Naj - .62; |
290
|
|
|
|
|
|
|
} |
291
|
3
|
|
|
|
|
24
|
return sprintf "%.1f", $Tm; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head2 _ntherm() |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
takes a nucleotide sequence and returns a nearest neighbor melting temp. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
in: nucleotide sequence (string) |
299
|
|
|
|
|
|
|
out: temperature (float) |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=cut |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
sub _ntherm |
304
|
|
|
|
|
|
|
{ |
305
|
1
|
|
|
1
|
|
3
|
my ($strand, $salt, $conc) = @_; |
306
|
1
|
|
|
|
|
3
|
my ($dH, $dS, $dG) = (0, 0, 0); |
307
|
1
|
|
|
|
|
6
|
foreach my $w (keys %EEG) |
308
|
|
|
|
|
|
|
{ |
309
|
16
|
|
|
|
|
104
|
while ($strand =~ /(?=$w)/ixg) |
310
|
|
|
|
|
|
|
{ |
311
|
32
|
|
|
|
|
44
|
$dH += $EEG{$w}->[0]; |
312
|
32
|
|
|
|
|
90
|
$dS += $EEG{$w}->[1]; |
313
|
|
|
|
|
|
|
#$dG += $EEG{$w}->[2]; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
} |
316
|
1
|
|
50
|
|
|
6
|
$salt = $salt || .05; |
317
|
1
|
|
50
|
|
|
4
|
$conc = $conc || .0000001; |
318
|
1
|
|
|
|
|
5
|
my $Naj = 16.6 * log10($salt); |
319
|
1
|
|
|
|
|
4
|
my $lc = 1.987 * abs( log( $conc / 2 ) ); |
320
|
1
|
|
|
|
|
4
|
my $Tm = ( ( ($dH - 3.4) / ( ($dS + $lc) / 1000) ) - 273.15) + $Naj; |
321
|
1
|
|
|
|
|
10
|
return sprintf "%.1f", $Tm; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=head2 _complement() |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
takes a nucleotide sequence and returns its complement or reverse complement. |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
in: nucleotide sequence (string), |
329
|
|
|
|
|
|
|
switch for reverse complement (1 or null) |
330
|
|
|
|
|
|
|
out: nucleotide sequence (string) |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=cut |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub _complement |
335
|
|
|
|
|
|
|
{ |
336
|
1190
|
|
|
1190
|
|
2692
|
my ($strand, $swit) = @_; |
337
|
1190
|
100
|
|
|
|
2964
|
$strand = scalar reverse($strand) if ($swit); |
338
|
1190
|
|
|
|
|
2261
|
$strand =~ tr/AaCcGgTtRrYyKkMmBbDdHhVv/TtGgCcAaYyRrMmKkVvHhDdBb/; |
339
|
1190
|
|
|
|
|
3671
|
return $strand; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=head2 _toRNA() |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=cut |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
sub _toRNA |
347
|
|
|
|
|
|
|
{ |
348
|
0
|
|
|
0
|
|
0
|
my ($strand) = @_; |
349
|
0
|
|
|
|
|
0
|
$strand =~ tr/Tt/Uu/; |
350
|
0
|
|
|
|
|
0
|
return $strand; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=head2 _regres() |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
takes a sequence that may be degenerate and returns a string that is prepped |
356
|
|
|
|
|
|
|
for use in a regular expression. |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
in: sequence (string), |
359
|
|
|
|
|
|
|
switch for aa or nt sequence (1 or null) |
360
|
|
|
|
|
|
|
out: regexp string (string) |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=cut |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub _regres |
365
|
|
|
|
|
|
|
{ |
366
|
1052
|
|
|
1052
|
|
1734
|
my ($sequence, $swit) = @_; |
367
|
1052
|
|
100
|
|
|
1889
|
$swit = $swit || 1; |
368
|
1052
|
|
|
|
|
1308
|
my $comp = q{}; |
369
|
1052
|
|
|
|
|
3809
|
my @arr = split('', uc $sequence); |
370
|
1052
|
|
|
|
|
1899
|
foreach my $char (@arr) |
371
|
|
|
|
|
|
|
{ |
372
|
6529
|
100
|
|
|
|
8635
|
if ($swit == 1) |
|
|
50
|
|
|
|
|
|
373
|
|
|
|
|
|
|
{ |
374
|
6505
|
|
|
|
|
8194
|
my $ntide = $REGHSH{$char}; |
375
|
6505
|
50
|
|
|
|
9145
|
croak ("$char is not a nucleotide\n") unless $ntide; |
376
|
6505
|
|
|
|
|
8927
|
$comp .= $ntide; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
elsif ($swit == 2) |
379
|
|
|
|
|
|
|
{ |
380
|
24
|
|
|
|
|
31
|
my $aa = $AACIDS{$char}; |
381
|
24
|
50
|
|
|
|
31
|
croak ("$char is not an amino acid \n") unless $aa; |
382
|
24
|
|
|
|
|
27
|
$comp .= $aa; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
} |
385
|
1052
|
|
|
|
|
19268
|
return qr/$comp/ix; |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=head2 regarr() |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=cut |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub _regarr |
393
|
|
|
|
|
|
|
{ |
394
|
773
|
|
|
773
|
|
1349
|
my ($sequence) = @_; |
395
|
773
|
|
|
|
|
1567
|
my $regex = [_regres($sequence, 1)]; |
396
|
773
|
|
|
|
|
1968
|
my $comp = _complement($sequence, 1); |
397
|
773
|
100
|
|
|
|
1942
|
if ($comp ne $sequence) |
398
|
|
|
|
|
|
|
{ |
399
|
202
|
|
|
|
|
379
|
push @$regex, _regres($comp, 1); |
400
|
|
|
|
|
|
|
} |
401
|
773
|
|
|
|
|
2272
|
return $regex; |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=head2 _positions() |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=cut |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
sub _positions |
409
|
|
|
|
|
|
|
{ |
410
|
6
|
|
|
6
|
|
14
|
my ($seq, $regarr) = @_; |
411
|
6
|
|
|
|
|
14
|
my $temphash = {}; |
412
|
6
|
|
|
|
|
11
|
foreach my $sit (@$regarr) |
413
|
|
|
|
|
|
|
{ |
414
|
10
|
|
|
|
|
156
|
while ($seq =~ /(?=($sit))/ixg) |
415
|
|
|
|
|
|
|
{ |
416
|
8
|
|
|
|
|
69
|
$temphash->{pos $seq} = $1; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
} |
419
|
6
|
|
|
|
|
25
|
return $temphash; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=head2 _find_runs |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=cut |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub _find_runs |
427
|
|
|
|
|
|
|
{ |
428
|
0
|
|
|
0
|
|
0
|
my ($seq, $pattern, $min) = @_; |
429
|
0
|
|
|
|
|
0
|
my @arr = (); |
430
|
0
|
|
0
|
|
|
0
|
$min = $min || 5; |
431
|
0
|
0
|
|
|
|
0
|
if (_is_ambiguous($pattern)) |
432
|
|
|
|
|
|
|
{ |
433
|
0
|
|
|
|
|
0
|
die "Ambiguous pattern passed to _find_runs"; |
434
|
|
|
|
|
|
|
} |
435
|
0
|
|
|
|
|
0
|
my $reg = '[' . $pattern . ']'; |
436
|
0
|
|
|
|
|
0
|
while ($seq =~ m{(($pattern) {$min,})}msixg) |
437
|
|
|
|
|
|
|
{ |
438
|
0
|
|
|
|
|
0
|
my $len = length $1; |
439
|
0
|
|
|
|
|
0
|
my $rig = pos $seq; |
440
|
0
|
|
|
|
|
0
|
my $lef = $rig - $len + 1; |
441
|
0
|
|
|
|
|
0
|
push @arr, [$lef, $rig]; |
442
|
|
|
|
|
|
|
} |
443
|
0
|
|
|
|
|
0
|
return \@arr; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 _is_ambiguous |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=cut |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
sub _is_ambiguous |
451
|
|
|
|
|
|
|
{ |
452
|
5
|
|
|
5
|
|
10
|
my ($sequence) = @_; |
453
|
5
|
100
|
|
|
|
29
|
return 1 if ($sequence =~ $ambnt); |
454
|
4
|
|
|
|
|
10
|
return 0; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=head2 _compare_sequences() |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
takes two nucleotide sequences that are assumed to be perfectly aligned and |
460
|
|
|
|
|
|
|
roughly equivalent and returns similarity metrics. should be twweeaakkeedd |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
in: 2x nucleotide sequence (string) |
463
|
|
|
|
|
|
|
out: similarity metrics (hash) |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=cut |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
sub _compare_sequences |
468
|
|
|
|
|
|
|
{ |
469
|
433
|
|
|
433
|
|
630
|
my ($topseq, $botseq) = @_; |
470
|
433
|
50
|
33
|
|
|
1094
|
return if (!$botseq || length($botseq) != length($topseq)); |
471
|
433
|
|
|
|
|
590
|
my ($tsit, $tver, $len) = (0, 0, length($topseq)); |
472
|
433
|
|
|
|
|
618
|
while (length($topseq) > 0) |
473
|
|
|
|
|
|
|
{ |
474
|
2504
|
|
|
|
|
3694
|
my ($topbit, $botbit) = (chop($topseq), chop ($botseq)); |
475
|
2504
|
100
|
|
|
|
4036
|
if ($topbit ne $botbit) |
476
|
|
|
|
|
|
|
{ |
477
|
814
|
100
|
|
|
|
1734
|
$topbit = $topbit =~ $REGHSH{R} ? 1 : 0; |
478
|
814
|
100
|
|
|
|
1477
|
$botbit = $botbit =~ $REGHSH{R} ? 1 : 0; |
479
|
814
|
100
|
|
|
|
1127
|
$tsit++ if ($topbit == $botbit); |
480
|
814
|
100
|
|
|
|
1476
|
$tver++ if ($topbit != $botbit); |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
} |
483
|
433
|
|
|
|
|
463
|
my %A; |
484
|
433
|
|
|
|
|
602
|
$A{D} = $tsit + $tver; #changes |
485
|
433
|
|
|
|
|
565
|
$A{I} = $len - $A{D}; #identities |
486
|
433
|
|
|
|
|
454
|
$A{T} = $tsit; #transitions |
487
|
433
|
|
|
|
|
456
|
$A{V} = $tver; #transversions |
488
|
433
|
|
|
|
|
1668
|
$A{P} = sprintf "%.1f", 100 - (($A{D} / $len) * 100); #percent identity |
489
|
433
|
|
|
|
|
908
|
return \%A; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 _amb_transcription |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
=cut |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub _amb_transcription |
497
|
|
|
|
|
|
|
{ |
498
|
10
|
|
|
10
|
|
17
|
my ($ntseq) = @_; |
499
|
10
|
|
|
|
|
21
|
my $prseq = uc $ntseq; |
500
|
10
|
|
|
|
|
16
|
my (@SEED, @NEW) = ((), ()); |
501
|
10
|
|
|
|
|
14
|
my $offset = 0; |
502
|
10
|
|
|
|
|
17
|
my $ntlen = length $prseq; |
503
|
10
|
|
|
|
|
25
|
while ($offset < $ntlen) |
504
|
|
|
|
|
|
|
{ |
505
|
47
|
|
|
|
|
93
|
my $template = substr $prseq, $offset, 1; |
506
|
47
|
|
|
|
|
67
|
my $ntides = $NTIDES{$template}; |
507
|
47
|
|
|
|
|
56
|
my $possibilities = scalar @$ntides; |
508
|
47
|
100
|
|
|
|
65
|
if ($possibilities == 1) |
509
|
|
|
|
|
|
|
{ |
510
|
44
|
100
|
|
|
|
80
|
@SEED = ( $template ) if ($offset == 0); |
511
|
44
|
100
|
|
|
|
96
|
@NEW = ( $template ) if ($offset > 0); |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
else |
514
|
|
|
|
|
|
|
{ |
515
|
3
|
50
|
|
|
|
10
|
@SEED = @$ntides if ($offset == 0); |
516
|
3
|
50
|
|
|
|
10
|
@NEW = @$ntides if ($offset > 0); |
517
|
|
|
|
|
|
|
} |
518
|
47
|
100
|
|
|
|
84
|
unless ($offset == 0) |
519
|
|
|
|
|
|
|
{ |
520
|
37
|
|
|
|
|
53
|
@SEED = _add_arr(\@SEED, \@NEW); |
521
|
|
|
|
|
|
|
} |
522
|
47
|
|
|
|
|
111
|
$offset ++; |
523
|
|
|
|
|
|
|
} |
524
|
10
|
|
|
|
|
16
|
my %hsh = map {$_ => 1 } @SEED; |
|
19
|
|
|
|
|
50
|
|
525
|
10
|
|
|
|
|
38
|
my @keys = sort keys %hsh; |
526
|
10
|
|
|
|
|
37
|
return \@keys; |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=head2 _add_arr |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
Basically builds a list of tree nodes for the amb_trans* functions. |
533
|
|
|
|
|
|
|
in: 2 x peptide lists (array reference) out: combined list of peptides |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=cut |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
sub _add_arr |
538
|
|
|
|
|
|
|
{ |
539
|
75
|
|
|
75
|
|
117
|
my ($arr1ref, $arr2ref) = @_; |
540
|
75
|
|
|
|
|
156
|
my @arr1 = @$arr1ref; |
541
|
75
|
|
|
|
|
122
|
my @arr2 = @$arr2ref; |
542
|
75
|
|
|
|
|
90
|
my @arr3 = (); |
543
|
75
|
|
|
|
|
107
|
foreach my $do (@arr1) |
544
|
|
|
|
|
|
|
{ |
545
|
465
|
|
|
|
|
605
|
foreach my $re (@arr2) |
546
|
|
|
|
|
|
|
{ |
547
|
1456
|
|
|
|
|
2331
|
push @arr3, $do . $re |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
} |
550
|
75
|
|
|
|
|
573
|
return @arr3; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
1; |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
__END__ |