line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Bio::Util::DNA - Basic DNA utilities |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSES |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Bio::Util::DNA qw(:all); |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my $clean_ref = cleanDNA($seq_ref); |
10
|
|
|
|
|
|
|
my $seq_ref = randomDNA(100); |
11
|
|
|
|
|
|
|
my $rev_ref = reverse_complement($seq_ref); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 DESCRIPTION |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Provides a set of functions and predefined variables which are handy when |
16
|
|
|
|
|
|
|
working with DNA. |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=cut |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
package Bio::Util::DNA; |
21
|
|
|
|
|
|
|
|
22
|
1
|
|
|
1
|
|
31499
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
23
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
26
|
|
24
|
|
|
|
|
|
|
|
25
|
1
|
|
|
1
|
|
1335
|
use version; our $VERSION = qv('0.2.1'); |
|
1
|
|
|
|
|
2445
|
|
|
1
|
|
|
|
|
7
|
|
26
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
88
|
use Exporter 'import'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1109
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
our %EXPORT_TAGS; |
30
|
|
|
|
|
|
|
$EXPORT_TAGS{funcs} = [ |
31
|
|
|
|
|
|
|
qw( |
32
|
|
|
|
|
|
|
cleanDNA |
33
|
|
|
|
|
|
|
randomDNA |
34
|
|
|
|
|
|
|
unrollDNA |
35
|
|
|
|
|
|
|
reverse_complement |
36
|
|
|
|
|
|
|
) |
37
|
|
|
|
|
|
|
]; |
38
|
|
|
|
|
|
|
$EXPORT_TAGS{all} = [ |
39
|
|
|
|
|
|
|
@{ $EXPORT_TAGS{funcs} }, |
40
|
|
|
|
|
|
|
qw( |
41
|
|
|
|
|
|
|
$DNAs |
42
|
|
|
|
|
|
|
@DNAs |
43
|
|
|
|
|
|
|
$DNA_match |
44
|
|
|
|
|
|
|
$DNA_fail |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
$RNAs |
47
|
|
|
|
|
|
|
@RNAs |
48
|
|
|
|
|
|
|
$RNA_match |
49
|
|
|
|
|
|
|
$RNA_fail |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
$degenerates |
52
|
|
|
|
|
|
|
@degenerates |
53
|
|
|
|
|
|
|
$degenerate_match |
54
|
|
|
|
|
|
|
$degenerate_fail |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
$all_nucleotides |
57
|
|
|
|
|
|
|
@all_nucleotides |
58
|
|
|
|
|
|
|
$all_nucleotide_match |
59
|
|
|
|
|
|
|
$all_nucleotide_fail |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
%degenerate2nucleotides |
62
|
|
|
|
|
|
|
%nucleotides2degenerate |
63
|
|
|
|
|
|
|
%degenerate_hierarchy |
64
|
|
|
|
|
|
|
) |
65
|
|
|
|
|
|
|
]; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
our @EXPORT_OK = @{ $EXPORT_TAGS{all} }; |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 VARIABLES |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 BASIC VARIABLES |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Basic nucleotide variables that could be useful. All of the variables have a |
74
|
|
|
|
|
|
|
prefix and a suffix; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=head3 Prefixes |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=over |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item DNA [ACGT] |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=item RNA [ACGU] |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=item degenerate |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item all_nucleotide |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=back |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=head3 Suffixes |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=over |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item ${prefix}s |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
String of the different nucleotides |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=item @{prefix}s |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Array of the different nucleotides |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=item ${prefix}_match |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Precompiled regular expression which matches nucleotide characters |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item ${prefix}_fail |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Precompiled regular expression which matches non-nucleotide characters |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=back |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
our $DNAs = 'ACGT'; |
115
|
|
|
|
|
|
|
our @DNAs = split //, $DNAs; |
116
|
|
|
|
|
|
|
our $DNA_match = qr/[$DNAs]/i; |
117
|
|
|
|
|
|
|
our $DNA_fail = qr/[^$DNAs]/i; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
our $RNAs = 'ACGU'; |
120
|
|
|
|
|
|
|
our @RNAs = split //, $RNAs; |
121
|
|
|
|
|
|
|
our $RNA_match = qr/[$RNAs]/i; |
122
|
|
|
|
|
|
|
our $RNA_fail = qr/[^$RNAs]/i; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
our $degenerates = 'BDHKMNRSVWY'; |
125
|
|
|
|
|
|
|
our @degenerates = split //, $degenerates; |
126
|
|
|
|
|
|
|
our $degenerate_match = qr/[$degenerates]/i; |
127
|
|
|
|
|
|
|
our $degenerate_fail = qr/[^$degenerates]/i; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
our $all_nucleotides = 'ACGTUBDHKMNRSVWY'; |
130
|
|
|
|
|
|
|
our @all_nucleotides = split //, $all_nucleotides; |
131
|
|
|
|
|
|
|
our $all_nucleotide_match = qr/[$all_nucleotides]/i; |
132
|
|
|
|
|
|
|
our $all_nucleotide_fail = qr/[^$all_nucleotides]/i; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 %degenerate2nucleotides |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Hash of degenerate nucleotide definitions. Each entry contains a reference to |
137
|
|
|
|
|
|
|
an array of DNA nucleotides that each degenerate nucleotide stands for. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=cut |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
our %degenerate2nucleotides = ( |
142
|
|
|
|
|
|
|
N => [qw( A C G T )], |
143
|
|
|
|
|
|
|
B => [qw( C G T )], # !A |
144
|
|
|
|
|
|
|
D => [qw( A G T )], # !C |
145
|
|
|
|
|
|
|
H => [qw( A C T )], # !G |
146
|
|
|
|
|
|
|
V => [qw( A C G )], # !T |
147
|
|
|
|
|
|
|
M => [qw( A C )], # aroMatic |
148
|
|
|
|
|
|
|
R => [qw( A G )], # puRine |
149
|
|
|
|
|
|
|
W => [qw( A T )], |
150
|
|
|
|
|
|
|
S => [qw( C G )], |
151
|
|
|
|
|
|
|
Y => [qw( C T )], # pYrimidine |
152
|
|
|
|
|
|
|
K => [qw( G T )] # Keto |
153
|
|
|
|
|
|
|
); |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head2 %nucleotides2degenerate |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
Reverse of %degenerate2nucleotides. Keys are alphabetically-sorted DNA |
158
|
|
|
|
|
|
|
nucleotides and values are the degenerate nucleotide that can represent those |
159
|
|
|
|
|
|
|
nucleotides. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=cut |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
our %nucleotides2degenerate = ( |
164
|
|
|
|
|
|
|
ACGT => 'N', |
165
|
|
|
|
|
|
|
CGT => 'B', |
166
|
|
|
|
|
|
|
AGT => 'D', |
167
|
|
|
|
|
|
|
ACT => 'H', |
168
|
|
|
|
|
|
|
ACG => 'V', |
169
|
|
|
|
|
|
|
AC => 'M', |
170
|
|
|
|
|
|
|
AG => 'R', |
171
|
|
|
|
|
|
|
AT => 'W', |
172
|
|
|
|
|
|
|
CG => 'S', |
173
|
|
|
|
|
|
|
CT => 'Y', |
174
|
|
|
|
|
|
|
GT => 'K' |
175
|
|
|
|
|
|
|
); |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head2 %degenerate_hierarchy |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Contains the heirarchy of degenerate nucleotides; N of course contains all the |
180
|
|
|
|
|
|
|
other degenerates, and the four degenerates that can stand for three different |
181
|
|
|
|
|
|
|
bases contain three of the two-base degenerates. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=cut |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
our %degenerate_hierarchy = ( |
186
|
|
|
|
|
|
|
N => [qw( M R W S Y K V H D B )], |
187
|
|
|
|
|
|
|
B => [qw( S Y K )], # !A = [CG],[CT],[GT] |
188
|
|
|
|
|
|
|
D => [qw( R W K )], # !C = [AT],[AG],[GT] |
189
|
|
|
|
|
|
|
H => [qw( M W Y )], # !G = [AC],[AT],[CT] |
190
|
|
|
|
|
|
|
V => [qw( M R S )] # !T = [AC],[AG],[CG] |
191
|
|
|
|
|
|
|
); |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 FUNCTIONS |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 cleanDNA |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
my $clean_ref = cleanDNA($seq_ref); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Cleans the sequence for use. Strips out comments (lines starting with '>') and |
200
|
|
|
|
|
|
|
whitespace, converts uracil to thymine, and capitalizes all characters. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Examples: |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
my $clean_ref = cleanDNA($seq_ref); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
my $seq_ref = cleanDNA(\'actg'); |
207
|
|
|
|
|
|
|
my $seq_ref = cleanDNA(\'act tag cta'); |
208
|
|
|
|
|
|
|
my $seq_ref = cleanDNA(\'>some mRNA |
209
|
|
|
|
|
|
|
acugauauagau |
210
|
|
|
|
|
|
|
uauagacgaucc'); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=cut |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
sub cleanDNA { |
215
|
0
|
|
|
0
|
1
|
|
my $seq_ref = shift; |
216
|
|
|
|
|
|
|
|
217
|
0
|
|
|
|
|
|
my $clean = uc $$seq_ref; |
218
|
0
|
|
|
|
|
|
$clean =~ s/^>.*//m; |
219
|
0
|
|
|
|
|
|
$clean =~ s/$all_nucleotide_fail+//g; |
220
|
0
|
|
|
|
|
|
$clean =~ tr/U/T/; |
221
|
|
|
|
|
|
|
|
222
|
0
|
|
|
|
|
|
return \$clean; |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head2 randomDNA |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
my $seq_ref = randomDNA($length); |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Generate random DNA for testing this module or your own scripts. Default length |
230
|
|
|
|
|
|
|
is 100 nucleotides. |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Example: |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
my $seq_ref = randomDNA(); |
235
|
|
|
|
|
|
|
my $seq_ref = randomDNA(600); |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=cut |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub randomDNA { |
240
|
0
|
|
|
0
|
1
|
|
my $length = shift; |
241
|
0
|
|
0
|
|
|
|
$length = $length || 100; |
242
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
|
my $seq; |
244
|
0
|
|
|
|
|
|
$seq .= int rand 4 while ( $length-- > 0 ); |
245
|
0
|
|
|
|
|
|
$seq =~ tr/0123/ACGT/; |
246
|
|
|
|
|
|
|
|
247
|
0
|
|
|
|
|
|
return \$seq; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=head2 reverse_complement |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=head2 rev_comp |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
my $reverse_ref = reverse_complement($seq_ref); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Finds the reverse complement of the sequence and handles degenerate |
257
|
|
|
|
|
|
|
nucleotides. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Example: |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
$reverse_ref = reverse_complement(\'act'); |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=cut |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub reverse_complement { |
266
|
0
|
|
|
0
|
1
|
|
my $seq_ref = shift; |
267
|
|
|
|
|
|
|
|
268
|
0
|
|
|
|
|
|
my $reverse = reverse $$seq_ref; |
269
|
0
|
|
|
|
|
|
$reverse =~ tr/acgtmrykvhdbnACGTMRYKVHDBN/tgcakyrmbdhvnTGCAKYRMBDHVN/; |
270
|
|
|
|
|
|
|
|
271
|
0
|
|
|
|
|
|
return \$reverse; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=head2 unrollDNA |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
my $seq_arrayref = unrollDNA( $seq_ref ); |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Unroll a DNA string containing degenerate nucleotides. The first entry of the |
279
|
|
|
|
|
|
|
arrayref will be the actual sequence. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
Example: |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
my $seq_arrayref = unrollDNA( \'ACSTAD' ) = |
284
|
|
|
|
|
|
|
[ |
285
|
|
|
|
|
|
|
'ACSTAD', 'ACCTAD', 'ACGTAD', |
286
|
|
|
|
|
|
|
'ACSTAR', 'ACCTAR', 'ACGTAR', |
287
|
|
|
|
|
|
|
'ACSTAW', 'ACCTAW', 'ACGTAW', |
288
|
|
|
|
|
|
|
'ACSTAK', 'ACCTAK', 'ACGTAK', |
289
|
|
|
|
|
|
|
'ACSTAA', 'ACCTAA', 'ACGTAA', |
290
|
|
|
|
|
|
|
'ACSTAG', 'ACCTAG', 'ACGTAG', |
291
|
|
|
|
|
|
|
'ACSTAT', 'ACCTAT', 'ACGTAT' |
292
|
|
|
|
|
|
|
]; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=cut |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub unrollDNA { |
297
|
0
|
|
|
0
|
1
|
|
my ($seq_ref) = @_; |
298
|
|
|
|
|
|
|
|
299
|
0
|
|
|
|
|
|
my @nucleotides = map { |
300
|
0
|
|
|
|
|
|
[ |
301
|
|
|
|
|
|
|
$_, |
302
|
|
|
|
|
|
|
( |
303
|
0
|
|
|
|
|
|
$degenerate_hierarchy{$_} ? @{ $degenerate_hierarchy{$_} } |
304
|
|
|
|
|
|
|
: () |
305
|
|
|
|
|
|
|
), |
306
|
|
|
|
|
|
|
( |
307
|
0
|
0
|
|
|
|
|
$degenerate2nucleotides{$_} ? @{ $degenerate2nucleotides{$_} } |
|
|
0
|
|
|
|
|
|
308
|
|
|
|
|
|
|
: () |
309
|
|
|
|
|
|
|
), |
310
|
|
|
|
|
|
|
] |
311
|
|
|
|
|
|
|
} split //, uc $$seq_ref; |
312
|
|
|
|
|
|
|
|
313
|
0
|
|
|
|
|
|
my @possibilities = ( [ (undef) x @nucleotides ] ); |
314
|
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
|
for ( my $i = 0 ; $i < @nucleotides ; $i++ ) { |
316
|
0
|
|
|
|
|
|
my $variants = $nucleotides[$i]; |
317
|
|
|
|
|
|
|
|
318
|
0
|
|
|
|
|
|
push @possibilities, map { |
319
|
0
|
|
|
|
|
|
map { [@$_] } |
|
0
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
@possibilities |
321
|
|
|
|
|
|
|
} (undef) x $#$variants; |
322
|
|
|
|
|
|
|
|
323
|
0
|
|
|
|
|
|
my $block_size = @possibilities / @$variants; |
324
|
0
|
|
|
|
|
|
for ( my $j = 0 ; $j < @$variants ; $j++ ) { |
325
|
0
|
|
|
|
|
|
my $variant = $variants->[$j]; |
326
|
0
|
|
|
|
|
|
for ( my $k = $block_size * $j; $k < $block_size * ($j + 1); $k++ ) { |
327
|
0
|
|
|
|
|
|
$possibilities[$k][$i] = $variant; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
0
|
|
|
|
|
|
return [ map { join( '', @$_ ) } @possibilities ]; |
|
0
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
1; |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head1 AUTHOR |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
Kevin Galinsky, |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Copyright (c) 2010-2011, Broad Institute. |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Copyright (c) 2008-2009, J. Craig Venter Institute. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
348
|
|
|
|
|
|
|
under the same terms as Perl itself. |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |