| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package BioX::Seq::Utils; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
1143
|
use v5.10.1; |
|
|
2
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
12
|
use strict; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
41
|
|
|
6
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
52
|
|
|
7
|
2
|
|
|
2
|
|
9
|
use Exporter qw/import/; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
1054
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw/ |
|
10
|
|
|
|
|
|
|
build_ORF_regex |
|
11
|
|
|
|
|
|
|
all_orfs |
|
12
|
|
|
|
|
|
|
rev_com |
|
13
|
|
|
|
|
|
|
is_nucleic |
|
14
|
|
|
|
|
|
|
/; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub build_ORF_regex { |
|
17
|
|
|
|
|
|
|
|
|
18
|
4
|
|
|
4
|
1
|
7
|
my ($mode, $min_len) = @_; |
|
19
|
|
|
|
|
|
|
|
|
20
|
4
|
100
|
|
|
|
26
|
die "Missing mode" |
|
21
|
|
|
|
|
|
|
if (! defined $mode); |
|
22
|
3
|
100
|
|
|
|
34
|
die "Missing minimum length" |
|
23
|
|
|
|
|
|
|
if (! defined $min_len); |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# mode 0 : any set of codons not STOP |
|
26
|
|
|
|
|
|
|
# mode 1 : must end with STOP codon |
|
27
|
|
|
|
|
|
|
# mode 2 : must start with START codon |
|
28
|
|
|
|
|
|
|
# mode 3 : START -> STOP |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
|
31
|
2
|
|
|
|
|
9
|
my $aasize = int($min_len/3); |
|
32
|
2
|
100
|
|
|
|
6
|
my $tail_size = $aasize - 1 - ($mode & 0x2 ? 1 : 0); |
|
33
|
2
|
|
|
|
|
9
|
my $codon = ".{3}"; |
|
34
|
2
|
100
|
|
|
|
10
|
my $first_codon = $mode & 0x2 ? 'A[TU]G' : ''; |
|
35
|
2
|
|
|
|
|
8
|
my $stop_codon = "[TU](?:AA|AG|GA|AR|RA)"; |
|
36
|
2
|
100
|
|
|
|
6
|
my $last_codon = $mode & 0x1 ? $stop_codon : ''; |
|
37
|
2
|
|
|
|
|
145
|
my $re_orf = qr/ |
|
38
|
|
|
|
|
|
|
\G # anchor at pos() - forces codon boundaries |
|
39
|
|
|
|
|
|
|
(?:$codon)*? # discard codons before start codon |
|
40
|
|
|
|
|
|
|
( # begin matching ORF |
|
41
|
|
|
|
|
|
|
$first_codon # match start codon |
|
42
|
|
|
|
|
|
|
(?! $stop_codon ) # (but only if not followed by stop codon) |
|
43
|
|
|
|
|
|
|
(?: $codon # match codon(s) |
|
44
|
|
|
|
|
|
|
(?! $stop_codon ) # (but only if not followed by stop codon) |
|
45
|
|
|
|
|
|
|
) {$tail_size,} # any only if >= $tail_size codons long |
|
46
|
|
|
|
|
|
|
$codon # add final codon |
|
47
|
|
|
|
|
|
|
) # end matching ORF |
|
48
|
|
|
|
|
|
|
$last_codon |
|
49
|
|
|
|
|
|
|
/ix; |
|
50
|
|
|
|
|
|
|
|
|
51
|
2
|
|
|
|
|
10
|
return $re_orf; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub all_orfs { |
|
56
|
|
|
|
|
|
|
|
|
57
|
4
|
|
|
4
|
1
|
714
|
my ($seq, $mode, $min_len) = @_; |
|
58
|
|
|
|
|
|
|
|
|
59
|
4
|
|
|
|
|
12
|
my $orf_re = build_ORF_regex($mode, $min_len); |
|
60
|
|
|
|
|
|
|
|
|
61
|
2
|
|
|
|
|
13
|
my $len = length $seq; |
|
62
|
2
|
|
|
|
|
4
|
my @orfs; |
|
63
|
2
|
|
|
|
|
11
|
for my $strand (0..1) { |
|
64
|
4
|
100
|
|
|
|
16
|
my $str = $strand ? rev_com($seq) : $seq; |
|
65
|
4
|
|
|
|
|
10
|
for my $frame (0..2) { |
|
66
|
12
|
|
|
|
|
39
|
pos($str) = $frame; |
|
67
|
12
|
|
|
|
|
4961
|
while ($str =~ /$orf_re/g) { |
|
68
|
9
|
100
|
|
|
|
55
|
my ($s, $e) = map {$strand ? $len-$_+1 : $_} ($-[1]+1, $+[1]); |
|
|
18
|
|
|
|
|
54
|
|
|
69
|
9
|
|
|
|
|
1821
|
push @orfs, [$1, $s, $e]; |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
|
|
74
|
2
|
|
|
|
|
12
|
return @orfs; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub is_nucleic { |
|
79
|
|
|
|
|
|
|
|
|
80
|
6
|
|
|
6
|
1
|
348
|
my ($seq) = @_; |
|
81
|
6
|
|
|
|
|
70
|
return $seq !~ /[^ACGTUMRWSYKVHDBN.-]/i; |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub rev_com { |
|
86
|
|
|
|
|
|
|
|
|
87
|
4
|
|
|
4
|
1
|
173
|
my ($seq) = @_; |
|
88
|
4
|
|
|
|
|
15
|
$seq =~ tr/Xx/Nn/; |
|
89
|
4
|
100
|
|
|
|
23
|
die "Bad input sequence" if (! is_nucleic($seq)); |
|
90
|
3
|
|
|
|
|
15
|
$seq = reverse $seq; |
|
91
|
3
|
|
|
|
|
11
|
$seq =~ tr |
|
92
|
|
|
|
|
|
|
{ACGTMRWSYKVHDBNacgtmrwsykvhdbn} |
|
93
|
|
|
|
|
|
|
{TGCAKYWSRMBDHVNtgcakywsrmbdhvn}; |
|
94
|
|
|
|
|
|
|
|
|
95
|
3
|
|
|
|
|
7
|
return $seq; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
1; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
__END__ |