line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package BioX::Seq::Utils; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
1143
|
use v5.10.1; |
|
2
|
|
|
|
|
7
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
12
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
41
|
|
6
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
52
|
|
7
|
2
|
|
|
2
|
|
9
|
use Exporter qw/import/; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
1054
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw/ |
10
|
|
|
|
|
|
|
build_ORF_regex |
11
|
|
|
|
|
|
|
all_orfs |
12
|
|
|
|
|
|
|
rev_com |
13
|
|
|
|
|
|
|
is_nucleic |
14
|
|
|
|
|
|
|
/; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub build_ORF_regex { |
17
|
|
|
|
|
|
|
|
18
|
4
|
|
|
4
|
1
|
7
|
my ($mode, $min_len) = @_; |
19
|
|
|
|
|
|
|
|
20
|
4
|
100
|
|
|
|
26
|
die "Missing mode" |
21
|
|
|
|
|
|
|
if (! defined $mode); |
22
|
3
|
100
|
|
|
|
34
|
die "Missing minimum length" |
23
|
|
|
|
|
|
|
if (! defined $min_len); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# mode 0 : any set of codons not STOP |
26
|
|
|
|
|
|
|
# mode 1 : must end with STOP codon |
27
|
|
|
|
|
|
|
# mode 2 : must start with START codon |
28
|
|
|
|
|
|
|
# mode 3 : START -> STOP |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
31
|
2
|
|
|
|
|
9
|
my $aasize = int($min_len/3); |
32
|
2
|
100
|
|
|
|
6
|
my $tail_size = $aasize - 1 - ($mode & 0x2 ? 1 : 0); |
33
|
2
|
|
|
|
|
9
|
my $codon = ".{3}"; |
34
|
2
|
100
|
|
|
|
10
|
my $first_codon = $mode & 0x2 ? 'A[TU]G' : ''; |
35
|
2
|
|
|
|
|
8
|
my $stop_codon = "[TU](?:AA|AG|GA|AR|RA)"; |
36
|
2
|
100
|
|
|
|
6
|
my $last_codon = $mode & 0x1 ? $stop_codon : ''; |
37
|
2
|
|
|
|
|
145
|
my $re_orf = qr/ |
38
|
|
|
|
|
|
|
\G # anchor at pos() - forces codon boundaries |
39
|
|
|
|
|
|
|
(?:$codon)*? # discard codons before start codon |
40
|
|
|
|
|
|
|
( # begin matching ORF |
41
|
|
|
|
|
|
|
$first_codon # match start codon |
42
|
|
|
|
|
|
|
(?! $stop_codon ) # (but only if not followed by stop codon) |
43
|
|
|
|
|
|
|
(?: $codon # match codon(s) |
44
|
|
|
|
|
|
|
(?! $stop_codon ) # (but only if not followed by stop codon) |
45
|
|
|
|
|
|
|
) {$tail_size,} # any only if >= $tail_size codons long |
46
|
|
|
|
|
|
|
$codon # add final codon |
47
|
|
|
|
|
|
|
) # end matching ORF |
48
|
|
|
|
|
|
|
$last_codon |
49
|
|
|
|
|
|
|
/ix; |
50
|
|
|
|
|
|
|
|
51
|
2
|
|
|
|
|
10
|
return $re_orf; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub all_orfs { |
56
|
|
|
|
|
|
|
|
57
|
4
|
|
|
4
|
1
|
714
|
my ($seq, $mode, $min_len) = @_; |
58
|
|
|
|
|
|
|
|
59
|
4
|
|
|
|
|
12
|
my $orf_re = build_ORF_regex($mode, $min_len); |
60
|
|
|
|
|
|
|
|
61
|
2
|
|
|
|
|
13
|
my $len = length $seq; |
62
|
2
|
|
|
|
|
4
|
my @orfs; |
63
|
2
|
|
|
|
|
11
|
for my $strand (0..1) { |
64
|
4
|
100
|
|
|
|
16
|
my $str = $strand ? rev_com($seq) : $seq; |
65
|
4
|
|
|
|
|
10
|
for my $frame (0..2) { |
66
|
12
|
|
|
|
|
39
|
pos($str) = $frame; |
67
|
12
|
|
|
|
|
4961
|
while ($str =~ /$orf_re/g) { |
68
|
9
|
100
|
|
|
|
55
|
my ($s, $e) = map {$strand ? $len-$_+1 : $_} ($-[1]+1, $+[1]); |
|
18
|
|
|
|
|
54
|
|
69
|
9
|
|
|
|
|
1821
|
push @orfs, [$1, $s, $e]; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
2
|
|
|
|
|
12
|
return @orfs; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub is_nucleic { |
79
|
|
|
|
|
|
|
|
80
|
6
|
|
|
6
|
1
|
348
|
my ($seq) = @_; |
81
|
6
|
|
|
|
|
70
|
return $seq !~ /[^ACGTUMRWSYKVHDBN.-]/i; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub rev_com { |
86
|
|
|
|
|
|
|
|
87
|
4
|
|
|
4
|
1
|
173
|
my ($seq) = @_; |
88
|
4
|
|
|
|
|
15
|
$seq =~ tr/Xx/Nn/; |
89
|
4
|
100
|
|
|
|
23
|
die "Bad input sequence" if (! is_nucleic($seq)); |
90
|
3
|
|
|
|
|
15
|
$seq = reverse $seq; |
91
|
3
|
|
|
|
|
11
|
$seq =~ tr |
92
|
|
|
|
|
|
|
{ACGTMRWSYKVHDBNacgtmrwsykvhdbn} |
93
|
|
|
|
|
|
|
{TGCAKYWSRMBDHVNtgcakywsrmbdhvn}; |
94
|
|
|
|
|
|
|
|
95
|
3
|
|
|
|
|
7
|
return $seq; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
1; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
__END__ |