line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Tools::Prepeat; |
2
|
1
|
|
|
1
|
|
12381
|
use 5.006; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
36
|
|
3
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
41
|
|
4
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
5
|
1
|
|
|
1
|
|
5
|
use XSLoader; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
34
|
|
6
|
|
|
|
|
|
|
XSLoader::load 'Bio::Tools::Prepeat'; |
7
|
1
|
|
|
1
|
|
5
|
use Exporter; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
67
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw(random_sequence); |
10
|
1
|
|
|
1
|
|
5
|
use Cwd qw(abs_path); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
47
|
|
11
|
1
|
|
|
1
|
|
899
|
use String::Random qw(random_string); |
|
1
|
|
|
|
|
3611
|
|
|
1
|
|
|
|
|
92
|
|
12
|
1
|
|
|
1
|
|
1032
|
use IO::File; |
|
1
|
|
|
|
|
18956
|
|
|
1
|
|
|
|
|
185
|
|
13
|
|
|
|
|
|
|
|
14
|
1
|
|
|
1
|
|
26039
|
use Data::Dumper; |
|
1
|
|
|
|
|
15118
|
|
|
1
|
|
|
|
|
1884
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @amino = qw/A C D E F G H I K L M N P Q R S T V W Y/; |
17
|
|
|
|
|
|
|
our @extamino = 'A'..'Z'; # extended amino acid |
18
|
2
|
100
|
|
2
|
1
|
3853819
|
sub random_sequence { ref($_[0]) ? shift : undef;random_string('0'x(shift), \@amino) } |
|
2
|
|
|
|
|
1302
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub new { |
21
|
1
|
|
|
1
|
1
|
142
|
my $pkg = shift; |
22
|
1
|
50
|
|
|
|
7
|
my $arg = ref($_[0]) ? $_[0] : {@_}; |
23
|
1
|
50
|
33
|
|
|
25
|
die "It exists as a non-directory\n" if -e $arg->{wd} && !-d $arg->{wd}; |
24
|
1
|
|
|
|
|
210
|
mkdir abs_path($arg->{wd}); |
25
|
1
|
|
|
|
|
36
|
bless { |
26
|
|
|
|
|
|
|
wd => abs_path($arg->{wd}), |
27
|
|
|
|
|
|
|
seqarr => undef, |
28
|
|
|
|
|
|
|
}, $pkg; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub reset { |
32
|
1
|
|
|
1
|
1
|
205
|
my $pkg = shift; |
33
|
1
|
|
|
|
|
2
|
@{$pkg->{seqarr}} = (); |
|
1
|
|
|
|
|
12
|
|
34
|
1
|
|
|
|
|
2
|
@{$pkg->{acclen}} = (); |
|
1
|
|
|
|
|
4
|
|
35
|
1
|
|
|
|
|
4
|
$pkg->{built} = 0; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
1
|
321
|
sub feed { push @{(shift)->{seqarr}}, map{uc$_}@_ } |
|
1
|
|
|
|
|
10
|
|
|
13
|
|
|
|
|
45
|
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# cartesian product over @extamino X @extamino |
41
|
|
|
|
|
|
|
sub bigram_set { |
42
|
2
|
|
|
2
|
0
|
3
|
my @ret; |
43
|
2
|
|
|
|
|
7
|
foreach my $f (@extamino){ |
44
|
52
|
|
|
|
|
85
|
foreach my $s (@extamino){ |
45
|
1352
|
|
|
|
|
3240
|
push @ret, $f.$s; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
} |
48
|
2
|
|
|
|
|
478
|
@ret; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub loadseq { |
52
|
1
|
|
|
1
|
0
|
2
|
my $pkg = shift; |
53
|
1
|
50
|
|
|
|
94
|
open F, $pkg->{wd}."/seqs" or die "Cannot load sequence file\n"; |
54
|
1
|
|
|
|
|
113
|
while(chomp($_=)){ |
55
|
13
|
|
|
|
|
15
|
push @{$pkg->{seqarr}}, $_; |
|
13
|
|
|
|
|
354
|
|
56
|
|
|
|
|
|
|
} |
57
|
1
|
|
|
|
|
26
|
close F; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# accumulating lengths |
61
|
|
|
|
|
|
|
sub acclen { |
62
|
2
|
|
|
2
|
0
|
4
|
my $pkg = shift; |
63
|
2
|
|
|
|
|
4
|
my $sum; |
64
|
2
|
|
|
|
|
5
|
push @{$pkg->{acclen}}, 0; |
|
2
|
|
|
|
|
8
|
|
65
|
2
|
|
|
|
|
3
|
for (1..$#{$pkg->{seqarr}}){ |
|
2
|
|
|
|
|
13
|
|
66
|
24
|
|
|
|
|
35
|
$sum += length $pkg->{seqarr}->[$_-1]; |
67
|
24
|
|
|
|
|
27
|
push @{$pkg->{acclen}}, $sum; |
|
24
|
|
|
|
|
50
|
|
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub loadidx { |
72
|
1
|
|
|
1
|
1
|
83
|
my $pkg = shift; |
73
|
1
|
|
|
|
|
5
|
$pkg->{seqarr} = $pkg->{acclen} = undef; |
74
|
1
|
|
|
|
|
7
|
$pkg->loadseq; |
75
|
1
|
|
|
|
|
5
|
$pkg->acclen; |
76
|
1
|
|
|
|
|
4
|
$pkg->{built} = 1; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub buildidx { |
80
|
1
|
|
|
1
|
1
|
100
|
my $pkg = shift; |
81
|
1
|
|
|
|
|
2
|
my %fh; |
82
|
|
|
|
|
|
|
|
83
|
1
|
|
|
|
|
7
|
$pkg->acclen; |
84
|
|
|
|
|
|
|
|
85
|
1
|
|
|
|
|
6
|
foreach (bigram_set){ |
86
|
676
|
|
|
|
|
4356
|
$fh{$_} = new IO::File $pkg->{wd}."/idx.$_", O_CREAT|O_WRONLY|O_TRUNC; |
87
|
676
|
50
|
|
|
|
125804
|
die "Cannot open index file $_ for writing\n" unless defined $fh{$_}; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
1
|
|
|
|
|
121
|
open F, '>', $pkg->{wd}."/seqs"; |
91
|
1
|
|
|
|
|
2
|
my $cnt = 0; |
92
|
1
|
|
|
|
|
2
|
foreach my $seq (@{$pkg->{seqarr}}){ |
|
1
|
|
|
|
|
6
|
|
93
|
13
|
|
|
|
|
57
|
print F "$seq\n"; |
94
|
13
|
|
|
|
|
34
|
for(my $i = 0; $i < length($seq)-1; $i++){ |
95
|
4122
|
|
|
|
|
6233
|
my $bigram = substr($seq, $i, 2); |
96
|
4122
|
|
|
|
|
6995
|
my $h = $fh{$bigram}; |
97
|
4122
|
|
|
|
|
21149
|
print $h ($pkg->{acclen}->[$cnt] + $i)."\n"; |
98
|
|
|
|
|
|
|
} |
99
|
13
|
|
|
|
|
25
|
$cnt++; |
100
|
|
|
|
|
|
|
} |
101
|
1
|
|
|
|
|
123
|
close F; |
102
|
|
|
|
|
|
|
|
103
|
1
|
|
|
|
|
65
|
$_->close foreach (values %fh); |
104
|
1
|
|
|
|
|
26720
|
$pkg->{built} = 1; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# relative position |
108
|
|
|
|
|
|
|
sub relpos { |
109
|
75266
|
|
|
75266
|
0
|
100894
|
my ($pkg, $abspos) = @_; |
110
|
75266
|
|
|
|
|
104804
|
my ($seqid, $pos) = (0, 0); |
111
|
75266
|
|
|
|
|
87782
|
for( my $i=$#{$pkg->{acclen}} ; $i>=0 ; $i--){ |
|
75266
|
|
|
|
|
223058
|
|
112
|
481073
|
100
|
|
|
|
1282159
|
if( $abspos >= $pkg->{acclen}->[$i] ){ |
113
|
75266
|
|
|
|
|
108709
|
$pos = $abspos - $pkg->{acclen}->[$i]; |
114
|
75266
|
|
|
|
|
86196
|
$seqid = $i; |
115
|
75266
|
|
|
|
|
92245
|
last; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
} |
118
|
75266
|
|
|
|
|
177990
|
return ($seqid, $pos); |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# absolute position |
122
|
|
|
|
|
|
|
sub abspos { |
123
|
9175
|
|
|
9175
|
0
|
13245
|
my ($pkg, $seqid, $pos) = @_; |
124
|
9175
|
|
|
|
|
30501
|
return $pkg->{acclen}->[$seqid] + $pos; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub coincidence_length { |
128
|
37633
|
|
|
37633
|
0
|
57841
|
my ($pkg, $prev, $this) = @_; |
129
|
37633
|
|
|
|
|
66832
|
my @prevrel = $pkg->relpos($prev); |
130
|
37633
|
|
|
|
|
76644
|
my @thisrel = $pkg->relpos($this); |
131
|
37633
|
|
|
|
|
49392
|
my @range = @{$pkg->{length}}; |
|
37633
|
|
|
|
|
81274
|
|
132
|
37633
|
|
|
|
|
50467
|
my @ret; |
133
|
|
|
|
|
|
|
|
134
|
37633
|
|
|
|
|
53518
|
foreach my $len (@range){ |
135
|
44045
|
|
|
|
|
81885
|
my $str_a = substr($pkg->{seqarr}->[$prevrel[0]], $prevrel[1], $len); |
136
|
44045
|
|
|
|
|
69424
|
my $str_b = substr($pkg->{seqarr}->[$thisrel[0]], $thisrel[1], $len); |
137
|
|
|
|
|
|
|
# print "$len $str_a $str_b$/" if $str_a =~/^ACL/o || $str_b =~ /^ACL/o; |
138
|
|
|
|
|
|
|
|
139
|
44045
|
100
|
100
|
|
|
176368
|
last if length $str_a != $len || length $str_b != $len; |
140
|
43410
|
100
|
|
|
|
97404
|
last if $str_a ne $str_b; |
141
|
|
|
|
|
|
|
# print "$prev, $this => ", substr($pkg->{seqarr}->[$prevrel[0]], $prevrel[1], $pkg->{length}), '/', substr($pkg->{seqarr}->[$thisrel[0]], $thisrel[1], $pkg->{length}), $/; |
142
|
9175
|
|
|
|
|
32600
|
push @ret, [ \@prevrel, \@thisrel, $len ]; |
143
|
|
|
|
|
|
|
} |
144
|
37633
|
|
|
|
|
94429
|
\@ret; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
1
|
|
|
1
|
|
13
|
use List::Util qw/min/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1236
|
|
148
|
|
|
|
|
|
|
sub query { |
149
|
1
|
|
|
1
|
1
|
104
|
my ($pkg, $length) = @_; |
150
|
1
|
50
|
|
|
|
7
|
die "Index files are not built or loaded. Please use 'buildidx' or 'loadidx' first\n" unless $pkg->{built}; |
151
|
1
|
50
|
|
|
|
6
|
die "Length of a repeat sequence must exceed 3\n" unless $length >= 3; |
152
|
1
|
50
|
|
|
|
13
|
$pkg->{length} = ref($length) ? [sort{$a<=>$b}@$length] : [ $length ]; |
|
3
|
|
|
|
|
10
|
|
153
|
1
|
|
|
|
|
168
|
open R, '>', $pkg->{wd}."/result"; |
154
|
1
|
|
|
|
|
2
|
my ($prev, $this); |
155
|
1
|
|
|
|
|
5
|
foreach (bigram_set){ |
156
|
676
|
100
|
|
|
|
18505
|
next unless -s $pkg->{wd}."/idx.$_"; |
157
|
368
|
50
|
|
|
|
15720
|
open F, $pkg->{wd}."/idx.$_" or die "Cannot open index file $_ for query\n"; |
158
|
368
|
|
|
|
|
610
|
my @posarr; |
159
|
368
|
|
|
|
|
5761
|
while ( chomp ($_ = ) ){ push @posarr, $_; } |
|
4122
|
|
|
|
|
11643
|
|
160
|
|
|
|
|
|
|
|
161
|
368
|
|
|
|
|
567
|
my %checked; |
162
|
368
|
|
|
|
|
855
|
foreach $prev (@posarr){ |
163
|
4122
|
50
|
|
|
|
9305
|
next if $checked{$prev}; |
164
|
4122
|
|
|
|
|
5696
|
foreach $this (@posarr){ |
165
|
79724
|
100
|
|
|
|
101596
|
if($this - $prev > min( @{$pkg->{length}})){ |
|
79724
|
|
|
|
|
250625
|
|
166
|
37633
|
|
|
|
|
73817
|
my $occs = $pkg->coincidence_length($prev, $this); |
167
|
37633
|
50
|
|
|
|
86909
|
if(ref $occs){ |
168
|
37633
|
|
|
|
|
41895
|
foreach my $occ (@{$occs}){ |
|
37633
|
|
|
|
|
99814
|
|
169
|
9175
|
|
|
|
|
23255
|
$checked{$pkg->abspos($occ->[0]->[0], $occ->[0]->[1])} = 1; |
170
|
|
|
|
|
|
|
# print substr($pkg->{seqarr}->[$occ->[0]->[0]],$occ->[0]->[1], $occ->[2]),$/; |
171
|
|
|
|
|
|
|
|
172
|
9175
|
|
|
|
|
25061
|
print R |
173
|
|
|
|
|
|
|
join ' ', |
174
|
|
|
|
|
|
|
substr($pkg->{seqarr}->[$occ->[0]->[0]], |
175
|
|
|
|
|
|
|
$occ->[0]->[1], $occ->[2]), |
176
|
9175
|
|
|
|
|
21432
|
"@{$occ->[0]} @{$occ->[1]}\n"; |
|
9175
|
|
|
|
|
39057
|
|
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
368
|
|
|
|
|
24115
|
close F; |
183
|
|
|
|
|
|
|
} |
184
|
1
|
|
|
|
|
177
|
close R; |
185
|
|
|
|
|
|
|
|
186
|
1
|
50
|
|
|
|
51
|
open R, $pkg->{wd}."/result" or die "Cannot open result file\n"; |
187
|
1
|
|
|
|
|
3
|
my $ret; |
188
|
1
|
|
|
|
|
43
|
while(chomp($_=)){ |
189
|
9175
|
|
|
|
|
35704
|
my @e = split /\s/, $_; |
190
|
9175
|
|
|
|
|
22907
|
$ret->{$e[0]}->{join q/ /,@e[1..2]} = 1; |
191
|
9175
|
|
|
|
|
32349
|
$ret->{$e[0]}->{join q/ /,@e[3..4]} = 1; |
192
|
|
|
|
|
|
|
} |
193
|
1
|
|
|
|
|
35
|
close R; |
194
|
1
|
|
|
|
|
2
|
foreach (keys %{$ret}){ |
|
1
|
|
|
|
|
930
|
|
195
|
7062
|
|
|
|
|
15695
|
$ret->{$_} = [ |
196
|
6559
|
|
|
|
|
10994
|
sort{ $a->[0] <=> $b->[0] } |
197
|
7582
|
|
|
|
|
22850
|
sort{ $a->[1] <=> $b->[1] } |
198
|
2692
|
|
|
|
|
7406
|
map { [split/ /] } |
199
|
2692
|
|
|
|
|
2898
|
keys %{$ret->{$_}} |
200
|
|
|
|
|
|
|
]; |
201
|
|
|
|
|
|
|
} |
202
|
1
|
|
|
|
|
266
|
$ret; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub cleanidx { |
207
|
1
|
|
|
1
|
0
|
15500
|
map{unlink $_} glob($_[0]->{wd}."/*"); |
|
678
|
|
|
|
|
722987
|
|
208
|
1
|
|
|
|
|
559
|
rmdir $_[0]->{wd}; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
1; |
214
|
|
|
|
|
|
|
__END__ |