line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Bio::Polloc::Rule::repeat - A rule of type repeat |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 AUTHOR - Luis M. Rodriguez-R |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Email lmrodriguezr at gmail dot com |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=cut |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
package Bio::Polloc::Rule::repeat; |
12
|
1
|
|
|
1
|
|
709
|
use base qw(Bio::Polloc::RuleI); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
508
|
|
13
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
14
|
1
|
|
|
1
|
|
5
|
use Bio::Polloc::Polloc::IO; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
15
|
1
|
|
|
1
|
|
640
|
use Bio::Polloc::LocusI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
16
|
1
|
|
|
1
|
|
892
|
use Bio::SeqIO; |
|
1
|
|
|
|
|
54755
|
|
|
1
|
|
|
|
|
1422
|
|
17
|
|
|
|
|
|
|
our $VERSION = 1.0503; # [a-version] from Bio::Polloc::Polloc::Version |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 APPENDIX |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Methods provided by the package |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head2 new |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Generic initialization method |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=cut |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub new { |
31
|
0
|
|
|
0
|
1
|
0
|
my($caller,@args) = @_; |
32
|
0
|
|
|
|
|
0
|
my $self = $caller->SUPER::new(@args); |
33
|
0
|
|
|
|
|
0
|
$self->_initialize(@args); |
34
|
0
|
|
|
|
|
0
|
return $self; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head2 execute |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This is where magic happens. Translates the parameters of the object into a call to |
40
|
|
|
|
|
|
|
B<mreps>, and scans the sequence for repeats. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 Arguments |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The sequence (C<-seq>) as a L<Bio::Seq> or a L<Bio::SeqIO> object |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head3 Returns |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
An array reference populated with L<Bio::Polloc::Locus::repeat> objects |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub execute { |
53
|
0
|
|
|
0
|
1
|
0
|
my($self,@args) = @_; |
54
|
0
|
|
|
|
|
0
|
my($seq) = $self->_rearrange([qw(SEQ)], @args); |
55
|
|
|
|
|
|
|
|
56
|
0
|
0
|
|
|
|
0
|
$self->throw("You must provide a sequence to evaluate the rule", $seq) unless $seq; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# For Bio::SeqIO objects |
59
|
0
|
0
|
|
|
|
0
|
if($seq->isa('Bio::SeqIO')){ |
60
|
0
|
|
|
|
|
0
|
my @feats = (); |
61
|
0
|
|
|
|
|
0
|
while(my $s = $seq->next_seq){ |
62
|
0
|
|
|
|
|
0
|
push(@feats, @{$self->execute(-seq=>$s)}) |
|
0
|
|
|
|
|
0
|
|
63
|
|
|
|
|
|
|
} |
64
|
0
|
0
|
|
|
|
0
|
return wantarray ? @feats : \@feats; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
0
|
$self->throw("Illegal class of sequence '".ref($seq)."'", $seq) unless $seq->isa('Bio::Seq'); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# Include safe_value parameters |
70
|
0
|
|
|
|
|
0
|
my $cmd_vars = {}; |
71
|
0
|
|
|
|
|
0
|
for my $p ( qw(RES MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP ALLOWSMALL WIN) ){ |
72
|
0
|
|
|
|
|
0
|
my $tv = $self->_search_value($p); |
73
|
0
|
0
|
|
|
|
0
|
$cmd_vars->{"-" . lc $p} = $tv if defined $tv; |
74
|
|
|
|
|
|
|
} |
75
|
0
|
|
0
|
|
|
0
|
my $minsim = ($self->_search_value("minsim") || 0)+0; |
76
|
0
|
|
0
|
|
|
0
|
my $maxsim = ($self->_search_value("maxsim") || 0)+0; |
77
|
0
|
|
0
|
|
|
0
|
$maxsim ||= 100; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Create the IO master |
80
|
0
|
|
|
|
|
0
|
my $io = Bio::Polloc::Polloc::IO->new(); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Search for mreps |
83
|
0
|
|
|
|
|
0
|
$self->source('mreps'); |
84
|
0
|
0
|
|
|
|
0
|
my $mreps = $self->_executable(defined $self->ruleset ? $self->ruleset->value("path") : undef) |
|
|
0
|
|
|
|
|
|
85
|
|
|
|
|
|
|
or $self->throw("Could not find the mreps binary"); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Write the sequence |
88
|
0
|
|
|
|
|
0
|
my($seq_fh, $seq_file) = $io->tempfile; |
89
|
0
|
|
|
|
|
0
|
close $seq_fh; |
90
|
0
|
|
|
|
|
0
|
my $seqO = Bio::SeqIO->new(-file=>">$seq_file", -format=>'Fasta'); |
91
|
0
|
|
|
|
|
0
|
$seqO->write_seq($seq); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# Run it |
94
|
0
|
|
|
|
|
0
|
my @run = ($mreps); |
95
|
0
|
|
|
|
|
0
|
push @run, %{$cmd_vars}; |
|
0
|
|
|
|
|
0
|
|
96
|
0
|
|
|
|
|
0
|
push @run, "-fasta", $seq_file; |
97
|
0
|
|
|
|
|
0
|
push @run, "2>&1"; |
98
|
0
|
|
|
|
|
0
|
push @run, "|"; |
99
|
0
|
|
|
|
|
0
|
$self->debug("Running: ".join(" ",@run)); |
100
|
0
|
|
|
|
|
0
|
my $run = Bio::Polloc::Polloc::IO->new(-file=>join(" ",@run)); |
101
|
0
|
|
|
|
|
0
|
my $ontable = 0; |
102
|
0
|
|
|
|
|
0
|
my @feats = (); |
103
|
0
|
|
|
|
|
0
|
while(my $line = $run->_readline){ |
104
|
0
|
0
|
|
|
|
0
|
if($line =~ m/^ -----------------------------/){ |
|
|
0
|
|
|
|
|
|
105
|
0
|
|
|
|
|
0
|
$ontable = !$ontable; |
106
|
|
|
|
|
|
|
}elsif($ontable){ |
107
|
0
|
|
|
|
|
0
|
chomp $line; |
108
|
|
|
|
|
|
|
# from -> to : size <per.> [exp.] err-rate sequence |
109
|
0
|
0
|
|
|
|
0
|
$line =~ m/^\s+(\d+)\s+->\s+(\d+)\s+:\s+(\d+)\s+<(\d+)>\s+\[([\d\.]+)\]\s+([\d\.]+)\s+([\w\s]+)$/ |
110
|
|
|
|
|
|
|
or $self->throw("Unexpected line $.",$line,"Bio::Polloc::Polloc::ParsingException"); |
111
|
0
|
|
|
|
|
0
|
my $score = 100 - $6*100; |
112
|
0
|
0
|
0
|
|
|
0
|
next if $score > $maxsim or $score < $minsim; |
113
|
0
|
|
|
|
|
0
|
my $id = $self->_next_child_id; |
114
|
0
|
|
|
|
|
0
|
my $cons = $self->_calculate_consensus($7); |
115
|
0
|
0
|
|
|
|
0
|
push @feats, Bio::Polloc::LocusI->new( |
116
|
|
|
|
|
|
|
-type=>$self->type, -rule=>$self, -seq=>$seq, |
117
|
|
|
|
|
|
|
-from=>$1+0, -to=>$2+0, -strand=>"+", |
118
|
|
|
|
|
|
|
-name=>$self->name, |
119
|
|
|
|
|
|
|
-id=>(defined $id ? $id : ""), |
120
|
|
|
|
|
|
|
-period=>$4+0, -exponent=>$5+0, |
121
|
|
|
|
|
|
|
-error=>$6*100, |
122
|
|
|
|
|
|
|
-score=>$score, |
123
|
|
|
|
|
|
|
-consensus=>$cons, |
124
|
|
|
|
|
|
|
-repeats=>$7); |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
} |
127
|
0
|
|
|
|
|
0
|
$run->close(); |
128
|
0
|
0
|
|
|
|
0
|
return wantarray ? @feats : \@feats; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 stringify_value |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Produces a readable string containing the value of the rule. |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=cut |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub stringify_value { |
138
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
139
|
0
|
|
|
|
|
0
|
my $out = ""; |
140
|
0
|
|
|
|
|
0
|
for my $k (keys %{$self->value}){ |
|
0
|
|
|
|
|
0
|
|
141
|
0
|
0
|
|
|
|
0
|
$out.= "$k=>".(defined $self->value->{$k} ? $self->value->{$k} : "")." "; |
142
|
|
|
|
|
|
|
} |
143
|
0
|
|
|
|
|
0
|
return $out; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head2 value |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Implements the C<_qualify_value()> from the L<Bio::Polloc::RuleI> interface |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head3 Arguments |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Value (str or ref-to-hash or ref-to-array). The supported keys are: |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=over |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item -res I<float> |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Resolution (allowed error) |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item -minsize I<int> |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Minimum size of the repeat |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item -maxsize I<int> |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Maximum size of the repeat |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=item -minperiod I<float> |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Minimum period of the repeat |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=item -maxperiod I<float> |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Maximum period of the repeat |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=item -exp I<float> |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Minimum exponent (number of repeats) |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=item -allowsmall I<bool (int)> |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
If true, allows spurious results |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item -win I<float> |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Process by sliding windows of size C<2*n> overlaping by C<n> |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item -minsim I<float> |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Minimum similarity percent |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=item -maxsim I<float> |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
Maximum similarity percent |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=back |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=head3 Return |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Value (I<hashref> or C<undef>). |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=head1 INTERNAL METHODS |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Methods intended to be used only within the scope of Bio::Polloc::* |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head2 _calculate_consensus |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Attempts to calculate the consensus of the repeat units. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head3 Arguments |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
The repetitive sequence with repeat units separated by spaces (I<str>). |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head3 Returns |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
The consensus sequence (I<str>) |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub _calculate_consensus { |
221
|
0
|
|
|
0
|
|
0
|
my($self,$seq) = @_; |
222
|
0
|
0
|
|
|
|
0
|
return unless $seq; |
223
|
0
|
|
|
|
|
0
|
my $io = Bio::Polloc::Polloc::IO->new(); |
224
|
0
|
|
|
|
|
0
|
my $emma = $io->exists_exe("emma"); |
225
|
0
|
|
|
|
|
0
|
my $cons = $io->exists_exe("cons"); |
226
|
0
|
0
|
|
|
|
0
|
return "no-emma" unless $emma; |
227
|
0
|
0
|
|
|
|
0
|
return "no-cons" unless $cons; |
228
|
0
|
|
|
|
|
0
|
my ($outseq_fh, $outseq) = $io->tempfile; |
229
|
0
|
|
|
|
|
0
|
my $i=0; |
230
|
0
|
|
|
|
|
0
|
print $outseq_fh ">".(++$i)."\n$_\n" for split /\s+/, $seq; |
231
|
0
|
|
|
|
|
0
|
close $outseq_fh; |
232
|
0
|
0
|
|
|
|
0
|
return "err-seq" unless -s $outseq; |
233
|
0
|
|
|
|
|
0
|
my $outaln = "$outseq.aln"; |
234
|
0
|
|
|
|
|
0
|
my $emmarun = Bio::Polloc::Polloc::IO->new(-file=>"$emma '$outseq' '$outaln' '/dev/null' -auto >/dev/null |"); |
235
|
0
|
|
|
|
|
0
|
while($emmarun->_readline){ print STDERR $_ } |
|
0
|
|
|
|
|
0
|
|
236
|
0
|
|
|
|
|
0
|
$emmarun->close(); |
237
|
0
|
0
|
|
|
|
0
|
unless(-s $outaln){ |
238
|
0
|
0
|
|
|
|
0
|
unlink $outaln if -e $outaln; |
239
|
0
|
|
|
|
|
0
|
return "err-aln"; |
240
|
|
|
|
|
|
|
} |
241
|
0
|
|
|
|
|
0
|
my $consout = ""; |
242
|
0
|
|
|
|
|
0
|
my $consrun = Bio::Polloc::Polloc::IO->new(-file=>"$cons '$outaln' stdout -auto |"); |
243
|
0
|
|
|
|
|
0
|
while(my $ln = $consrun->_readline){ |
244
|
0
|
|
|
|
|
0
|
chomp $ln; |
245
|
0
|
0
|
|
|
|
0
|
next if $ln =~ /^>/; |
246
|
0
|
|
|
|
|
0
|
$consout .= $ln; |
247
|
|
|
|
|
|
|
} |
248
|
0
|
|
|
|
|
0
|
unlink $outaln; |
249
|
0
|
|
|
|
|
0
|
$consrun->close(); |
250
|
0
|
|
|
|
|
0
|
$io->close(); |
251
|
0
|
|
|
|
|
0
|
return $consout; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 _parameters |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub _parameters { |
259
|
0
|
|
|
0
|
|
0
|
return [qw(RES MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP ALLOWSMALL WIN MINSIM MAXSIM)]; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head2 _executable |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
Attempts to get the mreps executable. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub _executable { |
269
|
1
|
|
|
1
|
|
48079
|
my($self, $path) = @_; |
270
|
1
|
|
|
|
|
2
|
my $name = 'mreps'; |
271
|
1
|
|
|
|
|
3
|
my $io = "Bio::Polloc::Polloc::IO"; |
272
|
1
|
|
|
|
|
17
|
$self->debug("Searching the $name binary for $^O"); |
273
|
|
|
|
|
|
|
# Note the darwin support. This is because darwin is unable to execute |
274
|
|
|
|
|
|
|
#Â the linux binary (despite its origin) |
275
|
1
|
50
|
|
|
|
12
|
my $bin = $^O =~ /(macos|darwin)/i ? "mreps.macosx.bin" : |
|
|
50
|
|
|
|
|
|
276
|
|
|
|
|
|
|
$^O =~ /mswin/i ? "mreps.exe" : |
277
|
|
|
|
|
|
|
"mreps.linux.bin"; |
278
|
1
|
|
|
|
|
2
|
my @where = (''); |
279
|
1
|
50
|
|
|
|
3
|
unshift @where, $path if $path; |
280
|
1
|
|
|
|
|
4
|
for my $p (@where){ |
281
|
1
|
|
|
|
|
2
|
for my $n (($bin, $name, "$name.bin")){ |
282
|
3
|
|
|
|
|
17
|
my $exe = $io->exists_exe($p . $n); |
283
|
3
|
50
|
|
|
|
12
|
return $exe if $exe; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=head2 _initialize |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _initialize { |
293
|
0
|
|
|
0
|
|
|
my($self,@args) = @_; |
294
|
0
|
|
|
|
|
|
$self->type('repeat'); |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
1; |