line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Anchr::Command::anchors; |
2
|
23
|
|
|
23
|
|
387491
|
use strict; |
|
23
|
|
|
|
|
53
|
|
|
23
|
|
|
|
|
675
|
|
3
|
23
|
|
|
23
|
|
109
|
use warnings; |
|
23
|
|
|
|
|
44
|
|
|
23
|
|
|
|
|
640
|
|
4
|
23
|
|
|
23
|
|
5636
|
use autodie; |
|
23
|
|
|
|
|
263803
|
|
|
23
|
|
|
|
|
110
|
|
5
|
|
|
|
|
|
|
|
6
|
23
|
|
|
23
|
|
138150
|
use App::Anchr -command; |
|
23
|
|
|
|
|
60
|
|
|
23
|
|
|
|
|
312
|
|
7
|
23
|
|
|
23
|
|
17754
|
use App::Anchr::Common; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use constant abstract => "selete anchors from k-unitigs or superreads"; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
sub opt_spec { |
12
|
|
|
|
|
|
|
return ( |
13
|
|
|
|
|
|
|
[ "outfile|o=s", "output filename, [stdout] for screen", { default => "anchors.sh" }, ], |
14
|
|
|
|
|
|
|
[ 'min=i', 'minimal length of anchors', { default => 1000, }, ], |
15
|
|
|
|
|
|
|
[ 'unambi=i', 'minimal coverage of unambiguous reads', { default => 2, }, ], |
16
|
|
|
|
|
|
|
[ 'parallel|p=i', 'number of threads', { default => 8, }, ], |
17
|
|
|
|
|
|
|
{ show_defaults => 1, } |
18
|
|
|
|
|
|
|
); |
19
|
|
|
|
|
|
|
} |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub usage_desc { |
22
|
|
|
|
|
|
|
return "anchr anchors [options] <k_unitigs.fasta> <pe.cor.fa>"; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub description { |
26
|
|
|
|
|
|
|
my $desc; |
27
|
|
|
|
|
|
|
$desc .= ucfirst(abstract) . ".\n"; |
28
|
|
|
|
|
|
|
$desc .= "\tFasta files can be gzipped\n"; |
29
|
|
|
|
|
|
|
return $desc; |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub validate_args { |
33
|
|
|
|
|
|
|
my ( $self, $opt, $args ) = @_; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
if ( !( @{$args} == 2 ) ) { |
36
|
|
|
|
|
|
|
my $message = "This command need two input files.\n\tIt found"; |
37
|
|
|
|
|
|
|
$message .= sprintf " [%s]", $_ for @{$args}; |
38
|
|
|
|
|
|
|
$message .= ".\n"; |
39
|
|
|
|
|
|
|
$self->usage_error($message); |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
for ( @{$args} ) { |
42
|
|
|
|
|
|
|
if ( !Path::Tiny::path($_)->is_file ) { |
43
|
|
|
|
|
|
|
$self->usage_error("The input file [$_] doesn't exist."); |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub execute { |
49
|
|
|
|
|
|
|
my ( $self, $opt, $args ) = @_; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# A stream to 'stdout' or a standard file. |
52
|
|
|
|
|
|
|
my $out_fh; |
53
|
|
|
|
|
|
|
if ( lc $opt->{outfile} eq "stdout" ) { |
54
|
|
|
|
|
|
|
$out_fh = *STDOUT{IO}; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
else { |
57
|
|
|
|
|
|
|
open $out_fh, ">", $opt->{outfile}; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
my $tt = Template->new; |
61
|
|
|
|
|
|
|
my $text = <<'EOF'; |
62
|
|
|
|
|
|
|
#!/usr/bin/env bash |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
#----------------------------# |
65
|
|
|
|
|
|
|
# Colors in term |
66
|
|
|
|
|
|
|
#----------------------------# |
67
|
|
|
|
|
|
|
# http://stackoverflow.com/questions/5947742/how-to-change-the-output-color-of-echo-in-linux |
68
|
|
|
|
|
|
|
GREEN= |
69
|
|
|
|
|
|
|
RED= |
70
|
|
|
|
|
|
|
NC= |
71
|
|
|
|
|
|
|
if tty -s < /dev/fd/1 2> /dev/null; then |
72
|
|
|
|
|
|
|
GREEN='\033[0;32m' |
73
|
|
|
|
|
|
|
RED='\033[0;31m' |
74
|
|
|
|
|
|
|
NC='\033[0m' # No Color |
75
|
|
|
|
|
|
|
fi |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
log_warn () { |
78
|
|
|
|
|
|
|
echo >&2 -e "${RED}==> $@ <==${NC}" |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
log_info () { |
82
|
|
|
|
|
|
|
echo >&2 -e "${GREEN}==> $@${NC}" |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
log_debug () { |
86
|
|
|
|
|
|
|
echo >&2 -e " * $@" |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
#----------------------------# |
90
|
|
|
|
|
|
|
# helper functions |
91
|
|
|
|
|
|
|
#----------------------------# |
92
|
|
|
|
|
|
|
set +e |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
signaled () { |
95
|
|
|
|
|
|
|
log_warn Interrupted |
96
|
|
|
|
|
|
|
exit 1 |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
trap signaled TERM QUIT INT |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#----------------------------# |
101
|
|
|
|
|
|
|
# Prepare SR |
102
|
|
|
|
|
|
|
#----------------------------# |
103
|
|
|
|
|
|
|
log_info Symlink/copy input files |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
if [ ! -e SR.fasta ]; then |
106
|
|
|
|
|
|
|
ln -s [% args.0 %] SR.fasta |
107
|
|
|
|
|
|
|
fi |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
if [ ! -e pe.cor.fa ]; then |
110
|
|
|
|
|
|
|
ln -s [% args.1 %] pe.cor.fa |
111
|
|
|
|
|
|
|
fi |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
log_debug "SR sizes" |
114
|
|
|
|
|
|
|
faops size SR.fasta > sr.chr.sizes |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
#----------------------------# |
117
|
|
|
|
|
|
|
# unambiguous |
118
|
|
|
|
|
|
|
#----------------------------# |
119
|
|
|
|
|
|
|
log_info "Unambiguous regions" |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
log_debug "bbmap" |
122
|
|
|
|
|
|
|
bbmap.sh \ |
123
|
|
|
|
|
|
|
maxindel=0 strictmaxindel perfectmode \ |
124
|
|
|
|
|
|
|
threads=[% opt.parallel %] \ |
125
|
|
|
|
|
|
|
ambiguous=toss \ |
126
|
|
|
|
|
|
|
nodisk \ |
127
|
|
|
|
|
|
|
ref=SR.fasta in=pe.cor.fa \ |
128
|
|
|
|
|
|
|
outm=unambiguous.sam outu=unmapped.sam \ |
129
|
|
|
|
|
|
|
basecov=basecov.txt \ |
130
|
|
|
|
|
|
|
1>bbmap.err 2>&1 |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# at least [% opt.unambi %] unambiguous reads covered |
133
|
|
|
|
|
|
|
# Pos is 0-based |
134
|
|
|
|
|
|
|
#RefName Pos Coverage |
135
|
|
|
|
|
|
|
log_debug "covered" |
136
|
|
|
|
|
|
|
cat basecov.txt \ |
137
|
|
|
|
|
|
|
| grep -v '^#' \ |
138
|
|
|
|
|
|
|
| perl -nla -e ' |
139
|
|
|
|
|
|
|
BEGIN { our $name; our @list; } |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub list_to_ranges { |
142
|
|
|
|
|
|
|
my @ranges; |
143
|
|
|
|
|
|
|
my $count = scalar @list; |
144
|
|
|
|
|
|
|
my $pos = 0; |
145
|
|
|
|
|
|
|
while ( $pos < $count ) { |
146
|
|
|
|
|
|
|
my $end = $pos + 1; |
147
|
|
|
|
|
|
|
$end++ while $end < $count && $list[$end] <= $list[ $end - 1 ] + 1; |
148
|
|
|
|
|
|
|
push @ranges, ( $list[$pos], $list[ $end - 1 ] ); |
149
|
|
|
|
|
|
|
$pos = $end; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
return @ranges; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
$F[2] < 2 and next; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
if ( !defined $name ) { |
158
|
|
|
|
|
|
|
$name = $F[0]; |
159
|
|
|
|
|
|
|
@list = ( $F[1] ); |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
elsif ( $name eq $F[0] ) { |
162
|
|
|
|
|
|
|
push @list, $F[1]; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
else { |
165
|
|
|
|
|
|
|
my @ranges = list_to_ranges(); |
166
|
|
|
|
|
|
|
for ( my $i = 0; $i < $#ranges; $i += 2 ) { |
167
|
|
|
|
|
|
|
if ( $ranges[$i] == $ranges[ $i + 1 ] ) { |
168
|
|
|
|
|
|
|
printf qq{%s:%s\n}, $name, $ranges[$i] + 1; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
else { |
171
|
|
|
|
|
|
|
printf qq{%s:%s-%s\n}, $name, $ranges[$i] + 1, $ranges[ $i + 1 ] + 1; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
$name = $F[0]; |
176
|
|
|
|
|
|
|
@list = ( $F[1] ); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
END { |
180
|
|
|
|
|
|
|
my @ranges = list_to_ranges(); |
181
|
|
|
|
|
|
|
for ( my $i = 0; $i < $#ranges; $i += 2 ) { |
182
|
|
|
|
|
|
|
if ( $ranges[$i] == $ranges[ $i + 1 ] ) { |
183
|
|
|
|
|
|
|
printf qq{%s:%s\n}, $name, $ranges[$i] + 1; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
else { |
186
|
|
|
|
|
|
|
printf qq{%s:%s-%s\n}, $name, $ranges[$i] + 1, $ranges[ $i + 1 ] + 1; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
' \ |
191
|
|
|
|
|
|
|
> unambiguous.covered.txt |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
#find . -type f -name "*.sam" | parallel --no-run-if-empty -j 1 rm |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
#----------------------------# |
196
|
|
|
|
|
|
|
# anchor |
197
|
|
|
|
|
|
|
#----------------------------# |
198
|
|
|
|
|
|
|
log_info "anchor - unambiguous" |
199
|
|
|
|
|
|
|
jrunlist cover unambiguous.covered.txt -o unambiguous.covered.yml |
200
|
|
|
|
|
|
|
jrunlist stat sr.chr.sizes unambiguous.covered.yml -o unambiguous.covered.csv |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
cat unambiguous.covered.csv \ |
203
|
|
|
|
|
|
|
| perl -nla -F"," -e ' |
204
|
|
|
|
|
|
|
$F[0] eq q{chr} and next; |
205
|
|
|
|
|
|
|
$F[0] eq q{all} and next; |
206
|
|
|
|
|
|
|
$F[2] < [% opt.min %] and next; |
207
|
|
|
|
|
|
|
$F[3] < 0.95 and next; |
208
|
|
|
|
|
|
|
print $F[0]; |
209
|
|
|
|
|
|
|
' \ |
210
|
|
|
|
|
|
|
| sort -n \ |
211
|
|
|
|
|
|
|
> anchor.txt |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
rm unambiguous.covered.txt |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
#----------------------------# |
216
|
|
|
|
|
|
|
# anchor2 |
217
|
|
|
|
|
|
|
#----------------------------# |
218
|
|
|
|
|
|
|
log_info "anchor2 - unambiguous2" |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# contiguous unique region longer than [% opt.min %] |
221
|
|
|
|
|
|
|
jrunlist span unambiguous.covered.yml --op excise -n [% opt.min %] -o unambiguous2.covered.yml |
222
|
|
|
|
|
|
|
jrunlist stat sr.chr.sizes unambiguous2.covered.yml -o unambiguous2.covered.csv |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
cat unambiguous2.covered.csv \ |
225
|
|
|
|
|
|
|
| perl -nla -F"," -e ' |
226
|
|
|
|
|
|
|
$F[0] eq q{chr} and next; |
227
|
|
|
|
|
|
|
$F[0] eq q{all} and next; |
228
|
|
|
|
|
|
|
$F[2] < [% opt.min %] and next; |
229
|
|
|
|
|
|
|
print $F[0]; |
230
|
|
|
|
|
|
|
' \ |
231
|
|
|
|
|
|
|
| sort -n \ |
232
|
|
|
|
|
|
|
> unambiguous2.txt |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
cat unambiguous2.txt \ |
235
|
|
|
|
|
|
|
| perl -nl -MPath::Tiny -e ' |
236
|
|
|
|
|
|
|
BEGIN { |
237
|
|
|
|
|
|
|
%seen = (); |
238
|
|
|
|
|
|
|
@ls = grep {/\S/} |
239
|
|
|
|
|
|
|
path(q{anchor.txt})->lines({ chomp => 1}); |
240
|
|
|
|
|
|
|
$seen{$_}++ for @ls; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
$seen{$_} and next; |
244
|
|
|
|
|
|
|
print; |
245
|
|
|
|
|
|
|
' \ |
246
|
|
|
|
|
|
|
> anchor2.txt |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
rm unambiguous2.* |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
#----------------------------# |
251
|
|
|
|
|
|
|
# basecov |
252
|
|
|
|
|
|
|
#----------------------------# |
253
|
|
|
|
|
|
|
log_info "basecov" |
254
|
|
|
|
|
|
|
cat basecov.txt \ |
255
|
|
|
|
|
|
|
| grep -v '^#' \ |
256
|
|
|
|
|
|
|
| perl -nla -MApp::Fasops::Common -e ' |
257
|
|
|
|
|
|
|
BEGIN { our $name; our @list; } |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
if ( !defined $name ) { |
260
|
|
|
|
|
|
|
$name = $F[0]; |
261
|
|
|
|
|
|
|
@list = ( $F[2] ); |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
elsif ( $name eq $F[0] ) { |
264
|
|
|
|
|
|
|
push @list, $F[2]; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
else { |
267
|
|
|
|
|
|
|
my $mean_cov = App::Fasops::Common::mean(@list); |
268
|
|
|
|
|
|
|
printf qq{%s\t%d\n}, $name, int $mean_cov; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
$name = $F[0]; |
271
|
|
|
|
|
|
|
@list = ( $F[2] ); |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
END { |
275
|
|
|
|
|
|
|
my $mean_cov = App::Fasops::Common::mean(@list); |
276
|
|
|
|
|
|
|
printf qq{%s\t%d\n}, $name, int $mean_cov; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
' \ |
279
|
|
|
|
|
|
|
> unambiguous.coverage.tsv |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# How to best eliminate values in a list that are outliers |
282
|
|
|
|
|
|
|
# http://www.perlmonks.org/?node_id=1147296 |
283
|
|
|
|
|
|
|
# http://exploringdatablog.blogspot.com/2013/02/finding-outliers-in-numerical-data.html |
284
|
|
|
|
|
|
|
cat unambiguous.coverage.tsv \ |
285
|
|
|
|
|
|
|
| perl -nla -MStatistics::Descriptive -e ' |
286
|
|
|
|
|
|
|
BEGIN { |
287
|
|
|
|
|
|
|
our $stat = Statistics::Descriptive::Full->new(); |
288
|
|
|
|
|
|
|
our %cov_of = (); |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
$cov_of{ $F[0] } = $F[1]; |
292
|
|
|
|
|
|
|
$stat->add_data( $F[1] ); |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
END { |
295
|
|
|
|
|
|
|
my $median = $stat->median(); |
296
|
|
|
|
|
|
|
my @abs_res = map { abs( $median - $_ ) } $stat->get_data(); |
297
|
|
|
|
|
|
|
my $abs_res_stat = Statistics::Descriptive::Full->new(); |
298
|
|
|
|
|
|
|
$abs_res_stat->add_data(@abs_res); |
299
|
|
|
|
|
|
|
my $MAD = $abs_res_stat->median(); |
300
|
|
|
|
|
|
|
my $k = 3; # the scale factor |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
my $lower_limit = ( $median - $k * $MAD ) / 2; |
303
|
|
|
|
|
|
|
my $upper_limit = ( $median + $k * $MAD ) * 1.5; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
for my $key ( keys %cov_of ) { |
306
|
|
|
|
|
|
|
if ( $cov_of{$key} < $lower_limit or $cov_of{$key} > $upper_limit ) { |
307
|
|
|
|
|
|
|
print $key; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
' \ |
312
|
|
|
|
|
|
|
> outlier.txt |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
cat anchor.txt anchor2.txt \ |
315
|
|
|
|
|
|
|
| grep -Fx -f outlier.txt -v \ |
316
|
|
|
|
|
|
|
> wanted.txt |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
#----------------------------# |
319
|
|
|
|
|
|
|
# Split SR.fasta to anchor and others |
320
|
|
|
|
|
|
|
#----------------------------# |
321
|
|
|
|
|
|
|
log_info "pe.anchor.fa & pe.others.fa" |
322
|
|
|
|
|
|
|
faops some -l 0 SR.fasta wanted.txt pe.anchor.fa |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
faops some -l 0 -i SR.fasta wanted.txt pe.others.fa |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
#----------------------------# |
327
|
|
|
|
|
|
|
# Done. |
328
|
|
|
|
|
|
|
#----------------------------# |
329
|
|
|
|
|
|
|
touch anchor.success |
330
|
|
|
|
|
|
|
log_info "Done." |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
exit 0 |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
EOF |
335
|
|
|
|
|
|
|
my $output; |
336
|
|
|
|
|
|
|
$tt->process( |
337
|
|
|
|
|
|
|
\$text, |
338
|
|
|
|
|
|
|
{ args => $args, |
339
|
|
|
|
|
|
|
opt => $opt, |
340
|
|
|
|
|
|
|
}, |
341
|
|
|
|
|
|
|
\$output |
342
|
|
|
|
|
|
|
); |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
print {$out_fh} $output; |
345
|
|
|
|
|
|
|
close $out_fh; |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
1; |