File Coverage

blib/lib/App/Anchr/Command/anchors.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package App::Anchr::Command::anchors;
2	23	23	387491	use strict;
	23		53
	23		675
3	23	23	109	use warnings;
	23		44
	23		640
4	23	23	5636	use autodie;
	23		263803
	23		110
5
6	23	23	138150	use App::Anchr -command;
	23		60
	23		312
7	23	23	17754	use App::Anchr::Common;
	0
	0
8
9				use constant abstract => "selete anchors from k-unitigs or superreads";
10
11				sub opt_spec {
12				return (
13				[ "outfile\|o=s", "output filename, [stdout] for screen", { default => "anchors.sh" }, ],
14				[ 'min=i', 'minimal length of anchors', { default => 1000, }, ],
15				[ 'unambi=i', 'minimal coverage of unambiguous reads', { default => 2, }, ],
16				[ 'parallel\|p=i', 'number of threads', { default => 8, }, ],
17				{ show_defaults => 1, }
18				);
19				}
20
21				sub usage_desc {
22				return "anchr anchors [options] <k_unitigs.fasta> <pe.cor.fa>";
23				}
24
25				sub description {
26				my $desc;
27				$desc .= ucfirst(abstract) . ".\n";
28				$desc .= "\tFasta files can be gzipped\n";
29				return $desc;
30				}
31
32				sub validate_args {
33				my ( $self, $opt, $args ) = @_;
34
35				if ( !( @{$args} == 2 ) ) {
36				my $message = "This command need two input files.\n\tIt found";
37				$message .= sprintf " [%s]", $_ for @{$args};
38				$message .= ".\n";
39				$self->usage_error($message);
40				}
41				for ( @{$args} ) {
42				if ( !Path::Tiny::path($_)->is_file ) {
43				$self->usage_error("The input file [$_] doesn't exist.");
44				}
45				}
46				}
47
48				sub execute {
49				my ( $self, $opt, $args ) = @_;
50
51				# A stream to 'stdout' or a standard file.
52				my $out_fh;
53				if ( lc $opt->{outfile} eq "stdout" ) {
54				$out_fh = *STDOUT{IO};
55				}
56				else {
57				open $out_fh, ">", $opt->{outfile};
58				}
59
60				my $tt = Template->new;
61				my $text = <<'EOF';
62				#!/usr/bin/env bash
63
64				#----------------------------#
65				# Colors in term
66				#----------------------------#
67				# http://stackoverflow.com/questions/5947742/how-to-change-the-output-color-of-echo-in-linux
68				GREEN=
69				RED=
70				NC=
71				if tty -s < /dev/fd/1 2> /dev/null; then
72				GREEN='\033[0;32m'
73				RED='\033[0;31m'
74				NC='\033[0m' # No Color
75				fi
76
77				log_warn () {
78				echo >&2 -e "${RED}==> $@ <==${NC}"
79				}
80
81				log_info () {
82				echo >&2 -e "${GREEN}==> $@${NC}"
83				}
84
85				log_debug () {
86				echo >&2 -e " * $@"
87				}
88
89				#----------------------------#
90				# helper functions
91				#----------------------------#
92				set +e
93
94				signaled () {
95				log_warn Interrupted
96				exit 1
97				}
98				trap signaled TERM QUIT INT
99
100				#----------------------------#
101				# Prepare SR
102				#----------------------------#
103				log_info Symlink/copy input files
104
105				if [ ! -e SR.fasta ]; then
106				ln -s [% args.0 %] SR.fasta
107				fi
108
109				if [ ! -e pe.cor.fa ]; then
110				ln -s [% args.1 %] pe.cor.fa
111				fi
112
113				log_debug "SR sizes"
114				faops size SR.fasta > sr.chr.sizes
115
116				#----------------------------#
117				# unambiguous
118				#----------------------------#
119				log_info "Unambiguous regions"
120
121				log_debug "bbmap"
122				bbmap.sh \
123				maxindel=0 strictmaxindel perfectmode \
124				threads=[% opt.parallel %] \
125				ambiguous=toss \
126				nodisk \
127				ref=SR.fasta in=pe.cor.fa \
128				outm=unambiguous.sam outu=unmapped.sam \
129				basecov=basecov.txt \
130				1>bbmap.err 2>&1
131
132				# at least [% opt.unambi %] unambiguous reads covered
133				# Pos is 0-based
134				#RefName Pos Coverage
135				log_debug "covered"
136				cat basecov.txt \
137				\| grep -v '^#' \
138				\| perl -nla -e '
139				BEGIN { our $name; our @list; }
140
141				sub list_to_ranges {
142				my @ranges;
143				my $count = scalar @list;
144				my $pos = 0;
145				while ( $pos < $count ) {
146				my $end = $pos + 1;
147				$end++ while $end < $count && $list[$end] <= $list[ $end - 1 ] + 1;
148				push @ranges, ( $list[$pos], $list[ $end - 1 ] );
149				$pos = $end;
150				}
151
152				return @ranges;
153				}
154
155				$F[2] < 2 and next;
156
157				if ( !defined $name ) {
158				$name = $F[0];
159				@list = ( $F[1] );
160				}
161				elsif ( $name eq $F[0] ) {
162				push @list, $F[1];
163				}
164				else {
165				my @ranges = list_to_ranges();
166				for ( my $i = 0; $i < $#ranges; $i += 2 ) {
167				if ( $ranges[$i] == $ranges[ $i + 1 ] ) {
168				printf qq{%s:%s\n}, $name, $ranges[$i] + 1;
169				}
170				else {
171				printf qq{%s:%s-%s\n}, $name, $ranges[$i] + 1, $ranges[ $i + 1 ] + 1;
172				}
173				}
174
175				$name = $F[0];
176				@list = ( $F[1] );
177				}
178
179				END {
180				my @ranges = list_to_ranges();
181				for ( my $i = 0; $i < $#ranges; $i += 2 ) {
182				if ( $ranges[$i] == $ranges[ $i + 1 ] ) {
183				printf qq{%s:%s\n}, $name, $ranges[$i] + 1;
184				}
185				else {
186				printf qq{%s:%s-%s\n}, $name, $ranges[$i] + 1, $ranges[ $i + 1 ] + 1;
187				}
188				}
189				}
190				' \
191				> unambiguous.covered.txt
192
193				#find . -type f -name "*.sam" \| parallel --no-run-if-empty -j 1 rm
194
195				#----------------------------#
196				# anchor
197				#----------------------------#
198				log_info "anchor - unambiguous"
199				jrunlist cover unambiguous.covered.txt -o unambiguous.covered.yml
200				jrunlist stat sr.chr.sizes unambiguous.covered.yml -o unambiguous.covered.csv
201
202				cat unambiguous.covered.csv \
203				\| perl -nla -F"," -e '
204				$F[0] eq q{chr} and next;
205				$F[0] eq q{all} and next;
206				$F[2] < [% opt.min %] and next;
207				$F[3] < 0.95 and next;
208				print $F[0];
209				' \
210				\| sort -n \
211				> anchor.txt
212
213				rm unambiguous.covered.txt
214
215				#----------------------------#
216				# anchor2
217				#----------------------------#
218				log_info "anchor2 - unambiguous2"
219
220				# contiguous unique region longer than [% opt.min %]
221				jrunlist span unambiguous.covered.yml --op excise -n [% opt.min %] -o unambiguous2.covered.yml
222				jrunlist stat sr.chr.sizes unambiguous2.covered.yml -o unambiguous2.covered.csv
223
224				cat unambiguous2.covered.csv \
225				\| perl -nla -F"," -e '
226				$F[0] eq q{chr} and next;
227				$F[0] eq q{all} and next;
228				$F[2] < [% opt.min %] and next;
229				print $F[0];
230				' \
231				\| sort -n \
232				> unambiguous2.txt
233
234				cat unambiguous2.txt \
235				\| perl -nl -MPath::Tiny -e '
236				BEGIN {
237				%seen = ();
238				@ls = grep {/\S/}
239				path(q{anchor.txt})->lines({ chomp => 1});
240				$seen{$_}++ for @ls;
241				}
242
243				$seen{$_} and next;
244				print;
245				' \
246				> anchor2.txt
247
248				rm unambiguous2.*
249
250				#----------------------------#
251				# basecov
252				#----------------------------#
253				log_info "basecov"
254				cat basecov.txt \
255				\| grep -v '^#' \
256				\| perl -nla -MApp::Fasops::Common -e '
257				BEGIN { our $name; our @list; }
258
259				if ( !defined $name ) {
260				$name = $F[0];
261				@list = ( $F[2] );
262				}
263				elsif ( $name eq $F[0] ) {
264				push @list, $F[2];
265				}
266				else {
267				my $mean_cov = App::Fasops::Common::mean(@list);
268				printf qq{%s\t%d\n}, $name, int $mean_cov;
269
270				$name = $F[0];
271				@list = ( $F[2] );
272				}
273
274				END {
275				my $mean_cov = App::Fasops::Common::mean(@list);
276				printf qq{%s\t%d\n}, $name, int $mean_cov;
277				}
278				' \
279				> unambiguous.coverage.tsv
280
281				# How to best eliminate values in a list that are outliers
282				# http://www.perlmonks.org/?node_id=1147296
283				# http://exploringdatablog.blogspot.com/2013/02/finding-outliers-in-numerical-data.html
284				cat unambiguous.coverage.tsv \
285				\| perl -nla -MStatistics::Descriptive -e '
286				BEGIN {
287				our $stat = Statistics::Descriptive::Full->new();
288				our %cov_of = ();
289				}
290
291				$cov_of{ $F[0] } = $F[1];
292				$stat->add_data( $F[1] );
293
294				END {
295				my $median = $stat->median();
296				my @abs_res = map { abs( $median - $_ ) } $stat->get_data();
297				my $abs_res_stat = Statistics::Descriptive::Full->new();
298				$abs_res_stat->add_data(@abs_res);
299				my $MAD = $abs_res_stat->median();
300				my $k = 3; # the scale factor
301
302				my $lower_limit = ( $median - $k * $MAD ) / 2;
303				my $upper_limit = ( $median + $k * $MAD ) * 1.5;
304
305				for my $key ( keys %cov_of ) {
306				if ( $cov_of{$key} < $lower_limit or $cov_of{$key} > $upper_limit ) {
307				print $key;
308				}
309				}
310				}
311				' \
312				> outlier.txt
313
314				cat anchor.txt anchor2.txt \
315				\| grep -Fx -f outlier.txt -v \
316				> wanted.txt
317
318				#----------------------------#
319				# Split SR.fasta to anchor and others
320				#----------------------------#
321				log_info "pe.anchor.fa & pe.others.fa"
322				faops some -l 0 SR.fasta wanted.txt pe.anchor.fa
323
324				faops some -l 0 -i SR.fasta wanted.txt pe.others.fa
325
326				#----------------------------#
327				# Done.
328				#----------------------------#
329				touch anchor.success
330				log_info "Done."
331
332				exit 0
333
334				EOF
335				my $output;
336				$tt->process(
337				\$text,
338				{ args => $args,
339				opt => $opt,
340				},
341				\$output
342				);
343
344				print {$out_fh} $output;
345				close $out_fh;
346				}
347
348				1;