| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
|
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2014-12-20 00:31:03 mtw> |
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::Bam; |
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1177
|
use 5.12.0; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
38
|
|
|
7
|
1
|
|
|
1
|
|
4
|
use Exporter; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
40
|
|
|
8
|
1
|
|
|
1
|
|
3
|
use version; our $VERSION = qv('0.12_07'); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
5
|
|
|
9
|
1
|
|
|
1
|
|
63
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
24
|
|
|
10
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
32
|
|
|
11
|
1
|
|
|
1
|
|
202
|
use Bio::Perl 1.00690001; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Bio::DB::Sam 1.39; |
|
13
|
|
|
|
|
|
|
use Data::Dumper; |
|
14
|
|
|
|
|
|
|
use File::Basename qw(fileparse); |
|
15
|
|
|
|
|
|
|
use File::Temp qw(tempfile); |
|
16
|
|
|
|
|
|
|
use Path::Class; |
|
17
|
|
|
|
|
|
|
use Carp; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
20
|
|
|
|
|
|
|
our @EXPORT = (); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our @EXPORT_OK = qw ( split_bam uniquify_bam ); |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
|
25
|
|
|
|
|
|
|
#^^^^^^^^^^^ Subroutines ^^^^^^^^^^# |
|
26
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub split_bam { |
|
29
|
|
|
|
|
|
|
my %data = (); |
|
30
|
|
|
|
|
|
|
my @NHval = (); |
|
31
|
|
|
|
|
|
|
my @processed_files = (); |
|
32
|
|
|
|
|
|
|
my $verbose = 0; |
|
33
|
|
|
|
|
|
|
my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_; |
|
34
|
|
|
|
|
|
|
my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp); |
|
35
|
|
|
|
|
|
|
my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg); |
|
36
|
|
|
|
|
|
|
my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg); |
|
37
|
|
|
|
|
|
|
my ($seq_id,$start,$stop,$strand,$target_names,$id,$score); |
|
38
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
39
|
|
|
|
|
|
|
my %count_entries = ( |
|
40
|
|
|
|
|
|
|
total => 0, |
|
41
|
|
|
|
|
|
|
uniq => 0, |
|
42
|
|
|
|
|
|
|
pos => 0, |
|
43
|
|
|
|
|
|
|
neg => 0, |
|
44
|
|
|
|
|
|
|
skip => 0, |
|
45
|
|
|
|
|
|
|
cur => 0, |
|
46
|
|
|
|
|
|
|
mult => 0, |
|
47
|
|
|
|
|
|
|
se_alis => 0, |
|
48
|
|
|
|
|
|
|
pe_alis => 0, |
|
49
|
|
|
|
|
|
|
flag => 0, |
|
50
|
|
|
|
|
|
|
); |
|
51
|
|
|
|
|
|
|
$data{count} = \%count_entries; |
|
52
|
|
|
|
|
|
|
$data{flag} = (); |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
croak "ERROR [$this_function] $bamfile does not exist\n" |
|
55
|
|
|
|
|
|
|
unless (-e $bamfile); |
|
56
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest_dir does not exist\n" |
|
57
|
|
|
|
|
|
|
unless (-d $dest_dir); |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
open(LOG, ">", $log) or croak $!; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
(undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0); |
|
62
|
|
|
|
|
|
|
(undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0); |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
$bam = Bio::DB::Bam->open($bamfile, "r"); |
|
65
|
|
|
|
|
|
|
$header = $bam->header; |
|
66
|
|
|
|
|
|
|
$target_names = $header->target_name; |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); |
|
69
|
|
|
|
|
|
|
unless ($dest_dir =~ /\/$/){$dest_dir .= "/";} |
|
70
|
|
|
|
|
|
|
$bamname_pos = $dest_dir.$bn.".pos".$ext; |
|
71
|
|
|
|
|
|
|
$bamname_neg = $dest_dir.$bn.".neg".$ext; |
|
72
|
|
|
|
|
|
|
$bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w') |
|
73
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!"; |
|
74
|
|
|
|
|
|
|
$bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w') |
|
75
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!"; |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
if ($want_bed == 1){ |
|
78
|
|
|
|
|
|
|
$bedname_pos = $dest_dir.$bn.".pos.bed"; |
|
79
|
|
|
|
|
|
|
$bedname_neg = $dest_dir.$bn.".neg.bed"; |
|
80
|
|
|
|
|
|
|
open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg); |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
$bam_pos->header_write($header);$bam_neg->header_write($header); |
|
84
|
|
|
|
|
|
|
if($reverse == 1) { # switch +/- strand mapping |
|
85
|
|
|
|
|
|
|
$tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp; |
|
86
|
|
|
|
|
|
|
$tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
while (my $read= $bam->read1() ) { |
|
90
|
|
|
|
|
|
|
@NHval = (); |
|
91
|
|
|
|
|
|
|
$data{count}{total}++; |
|
92
|
|
|
|
|
|
|
if($verbose == 1){print STDERR $read->query->name."\t";} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# check if NH (the SAM tag used to indicate multiple mappings) is set |
|
95
|
|
|
|
|
|
|
if ($read->has_tag("NH")) { |
|
96
|
|
|
|
|
|
|
@NHval = $read->get_tag_values("NH"); |
|
97
|
|
|
|
|
|
|
$NH = $NHval[0]; |
|
98
|
|
|
|
|
|
|
if ($NH == 1) { |
|
99
|
|
|
|
|
|
|
$data{count}{uniq}++; |
|
100
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "NH:i:1\t";} |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
else { |
|
103
|
|
|
|
|
|
|
$data{count}{mult}++; |
|
104
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";} |
|
105
|
|
|
|
|
|
|
if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper |
|
106
|
|
|
|
|
|
|
$data{count}{skip}++; |
|
107
|
|
|
|
|
|
|
next; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
$data{count}{cur}++; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";} |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# print Dumper ($read->query); |
|
115
|
|
|
|
|
|
|
$strand = $read->strand; |
|
116
|
|
|
|
|
|
|
$seq_id = $target_names->[$read->tid]; |
|
117
|
|
|
|
|
|
|
$start = $read->start; |
|
118
|
|
|
|
|
|
|
$stop = $read->end; |
|
119
|
|
|
|
|
|
|
$id = "x"; # $read->qname; |
|
120
|
|
|
|
|
|
|
$score = 100; |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
if ( $read->get_tag_values('PAIRED') ) { # paired-end |
|
123
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "pe\t";} |
|
124
|
|
|
|
|
|
|
$data{count}{pe_alis}++; |
|
125
|
|
|
|
|
|
|
if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted |
|
126
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";} |
|
127
|
|
|
|
|
|
|
if ( $strand eq "1" ){ |
|
128
|
|
|
|
|
|
|
$bam_pos->write1($read); |
|
129
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
|
130
|
|
|
|
|
|
|
if ($reverse == 0){ |
|
131
|
|
|
|
|
|
|
$data{count}{pos}++; $eff_strand=$strand; |
|
132
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
else { |
|
135
|
|
|
|
|
|
|
$data{count}{neg}++; $eff_strand=-1*$strand; |
|
136
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
elsif ($strand eq "-1") { |
|
140
|
|
|
|
|
|
|
$bam_neg->write1($read); |
|
141
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
|
142
|
|
|
|
|
|
|
if ($reverse == 0){ |
|
143
|
|
|
|
|
|
|
$data{count}{neg}++; $eff_strand=$strand; |
|
144
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
else { |
|
147
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=-1*$strand; |
|
148
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located |
|
154
|
|
|
|
|
|
|
# on the other strand |
|
155
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";} |
|
156
|
|
|
|
|
|
|
if ( $strand eq "1" ) { |
|
157
|
|
|
|
|
|
|
$bam_neg->write1($read); |
|
158
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
|
159
|
|
|
|
|
|
|
if ($reverse == 0){ |
|
160
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=$strand; |
|
161
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
else { |
|
164
|
|
|
|
|
|
|
$data{count}{pos}++; $eff_strand=-1*$strand; |
|
165
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
elsif ( $strand eq "-1" ) { |
|
169
|
|
|
|
|
|
|
$bam_pos->write1($read); |
|
170
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
|
171
|
|
|
|
|
|
|
if ($reverse == 0){ |
|
172
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=$strand; |
|
173
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
else { |
|
176
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=-1*$strand; |
|
177
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
else { # single-end |
|
184
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "se\t";} |
|
185
|
|
|
|
|
|
|
$data{count}{se_alis}++; |
|
186
|
|
|
|
|
|
|
if ( $read->strand eq "1" ){ |
|
187
|
|
|
|
|
|
|
$bam_pos->write1($read); |
|
188
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
|
189
|
|
|
|
|
|
|
if ($reverse == 0){ |
|
190
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=$strand; |
|
191
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
else { |
|
194
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=-1*$strand; |
|
195
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
elsif ($read->strand eq "-1") { |
|
199
|
|
|
|
|
|
|
$bam_neg->write1($read); |
|
200
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
|
201
|
|
|
|
|
|
|
if ($reverse == 0){ |
|
202
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=$strand; |
|
203
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
else { |
|
206
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=-1*$strand; |
|
207
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# collect statistics of SAM flags |
|
215
|
|
|
|
|
|
|
$flag = $read->flag; |
|
216
|
|
|
|
|
|
|
unless (exists $data{flag}{$flag}){ |
|
217
|
|
|
|
|
|
|
$data{flag}{$flag} = 0; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
$data{flag}{$flag}++; |
|
220
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "\n";} |
|
221
|
|
|
|
|
|
|
} # end while |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
rename ($tmp_bam_pos, $bamname_pos); |
|
224
|
|
|
|
|
|
|
rename ($tmp_bam_neg, $bamname_neg); |
|
225
|
|
|
|
|
|
|
push (@processed_files, ($bamname_pos,$bamname_neg)); |
|
226
|
|
|
|
|
|
|
push (@processed_files, ($data{count}{pos},$data{count}{neg})); |
|
227
|
|
|
|
|
|
|
if ($want_bed){ |
|
228
|
|
|
|
|
|
|
push (@processed_files, ($bedname_pos,$bedname_neg)) |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# error checks |
|
232
|
|
|
|
|
|
|
unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) { |
|
233
|
|
|
|
|
|
|
printf "ERROR: paired-end + single-end alignments != total alignment count\n"; |
|
234
|
|
|
|
|
|
|
print Dumper(\%data); |
|
235
|
|
|
|
|
|
|
croak $!; |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) { |
|
238
|
|
|
|
|
|
|
printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos}; |
|
239
|
|
|
|
|
|
|
printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg}; |
|
240
|
|
|
|
|
|
|
printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg}); |
|
241
|
|
|
|
|
|
|
printf STDERR "%20d cur_count (should be)\n",$data{count}{cur}; |
|
242
|
|
|
|
|
|
|
printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n"; |
|
243
|
|
|
|
|
|
|
print Dumper(\%data); |
|
244
|
|
|
|
|
|
|
croak $!; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
foreach (keys %{$data{flag}}){ |
|
247
|
|
|
|
|
|
|
$data{count}{flag} += $data{flag}{$_}; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
|
|
|
|
|
|
unless ($data{count}{flag} == $data{count}{cur}){ |
|
250
|
|
|
|
|
|
|
printf STDERR "%20d alignments considered\n",$data{count}{cur}; |
|
251
|
|
|
|
|
|
|
printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag}; |
|
252
|
|
|
|
|
|
|
printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n"; |
|
253
|
|
|
|
|
|
|
print Dumper(\%data); |
|
254
|
|
|
|
|
|
|
croak $!; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# logging output |
|
258
|
|
|
|
|
|
|
printf LOG "# bam_split.pl log for file \'$bamfile\'\n"; |
|
259
|
|
|
|
|
|
|
printf LOG "#-----------------------------------------------------------------\n"; |
|
260
|
|
|
|
|
|
|
printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total}; |
|
261
|
|
|
|
|
|
|
printf LOG "%20d unique-mappers (%6.2f%% of total)\n", |
|
262
|
|
|
|
|
|
|
$data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ; |
|
263
|
|
|
|
|
|
|
printf LOG "%20d multi-mappers (%6.2f%% of total)\n", |
|
264
|
|
|
|
|
|
|
$data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total}); |
|
265
|
|
|
|
|
|
|
printf LOG "%20d multi-mappers skipped\n", $data{count}{skip}; |
|
266
|
|
|
|
|
|
|
printf LOG "%20d alignments considered\n", $data{count}{cur}; |
|
267
|
|
|
|
|
|
|
printf LOG "%20d paired-end\n", $data{count}{pe_alis}; |
|
268
|
|
|
|
|
|
|
printf LOG "%20s single-end\n", $data{count}{se_alis}; |
|
269
|
|
|
|
|
|
|
printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n", |
|
270
|
|
|
|
|
|
|
$data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur}); |
|
271
|
|
|
|
|
|
|
printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n", |
|
272
|
|
|
|
|
|
|
$data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur}); |
|
273
|
|
|
|
|
|
|
printf LOG "#-----------------------------------------------------------------\n"; |
|
274
|
|
|
|
|
|
|
printf LOG "Dumper output:\n". Dumper(\%data); |
|
275
|
|
|
|
|
|
|
close(LOG); |
|
276
|
|
|
|
|
|
|
return @processed_files; |
|
277
|
|
|
|
|
|
|
} |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub uniquify_bam { |
|
280
|
|
|
|
|
|
|
my ($bamfile,$dest,$log) = @_; |
|
281
|
|
|
|
|
|
|
my ($bam, $bn,$path,$ext,$read,$header); |
|
282
|
|
|
|
|
|
|
my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult); |
|
283
|
|
|
|
|
|
|
my ($count_all,$count_uniq,$count_mult) = (0)x3; |
|
284
|
|
|
|
|
|
|
my @processed_files = (); |
|
285
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $bamfile\n" |
|
288
|
|
|
|
|
|
|
unless (-e $bamfile); |
|
289
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist\n" |
|
290
|
|
|
|
|
|
|
unless (-d $dest); |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
(undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0); |
|
295
|
|
|
|
|
|
|
(undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0); |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
$bam = Bio::DB::Bam->open($bamfile, "r"); |
|
298
|
|
|
|
|
|
|
$fn_uniq = file($dest,$bn.".uniq".$ext); |
|
299
|
|
|
|
|
|
|
$fn_mult = file($dest,$bn.".mult".$ext); |
|
300
|
|
|
|
|
|
|
$header = $bam->header; # TODO: modify header, leave traces ... |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
$bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w') |
|
303
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; |
|
304
|
|
|
|
|
|
|
$bam_mult = Bio::DB::Bam->open($tmp_mult,'w') |
|
305
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; |
|
306
|
|
|
|
|
|
|
$bam_uniq->header_write($header); |
|
307
|
|
|
|
|
|
|
$bam_mult->header_write($header); |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
while ($read = $bam->read1() ) { |
|
310
|
|
|
|
|
|
|
$count_all++; |
|
311
|
|
|
|
|
|
|
if ($read->aux_get("NH") == 1){ # uniquely mapped reads |
|
312
|
|
|
|
|
|
|
$bam_uniq->write1($read); |
|
313
|
|
|
|
|
|
|
$count_uniq++; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
else { # multiply mapped reads |
|
316
|
|
|
|
|
|
|
$bam_mult->write1($read); |
|
317
|
|
|
|
|
|
|
$count_mult++; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
croak "ERROR [$this_function] Read counts don't match\n" |
|
322
|
|
|
|
|
|
|
unless ($count_uniq + $count_mult == $count_all); |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
rename ($tmp_uniq, $fn_uniq); |
|
325
|
|
|
|
|
|
|
rename ($tmp_mult, $fn_mult); |
|
326
|
|
|
|
|
|
|
push (@processed_files, ($fn_uniq,$fn_mult)); |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
if (defined $log){ |
|
329
|
|
|
|
|
|
|
my $lf = file($dest,$log); |
|
330
|
|
|
|
|
|
|
open(LOG, ">>", $lf) or croak $!; |
|
331
|
|
|
|
|
|
|
printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n", |
|
332
|
|
|
|
|
|
|
$count_all,$count_uniq,$count_mult; |
|
333
|
|
|
|
|
|
|
close(LOG); |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
1; |
|
339
|
|
|
|
|
|
|
__END__ |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=head1 NAME |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Bio::ViennaNGS::Bam - High-level access to BAM files |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
use Bio::ViennaNGS::Bam; |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# split a single-end or paired-end BAM file by strands |
|
351
|
|
|
|
|
|
|
@result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile); |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# extract unique and multi mappers from a BAM file |
|
354
|
|
|
|
|
|
|
@result = uniquify_bam($bam_in,$outdir,$logfile); |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Bio::ViennaNGS::BAM provides high-level access to BAM file. Building |
|
359
|
|
|
|
|
|
|
on L<Bio::DB::Sam>, it provides code to extract specific portions from |
|
360
|
|
|
|
|
|
|
BAM files. It comes with routines for splitting BAM files by strand |
|
361
|
|
|
|
|
|
|
(which is often required for visualization of NGS data) and extracting |
|
362
|
|
|
|
|
|
|
uniquely and multiply aligned reads from BAM files. |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head2 ROUTINES |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=over |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log) |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
Splits BAM file $bam according to [+] and [-] strand. C<$reverse>, |
|
371
|
|
|
|
|
|
|
C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1, |
|
372
|
|
|
|
|
|
|
triggering forced reversion of strand mapping (due to RNA-seq protocol |
|
373
|
|
|
|
|
|
|
constraints), filtering of unique mappers (identified via NH:i:1 SAM |
|
374
|
|
|
|
|
|
|
argument), and forced output of a BED file corresponding to |
|
375
|
|
|
|
|
|
|
strand-specific mapping, respectively. C<$log> holds name and path of |
|
376
|
|
|
|
|
|
|
the log file. |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Strand-splitting is done in a way that in paired-end alignments, FIRST |
|
379
|
|
|
|
|
|
|
and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE |
|
380
|
|
|
|
|
|
|
reads determine the strand, while SECOND_MATE reads are assigned the |
|
381
|
|
|
|
|
|
|
opposite strand I<per definitionem>. This also holds if the reads are |
|
382
|
|
|
|
|
|
|
not mapped in proper pairs and even if there is no mapping partner at |
|
383
|
|
|
|
|
|
|
all. |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
Sometimes the library preparation protocol causes inversion of the |
|
386
|
|
|
|
|
|
|
read assignment (with respect to the underlying annotation). In those |
|
387
|
|
|
|
|
|
|
cases, the natural mapping of the reads can be obtained by the |
|
388
|
|
|
|
|
|
|
C<$reverse> flag. |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
This routine returns an array whose fist two elements are the file |
|
391
|
|
|
|
|
|
|
names of the newly generate BAM files with reads mapped to the |
|
392
|
|
|
|
|
|
|
positive, and negative strand, respectively. Elements three and four |
|
393
|
|
|
|
|
|
|
are the number of fragments mapped to the positive and negative |
|
394
|
|
|
|
|
|
|
strand. If the C<$want_bed> option was given elements fiveand six are |
|
395
|
|
|
|
|
|
|
the file names of the output BED files for positive and negative |
|
396
|
|
|
|
|
|
|
strand, respectively. |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
NOTE: Filtering of unique mappers is only safe for single-end |
|
399
|
|
|
|
|
|
|
experiments; In paired-end experiments, read and mate are treated |
|
400
|
|
|
|
|
|
|
separately, thus allowing for scenarios where eg. one read is a |
|
401
|
|
|
|
|
|
|
multi-mapper, whereas its associate mate is a unique mapper, resulting |
|
402
|
|
|
|
|
|
|
in an ambiguous alignment of the entire fragment. |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=item uniquify_bam($bam,$dest,$log) |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
Extract I<uniquely> and I<multiply> aligned reads from BAM file |
|
407
|
|
|
|
|
|
|
C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for |
|
408
|
|
|
|
|
|
|
unique and multi mappers are created in the output folder C<$dest>, |
|
409
|
|
|
|
|
|
|
which are named B<basename.uniq.bam> and B<basename.mult.bam>, |
|
410
|
|
|
|
|
|
|
respectively. If defined, a logfile named C<$log> is created in the |
|
411
|
|
|
|
|
|
|
output folder. |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
This routine returns an array holding file names of the newly created |
|
414
|
|
|
|
|
|
|
BAM files for unique and multi mappers, respectively. |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to |
|
417
|
|
|
|
|
|
|
decorate unique and multi mappers. As such, this routine will not work |
|
418
|
|
|
|
|
|
|
unless your BAM file has these attributes. |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=over 6 |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item L<Bio::Perl> >= 1.00690001 |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=item L<BIO::DB::Sam> >= 1.39 |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item L<File::Basename> |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item L<File::Temp> |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=item L<Path::Class> |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item L<Carp> |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=back |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head1 AUTHORS |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=over |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=back |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Copyright (C) 2014 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
451
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.12.4 or, |
|
452
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
This software is distributed in the hope that it will be useful, but |
|
455
|
|
|
|
|
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of |
|
456
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |