line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2014-12-20 00:31:03 mtw> |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::Bam; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1177
|
use 5.12.0; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
38
|
|
7
|
1
|
|
|
1
|
|
4
|
use Exporter; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
40
|
|
8
|
1
|
|
|
1
|
|
3
|
use version; our $VERSION = qv('0.12_07'); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
5
|
|
9
|
1
|
|
|
1
|
|
63
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
10
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
32
|
|
11
|
1
|
|
|
1
|
|
202
|
use Bio::Perl 1.00690001; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Bio::DB::Sam 1.39; |
13
|
|
|
|
|
|
|
use Data::Dumper; |
14
|
|
|
|
|
|
|
use File::Basename qw(fileparse); |
15
|
|
|
|
|
|
|
use File::Temp qw(tempfile); |
16
|
|
|
|
|
|
|
use Path::Class; |
17
|
|
|
|
|
|
|
use Carp; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
20
|
|
|
|
|
|
|
our @EXPORT = (); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our @EXPORT_OK = qw ( split_bam uniquify_bam ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
25
|
|
|
|
|
|
|
#^^^^^^^^^^^ Subroutines ^^^^^^^^^^# |
26
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub split_bam { |
29
|
|
|
|
|
|
|
my %data = (); |
30
|
|
|
|
|
|
|
my @NHval = (); |
31
|
|
|
|
|
|
|
my @processed_files = (); |
32
|
|
|
|
|
|
|
my $verbose = 0; |
33
|
|
|
|
|
|
|
my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_; |
34
|
|
|
|
|
|
|
my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp); |
35
|
|
|
|
|
|
|
my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg); |
36
|
|
|
|
|
|
|
my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg); |
37
|
|
|
|
|
|
|
my ($seq_id,$start,$stop,$strand,$target_names,$id,$score); |
38
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
39
|
|
|
|
|
|
|
my %count_entries = ( |
40
|
|
|
|
|
|
|
total => 0, |
41
|
|
|
|
|
|
|
uniq => 0, |
42
|
|
|
|
|
|
|
pos => 0, |
43
|
|
|
|
|
|
|
neg => 0, |
44
|
|
|
|
|
|
|
skip => 0, |
45
|
|
|
|
|
|
|
cur => 0, |
46
|
|
|
|
|
|
|
mult => 0, |
47
|
|
|
|
|
|
|
se_alis => 0, |
48
|
|
|
|
|
|
|
pe_alis => 0, |
49
|
|
|
|
|
|
|
flag => 0, |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
$data{count} = \%count_entries; |
52
|
|
|
|
|
|
|
$data{flag} = (); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
croak "ERROR [$this_function] $bamfile does not exist\n" |
55
|
|
|
|
|
|
|
unless (-e $bamfile); |
56
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest_dir does not exist\n" |
57
|
|
|
|
|
|
|
unless (-d $dest_dir); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
open(LOG, ">", $log) or croak $!; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
(undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0); |
62
|
|
|
|
|
|
|
(undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
$bam = Bio::DB::Bam->open($bamfile, "r"); |
65
|
|
|
|
|
|
|
$header = $bam->header; |
66
|
|
|
|
|
|
|
$target_names = $header->target_name; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); |
69
|
|
|
|
|
|
|
unless ($dest_dir =~ /\/$/){$dest_dir .= "/";} |
70
|
|
|
|
|
|
|
$bamname_pos = $dest_dir.$bn.".pos".$ext; |
71
|
|
|
|
|
|
|
$bamname_neg = $dest_dir.$bn.".neg".$ext; |
72
|
|
|
|
|
|
|
$bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w') |
73
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!"; |
74
|
|
|
|
|
|
|
$bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w') |
75
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!"; |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
if ($want_bed == 1){ |
78
|
|
|
|
|
|
|
$bedname_pos = $dest_dir.$bn.".pos.bed"; |
79
|
|
|
|
|
|
|
$bedname_neg = $dest_dir.$bn.".neg.bed"; |
80
|
|
|
|
|
|
|
open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg); |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
$bam_pos->header_write($header);$bam_neg->header_write($header); |
84
|
|
|
|
|
|
|
if($reverse == 1) { # switch +/- strand mapping |
85
|
|
|
|
|
|
|
$tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp; |
86
|
|
|
|
|
|
|
$tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
while (my $read= $bam->read1() ) { |
90
|
|
|
|
|
|
|
@NHval = (); |
91
|
|
|
|
|
|
|
$data{count}{total}++; |
92
|
|
|
|
|
|
|
if($verbose == 1){print STDERR $read->query->name."\t";} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# check if NH (the SAM tag used to indicate multiple mappings) is set |
95
|
|
|
|
|
|
|
if ($read->has_tag("NH")) { |
96
|
|
|
|
|
|
|
@NHval = $read->get_tag_values("NH"); |
97
|
|
|
|
|
|
|
$NH = $NHval[0]; |
98
|
|
|
|
|
|
|
if ($NH == 1) { |
99
|
|
|
|
|
|
|
$data{count}{uniq}++; |
100
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "NH:i:1\t";} |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
else { |
103
|
|
|
|
|
|
|
$data{count}{mult}++; |
104
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";} |
105
|
|
|
|
|
|
|
if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper |
106
|
|
|
|
|
|
|
$data{count}{skip}++; |
107
|
|
|
|
|
|
|
next; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
$data{count}{cur}++; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# print Dumper ($read->query); |
115
|
|
|
|
|
|
|
$strand = $read->strand; |
116
|
|
|
|
|
|
|
$seq_id = $target_names->[$read->tid]; |
117
|
|
|
|
|
|
|
$start = $read->start; |
118
|
|
|
|
|
|
|
$stop = $read->end; |
119
|
|
|
|
|
|
|
$id = "x"; # $read->qname; |
120
|
|
|
|
|
|
|
$score = 100; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
if ( $read->get_tag_values('PAIRED') ) { # paired-end |
123
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "pe\t";} |
124
|
|
|
|
|
|
|
$data{count}{pe_alis}++; |
125
|
|
|
|
|
|
|
if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted |
126
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";} |
127
|
|
|
|
|
|
|
if ( $strand eq "1" ){ |
128
|
|
|
|
|
|
|
$bam_pos->write1($read); |
129
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
130
|
|
|
|
|
|
|
if ($reverse == 0){ |
131
|
|
|
|
|
|
|
$data{count}{pos}++; $eff_strand=$strand; |
132
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
else { |
135
|
|
|
|
|
|
|
$data{count}{neg}++; $eff_strand=-1*$strand; |
136
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
elsif ($strand eq "-1") { |
140
|
|
|
|
|
|
|
$bam_neg->write1($read); |
141
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
142
|
|
|
|
|
|
|
if ($reverse == 0){ |
143
|
|
|
|
|
|
|
$data{count}{neg}++; $eff_strand=$strand; |
144
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
else { |
147
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=-1*$strand; |
148
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located |
154
|
|
|
|
|
|
|
# on the other strand |
155
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";} |
156
|
|
|
|
|
|
|
if ( $strand eq "1" ) { |
157
|
|
|
|
|
|
|
$bam_neg->write1($read); |
158
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
159
|
|
|
|
|
|
|
if ($reverse == 0){ |
160
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=$strand; |
161
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
else { |
164
|
|
|
|
|
|
|
$data{count}{pos}++; $eff_strand=-1*$strand; |
165
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
elsif ( $strand eq "-1" ) { |
169
|
|
|
|
|
|
|
$bam_pos->write1($read); |
170
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
171
|
|
|
|
|
|
|
if ($reverse == 0){ |
172
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=$strand; |
173
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
else { |
176
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=-1*$strand; |
177
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
else { # single-end |
184
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "se\t";} |
185
|
|
|
|
|
|
|
$data{count}{se_alis}++; |
186
|
|
|
|
|
|
|
if ( $read->strand eq "1" ){ |
187
|
|
|
|
|
|
|
$bam_pos->write1($read); |
188
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
189
|
|
|
|
|
|
|
if ($reverse == 0){ |
190
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=$strand; |
191
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
else { |
194
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=-1*$strand; |
195
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
elsif ($read->strand eq "-1") { |
199
|
|
|
|
|
|
|
$bam_neg->write1($read); |
200
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
201
|
|
|
|
|
|
|
if ($reverse == 0){ |
202
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=$strand; |
203
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
else { |
206
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=-1*$strand; |
207
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# collect statistics of SAM flags |
215
|
|
|
|
|
|
|
$flag = $read->flag; |
216
|
|
|
|
|
|
|
unless (exists $data{flag}{$flag}){ |
217
|
|
|
|
|
|
|
$data{flag}{$flag} = 0; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
$data{flag}{$flag}++; |
220
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "\n";} |
221
|
|
|
|
|
|
|
} # end while |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
rename ($tmp_bam_pos, $bamname_pos); |
224
|
|
|
|
|
|
|
rename ($tmp_bam_neg, $bamname_neg); |
225
|
|
|
|
|
|
|
push (@processed_files, ($bamname_pos,$bamname_neg)); |
226
|
|
|
|
|
|
|
push (@processed_files, ($data{count}{pos},$data{count}{neg})); |
227
|
|
|
|
|
|
|
if ($want_bed){ |
228
|
|
|
|
|
|
|
push (@processed_files, ($bedname_pos,$bedname_neg)) |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# error checks |
232
|
|
|
|
|
|
|
unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) { |
233
|
|
|
|
|
|
|
printf "ERROR: paired-end + single-end alignments != total alignment count\n"; |
234
|
|
|
|
|
|
|
print Dumper(\%data); |
235
|
|
|
|
|
|
|
croak $!; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) { |
238
|
|
|
|
|
|
|
printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos}; |
239
|
|
|
|
|
|
|
printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg}; |
240
|
|
|
|
|
|
|
printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg}); |
241
|
|
|
|
|
|
|
printf STDERR "%20d cur_count (should be)\n",$data{count}{cur}; |
242
|
|
|
|
|
|
|
printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n"; |
243
|
|
|
|
|
|
|
print Dumper(\%data); |
244
|
|
|
|
|
|
|
croak $!; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
foreach (keys %{$data{flag}}){ |
247
|
|
|
|
|
|
|
$data{count}{flag} += $data{flag}{$_}; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
unless ($data{count}{flag} == $data{count}{cur}){ |
250
|
|
|
|
|
|
|
printf STDERR "%20d alignments considered\n",$data{count}{cur}; |
251
|
|
|
|
|
|
|
printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag}; |
252
|
|
|
|
|
|
|
printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n"; |
253
|
|
|
|
|
|
|
print Dumper(\%data); |
254
|
|
|
|
|
|
|
croak $!; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# logging output |
258
|
|
|
|
|
|
|
printf LOG "# bam_split.pl log for file \'$bamfile\'\n"; |
259
|
|
|
|
|
|
|
printf LOG "#-----------------------------------------------------------------\n"; |
260
|
|
|
|
|
|
|
printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total}; |
261
|
|
|
|
|
|
|
printf LOG "%20d unique-mappers (%6.2f%% of total)\n", |
262
|
|
|
|
|
|
|
$data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ; |
263
|
|
|
|
|
|
|
printf LOG "%20d multi-mappers (%6.2f%% of total)\n", |
264
|
|
|
|
|
|
|
$data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total}); |
265
|
|
|
|
|
|
|
printf LOG "%20d multi-mappers skipped\n", $data{count}{skip}; |
266
|
|
|
|
|
|
|
printf LOG "%20d alignments considered\n", $data{count}{cur}; |
267
|
|
|
|
|
|
|
printf LOG "%20d paired-end\n", $data{count}{pe_alis}; |
268
|
|
|
|
|
|
|
printf LOG "%20s single-end\n", $data{count}{se_alis}; |
269
|
|
|
|
|
|
|
printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n", |
270
|
|
|
|
|
|
|
$data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur}); |
271
|
|
|
|
|
|
|
printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n", |
272
|
|
|
|
|
|
|
$data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur}); |
273
|
|
|
|
|
|
|
printf LOG "#-----------------------------------------------------------------\n"; |
274
|
|
|
|
|
|
|
printf LOG "Dumper output:\n". Dumper(\%data); |
275
|
|
|
|
|
|
|
close(LOG); |
276
|
|
|
|
|
|
|
return @processed_files; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub uniquify_bam { |
280
|
|
|
|
|
|
|
my ($bamfile,$dest,$log) = @_; |
281
|
|
|
|
|
|
|
my ($bam, $bn,$path,$ext,$read,$header); |
282
|
|
|
|
|
|
|
my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult); |
283
|
|
|
|
|
|
|
my ($count_all,$count_uniq,$count_mult) = (0)x3; |
284
|
|
|
|
|
|
|
my @processed_files = (); |
285
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $bamfile\n" |
288
|
|
|
|
|
|
|
unless (-e $bamfile); |
289
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist\n" |
290
|
|
|
|
|
|
|
unless (-d $dest); |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
(undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0); |
295
|
|
|
|
|
|
|
(undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0); |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
$bam = Bio::DB::Bam->open($bamfile, "r"); |
298
|
|
|
|
|
|
|
$fn_uniq = file($dest,$bn.".uniq".$ext); |
299
|
|
|
|
|
|
|
$fn_mult = file($dest,$bn.".mult".$ext); |
300
|
|
|
|
|
|
|
$header = $bam->header; # TODO: modify header, leave traces ... |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
$bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w') |
303
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; |
304
|
|
|
|
|
|
|
$bam_mult = Bio::DB::Bam->open($tmp_mult,'w') |
305
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; |
306
|
|
|
|
|
|
|
$bam_uniq->header_write($header); |
307
|
|
|
|
|
|
|
$bam_mult->header_write($header); |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
while ($read = $bam->read1() ) { |
310
|
|
|
|
|
|
|
$count_all++; |
311
|
|
|
|
|
|
|
if ($read->aux_get("NH") == 1){ # uniquely mapped reads |
312
|
|
|
|
|
|
|
$bam_uniq->write1($read); |
313
|
|
|
|
|
|
|
$count_uniq++; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
else { # multiply mapped reads |
316
|
|
|
|
|
|
|
$bam_mult->write1($read); |
317
|
|
|
|
|
|
|
$count_mult++; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
croak "ERROR [$this_function] Read counts don't match\n" |
322
|
|
|
|
|
|
|
unless ($count_uniq + $count_mult == $count_all); |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
rename ($tmp_uniq, $fn_uniq); |
325
|
|
|
|
|
|
|
rename ($tmp_mult, $fn_mult); |
326
|
|
|
|
|
|
|
push (@processed_files, ($fn_uniq,$fn_mult)); |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
if (defined $log){ |
329
|
|
|
|
|
|
|
my $lf = file($dest,$log); |
330
|
|
|
|
|
|
|
open(LOG, ">>", $lf) or croak $!; |
331
|
|
|
|
|
|
|
printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n", |
332
|
|
|
|
|
|
|
$count_all,$count_uniq,$count_mult; |
333
|
|
|
|
|
|
|
close(LOG); |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
1; |
339
|
|
|
|
|
|
|
__END__ |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=head1 NAME |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Bio::ViennaNGS::Bam - High-level access to BAM files |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=head1 SYNOPSIS |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
use Bio::ViennaNGS::Bam; |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# split a single-end or paired-end BAM file by strands |
351
|
|
|
|
|
|
|
@result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile); |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# extract unique and multi mappers from a BAM file |
354
|
|
|
|
|
|
|
@result = uniquify_bam($bam_in,$outdir,$logfile); |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=head1 DESCRIPTION |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Bio::ViennaNGS::BAM provides high-level access to BAM file. Building |
359
|
|
|
|
|
|
|
on L<Bio::DB::Sam>, it provides code to extract specific portions from |
360
|
|
|
|
|
|
|
BAM files. It comes with routines for splitting BAM files by strand |
361
|
|
|
|
|
|
|
(which is often required for visualization of NGS data) and extracting |
362
|
|
|
|
|
|
|
uniquely and multiply aligned reads from BAM files. |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head2 ROUTINES |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=over |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log) |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
Splits BAM file $bam according to [+] and [-] strand. C<$reverse>, |
371
|
|
|
|
|
|
|
C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1, |
372
|
|
|
|
|
|
|
triggering forced reversion of strand mapping (due to RNA-seq protocol |
373
|
|
|
|
|
|
|
constraints), filtering of unique mappers (identified via NH:i:1 SAM |
374
|
|
|
|
|
|
|
argument), and forced output of a BED file corresponding to |
375
|
|
|
|
|
|
|
strand-specific mapping, respectively. C<$log> holds name and path of |
376
|
|
|
|
|
|
|
the log file. |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Strand-splitting is done in a way that in paired-end alignments, FIRST |
379
|
|
|
|
|
|
|
and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE |
380
|
|
|
|
|
|
|
reads determine the strand, while SECOND_MATE reads are assigned the |
381
|
|
|
|
|
|
|
opposite strand I<per definitionem>. This also holds if the reads are |
382
|
|
|
|
|
|
|
not mapped in proper pairs and even if there is no mapping partner at |
383
|
|
|
|
|
|
|
all. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
Sometimes the library preparation protocol causes inversion of the |
386
|
|
|
|
|
|
|
read assignment (with respect to the underlying annotation). In those |
387
|
|
|
|
|
|
|
cases, the natural mapping of the reads can be obtained by the |
388
|
|
|
|
|
|
|
C<$reverse> flag. |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
This routine returns an array whose fist two elements are the file |
391
|
|
|
|
|
|
|
names of the newly generate BAM files with reads mapped to the |
392
|
|
|
|
|
|
|
positive, and negative strand, respectively. Elements three and four |
393
|
|
|
|
|
|
|
are the number of fragments mapped to the positive and negative |
394
|
|
|
|
|
|
|
strand. If the C<$want_bed> option was given elements fiveand six are |
395
|
|
|
|
|
|
|
the file names of the output BED files for positive and negative |
396
|
|
|
|
|
|
|
strand, respectively. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
NOTE: Filtering of unique mappers is only safe for single-end |
399
|
|
|
|
|
|
|
experiments; In paired-end experiments, read and mate are treated |
400
|
|
|
|
|
|
|
separately, thus allowing for scenarios where eg. one read is a |
401
|
|
|
|
|
|
|
multi-mapper, whereas its associate mate is a unique mapper, resulting |
402
|
|
|
|
|
|
|
in an ambiguous alignment of the entire fragment. |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=item uniquify_bam($bam,$dest,$log) |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
Extract I<uniquely> and I<multiply> aligned reads from BAM file |
407
|
|
|
|
|
|
|
C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for |
408
|
|
|
|
|
|
|
unique and multi mappers are created in the output folder C<$dest>, |
409
|
|
|
|
|
|
|
which are named B<basename.uniq.bam> and B<basename.mult.bam>, |
410
|
|
|
|
|
|
|
respectively. If defined, a logfile named C<$log> is created in the |
411
|
|
|
|
|
|
|
output folder. |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
This routine returns an array holding file names of the newly created |
414
|
|
|
|
|
|
|
BAM files for unique and multi mappers, respectively. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to |
417
|
|
|
|
|
|
|
decorate unique and multi mappers. As such, this routine will not work |
418
|
|
|
|
|
|
|
unless your BAM file has these attributes. |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=over 6 |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item L<Bio::Perl> >= 1.00690001 |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=item L<BIO::DB::Sam> >= 1.39 |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item L<File::Basename> |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item L<File::Temp> |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=item L<Path::Class> |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item L<Carp> |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=back |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head1 AUTHORS |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=over |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=back |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Copyright (C) 2014 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
451
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.12.4 or, |
452
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
This software is distributed in the hope that it will be useful, but |
455
|
|
|
|
|
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of |
456
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |