line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2015-02-09 09:20:43 mtw> |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::Bam; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1330
|
use Exporter; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
64
|
|
7
|
1
|
|
|
1
|
|
35
|
use version; our $VERSION = qv('0.12_16'); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
8
|
1
|
|
|
1
|
|
98
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
39
|
|
9
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
10
|
1
|
|
|
1
|
|
497
|
use Bio::Perl 1.00690001; |
|
1
|
|
|
|
|
83321
|
|
|
1
|
|
|
|
|
95
|
|
11
|
1
|
|
|
1
|
|
206
|
use Bio::DB::Sam 1.37; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Data::Dumper; |
13
|
|
|
|
|
|
|
use File::Basename qw(fileparse); |
14
|
|
|
|
|
|
|
use File::Temp qw(tempfile); |
15
|
|
|
|
|
|
|
use Path::Class; |
16
|
|
|
|
|
|
|
use Carp; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
19
|
|
|
|
|
|
|
our @EXPORT = (); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our @EXPORT_OK = qw ( split_bam uniquify_bam ); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
24
|
|
|
|
|
|
|
#^^^^^^^^^^^ Subroutines ^^^^^^^^^^# |
25
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub split_bam { |
28
|
|
|
|
|
|
|
my %data = (); |
29
|
|
|
|
|
|
|
my @NHval = (); |
30
|
|
|
|
|
|
|
my @processed_files = (); |
31
|
|
|
|
|
|
|
my $verbose = 0; |
32
|
|
|
|
|
|
|
my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_; |
33
|
|
|
|
|
|
|
my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp); |
34
|
|
|
|
|
|
|
my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg); |
35
|
|
|
|
|
|
|
my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg); |
36
|
|
|
|
|
|
|
my ($seq_id,$start,$stop,$strand,$target_names,$id,$score); |
37
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
38
|
|
|
|
|
|
|
my %count_entries = ( |
39
|
|
|
|
|
|
|
total => 0, |
40
|
|
|
|
|
|
|
uniq => 0, |
41
|
|
|
|
|
|
|
pos => 0, |
42
|
|
|
|
|
|
|
neg => 0, |
43
|
|
|
|
|
|
|
skip => 0, |
44
|
|
|
|
|
|
|
cur => 0, |
45
|
|
|
|
|
|
|
mult => 0, |
46
|
|
|
|
|
|
|
se_alis => 0, |
47
|
|
|
|
|
|
|
pe_alis => 0, |
48
|
|
|
|
|
|
|
flag => 0, |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
$data{count} = \%count_entries; |
51
|
|
|
|
|
|
|
$data{flag} = (); |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
croak "ERROR [$this_function] $bamfile does not exist\n" |
54
|
|
|
|
|
|
|
unless (-e $bamfile); |
55
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest_dir does not exist\n" |
56
|
|
|
|
|
|
|
unless (-d $dest_dir); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
open(LOG, ">", $log) or croak $!; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
(undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0); |
61
|
|
|
|
|
|
|
(undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0); |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
$bam = Bio::DB::Bam->open($bamfile, "r"); |
64
|
|
|
|
|
|
|
$header = $bam->header; |
65
|
|
|
|
|
|
|
$target_names = $header->target_name; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); |
68
|
|
|
|
|
|
|
unless ($dest_dir =~ /\/$/){$dest_dir .= "/";} |
69
|
|
|
|
|
|
|
$bamname_pos = $dest_dir.$bn.".pos".$ext; |
70
|
|
|
|
|
|
|
$bamname_neg = $dest_dir.$bn.".neg".$ext; |
71
|
|
|
|
|
|
|
$bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w') |
72
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!"; |
73
|
|
|
|
|
|
|
$bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w') |
74
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!"; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
if ($want_bed == 1){ |
77
|
|
|
|
|
|
|
$bedname_pos = $dest_dir.$bn.".pos.bed"; |
78
|
|
|
|
|
|
|
$bedname_neg = $dest_dir.$bn.".neg.bed"; |
79
|
|
|
|
|
|
|
open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
$bam_pos->header_write($header);$bam_neg->header_write($header); |
83
|
|
|
|
|
|
|
if($reverse == 1) { # switch +/- strand mapping |
84
|
|
|
|
|
|
|
$tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp; |
85
|
|
|
|
|
|
|
$tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
while (my $read= $bam->read1() ) { |
89
|
|
|
|
|
|
|
@NHval = (); |
90
|
|
|
|
|
|
|
$data{count}{total}++; |
91
|
|
|
|
|
|
|
if($verbose == 1){print STDERR $read->query->name."\t";} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# check if NH (the SAM tag used to indicate multiple mappings) is set |
94
|
|
|
|
|
|
|
if ($read->has_tag("NH")) { |
95
|
|
|
|
|
|
|
@NHval = $read->get_tag_values("NH"); |
96
|
|
|
|
|
|
|
$NH = $NHval[0]; |
97
|
|
|
|
|
|
|
if ($NH == 1) { |
98
|
|
|
|
|
|
|
$data{count}{uniq}++; |
99
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "NH:i:1\t";} |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
else { |
102
|
|
|
|
|
|
|
$data{count}{mult}++; |
103
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";} |
104
|
|
|
|
|
|
|
if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper |
105
|
|
|
|
|
|
|
$data{count}{skip}++; |
106
|
|
|
|
|
|
|
next; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
$data{count}{cur}++; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# print Dumper ($read->query); |
114
|
|
|
|
|
|
|
$strand = $read->strand; |
115
|
|
|
|
|
|
|
$seq_id = $target_names->[$read->tid]; |
116
|
|
|
|
|
|
|
$start = $read->start; |
117
|
|
|
|
|
|
|
$stop = $read->end; |
118
|
|
|
|
|
|
|
$id = "x"; # $read->qname; |
119
|
|
|
|
|
|
|
$score = 100; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
if ( $read->get_tag_values('PAIRED') ) { # paired-end |
122
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "pe\t";} |
123
|
|
|
|
|
|
|
$data{count}{pe_alis}++; |
124
|
|
|
|
|
|
|
if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted |
125
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";} |
126
|
|
|
|
|
|
|
if ( $strand eq "1" ){ |
127
|
|
|
|
|
|
|
$bam_pos->write1($read); |
128
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
129
|
|
|
|
|
|
|
if ($reverse == 0){ |
130
|
|
|
|
|
|
|
$data{count}{pos}++; $eff_strand=$strand; |
131
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
else { |
134
|
|
|
|
|
|
|
$data{count}{neg}++; $eff_strand=-1*$strand; |
135
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
elsif ($strand eq "-1") { |
139
|
|
|
|
|
|
|
$bam_neg->write1($read); |
140
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
141
|
|
|
|
|
|
|
if ($reverse == 0){ |
142
|
|
|
|
|
|
|
$data{count}{neg}++; $eff_strand=$strand; |
143
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
else { |
146
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=-1*$strand; |
147
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located |
153
|
|
|
|
|
|
|
# on the other strand |
154
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";} |
155
|
|
|
|
|
|
|
if ( $strand eq "1" ) { |
156
|
|
|
|
|
|
|
$bam_neg->write1($read); |
157
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
158
|
|
|
|
|
|
|
if ($reverse == 0){ |
159
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=$strand; |
160
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
else { |
163
|
|
|
|
|
|
|
$data{count}{pos}++; $eff_strand=-1*$strand; |
164
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
elsif ( $strand eq "-1" ) { |
168
|
|
|
|
|
|
|
$bam_pos->write1($read); |
169
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
170
|
|
|
|
|
|
|
if ($reverse == 0){ |
171
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=$strand; |
172
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
else { |
175
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=-1*$strand; |
176
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
else { # single-end |
183
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "se\t";} |
184
|
|
|
|
|
|
|
$data{count}{se_alis}++; |
185
|
|
|
|
|
|
|
if ( $read->strand eq "1" ){ |
186
|
|
|
|
|
|
|
$bam_pos->write1($read); |
187
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
188
|
|
|
|
|
|
|
if ($reverse == 0){ |
189
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=$strand; |
190
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "+";} |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
else { |
193
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=-1*$strand; |
194
|
|
|
|
|
|
|
if ($want_bed){printf $bed_pos "%s\n", "-";} |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
elsif ($read->strand eq "-1") { |
198
|
|
|
|
|
|
|
$bam_neg->write1($read); |
199
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} |
200
|
|
|
|
|
|
|
if ($reverse == 0){ |
201
|
|
|
|
|
|
|
$data{count}{neg}++;$eff_strand=$strand; |
202
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "-";} |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
else { |
205
|
|
|
|
|
|
|
$data{count}{pos}++;$eff_strand=-1*$strand; |
206
|
|
|
|
|
|
|
if ($want_bed){printf $bed_neg "%s\n", "+";} |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
else {croak "Strand neither + nor - ...exiting!\n";} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# collect statistics of SAM flags |
214
|
|
|
|
|
|
|
$flag = $read->flag; |
215
|
|
|
|
|
|
|
unless (exists $data{flag}{$flag}){ |
216
|
|
|
|
|
|
|
$data{flag}{$flag} = 0; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
$data{flag}{$flag}++; |
219
|
|
|
|
|
|
|
if ($verbose == 1) {print STDERR "\n";} |
220
|
|
|
|
|
|
|
} # end while |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
rename ($tmp_bam_pos, $bamname_pos); |
223
|
|
|
|
|
|
|
rename ($tmp_bam_neg, $bamname_neg); |
224
|
|
|
|
|
|
|
push (@processed_files, ($bamname_pos,$bamname_neg)); |
225
|
|
|
|
|
|
|
push (@processed_files, ($data{count}{pos},$data{count}{neg})); |
226
|
|
|
|
|
|
|
if ($want_bed){ |
227
|
|
|
|
|
|
|
push (@processed_files, ($bedname_pos,$bedname_neg)) |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# error checks |
231
|
|
|
|
|
|
|
unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) { |
232
|
|
|
|
|
|
|
printf "ERROR: paired-end + single-end alignments != total alignment count\n"; |
233
|
|
|
|
|
|
|
print Dumper(\%data); |
234
|
|
|
|
|
|
|
croak $!; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) { |
237
|
|
|
|
|
|
|
printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos}; |
238
|
|
|
|
|
|
|
printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg}; |
239
|
|
|
|
|
|
|
printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg}); |
240
|
|
|
|
|
|
|
printf STDERR "%20d cur_count (should be)\n",$data{count}{cur}; |
241
|
|
|
|
|
|
|
printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n"; |
242
|
|
|
|
|
|
|
print Dumper(\%data); |
243
|
|
|
|
|
|
|
croak $!; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
foreach (keys %{$data{flag}}){ |
246
|
|
|
|
|
|
|
$data{count}{flag} += $data{flag}{$_}; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
unless ($data{count}{flag} == $data{count}{cur}){ |
249
|
|
|
|
|
|
|
printf STDERR "%20d alignments considered\n",$data{count}{cur}; |
250
|
|
|
|
|
|
|
printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag}; |
251
|
|
|
|
|
|
|
printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n"; |
252
|
|
|
|
|
|
|
print Dumper(\%data); |
253
|
|
|
|
|
|
|
croak $!; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# logging output |
257
|
|
|
|
|
|
|
printf LOG "# bam_split.pl log for file \'$bamfile\'\n"; |
258
|
|
|
|
|
|
|
printf LOG "#-----------------------------------------------------------------\n"; |
259
|
|
|
|
|
|
|
printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total}; |
260
|
|
|
|
|
|
|
printf LOG "%20d unique-mappers (%6.2f%% of total)\n", |
261
|
|
|
|
|
|
|
$data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ; |
262
|
|
|
|
|
|
|
printf LOG "%20d multi-mappers (%6.2f%% of total)\n", |
263
|
|
|
|
|
|
|
$data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total}); |
264
|
|
|
|
|
|
|
printf LOG "%20d multi-mappers skipped\n", $data{count}{skip}; |
265
|
|
|
|
|
|
|
printf LOG "%20d alignments considered\n", $data{count}{cur}; |
266
|
|
|
|
|
|
|
printf LOG "%20d paired-end\n", $data{count}{pe_alis}; |
267
|
|
|
|
|
|
|
printf LOG "%20s single-end\n", $data{count}{se_alis}; |
268
|
|
|
|
|
|
|
printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n", |
269
|
|
|
|
|
|
|
$data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur}); |
270
|
|
|
|
|
|
|
printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n", |
271
|
|
|
|
|
|
|
$data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur}); |
272
|
|
|
|
|
|
|
printf LOG "#-----------------------------------------------------------------\n"; |
273
|
|
|
|
|
|
|
printf LOG "Dumper output:\n". Dumper(\%data); |
274
|
|
|
|
|
|
|
close(LOG); |
275
|
|
|
|
|
|
|
return @processed_files; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
sub uniquify_bam { |
279
|
|
|
|
|
|
|
my ($bamfile,$dest,$log) = @_; |
280
|
|
|
|
|
|
|
my ($bam, $bn,$path,$ext,$read,$header); |
281
|
|
|
|
|
|
|
my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult); |
282
|
|
|
|
|
|
|
my ($count_all,$count_uniq,$count_mult) = (0)x3; |
283
|
|
|
|
|
|
|
my @processed_files = (); |
284
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $bamfile\n" |
287
|
|
|
|
|
|
|
unless (-e $bamfile); |
288
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist\n" |
289
|
|
|
|
|
|
|
unless (-d $dest); |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
(undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0); |
294
|
|
|
|
|
|
|
(undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0); |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
$bam = Bio::DB::Bam->open($bamfile, "r"); |
297
|
|
|
|
|
|
|
$fn_uniq = file($dest,$bn.".uniq".$ext); |
298
|
|
|
|
|
|
|
$fn_mult = file($dest,$bn.".mult".$ext); |
299
|
|
|
|
|
|
|
$header = $bam->header; # TODO: modify header, leave traces ... |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w') |
302
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; |
303
|
|
|
|
|
|
|
$bam_mult = Bio::DB::Bam->open($tmp_mult,'w') |
304
|
|
|
|
|
|
|
or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; |
305
|
|
|
|
|
|
|
$bam_uniq->header_write($header); |
306
|
|
|
|
|
|
|
$bam_mult->header_write($header); |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
while ($read = $bam->read1() ) { |
309
|
|
|
|
|
|
|
$count_all++; |
310
|
|
|
|
|
|
|
if ($read->aux_get("NH") == 1){ # uniquely mapped reads |
311
|
|
|
|
|
|
|
$bam_uniq->write1($read); |
312
|
|
|
|
|
|
|
$count_uniq++; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
else { # multiply mapped reads |
315
|
|
|
|
|
|
|
$bam_mult->write1($read); |
316
|
|
|
|
|
|
|
$count_mult++; |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
croak "ERROR [$this_function] Read counts don't match\n" |
321
|
|
|
|
|
|
|
unless ($count_uniq + $count_mult == $count_all); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
rename ($tmp_uniq, $fn_uniq); |
324
|
|
|
|
|
|
|
rename ($tmp_mult, $fn_mult); |
325
|
|
|
|
|
|
|
push (@processed_files, ($fn_uniq,$fn_mult)); |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
if (defined $log){ |
328
|
|
|
|
|
|
|
my $lf = file($dest,$log); |
329
|
|
|
|
|
|
|
open(LOG, ">>", $lf) or croak $!; |
330
|
|
|
|
|
|
|
printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n", |
331
|
|
|
|
|
|
|
$count_all,$count_uniq,$count_mult; |
332
|
|
|
|
|
|
|
close(LOG); |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
1; |
338
|
|
|
|
|
|
|
__END__ |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head1 NAME |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Bio::ViennaNGS::Bam - High-level access to BAM files |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head1 SYNOPSIS |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
use Bio::ViennaNGS::Bam; |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# split a single-end or paired-end BAM file by strands |
350
|
|
|
|
|
|
|
@result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile); |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# extract unique and multi mappers from a BAM file |
353
|
|
|
|
|
|
|
@result = uniquify_bam($bam_in,$outdir,$logfile); |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head1 DESCRIPTION |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Bio::ViennaNGS::BAM provides high-level access to BAM file. Building |
358
|
|
|
|
|
|
|
on L<Bio::DB::Sam>, it provides code to extract specific portions from |
359
|
|
|
|
|
|
|
BAM files. It comes with routines for splitting BAM files by strand |
360
|
|
|
|
|
|
|
(which is often required for visualization of NGS data) and extracting |
361
|
|
|
|
|
|
|
uniquely and multiply aligned reads from BAM files. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=head2 ROUTINES |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=over |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log) |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Splits BAM file $bam according to [+] and [-] strand. C<$reverse>, |
370
|
|
|
|
|
|
|
C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1, |
371
|
|
|
|
|
|
|
triggering forced reversion of strand mapping (due to RNA-seq protocol |
372
|
|
|
|
|
|
|
constraints), filtering of unique mappers (identified via NH:i:1 SAM |
373
|
|
|
|
|
|
|
argument), and forced output of a BED file corresponding to |
374
|
|
|
|
|
|
|
strand-specific mapping, respectively. C<$log> holds name and path of |
375
|
|
|
|
|
|
|
the log file. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
Strand-splitting is done in a way that in paired-end alignments, FIRST |
378
|
|
|
|
|
|
|
and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE |
379
|
|
|
|
|
|
|
reads determine the strand, while SECOND_MATE reads are assigned the |
380
|
|
|
|
|
|
|
opposite strand I<per definitionem>. This also holds if the reads are |
381
|
|
|
|
|
|
|
not mapped in proper pairs and even if there is no mapping partner at |
382
|
|
|
|
|
|
|
all. |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
Sometimes the library preparation protocol causes inversion of the |
385
|
|
|
|
|
|
|
read assignment (with respect to the underlying annotation). In those |
386
|
|
|
|
|
|
|
cases, the natural mapping of the reads can be obtained by the |
387
|
|
|
|
|
|
|
C<$reverse> flag. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
This routine returns an array whose fist two elements are the file |
390
|
|
|
|
|
|
|
names of the newly generate BAM files with reads mapped to the |
391
|
|
|
|
|
|
|
positive, and negative strand, respectively. Elements three and four |
392
|
|
|
|
|
|
|
are the number of fragments mapped to the positive and negative |
393
|
|
|
|
|
|
|
strand. If the C<$want_bed> option was given elements fiveand six are |
394
|
|
|
|
|
|
|
the file names of the output BED files for positive and negative |
395
|
|
|
|
|
|
|
strand, respectively. |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
NOTE: Filtering of unique mappers is only safe for single-end |
398
|
|
|
|
|
|
|
experiments; In paired-end experiments, read and mate are treated |
399
|
|
|
|
|
|
|
separately, thus allowing for scenarios where eg. one read is a |
400
|
|
|
|
|
|
|
multi-mapper, whereas its associate mate is a unique mapper, resulting |
401
|
|
|
|
|
|
|
in an ambiguous alignment of the entire fragment. |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=item uniquify_bam($bam,$dest,$log) |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Extract I<uniquely> and I<multiply> aligned reads from BAM file |
406
|
|
|
|
|
|
|
C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for |
407
|
|
|
|
|
|
|
unique and multi mappers are created in the output folder C<$dest>, |
408
|
|
|
|
|
|
|
which are named B<basename.uniq.bam> and B<basename.mult.bam>, |
409
|
|
|
|
|
|
|
respectively. If defined, a logfile named C<$log> is created in the |
410
|
|
|
|
|
|
|
output folder. |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
This routine returns an array holding file names of the newly created |
413
|
|
|
|
|
|
|
BAM files for unique and multi mappers, respectively. |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to |
416
|
|
|
|
|
|
|
decorate unique and multi mappers. As such, this routine will not work |
417
|
|
|
|
|
|
|
unless your BAM file has these attributes. |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=over |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=item L<Bio::Perl> >= 1.00690001 |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=item L<BIO::DB::Sam> >= 1.37 |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=item L<File::Basename> |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item L<File::Temp> |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=item L<Path::Class> |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=item L<Carp> |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=back |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
=head1 AUTHORS |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=over |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=back |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
Copyright (C) 2015 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
450
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.10.0 or, |
451
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
This software is distributed in the hope that it will be useful, but |
454
|
|
|
|
|
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of |
455
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=cut |