File Coverage

blib/lib/Bio/ViennaNGS/Bam.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				# --CPerl--
2				# Last changed Time-stamp: <2015-02-09 09:20:43 mtw>
3
4				package Bio::ViennaNGS::Bam;
5
6	1	1	1330	use Exporter;
	1		3
	1		64
7	1	1	35	use version; our $VERSION = qv('0.12_16');
	1		2
	1		10
8	1	1	98	use strict;
	1		1
	1		39
9	1	1	4	use warnings;
	1		2
	1		38
10	1	1	497	use Bio::Perl 1.00690001;
	1		83321
	1		95
11	1	1	206	use Bio::DB::Sam 1.37;
	0
	0
12				use Data::Dumper;
13				use File::Basename qw(fileparse);
14				use File::Temp qw(tempfile);
15				use Path::Class;
16				use Carp;
17
18				our @ISA = qw(Exporter);
19				our @EXPORT = ();
20
21				our @EXPORT_OK = qw ( split_bam uniquify_bam );
22
23				#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
24				#^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
25				#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
26
27				sub split_bam {
28				my %data = ();
29				my @NHval = ();
30				my @processed_files = ();
31				my $verbose = 0;
32				my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_;
33				my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp);
34				my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg);
35				my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg);
36				my ($seq_id,$start,$stop,$strand,$target_names,$id,$score);
37				my $this_function = (caller(0))[3];
38				my %count_entries = (
39				total => 0,
40				uniq => 0,
41				pos => 0,
42				neg => 0,
43				skip => 0,
44				cur => 0,
45				mult => 0,
46				se_alis => 0,
47				pe_alis => 0,
48				flag => 0,
49				);
50				$data{count} = \%count_entries;
51				$data{flag} = ();
52
53				croak "ERROR [$this_function] $bamfile does not exist\n"
54				unless (-e $bamfile);
55				croak "ERROR [$this_function] $dest_dir does not exist\n"
56				unless (-d $dest_dir);
57
58				open(LOG, ">", $log) or croak $!;
59
60				(undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0);
61				(undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0);
62
63				$bam = Bio::DB::Bam->open($bamfile, "r");
64				$header = $bam->header;
65				$target_names = $header->target_name;
66
67				($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
68				unless ($dest_dir =~ /\/$/){$dest_dir .= "/";}
69				$bamname_pos = $dest_dir.$bn.".pos".$ext;
70				$bamname_neg = $dest_dir.$bn.".neg".$ext;
71				$bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w')
72				or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!";
73				$bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w')
74				or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!";
75
76				if ($want_bed == 1){
77				$bedname_pos = $dest_dir.$bn.".pos.bed";
78				$bedname_neg = $dest_dir.$bn.".neg.bed";
79				open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg);
80				}
81
82				$bam_pos->header_write($header);$bam_neg->header_write($header);
83				if($reverse == 1) { # switch +/- strand mapping
84				$tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp;
85				$tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp;
86				}
87
88				while (my $read= $bam->read1() ) {
89				@NHval = ();
90				$data{count}{total}++;
91				if($verbose == 1){print STDERR $read->query->name."\t";}
92
93				# check if NH (the SAM tag used to indicate multiple mappings) is set
94				if ($read->has_tag("NH")) {
95				@NHval = $read->get_tag_values("NH");
96				$NH = $NHval[0];
97				if ($NH == 1) {
98				$data{count}{uniq}++;
99				if ($verbose == 1) {print STDERR "NH:i:1\t";}
100				}
101				else {
102				$data{count}{mult}++;
103				if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";}
104				if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper
105				$data{count}{skip}++;
106				next;
107				}
108				}
109				$data{count}{cur}++;
110				}
111				else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";}
112
113				# print Dumper ($read->query);
114				$strand = $read->strand;
115				$seq_id = $target_names->[$read->tid];
116				$start = $read->start;
117				$stop = $read->end;
118				$id = "x"; # $read->qname;
119				$score = 100;
120
121				if ( $read->get_tag_values('PAIRED') ) { # paired-end
122				if($verbose == 1) {print STDERR "pe\t";}
123				$data{count}{pe_alis}++;
124				if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted
125				if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";}
126				if ( $strand eq "1" ){
127				$bam_pos->write1($read);
128				if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
129				if ($reverse == 0){
130				$data{count}{pos}++; $eff_strand=$strand;
131				if ($want_bed){printf $bed_pos "%s\n", "+";}
132				}
133				else {
134				$data{count}{neg}++; $eff_strand=-1*$strand;
135				if ($want_bed){printf $bed_pos "%s\n", "-";}
136				}
137				}
138				elsif ($strand eq "-1") {
139				$bam_neg->write1($read);
140				if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
141				if ($reverse == 0){
142				$data{count}{neg}++; $eff_strand=$strand;
143				if ($want_bed){printf $bed_neg "%s\n", "-";}
144				}
145				else {
146				$data{count}{pos}++;$eff_strand=-1*$strand;
147				if ($want_bed){printf $bed_neg "%s\n", "+";}
148				}
149				}
150				else {croak "Strand neither + nor - ...exiting!\n";}
151				}
152				else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located
153				# on the other strand
154				if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";}
155				if ( $strand eq "1" ) {
156				$bam_neg->write1($read);
157				if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
158				if ($reverse == 0){
159				$data{count}{neg}++;$eff_strand=$strand;
160				if ($want_bed){printf $bed_neg "%s\n", "-";}
161				}
162				else {
163				$data{count}{pos}++; $eff_strand=-1*$strand;
164				if ($want_bed){printf $bed_neg "%s\n", "+";}
165				}
166				}
167				elsif ( $strand eq "-1" ) {
168				$bam_pos->write1($read);
169				if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
170				if ($reverse == 0){
171				$data{count}{pos}++;$eff_strand=$strand;
172				if ($want_bed){printf $bed_pos "%s\n", "+";}
173				}
174				else {
175				$data{count}{neg}++;$eff_strand=-1*$strand;
176				if ($want_bed){printf $bed_pos "%s\n", "-";}
177				}
178				}
179				else {croak "Strand neither + nor - ...exiting!\n";}
180				}
181				}
182				else { # single-end
183				if($verbose == 1) {print STDERR "se\t";}
184				$data{count}{se_alis}++;
185				if ( $read->strand eq "1" ){
186				$bam_pos->write1($read);
187				if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
188				if ($reverse == 0){
189				$data{count}{pos}++;$eff_strand=$strand;
190				if ($want_bed){printf $bed_pos "%s\n", "+";}
191				}
192				else {
193				$data{count}{neg}++;$eff_strand=-1*$strand;
194				if ($want_bed){printf $bed_pos "%s\n", "-";}
195				}
196				}
197				elsif ($read->strand eq "-1") {
198				$bam_neg->write1($read);
199				if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
200				if ($reverse == 0){
201				$data{count}{neg}++;$eff_strand=$strand;
202				if ($want_bed){printf $bed_neg "%s\n", "-";}
203				}
204				else {
205				$data{count}{pos}++;$eff_strand=-1*$strand;
206				if ($want_bed){printf $bed_neg "%s\n", "+";}
207				}
208				}
209				else {croak "Strand neither + nor - ...exiting!\n";}
210				}
211				if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";}
212
213				# collect statistics of SAM flags
214				$flag = $read->flag;
215				unless (exists $data{flag}{$flag}){
216				$data{flag}{$flag} = 0;
217				}
218				$data{flag}{$flag}++;
219				if ($verbose == 1) {print STDERR "\n";}
220				} # end while
221
222				rename ($tmp_bam_pos, $bamname_pos);
223				rename ($tmp_bam_neg, $bamname_neg);
224				push (@processed_files, ($bamname_pos,$bamname_neg));
225				push (@processed_files, ($data{count}{pos},$data{count}{neg}));
226				if ($want_bed){
227				push (@processed_files, ($bedname_pos,$bedname_neg))
228				}
229
230				# error checks
231				unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) {
232				printf "ERROR: paired-end + single-end alignments != total alignment count\n";
233				print Dumper(\%data);
234				croak $!;
235				}
236				unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) {
237				printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos};
238				printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg};
239				printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg});
240				printf STDERR "%20d cur_count (should be)\n",$data{count}{cur};
241				printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n";
242				print Dumper(\%data);
243				croak $!;
244				}
245				foreach (keys %{$data{flag}}){
246				$data{count}{flag} += $data{flag}{$_};
247				}
248				unless ($data{count}{flag} == $data{count}{cur}){
249				printf STDERR "%20d alignments considered\n",$data{count}{cur};
250				printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag};
251				printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n";
252				print Dumper(\%data);
253				croak $!;
254				}
255
256				# logging output
257				printf LOG "# bam_split.pl log for file \'$bamfile\'\n";
258				printf LOG "#-----------------------------------------------------------------\n";
259				printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total};
260				printf LOG "%20d unique-mappers (%6.2f%% of total)\n",
261				$data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ;
262				printf LOG "%20d multi-mappers (%6.2f%% of total)\n",
263				$data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total});
264				printf LOG "%20d multi-mappers skipped\n", $data{count}{skip};
265				printf LOG "%20d alignments considered\n", $data{count}{cur};
266				printf LOG "%20d paired-end\n", $data{count}{pe_alis};
267				printf LOG "%20s single-end\n", $data{count}{se_alis};
268				printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n",
269				$data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur});
270				printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n",
271				$data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur});
272				printf LOG "#-----------------------------------------------------------------\n";
273				printf LOG "Dumper output:\n". Dumper(\%data);
274				close(LOG);
275				return @processed_files;
276				}
277
278				sub uniquify_bam {
279				my ($bamfile,$dest,$log) = @_;
280				my ($bam, $bn,$path,$ext,$read,$header);
281				my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult);
282				my ($count_all,$count_uniq,$count_mult) = (0)x3;
283				my @processed_files = ();
284				my $this_function = (caller(0))[3];
285
286				croak "ERROR [$this_function] Cannot find $bamfile\n"
287				unless (-e $bamfile);
288				croak "ERROR [$this_function] $dest does not exist\n"
289				unless (-d $dest);
290
291				($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
292
293				(undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0);
294				(undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0);
295
296				$bam = Bio::DB::Bam->open($bamfile, "r");
297				$fn_uniq = file($dest,$bn.".uniq".$ext);
298				$fn_mult = file($dest,$bn.".mult".$ext);
299				$header = $bam->header; # TODO: modify header, leave traces ...
300
301				$bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w')
302				or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
303				$bam_mult = Bio::DB::Bam->open($tmp_mult,'w')
304				or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
305				$bam_uniq->header_write($header);
306				$bam_mult->header_write($header);
307
308				while ($read = $bam->read1() ) {
309				$count_all++;
310				if ($read->aux_get("NH") == 1){ # uniquely mapped reads
311				$bam_uniq->write1($read);
312				$count_uniq++;
313				}
314				else { # multiply mapped reads
315				$bam_mult->write1($read);
316				$count_mult++;
317				}
318				}
319
320				croak "ERROR [$this_function] Read counts don't match\n"
321				unless ($count_uniq + $count_mult == $count_all);
322
323				rename ($tmp_uniq, $fn_uniq);
324				rename ($tmp_mult, $fn_mult);
325				push (@processed_files, ($fn_uniq,$fn_mult));
326
327				if (defined $log){
328				my $lf = file($dest,$log);
329				open(LOG, ">>", $lf) or croak $!;
330				printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n",
331				$count_all,$count_uniq,$count_mult;
332				close(LOG);
333				}
334				}
335
336
337				1;
338				__END__
339
340
341				=head1 NAME
342
343				Bio::ViennaNGS::Bam - High-level access to BAM files
344
345				=head1 SYNOPSIS
346
347				use Bio::ViennaNGS::Bam;
348
349				# split a single-end or paired-end BAM file by strands
350				@result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile);
351
352				# extract unique and multi mappers from a BAM file
353				@result = uniquify_bam($bam_in,$outdir,$logfile);
354
355				=head1 DESCRIPTION
356
357				Bio::ViennaNGS::BAM provides high-level access to BAM file. Building
358				on L<Bio::DB::Sam>, it provides code to extract specific portions from
359				BAM files. It comes with routines for splitting BAM files by strand
360				(which is often required for visualization of NGS data) and extracting
361				uniquely and multiply aligned reads from BAM files.
362
363				=head2 ROUTINES
364
365				=over
366
367				=item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log)
368
369				Splits BAM file $bam according to [+] and [-] strand. C<$reverse>,
370				C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1,
371				triggering forced reversion of strand mapping (due to RNA-seq protocol
372				constraints), filtering of unique mappers (identified via NH:i:1 SAM
373				argument), and forced output of a BED file corresponding to
374				strand-specific mapping, respectively. C<$log> holds name and path of
375				the log file.
376
377				Strand-splitting is done in a way that in paired-end alignments, FIRST
378				and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE
379				reads determine the strand, while SECOND_MATE reads are assigned the
380				opposite strand I<per definitionem>. This also holds if the reads are
381				not mapped in proper pairs and even if there is no mapping partner at
382				all.
383
384				Sometimes the library preparation protocol causes inversion of the
385				read assignment (with respect to the underlying annotation). In those
386				cases, the natural mapping of the reads can be obtained by the
387				C<$reverse> flag.
388
389				This routine returns an array whose fist two elements are the file
390				names of the newly generate BAM files with reads mapped to the
391				positive, and negative strand, respectively. Elements three and four
392				are the number of fragments mapped to the positive and negative
393				strand. If the C<$want_bed> option was given elements fiveand six are
394				the file names of the output BED files for positive and negative
395				strand, respectively.
396
397				NOTE: Filtering of unique mappers is only safe for single-end
398				experiments; In paired-end experiments, read and mate are treated
399				separately, thus allowing for scenarios where eg. one read is a
400				multi-mapper, whereas its associate mate is a unique mapper, resulting
401				in an ambiguous alignment of the entire fragment.
402
403				=item uniquify_bam($bam,$dest,$log)
404
405				Extract I<uniquely> and I<multiply> aligned reads from BAM file
406				C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for
407				unique and multi mappers are created in the output folder C<$dest>,
408				which are named B<basename.uniq.bam> and B<basename.mult.bam>,
409				respectively. If defined, a logfile named C<$log> is created in the
410				output folder.
411
412				This routine returns an array holding file names of the newly created
413				BAM files for unique and multi mappers, respectively.
414
415				NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to
416				decorate unique and multi mappers. As such, this routine will not work
417				unless your BAM file has these attributes.
418
419				=head1 DEPENDENCIES
420
421				=over
422
423				=item L<Bio::Perl> >= 1.00690001
424
425				=item L<BIO::DB::Sam> >= 1.37
426
427				=item L<File::Basename>
428
429				=item L<File::Temp>
430
431				=item L<Path::Class>
432
433				=item L<Carp>
434
435				=back
436
437				=head1 AUTHORS
438
439				=over
440
441				=item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
442
443				=back
444
445				=head1 COPYRIGHT AND LICENSE
446
447				Copyright (C) 2015 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
448
449				This library is free software; you can redistribute it and/or modify
450				it under the same terms as Perl itself, either Perl version 5.10.0 or,
451				at your option, any later version of Perl 5 you may have available.
452
453				This software is distributed in the hope that it will be useful, but
454				WITHOUT ANY WARRANTY; without even the implied warranty of
455				MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
456
457				=cut