File Coverage

blib/lib/Bio/ViennaNGS/Bam.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				# --CPerl--
2				# Last changed Time-stamp: <2014-12-20 00:31:03 mtw>
3
4				package Bio::ViennaNGS::Bam;
5
6	1	1	1177	use 5.12.0;
	1		3
	1		38
7	1	1	4	use Exporter;
	1		1
	1		40
8	1	1	3	use version; our $VERSION = qv('0.12_07');
	1		1
	1		5
9	1	1	63	use strict;
	1		2
	1		24
10	1	1	7	use warnings;
	1		1
	1		32
11	1	1	202	use Bio::Perl 1.00690001;
	0
	0
12				use Bio::DB::Sam 1.39;
13				use Data::Dumper;
14				use File::Basename qw(fileparse);
15				use File::Temp qw(tempfile);
16				use Path::Class;
17				use Carp;
18
19				our @ISA = qw(Exporter);
20				our @EXPORT = ();
21
22				our @EXPORT_OK = qw ( split_bam uniquify_bam );
23
24				#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
25				#^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
26				#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
27
28				sub split_bam {
29				my %data = ();
30				my @NHval = ();
31				my @processed_files = ();
32				my $verbose = 0;
33				my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_;
34				my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp);
35				my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg);
36				my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg);
37				my ($seq_id,$start,$stop,$strand,$target_names,$id,$score);
38				my $this_function = (caller(0))[3];
39				my %count_entries = (
40				total => 0,
41				uniq => 0,
42				pos => 0,
43				neg => 0,
44				skip => 0,
45				cur => 0,
46				mult => 0,
47				se_alis => 0,
48				pe_alis => 0,
49				flag => 0,
50				);
51				$data{count} = \%count_entries;
52				$data{flag} = ();
53
54				croak "ERROR [$this_function] $bamfile does not exist\n"
55				unless (-e $bamfile);
56				croak "ERROR [$this_function] $dest_dir does not exist\n"
57				unless (-d $dest_dir);
58
59				open(LOG, ">", $log) or croak $!;
60
61				(undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0);
62				(undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0);
63
64				$bam = Bio::DB::Bam->open($bamfile, "r");
65				$header = $bam->header;
66				$target_names = $header->target_name;
67
68				($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
69				unless ($dest_dir =~ /\/$/){$dest_dir .= "/";}
70				$bamname_pos = $dest_dir.$bn.".pos".$ext;
71				$bamname_neg = $dest_dir.$bn.".neg".$ext;
72				$bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w')
73				or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!";
74				$bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w')
75				or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!";
76
77				if ($want_bed == 1){
78				$bedname_pos = $dest_dir.$bn.".pos.bed";
79				$bedname_neg = $dest_dir.$bn.".neg.bed";
80				open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg);
81				}
82
83				$bam_pos->header_write($header);$bam_neg->header_write($header);
84				if($reverse == 1) { # switch +/- strand mapping
85				$tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp;
86				$tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp;
87				}
88
89				while (my $read= $bam->read1() ) {
90				@NHval = ();
91				$data{count}{total}++;
92				if($verbose == 1){print STDERR $read->query->name."\t";}
93
94				# check if NH (the SAM tag used to indicate multiple mappings) is set
95				if ($read->has_tag("NH")) {
96				@NHval = $read->get_tag_values("NH");
97				$NH = $NHval[0];
98				if ($NH == 1) {
99				$data{count}{uniq}++;
100				if ($verbose == 1) {print STDERR "NH:i:1\t";}
101				}
102				else {
103				$data{count}{mult}++;
104				if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";}
105				if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper
106				$data{count}{skip}++;
107				next;
108				}
109				}
110				$data{count}{cur}++;
111				}
112				else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";}
113
114				# print Dumper ($read->query);
115				$strand = $read->strand;
116				$seq_id = $target_names->[$read->tid];
117				$start = $read->start;
118				$stop = $read->end;
119				$id = "x"; # $read->qname;
120				$score = 100;
121
122				if ( $read->get_tag_values('PAIRED') ) { # paired-end
123				if($verbose == 1) {print STDERR "pe\t";}
124				$data{count}{pe_alis}++;
125				if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted
126				if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";}
127				if ( $strand eq "1" ){
128				$bam_pos->write1($read);
129				if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
130				if ($reverse == 0){
131				$data{count}{pos}++; $eff_strand=$strand;
132				if ($want_bed){printf $bed_pos "%s\n", "+";}
133				}
134				else {
135				$data{count}{neg}++; $eff_strand=-1*$strand;
136				if ($want_bed){printf $bed_pos "%s\n", "-";}
137				}
138				}
139				elsif ($strand eq "-1") {
140				$bam_neg->write1($read);
141				if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
142				if ($reverse == 0){
143				$data{count}{neg}++; $eff_strand=$strand;
144				if ($want_bed){printf $bed_neg "%s\n", "-";}
145				}
146				else {
147				$data{count}{pos}++;$eff_strand=-1*$strand;
148				if ($want_bed){printf $bed_neg "%s\n", "+";}
149				}
150				}
151				else {croak "Strand neither + nor - ...exiting!\n";}
152				}
153				else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located
154				# on the other strand
155				if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";}
156				if ( $strand eq "1" ) {
157				$bam_neg->write1($read);
158				if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
159				if ($reverse == 0){
160				$data{count}{neg}++;$eff_strand=$strand;
161				if ($want_bed){printf $bed_neg "%s\n", "-";}
162				}
163				else {
164				$data{count}{pos}++; $eff_strand=-1*$strand;
165				if ($want_bed){printf $bed_neg "%s\n", "+";}
166				}
167				}
168				elsif ( $strand eq "-1" ) {
169				$bam_pos->write1($read);
170				if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
171				if ($reverse == 0){
172				$data{count}{pos}++;$eff_strand=$strand;
173				if ($want_bed){printf $bed_pos "%s\n", "+";}
174				}
175				else {
176				$data{count}{neg}++;$eff_strand=-1*$strand;
177				if ($want_bed){printf $bed_pos "%s\n", "-";}
178				}
179				}
180				else {croak "Strand neither + nor - ...exiting!\n";}
181				}
182				}
183				else { # single-end
184				if($verbose == 1) {print STDERR "se\t";}
185				$data{count}{se_alis}++;
186				if ( $read->strand eq "1" ){
187				$bam_pos->write1($read);
188				if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
189				if ($reverse == 0){
190				$data{count}{pos}++;$eff_strand=$strand;
191				if ($want_bed){printf $bed_pos "%s\n", "+";}
192				}
193				else {
194				$data{count}{neg}++;$eff_strand=-1*$strand;
195				if ($want_bed){printf $bed_pos "%s\n", "-";}
196				}
197				}
198				elsif ($read->strand eq "-1") {
199				$bam_neg->write1($read);
200				if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
201				if ($reverse == 0){
202				$data{count}{neg}++;$eff_strand=$strand;
203				if ($want_bed){printf $bed_neg "%s\n", "-";}
204				}
205				else {
206				$data{count}{pos}++;$eff_strand=-1*$strand;
207				if ($want_bed){printf $bed_neg "%s\n", "+";}
208				}
209				}
210				else {croak "Strand neither + nor - ...exiting!\n";}
211				}
212				if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";}
213
214				# collect statistics of SAM flags
215				$flag = $read->flag;
216				unless (exists $data{flag}{$flag}){
217				$data{flag}{$flag} = 0;
218				}
219				$data{flag}{$flag}++;
220				if ($verbose == 1) {print STDERR "\n";}
221				} # end while
222
223				rename ($tmp_bam_pos, $bamname_pos);
224				rename ($tmp_bam_neg, $bamname_neg);
225				push (@processed_files, ($bamname_pos,$bamname_neg));
226				push (@processed_files, ($data{count}{pos},$data{count}{neg}));
227				if ($want_bed){
228				push (@processed_files, ($bedname_pos,$bedname_neg))
229				}
230
231				# error checks
232				unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) {
233				printf "ERROR: paired-end + single-end alignments != total alignment count\n";
234				print Dumper(\%data);
235				croak $!;
236				}
237				unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) {
238				printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos};
239				printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg};
240				printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg});
241				printf STDERR "%20d cur_count (should be)\n",$data{count}{cur};
242				printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n";
243				print Dumper(\%data);
244				croak $!;
245				}
246				foreach (keys %{$data{flag}}){
247				$data{count}{flag} += $data{flag}{$_};
248				}
249				unless ($data{count}{flag} == $data{count}{cur}){
250				printf STDERR "%20d alignments considered\n",$data{count}{cur};
251				printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag};
252				printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n";
253				print Dumper(\%data);
254				croak $!;
255				}
256
257				# logging output
258				printf LOG "# bam_split.pl log for file \'$bamfile\'\n";
259				printf LOG "#-----------------------------------------------------------------\n";
260				printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total};
261				printf LOG "%20d unique-mappers (%6.2f%% of total)\n",
262				$data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ;
263				printf LOG "%20d multi-mappers (%6.2f%% of total)\n",
264				$data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total});
265				printf LOG "%20d multi-mappers skipped\n", $data{count}{skip};
266				printf LOG "%20d alignments considered\n", $data{count}{cur};
267				printf LOG "%20d paired-end\n", $data{count}{pe_alis};
268				printf LOG "%20s single-end\n", $data{count}{se_alis};
269				printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n",
270				$data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur});
271				printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n",
272				$data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur});
273				printf LOG "#-----------------------------------------------------------------\n";
274				printf LOG "Dumper output:\n". Dumper(\%data);
275				close(LOG);
276				return @processed_files;
277				}
278
279				sub uniquify_bam {
280				my ($bamfile,$dest,$log) = @_;
281				my ($bam, $bn,$path,$ext,$read,$header);
282				my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult);
283				my ($count_all,$count_uniq,$count_mult) = (0)x3;
284				my @processed_files = ();
285				my $this_function = (caller(0))[3];
286
287				croak "ERROR [$this_function] Cannot find $bamfile\n"
288				unless (-e $bamfile);
289				croak "ERROR [$this_function] $dest does not exist\n"
290				unless (-d $dest);
291
292				($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
293
294				(undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0);
295				(undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0);
296
297				$bam = Bio::DB::Bam->open($bamfile, "r");
298				$fn_uniq = file($dest,$bn.".uniq".$ext);
299				$fn_mult = file($dest,$bn.".mult".$ext);
300				$header = $bam->header; # TODO: modify header, leave traces ...
301
302				$bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w')
303				or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
304				$bam_mult = Bio::DB::Bam->open($tmp_mult,'w')
305				or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
306				$bam_uniq->header_write($header);
307				$bam_mult->header_write($header);
308
309				while ($read = $bam->read1() ) {
310				$count_all++;
311				if ($read->aux_get("NH") == 1){ # uniquely mapped reads
312				$bam_uniq->write1($read);
313				$count_uniq++;
314				}
315				else { # multiply mapped reads
316				$bam_mult->write1($read);
317				$count_mult++;
318				}
319				}
320
321				croak "ERROR [$this_function] Read counts don't match\n"
322				unless ($count_uniq + $count_mult == $count_all);
323
324				rename ($tmp_uniq, $fn_uniq);
325				rename ($tmp_mult, $fn_mult);
326				push (@processed_files, ($fn_uniq,$fn_mult));
327
328				if (defined $log){
329				my $lf = file($dest,$log);
330				open(LOG, ">>", $lf) or croak $!;
331				printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n",
332				$count_all,$count_uniq,$count_mult;
333				close(LOG);
334				}
335				}
336
337
338				1;
339				__END__
340
341
342				=head1 NAME
343
344				Bio::ViennaNGS::Bam - High-level access to BAM files
345
346				=head1 SYNOPSIS
347
348				use Bio::ViennaNGS::Bam;
349
350				# split a single-end or paired-end BAM file by strands
351				@result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile);
352
353				# extract unique and multi mappers from a BAM file
354				@result = uniquify_bam($bam_in,$outdir,$logfile);
355
356				=head1 DESCRIPTION
357
358				Bio::ViennaNGS::BAM provides high-level access to BAM file. Building
359				on L<Bio::DB::Sam>, it provides code to extract specific portions from
360				BAM files. It comes with routines for splitting BAM files by strand
361				(which is often required for visualization of NGS data) and extracting
362				uniquely and multiply aligned reads from BAM files.
363
364				=head2 ROUTINES
365
366				=over
367
368				=item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log)
369
370				Splits BAM file $bam according to [+] and [-] strand. C<$reverse>,
371				C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1,
372				triggering forced reversion of strand mapping (due to RNA-seq protocol
373				constraints), filtering of unique mappers (identified via NH:i:1 SAM
374				argument), and forced output of a BED file corresponding to
375				strand-specific mapping, respectively. C<$log> holds name and path of
376				the log file.
377
378				Strand-splitting is done in a way that in paired-end alignments, FIRST
379				and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE
380				reads determine the strand, while SECOND_MATE reads are assigned the
381				opposite strand I<per definitionem>. This also holds if the reads are
382				not mapped in proper pairs and even if there is no mapping partner at
383				all.
384
385				Sometimes the library preparation protocol causes inversion of the
386				read assignment (with respect to the underlying annotation). In those
387				cases, the natural mapping of the reads can be obtained by the
388				C<$reverse> flag.
389
390				This routine returns an array whose fist two elements are the file
391				names of the newly generate BAM files with reads mapped to the
392				positive, and negative strand, respectively. Elements three and four
393				are the number of fragments mapped to the positive and negative
394				strand. If the C<$want_bed> option was given elements fiveand six are
395				the file names of the output BED files for positive and negative
396				strand, respectively.
397
398				NOTE: Filtering of unique mappers is only safe for single-end
399				experiments; In paired-end experiments, read and mate are treated
400				separately, thus allowing for scenarios where eg. one read is a
401				multi-mapper, whereas its associate mate is a unique mapper, resulting
402				in an ambiguous alignment of the entire fragment.
403
404				=item uniquify_bam($bam,$dest,$log)
405
406				Extract I<uniquely> and I<multiply> aligned reads from BAM file
407				C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for
408				unique and multi mappers are created in the output folder C<$dest>,
409				which are named B<basename.uniq.bam> and B<basename.mult.bam>,
410				respectively. If defined, a logfile named C<$log> is created in the
411				output folder.
412
413				This routine returns an array holding file names of the newly created
414				BAM files for unique and multi mappers, respectively.
415
416				NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to
417				decorate unique and multi mappers. As such, this routine will not work
418				unless your BAM file has these attributes.
419
420				=head1 DEPENDENCIES
421
422				=over 6
423
424				=item L<Bio::Perl> >= 1.00690001
425
426				=item L<BIO::DB::Sam> >= 1.39
427
428				=item L<File::Basename>
429
430				=item L<File::Temp>
431
432				=item L<Path::Class>
433
434				=item L<Carp>
435
436				=back
437
438				=head1 AUTHORS
439
440				=over
441
442				=item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
443
444				=back
445
446				=head1 COPYRIGHT AND LICENSE
447
448				Copyright (C) 2014 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
449
450				This library is free software; you can redistribute it and/or modify
451				it under the same terms as Perl itself, either Perl version 5.12.4 or,
452				at your option, any later version of Perl 5 you may have available.
453
454				This software is distributed in the hope that it will be useful, but
455				WITHOUT ANY WARRANTY; without even the implied warranty of
456				MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
457
458				=cut