File Coverage

blib/lib/Bio/Tools/Run/BWA.pm
Criterion Covered Total %
statement 29 197 14.7
branch 0 88 0.0
condition 0 17 0.0
subroutine 10 23 43.4
pod 6 8 75.0
total 45 333 13.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Run::BWA
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Mark A. Jensen
7             #
8             # Copyright Mark A. Jensen
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Run::BWA - Run wrapper for the BWA short-read assembler *BETA*
17              
18             =head1 SYNOPSIS
19              
20             # create an assembly
21              
22             # run BWA commands separately
23              
24             =head1 DESCRIPTION
25              
26             This module provides a wrapper interface for Heng Li's
27             reference-directed short read assembly suite C (see
28             L for manuals and
29             downloads).
30              
31             Manipulating the alignments requires C
32             (L) and Lincoln Stein's package
33             C (L).
34              
35             There are two modes of action.
36              
37             =over
38              
39             =item * Easy assembly
40              
41             The first is a simple pipeline through the C commands, taking
42             your read data in and squirting out an assembly object of type
43             L. The pipeline is based on the one performed
44             by C:
45              
46             Action maq commands
47             ------ ------------
48             data conversion to fasta2bfa, fastq2bfq
49             maq binary formats
50              
51             map sequence reads map
52             to reference seq
53              
54             assemble, creating assemble
55             consensus
56              
57             convert map & cns mapview, cns2fq
58             files to plaintext
59             (for B:A:IO:maq)
60              
61             Command-line options can be directed to the C, C, and
62             C steps. See L below.
63              
64             =item * BWA command mode
65              
66             The second mode is direct access to C commands. To run a C
67             command, construct a run factory, specifying the desired command using
68             the C<-command> argument in the factory constructor, along with
69             options specific to that command (see L):
70              
71             $bwafac = Bio::Tools::Run::BWA->new( -command => 'fasta2bfa' );
72              
73             To execute, use the C methods. Input and output files are
74             specified in the arguments of C (see L):
75              
76             $bwafac->run_bwa( -fas => "myref.fas", -bfa => "myref.bfa" );
77              
78             =back
79              
80             =head1 OPTIONS
81              
82             C is complex, with many subprograms (commands) and command-line
83             options and file specs for each. This module attempts to provide
84             commands and options comprehensively. You can browse the choices like so:
85              
86             $bwafac = Bio::Tools::Run::BWA->new( -command => 'aln' );
87             # all maq commands
88             @all_commands = $bwafac->available_parameters('commands');
89             @all_commands = $bwafac->available_commands; # alias
90             # just for aln
91             @aln_params = $bwafac->available_parameters('params');
92             @aln_switches = $bwafac->available_parameters('switches');
93             @aln_all_options = $bwafac->available_parameters();
94              
95             Reasonably mnemonic names have been assigned to the single-letter
96             command line options. These are the names returned by
97             C, and can be used in the factory constructor
98             like typical BioPerl named parameters.
99              
100             See L for the gory details.
101              
102             =head1 FILES
103              
104             When a command requires filenames, these are provided to the C method, not
105             the constructor (C). To see the set of files required by a command, use
106             C or the alias C:
107              
108             $bwafac = Bio::Tools::Run::BWA->new( -command => 'aln' );
109             @filespec = $bwafac->filespec;
110              
111             This example returns the following array:
112              
113             fas
114             faq
115             >sai
116              
117             This indicates that the FASTA database (faq) and the FASTQ reads (faq)
118             MUST be specified, and the STDOUT of this program (SA coordinates) MAY be
119             slurped into a file specified in the C argument list:
120              
121             $bwafac->run_bwa( -fas => 'my.db.fas', -faq => 'reads.faq',
122             -sai => 'out.sai' );
123              
124             If files are not specified per the filespec, text sent to STDOUT and
125             STDERR is saved and is accessible with C<$bwafac->stdout()> and
126             C<$bwafac->stderr()>.
127              
128             =head1 FEEDBACK
129              
130             =head2 Mailing Lists
131              
132             User feedback is an integral part of the evolution of this and other
133             Bioperl modules. Send your comments and suggestions preferably to
134             the Bioperl mailing list. Your participation is much appreciated.
135              
136             bioperl-l@bioperl.org - General discussion
137             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
138              
139             =head2 Support
140              
141             Please direct usage questions or support issues to the mailing list:
142              
143             L
144              
145             rather than to the module maintainer directly. Many experienced and
146             reponsive experts will be able look at the problem and quickly
147             address it. Please include a thorough description of the problem
148             with code and data examples if at all possible.
149              
150             =head2 Reporting Bugs
151              
152             Report bugs to the Bioperl bug tracking system to help us keep track
153             of the bugs and their resolution. Bug reports can be submitted via
154             the web:
155              
156             http://redmine.open-bio.org/projects/bioperl/
157              
158             =head1 AUTHOR - Mark A. Jensen
159              
160             Email maj -at- fortinbras -dot- us
161              
162             =head1 APPENDIX
163              
164             The rest of the documentation details each of the object methods.
165             Internal methods are usually preceded with a _
166              
167             =cut
168              
169             # Let the code begin...
170              
171             package Bio::Tools::Run::BWA;
172 1     1   127311 use strict;
  1         2  
  1         24  
173              
174 1     1   3 use IPC::Run;
  1         1  
  1         65  
175             our $HAVE_IO_UNCOMPRESS;
176             our $HAVE_SAMTOOLS;
177             BEGIN {
178 1     1   122 eval "require IO::Uncompress::Gunzip; \$HAVE_IO_UNCOMPRESS = 1";
179 1         55 eval "require Bio::Tools::Run::Samtools; \$HAVE_SAMTOOLS = 1";
180             }
181              
182             # Object preamble - inherits from Bio::Root::Root
183              
184 1     1   5 use Bio::Root::Root;
  1         1  
  1         3  
185 1     1   372 use Bio::Tools::Run::BWA::Config;
  1         2  
  1         96  
186 1     1   473 use Bio::Tools::GuessSeqFormat;
  1         2699  
  1         9  
187 1     1   34 use File::Basename qw(fileparse);
  1         1  
  1         53  
188 1     1   3 use File::Copy;
  1         1  
  1         33  
189 1     1   3 use Cwd;
  1         1  
  1         44  
190              
191 1     1   3 use base qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase );
  1         1  
  1         439  
192              
193             our $program_name = 'bwa'; # name of the executable
194              
195             # Note:
196             # other globals required by Bio::Tools::Run::AssemblerBase are
197             # imported from Bio::Tools::Run::BWA::Config
198              
199             our $qual_param = 'quality_file';
200             our $use_dash = 1;
201             our $join = ' ';
202              
203             # Bio::Assembly::IO::sam now workable!
204             our $asm_format = 'sam';
205              
206             =head2 new()
207              
208             Title : new
209             Usage : my $obj = new Bio::Tools::Run::BWA();
210             Function: Builds a new Bio::Tools::Run::BWA object
211             Returns : an instance of Bio::Tools::Run::BWA
212             Args :
213              
214             =cut
215              
216             sub new {
217 0     0 1   my ($class,@args) = @_;
218 0           my $self = $class->SUPER::new(@args);
219 0           $self->parameters_changed(1);
220 0           $self->_register_program_commands( \@program_commands, \%command_prefixes );
221 0 0         unless (grep /command/, @args) {
222 0           push @args, '-command', 'run';
223             }
224 0           $self->_set_program_options(\@args, \@program_params, \@program_switches,
225             \%param_translation, $qual_param, $use_dash, $join);
226 0 0         $self->program_name($program_name) if not defined $self->program_name();
227 0           $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI
228 0           $self->_assembly_format($asm_format);
229 0           return $self;
230             }
231              
232             =head2 run
233              
234             Title : run
235             Usage : $assembly = $bwafac->run( @args );
236             Function: Run the bwa assembly pipeline.
237             Returns : Assembly results (file, IO object or Assembly object)
238             Args : - fastq file containing single-end reads
239             - fasta file containing the reference sequence
240             - [optional] fastq file containing paired-end reads
241            
242             =cut
243              
244             sub run {
245 0     0 1   my ($self, $rd1_file, $ref_file, $rd2_file) = @_;
246              
247             # Sanity checks
248 0           $self->_check_executable();
249 0 0         unless ($HAVE_SAMTOOLS) {
250 0           cluck( "Bio::Tools::Run::Samtools is not available. A .sam output alignment will be created, but must be converted to binary SAM (.bam) before it can be passed to Bio::Assembly::IO, as follows: \n\t\$ samtools view -Sb out.sam > out.bam" );
251             }
252 0 0         $rd1_file or $self->throw("Fastq reads file required at arg 1");
253 0 0         $ref_file or $self->throw("Fasta refseq file required at arg 2");
254 0           for ($rd1_file, $ref_file, $rd2_file) {
255 0 0         next unless $_;
256 0 0         if (/\.gz[^.]*$/) {
257 0 0         unless ($HAVE_IO_UNCOMPRESS) {
258 0           croak( "IO::Uncompress::Gunzip not available, can't expand '$_'" );
259             }
260 0           my ($tfh, $tf) = $self->io->tempfile;
261 0           my $z = IO::Uncompress::Gunzip->new($_);
262 0           while (<$z>) { print $tfh $_ }
  0            
263 0           close $tfh;
264 0           $_ = $tf;
265             }
266             }
267              
268 0           my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$rd1_file);
269              
270 0 0         $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 1");
271 0           $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$ref_file);
272 0 0         $guesser->guess eq 'fasta' or $self->throw("Refseq file doesn't look like fasta at arg 2");
273 0 0         if ($rd2_file) {
274 0           $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$rd2_file);
275 0 0         $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 3");
276             }
277              
278             #Assemble
279 0           my ($sam_file) = $self->_run($rd1_file, $ref_file, $rd2_file);
280              
281 0 0         if ($HAVE_SAMTOOLS) {
282 0           my ($nm,$dr,$suf) = fileparse($sam_file, ".sam");
283             # goofy kludge for samtools...
284 0           my $pwd = getcwd;
285 0           chdir($dr);
286 0           my $samt = Bio::Tools::Run::Samtools->new( -command => 'view',
287             -sam_input => 1,
288             -bam_output => 1,
289             -refseq => $ref_file);
290 0           my $bam_file = $nm.'.bam';
291 0 0         $samt->run( -bam => $nm.$suf, -out => $bam_file ) or croak( "Problem converting .sam file");
292 0           $samt = Bio::Tools::Run::Samtools->new( -command => 'sort' );
293 0 0         $samt->run( -bam => $bam_file, -pfx => $nm.'.srt' ) or croak( "Problem sorting .bam file");
294 0           move( $nm.'.srt.bam', $bam_file );
295 0           $samt = Bio::Tools::Run::Samtools->new( -command => 'index' );
296 0           $samt->run( -bam => $bam_file );
297 0           $bam_file = File::Spec->catfile($dr, $bam_file);
298 0           $sam_file = $bam_file;
299 0           chdir($pwd);
300             }
301            
302             # Export results in desired object type
303 0           my $asm = $self->_export_results($sam_file, -refdb => $ref_file, -keep_asm => 1);
304 0           return $asm;
305              
306             }
307              
308             =head2 run_bwa()
309              
310             Title : run_bwa
311             Usage : $obj->run_bwa( @file_args )
312             Function: Run a bwa command as specified during object contruction
313             Returns :
314             Args : a specification of the files to operate on:
315              
316             =cut
317              
318             sub run_bwa {
319 0     0 1   my ($self, @args) = @_;
320             # _translate_params will provide an array of command/parameters/switches
321             # -- these are set at object construction
322             # to set up the run, need to add the files to the call
323             # -- provide these as arguments to this function
324              
325 0 0         my $cmd = $self->command if $self->can('command');
326 0 0         $self->throw("No maq command specified for the object") unless $cmd;
327             # setup files necessary for this command
328 0           my $filespec = $command_files{$cmd};
329 0 0         $self->throw("No command-line file specification is defined for command '$cmd'; check Bio::Tools::Run::Maq::Config") unless $filespec;
330              
331             # parse args based on filespec
332             # require named args
333 0 0         $self->throw("Named args are required") unless !(@args % 2);
334 0           s/^-// for @args;
335 0           my %args = @args;
336             # validate
337             my @req = map {
338 0           my $s = $_;
  0            
339 0           $s =~ s/^[012]?[<>]//;
340 0           $s =~ s/[^a-zA-Z0-9_]//g;
341 0           $s
342             } grep !/[#]/, @$filespec;
343 0   0       !defined($args{$_}) && $self->throw("Required filearg '$_' not specified") for @req;
344             # set up redirects
345 0           my ($in, $out, $err);
346 0           for (@$filespec) {
347 0 0         m/^1?>(.*)/ && do {
348 0 0 0       defined($args{$1}) && ( open($out,">", $args{$1}) or $self->throw("Open for write error : $!"));
349 0           next;
350             };
351 0 0         m/^2>#?(.*)/ && do {
352 0 0 0       defined($args{$1}) && (open($err, ">", $args{$1}) or $self->throw("Open for write error : $!"));
353 0           next;
354             };
355 0 0         m/^<#?(.*)/ && do {
356 0 0 0       defined($args{$1}) && (open($in, "<", $args{$1}) or $self->throw("Open for read error : $!"));
357 0           next;
358             }
359             }
360 0           my $dum;
361 0 0         $in || ($in = \$dum);
362 0 0         $out || ($out = \$self->{'stdout'});
363 0 0         $err || ($err = \$self->{'stderr'});
364            
365             # Get program executable
366 0           my $exe = $self->executable;
367             # Get command-line options
368 0           my $options = $self->_translate_params();
369             # Get file specs sans redirects in correct order
370             my @specs = map {
371 0           my $s = $_;
  0            
372 0           $s =~ s/[^a-zA-Z0-9_]//g;
373 0           $s
374             } grep !/[<>]/, @$filespec;
375 0           my @files = @args{@specs};
376             # expand arrayrefs
377 0           my $l = $#files;
378 0           for (0..$l) {
379 0 0         splice(@files, $_, 1, @{$files[$_]}) if (ref($files[$_]) eq 'ARRAY');
  0            
380             }
381 0 0         @files = map { defined $_ ? $_ : () } @files; # squish undefs
  0            
382 0           my @ipc_args = ( $exe, @$options, @files );
383              
384 0           eval {
385 0 0         IPC::Run::run(\@ipc_args, $in, $out, $err) or
386             die ("There was a problem running $exe : $!");
387             };
388 0 0         if ($@) {
389 0           $self->throw("$exe call crashed: $@");
390             }
391              
392             # return arguments as specified on call
393 0           return @args;
394             }
395              
396             =head2 stdout()
397              
398             Title : stdout
399             Usage : $fac->stdout()
400             Function: store the output from STDOUT for the run,
401             if no file specified in run_maq()
402             Example :
403             Returns : scalar string
404             Args : on set, new value (a scalar or undef, optional)
405              
406             =cut
407              
408             sub stdout {
409 0     0 1   my $self = shift;
410            
411 0 0         return $self->{'stdout'} = shift if @_;
412 0           return $self->{'stdout'};
413             }
414              
415             =head2 stderr()
416              
417             Title : stderr
418             Usage : $fac->stderr()
419             Function: store the output from STDERR for the run,
420             if no file is specified in run_maq()
421             Example :
422             Returns : scalar string
423             Args : on set, new value (a scalar or undef, optional)
424              
425             =cut
426              
427             sub stderr {
428 0     0 1   my $self = shift;
429            
430 0 0         return $self->{'stderr'} = shift if @_;
431 0           return $self->{'stderr'};
432             }
433              
434             =head1 Bio::Tools::Run::AssemblerBase overrides
435              
436             =head2 _check_sequence_input()
437              
438             No-op.
439              
440             =cut
441              
442             sub _check_sequence_input {
443 0     0     return 1;
444             }
445              
446             =head2 _check_optional_quality_input()
447              
448             No-op.
449              
450             =cut
451              
452             sub _check_optional_quality_input {
453 0     0     return 1;
454             }
455              
456             =head2 _prepare_input_sequences
457              
458             Convert input fastq and fasta to maq format.
459              
460             =cut
461              
462             sub _prepare_input_sequences {
463 0     0     shift->throw_not_implemented;
464             }
465              
466             =head2 _collate_subcmd_args()
467              
468             Title : _collate_subcmd_args
469             Usage : $args_hash = $self->_collate_subcmd_args
470             Function: collate parameters and switches into command-specific
471             arg lists for passing to new()
472             Returns : hash of named argument lists
473             Args : [optional] composite cmd prefix (scalar string)
474             [default is 'run']
475              
476             =cut
477              
478             sub _collate_subcmd_args {
479 0     0     my $self = shift;
480 0           my $cmd = shift;
481 0           my %ret;
482             # default command is 'run'
483 0   0       $cmd ||= 'run';
484 0           my @subcmds = @{$composite_commands{$cmd}};
  0            
485 0           my %subcmds;
486 0           my $cur_options = $self->{'_options'};
487              
488             # collate
489 0           foreach my $subcmd (@subcmds) {
490             # find the composite cmd form of the argument in
491             # the current params and switches
492             # e.g., map_max_mismatches
493 0           my @params = grep /^${subcmd}_/, @{$$cur_options{'_params'}};
  0            
494 0           my @switches = grep /^${subcmd}_/, @{$$cur_options{'_switches'}};
  0            
495 0           $ret{$subcmd} = [];
496             # create an argument list suitable for passing to new() of
497             # the subcommand factory...
498 0           foreach my $opt (@params, @switches) {
499 0           my $subopt = $opt;
500 0           $subopt =~ s/^${subcmd}_//;
501 0 0         push(@{$ret{$subcmd}}, '-'.$subopt => $self->$opt) if defined $self->$opt;
  0            
502             }
503             }
504 0           return \%ret;
505             }
506              
507             =head2 _run()
508              
509             Title : _run
510             Usage : $factory->_run()
511             Function: Run a bwa assembly pipeline
512             Returns : a text-formatted sam alignment
513             Args : - single end read file in maq bfq format
514             - reference seq file in maq bfa format
515             - [optional] paired end read file in maq bfq format
516              
517             =cut
518              
519             sub _run {
520 0     0     my ($self, $rd1_file, $ref_file, $rd2_file) = @_;
521 0           my ($cmd, $filespec, @ipc_args);
522             # Get program executable
523 0           my $exe = $self->executable;
524 0   0       my $paired = $rd1_file && $rd2_file;
525            
526 0           my $tdir = $self->tempdir();
527 0           my ($saih, $saif) = $self->io->tempfile(-template => 'saiXXXX', -dir=>$tdir);
528 0 0         my ($sai2h, $sai2f) = $self->io->tempfile(-template => 'saiXXXX', -dir=>$tdir)
529             if $paired;
530 0           my ($samh, $samf) = $self->io->tempfile(-template => 'saiXXXX', -dir=>$tdir);
531 0           $_->close for ($saih, $samh);
532 0 0         $sai2h->close if $paired;
533              
534 0           my $subcmd_args = $self->_collate_subcmd_args();
535             # index the fasta file (bwa's, not samtools', index...)
536             my $bwa = Bio::Tools::Run::BWA->new(
537             -command => 'index',
538 0           @{$subcmd_args->{idx}}
  0            
539             );
540 0           $bwa->run_bwa( -fas => $ref_file );
541             # map reads to reference seqs
542             $bwa = Bio::Tools::Run::BWA->new(
543             -command => 'aln',
544 0           @{$subcmd_args->{aln}}
  0            
545             );
546              
547 0           $bwa->run_bwa( -fas => $ref_file, -faq => $rd1_file, -sai => $saif );
548             # do paired run if nec
549 0 0         $bwa->run_bwa( -fas => $ref_file, -faq => $rd2_file, -sai => $sai2f )
550             if $paired;
551             # assemble reads
552             $bwa = Bio::Tools::Run::BWA->new(
553             -command => ($paired ? 'sampe' : 'samse'),
554 0 0         @{$subcmd_args->{ ($paired ? 'smp' : 'sms' ) }}
  0 0          
555             );
556 0 0         if ($paired) {
557 0           $bwa->run_bwa( -fas => $ref_file, -sai1 => $saif, -faq1 => $rd1_file,
558             -sai2 => $sai2f, -faq2 => $rd2_file, -sam => $samf );
559             }
560             else {
561 0           $bwa->run_bwa( -fas => $ref_file, -sai => $saif, -faq => $rd1_file,
562             -sam => $samf );
563             }
564             # note this returns a text-sam file-- needs conversion for B:A:IO::sam...
565             # conversion done in run(), if Bio::Tools::Run::Samtools available.
566 0           return $samf;
567              
568             }
569              
570             =head2 available_parameters()
571              
572             Title : available_parameters
573             Usage : @cmds = $fac->available_commands('commands');
574             Function: Use to browse available commands, params, or switches
575             Returns : array of scalar strings
576             Args : 'commands' : all bwa commands
577             'params' : parameters for this object's command
578             'switches' : boolean switches for this object's command
579             'filespec' : the filename spec for this object's command
580             4Geeks : Overrides Bio::ParameterBaseI via
581             Bio::Tools::Run::AssemblerBase
582              
583             =cut
584              
585             sub available_parameters {
586 0     0 1   my $self = shift;
587 0           my $subset = shift;
588 0           for ($subset) { # get commands
589 0 0         !defined && do { # delegate
590 0           return $self->SUPER::available_parameters($subset);
591             };
592 0 0         m/^c/i && do {
593 0           return grep !/^run$/, @program_commands;
594             };
595 0 0         m/^f/i && do { # get file spec
596 0           return @{$command_files{$self->command}};
  0            
597             };
598 0           do { #else delegate...
599 0           return $self->SUPER::available_parameters($subset);
600             };
601             }
602             }
603              
604 0     0 0   sub available_commands { shift->available_parameters('commands') };
605              
606 0     0 0   sub filespec { shift->available_parameters('filespec') };
607            
608             1;