File Coverage

blib/lib/Bio/Tools/Run/Newbler.pm
Criterion Covered Total %
statement 37 86 43.0
branch 5 22 22.7
condition 0 6 0.0
subroutine 9 10 90.0
pod 1 1 100.0
total 52 125 41.6


line stmt bran cond sub pod time code
1             # BioPerl module for Bio::Tools::Run::Newbler
2             #
3             # Copyright Florent E Angly
4             #
5             # You may distribute this module under the same terms as perl itself
6             #
7             # POD documentation - main docs before the code
8              
9             =head1 NAME
10              
11             Bio::Tools::Run::Newbler - Wrapper for local execution of Newbler
12              
13             =head1 SYNOPSIS
14              
15             use Bio::Tools::Run::Newbler;
16             # Run Minmo using an input FASTA file
17             my $factory = Bio::Tools::Run::Newbler->new( -minimum_overlap_length => 35 );
18             my $asm_obj = $factory->run($fasta_file, $qual_file);
19             # An assembly object is returned by default
20             for my $contig ($assembly->all_contigs) {
21             ... do something ...
22             }
23              
24             # Read some sequences
25             use Bio::SeqIO;
26             my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta');
27             my @seqs;
28             while (my $seq = $sio->next_seq()) {
29             push @seqs,$seq;
30             }
31              
32             # Run Newbler using input sequence objects and returning an assembly file
33             my $asm_file = 'results.ace';
34             $factory->out_type($asm_file);
35             $factory->run(\@seqs);
36              
37             =head1 DESCRIPTION
38              
39             Wrapper module for the local execution of the proprietary DNA assembly
40             program GS De Novo Assembler (Newbler) from Roche/454 v2.0.00.20:
41             http://www.454.com/products-solutions/analysis-tools/gs-de-novo-assembler.asp
42              
43             =head1 FEEDBACK
44              
45             =head2 Mailing Lists
46              
47             User feedback is an integral part of the evolution of this and other Bioperl
48             modules. Send your comments and suggestions preferably to one of the Bioperl
49             mailing lists. Your participation is much appreciated.
50              
51             bioperl-l@bioperl.org - General discussion
52             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53              
54             =head2 Support
55              
56             Please direct usage questions or support issues to the mailing list:
57              
58             I
59              
60             rather than to the module maintainer directly. Many experienced and
61             reponsive experts will be able look at the problem and quickly
62             address it. Please include a thorough description of the problem
63             with code and data examples if at all possible.
64              
65             =head2 Reporting Bugs
66              
67             Report bugs to the Bioperl bug tracking system to help us keep track the bugs
68             and their resolution. Bug reports can be submitted via the web:
69              
70             http://redmine.open-bio.org/projects/bioperl/
71              
72             =head1 AUTHOR - Florent E Angly
73              
74             Email: florent-dot-angly-at-gmail-dot-com
75              
76             =head1 APPENDIX
77              
78             The rest of the documentation details each of the object methods. Internal
79             methods are usually preceded with a _
80              
81             =cut
82              
83              
84             package Bio::Tools::Run::Newbler;
85              
86 1     1   128331 use strict;
  1         3  
  1         24  
87 1     1   5 use IPC::Run;
  1         2  
  1         27  
88 1     1   255 use File::Copy;
  1         1724  
  1         46  
89 1     1   5 use File::Path;
  1         2  
  1         36  
90 1     1   5 use File::Spec;
  1         1  
  1         14  
91 1     1   4 use File::Basename;
  1         1  
  1         43  
92              
93 1     1   5 use base qw( Bio::Root::Root Bio::Tools::Run::AssemblerBase );
  1         2  
  1         288  
94              
95             our $program_name = 'runAssembly'; # name of the executable
96             our @program_params = (qw( expected_depth mid_conf_file vector_trim vector_screen
97             aln_identity_score aln_difference_score min_ovl_identity min_ovl_length
98             seed_count seed_length seed_step out_dir ));
99             our @program_switches = (qw( large ace ace_raw ace_trimmed no_trim in_memory
100             no_auto_rescore no_duplicates ));
101             our %param_translation = (
102             'large' => 'large',
103             'ace' => 'ace',
104             'ace_raw' => 'ar',
105             'ace_trimmed' => 'at',
106             'expected_depth' => 'e',
107             'mid_conf_file' => 'mcf',
108             'no_trim' => 'notrim',
109             'vector_trim' => 'vt',
110             'vector_screen' => 'vs',
111             'aln_identity_score' => 'ais',
112             'aln_difference_score'=> 'ads',
113             'in_memory' => 'm',
114             'min_ovl_identity' => 'mi',
115             'min_ovl_length' => 'ml',
116             'no_auto_rescore' => 'nor',
117             'seed_count' => 'sc',
118             'seed_length' => 'sl',
119             'seed_step' => 'ss',
120             'no_duplicates' => 'ud',
121             'out_dir' => 'o'
122             );
123              
124             our $qual_param;
125             our $use_dash = 1;
126             our $join = ' ';
127             our $asm_format = 'ace';
128             our $asm_variant = '454';
129              
130              
131             =head2 new
132              
133             Title : new
134             Usage : $assembler->new( -min_len => 50,
135             -min_ident => 95 );
136             Function: Creates a Newbler factory
137             Returns : A Bio::Tools::Run::Newbler object
138             Args : Newbler options available in this module (from the Newbler manual):
139              
140             large Shortcut some of the computationally expensive algorithms
141             to save some time. Useful for large or complex datasets
142             (default: off).
143             ace_raw Output the full "raw" read sequence (default: off).
144             ace_trimmed Output only the "trimmed" sequences (after low quality,
145             vector and key trimming) (default: on).
146             expected_depth Expected depth of the assembly. Filters out random-chance
147             level events at bigger depths. 0 means to not use the
148             expected depth information (default: 0).
149             mid_conf_file MID configuration file for decoding the multiplex data.
150             no_trim Disable the quality and primer trimming of the input
151             sequences (default: off).
152             vector_trim Specify a vector trimming database (in FASTA format) to
153             trim the ends of input sequences.
154             vector_screen Specify a vector screening database (in FASTA format) to
155             remove contaminants, i.e. input reads that align
156             against a sequence in the database.
157             aln_identity_score Set the alignment identity score. When multiple alignments
158             are found, it is the per-overlap column identity score
159             used to sort the overlaps for use in the progressive
160             alignment (default: 2).
161             aln_difference_score Set the alignment difference score. For multiple alignments
162             this is the per-overlap difference score used to sort the
163             overlaps for use in the progressive multi-alignment
164             (default: -3).
165             in_memory Keep all sequence data in memory throughout the computation.
166             Can speed up the computation but requires more computer
167             memory (default: off).
168             min_ovl_identity / minimum_overlap_similarity
169             Minimum overlap identity, i.e. the minimum percent identity
170             of overlaps used by the assembler (default: 40).
171             min_ovl_length / minimum_overlap_length
172             Minimum overlap length, i.e. the minimum length of overlaps
173             considered by the assembler (default: 90). Warning: It
174             seems like this parameter is not respected by the program
175             in the current version
176             no_auto_rescore Do not use the quality score re-scoring algorithm (default:
177             off).
178             seed_count Set the seed count parameter, the number of seeds required
179             in a window before an extension is made (default: 1).
180             seed_length Set the seed length parameter, i.e. the number of bases
181             between seed generation locations used in the exact
182             k-mer matching part of the overlap detection (between 6
183             16) (default: 16).
184             seed_step Set the seed step parameter, i.e. the number of bases used
185             for each seed in the exact k-mer matching part of the
186             overlap detection (i.e. the "k" value) (default: 12).
187             no_duplicates Treat each read as a separate read and do not group them
188             into duplicates for assembly or consensus calling
189             (default: off).
190              
191             =cut
192              
193             sub new {
194 1     1 1 80 my ($class,@args) = @_;
195 1         10 my $self = $class->SUPER::new(@args);
196 1         27 $self->_set_program_options(\@args, \@program_params, \@program_switches,
197             \%param_translation, $qual_param, $use_dash, $join);
198 1         3 *minimum_overlap_length = \&min_ovl_length;
199 1         2 *minimum_overlap_similarity = \&min_ovl_identity;
200 1 50       4 $self->program_name($program_name) if not defined $self->program_name();
201 1         5 $self->_assembly_format($asm_format);
202 1         4 $self->_assembly_variant($asm_variant);
203 1         5 return $self;
204             }
205              
206             =head2 _check_sequence_input
207              
208             Title : _check_sequence_input
209             Usage : $assembler->_check_sequence_input($seqs)
210             Function: Check that the sequence input is arrayref of sequence objects or
211             a FASTA file, or a MIDinfo + dir, or a MIDinfo + file. If not, an
212             error is thrown.
213             Returns : 1 if the check passed
214             Args : sequence input
215              
216             =cut
217              
218             sub _check_sequence_input {
219 4     4   502 my ($self, $seqs) = @_;
220 4 50       10 if (not $seqs) {
221 0         0 $self->throw("Must supply sequences as a FASTA filename or a sequence object".
222             " (Bio::PrimarySeqI or Bio::SeqI) array reference");
223             } else {
224 4 50       8 if (ref($seqs) =~ m/ARRAY/i ) {
225 0         0 for my $seq (@$seqs) {
226 0 0 0     0 unless ($seq->isa('Bio::PrimarySeqI') || $seq->isa('Bio::SeqI')) {
227 0         0 $self->throw("Not a valid Bio::PrimarySeqI or Bio::SeqI object");
228             }
229             }
230             } else {
231             # [midinfo@]sffile|[midinfo@]projectdir|fastafile
232 4         21 my ($mid, $file_or_dir) = ($seqs =~ m/^(.+@)?(.+)$/);
233 4 50       9 if (not defined $file_or_dir) {
234 0         0 $self->throw("Input string $seqs does not seem valid.");
235             } else {
236 4 50       57 if (not -e $file_or_dir) {
237 0         0 $self->throw("Input file or directory '$file_or_dir' does not seem to exist.");
238             }
239             }
240             }
241             }
242 4         16 return 1;
243             }
244              
245             =head2 out_type
246              
247             Title : out_type
248             Usage : $factory->out_type('Bio::Assembly::ScaffoldI')
249             Function: Get/set the desired type of output
250             Returns : The type of results to return
251             Args : Desired type of results to return (optional):
252             'Bio::Assembly::IO' object
253             'Bio::Assembly::ScaffoldI' object (default)
254             The name of a file to save the results in
255              
256             =cut
257              
258              
259             =head2 run
260              
261             Title : run
262             Usage : $factory->run($fasta_file);
263             Function: Run TIGR Assembler
264             Returns : - a Bio::Assembly::ScaffoldI object, a Bio::Assembly::IO
265             object, a filename, or undef if all sequences were too small to
266             be usable
267             Returns : Assembly results (file, IO object or assembly object)
268             Args : Sequence input can be:
269             * a sequence object arrayref
270             * a FASTA file
271             * a SFF file and optional MID information. Example:
272             mid2@/home/xxx/myreads.sff
273             * the path to an run analysis directory and MID information
274             The reads must be between 50 and 2000 bp. Newbler does not support
275             for input quality files. See the Newbler manual for details.
276              
277             =cut
278              
279              
280             =head2 _run
281              
282             Title : _run
283             Usage : $factory->_run()
284             Function: Make a system call and run TIGR Assembler
285             Returns : An assembly file
286             Args : - FASTA file, SFF file and MID, or analysis dir and MID
287             - optional QUAL file
288              
289             =cut
290              
291              
292             sub _run {
293 0     0     my ($self, $fasta_file, $qual_file) = @_;
294              
295             # fasta_file: [midinfo@]sffile|[midinfo@]projectdir|fastafile
296             # qual_file: not supported by newbler
297              
298             # Specify that we want a single ACE output file containing all contigs
299 0           $self->ace(1);
300              
301             # Setup needed files and filehandles first
302 0           my ($output_fh, $output_file) = $self->_prepare_output_file( );
303              
304             # Set the output directory based on the the output file name
305 0           my $output_dir = dirname($output_file);
306 0           $self->out_dir($output_dir);
307              
308             # Set a log file
309 0           my $log_file = File::Spec->catfile($output_dir, '454Log.txt');
310              
311             # Get program executable
312 0           my $exe = $self->executable;
313              
314             # Get command-line options
315 0           my $options = $self->_translate_params();
316              
317             # Usage: runAssembly [options] (sfffile | [regionlist:]analysisDir | readfastafile)...
318             # where options is: [-o projdir] [-nrm] [-p (sfffile | [regionlist:]analysisDir)]...
319 0           my @program_args = ( $exe, @$options, $fasta_file);
320 0           my @ipc_args = ( \@program_args, '>', $log_file );
321              
322             # Print command for debugging
323 0 0         if ($self->verbose() >= 0) {
324 0           my $cmd = '';
325 0           $cmd .= join ( ' ', @program_args );
326 0           for ( my $i = 1 ; $i < scalar @ipc_args ; $i++ ) {
327 0           my $element = $ipc_args[$i];
328 0           my $ref = ref($element);
329 0           my $value;
330 0 0 0       if ( $ref && $ref eq 'SCALAR') {
331 0           $value = $$element;
332             } else {
333 0           $value = $element;
334             }
335 0           $cmd .= " $value";
336             }
337 0           $self->debug( "$exe command = $cmd\n" );
338             }
339              
340             # Execute command
341 0           eval {
342 0 0         IPC::Run::run(@ipc_args) || die("There was a problem running $exe. The ".
343             "error message is: $!.");
344             };
345 0 0         if ($@) {
346 0           $self->throw("$exe call crashed: $@");
347             }
348              
349             # Close filehandles
350 0           close($output_fh);
351              
352             # Result files
353 0           my $ace_file = File::Spec->catfile($output_dir, '454Contigs.ace');
354 0           my $aln_file = File::Spec->catfile($output_dir, '454AlignmentInfo.tsv');
355 0           my $all_cont_fasta_file = File::Spec->catfile($output_dir, '454AllContigs.fna');
356 0           my $all_cont_qual_file = File::Spec->catfile($output_dir, '454AllContigs.qual');
357 0           my $large_cont_fasta_file = File::Spec->catfile($output_dir, '454LargeContigs.fna');
358 0           my $large_cont_qual_file = File::Spec->catfile($output_dir, '454LargeContigs.qual');
359 0           my $metrics_file = File::Spec->catfile($output_dir, '454NewblerMetrics.txt');
360 0           my $status_file = File::Spec->catfile($output_dir, '454ReadStatus.txt');
361 0           my $progress_file = File::Spec->catfile($output_dir, '454NewblerProgress.txt');
362 0           my $trim_file = File::Spec->catfile($output_dir, '454TrimStatus.txt');
363 0           my $sff_dir = File::Spec->catfile($output_dir, 'sff');
364              
365             # Remove all files except for the ACE file
366 0           for my $file ($aln_file, $all_cont_fasta_file, $all_cont_qual_file,
367             $large_cont_fasta_file, $large_cont_qual_file, $metrics_file, $status_file,
368             $progress_file, $trim_file, $log_file ) {
369 0           unlink $file;
370             }
371 0           rmtree( $sff_dir );
372              
373             # Move output file to proper location/name
374 0 0         move ($ace_file, $output_file) or $self->throw("Could not move file ".
375             "'$ace_file' to '$output_file': $!");
376              
377 0           return $output_file;
378             }
379              
380             1;