File Coverage

blib/lib/Bio/Tools/Run/Phrap.pm
Criterion Covered Total %
statement 15 40 37.5
branch 1 12 8.3
condition 0 6 0.0
subroutine 4 5 80.0
pod 1 1 100.0
total 21 64 32.8


line stmt bran cond sub pod time code
1             #
2             # Phrap wraper module
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Shawn Hoon
7             #
8             # Copyright Shawn Hoon
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Run::Phrap - a wrapper for running Phrap
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Run::Phrap;
21             # Run Phrap using an input FASTA file
22             my $factory = Bio::Tools::Run::Phrap->new( -penalty => -2, -raw => 1 );
23             my $asm_obj = $factory->run($fasta_file, $qual_file);
24             # An assembly object is returned by default
25             for my $contig ($assembly->all_contigs) {
26             ... do something ...
27             }
28              
29             # Read some sequences
30             use Bio::SeqIO;
31             my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta');
32             my @seqs;
33             while (my $seq = $sio->next_seq()) {
34             push @seqs,$seq;
35             }
36              
37             # Run Phrap using input sequence objects and returning an assembly file
38             my $asm_file = 'results.phrap';
39             $factory->out_type($asm_file);
40             $factory->run(\@seqs);
41              
42              
43             =head1 DESCRIPTION
44              
45             Wrapper module for the Phrap assembly program
46             Phrap is available at: http://www.phrap.org/
47              
48             =head1 FEEDBACK
49              
50             =head2 Mailing Lists
51              
52             User feedback is an integral part of the evolution of this and other
53             Bioperl modules. Send your comments and suggestions preferably to one
54             of the Bioperl mailing lists. Your participation is much appreciated.
55              
56             bioperl-l@bioperl.org - General discussion
57             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58              
59             =head2 Support
60              
61             Please direct usage questions or support issues to the mailing list:
62              
63             I
64              
65             rather than to the module maintainer directly. Many experienced and
66             reponsive experts will be able look at the problem and quickly
67             address it. Please include a thorough description of the problem
68             with code and data examples if at all possible.
69              
70             =head2 Reporting Bugs
71              
72             Report bugs to the Bioperl bug tracking system to help us keep track
73             the bugs and their resolution. Bug reports can be submitted via the
74             web:
75              
76             http://redmine.open-bio.org/projects/bioperl/
77              
78             =head1 AUTHOR - Shawn Hoon
79              
80             Email shawnh-at-stanford.edu
81              
82             =head1 APPENDIX
83              
84             The rest of the documentation details each of the object
85             methods. Internal methods are usually preceded with a _
86              
87             =cut
88              
89             package Bio::Tools::Run::Phrap;
90              
91 1     1   103256 use strict;
  1         3  
  1         34  
92 1     1   334 use File::Copy;
  1         2243  
  1         47  
93              
94 1     1   6 use base qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase);
  1         1  
  1         286  
95              
96             our $program_name = 'phrap';
97             our @program_params = (qw(penalty gap_init gap_ext ins_gap_ext del_gap_ext
98             matrix minmatch maxmatch max_group_size bandwidth minscore vector_bound masklevel
99             default_qual subclone_delim n_delim group_delim trim_start forcelevel
100             bypasslevel maxgap repeat_stringency node_seg node_space max_subclone_size
101             trim_penalty trim_score trim_qual confirm_length confirm_trim confirm_penalty
102             confirm_score indexwordsize));
103             our @program_switches = (qw(raw word_raw revise_greedy shatter_greedy preassemble
104             force_high retain_duplicates));
105             our %param_translation;
106             our $qual_param;
107             our $use_dash = 1;
108             our $join = ' ';
109             our $asm_format = 'phrap';
110              
111              
112             =head2 new
113              
114             Title : new
115             Usage : $factory = Bio::Tools::Run::Phrap->new(
116             -penalty => -2, # parameter option and value
117             -raw => 1 # flag (1=yes, 0=no)
118             );
119             Function: Create a new Phrap factory
120             Returns : A Bio::Tools::Run::Phrap object
121             Args : Phrap options available in this module:
122              
123             Option names & default values taken from the PHRAP manual:
124              
125             1. Scoring of pairwise alignments
126              
127             -penalty -2
128             Mismatch (substitution) penalty for SWAT comparisons.
129              
130             -gap_init penalty-2
131             Gap initiation penalty for SWAT comparisons.
132              
133             -gap_ext penalty-1
134             Gap extension penalty for SWAT comparisons.
135              
136             -ins_gap_ext gap_ext
137             Insertion gap extension penalty for SWAT comparisons (insertion in
138             subject relative to query).
139              
140             -del_gap_ext gap_ext
141             Deletion gap extension penalty for SWAT comparisons (deletion in
142             subject relative to query).
143              
144             -matrix [None]
145             Score matrix for SWAT comparisons (if present, supersedes -penalty)
146              
147             -raw *
148             Use raw rather than complexity-adjusted Smith-Waterman scores.
149              
150             2. Banded search
151              
152             -maxmatch 30
153             Maximum length of matching word. For cross_match, the default value
154             is equal to minmatch, instead of 30.
155              
156             -max_group_size 20
157             Group size (query file, forward strand words)
158              
159             -word_raw *
160             Use raw rather than complexity-adjusted word length, in testing
161             against minmatch (N.B. maxmatch always refer to raw lengths). (The
162             default is to adjust word length to reflect complexity of matching
163             sequence).
164              
165             -bandwidth 14
166             1/2 band width for banded SWAT searches (full width is 2 times
167             bandwidth + 1). Decreasing bandwidth also decreases running time at
168             the expense of sensitivity. Phrap assemblies of clones containing long
169             tandem repeats of a short repeat unit (< 30 bp) may be more accurately
170             assembled by decreasing -bandwidth; -bandwidth should be set such that
171             2 bandwidth + 1 is less than the length of a repeat unit. -bandwidth 0
172             can be used to find gap-free alignments.
173              
174             3. Filtering of matches
175              
176             -minscore 30
177             Minimum alignment score.
178              
179             -vector_bound 80
180             Number of potential vector bases at beginning of each read. Matches
181             that lie entirely within this region are assumed to represent vector
182             matches and are ignored. For cross_match, the default value is 0
183             instead of 80.
184              
185             -masklevel 80
186             (cross_match only). A match is reported only if at least (100 -
187             masklevel)% of the bases in its "domain" (the part of the query that
188             is aligned) are not contained within the domain of any higher-scoring
189             match.
190             Special cases:
191             -masklevel 0 report only the single highest scoring match for each query
192             -masklevel 100 report any match whose domain is not completely contained
193             within a higher scoring match
194             -masklevel 101 report all matches
195              
196             4. Input data interpretation
197              
198             -default_qual 15
199             Quality value to be used for each base, when no input .qual file is
200             provided. Note that a quality value of 15 corresponds to an error rate
201             of approximately 1 in 30 bases, i.e. relatively accurate sequence. If
202             you are using sequence that is substantially less accurate than this
203             and do not have phred-generated quality values you should be sure to
204             decrease the value of this parameter.
205              
206             -subclone_delim .
207             (phrap only). Subclone name delimiter: Character used to indicate end
208             of that part of the read name that corresponds to the subclone name
209              
210             -n_delim 1
211             (phrap only). Indicates which occurrence of the subclone delimiter
212             character denotes the end of the subclone name (so for example
213             -subclone_delim _ -n_delim 2
214             means that the end of the subclone name occurs at the
215             second occurrence of the character '_'). Must be the same for all
216             reads!
217              
218             -group_delim _
219             (phrap only). Group name delimiter: Character used to indicate end of
220             that part of the read name that corresponds to the group name
221             (relevant only if option -preassemble is used); this character must
222             occur before the subclone delimiter (else it has no effect, and the
223             read is not assigned to a group).
224              
225             -trim_start 0
226             (phrap only). No. of bases to be removed at beginning of each read.
227              
228             5. Assembly
229              
230             -forcelevel 0
231             (phrap only). Relaxes stringency to varying degree during final
232             contig merge pass. Allowed values are integers from 0 (most
233             stringent) to 10 (least stringent), inclusive.
234              
235             -bypasslevel 1
236             (phrap only). Controls treatment of inconsistent reads in merge.
237             Currently allowed values are 0 (no bypasses allowed; most stringent)
238             and 1 (a single conflicting read may be bypassed).
239              
240             -maxgap 30
241             (phrap only). Maximum permitted size of an unmatched region in
242             merging contigs, during first (most stringent) merging pass.
243              
244             -repeat_stringency .95
245             (phrap only). Controls stringency of match required for joins. Must
246             be less than 1 (highest stringency), and greater than 0 (lowest
247             stringency).
248              
249             -revise_greedy *
250             (phrap only). Splits initial greedy assembly into pieces at "weak
251             joins", and then tries to reattach them to give higher overall score.
252             Use of this option should correct some types of missassembly.
253              
254             -shatter_greedy *
255             (phrap only). Breaks assembly at weak joins (as with -revise_greedy)
256             but does not try to reattach pieces.
257              
258             -preassemble *
259             (phrap only). Preassemble reads within groups, prior to merging with
260             other groups. This is useful for example when the input data set
261             consists of reads from two distinct but overlapping clones, and it is
262             desired to assemble the reads from each clone separately before
263             merging in order to reduce the risk of incorrect joins due to
264             repeats. The preassemble merging pass is relatively stringent and not
265             guaranteed to merge all of the reads from a group.
266             Groups are indicated by the first part of the read name, up to the
267             character specified by -group_delim.
268              
269             -force_high *
270             (phrap only). Causes edited high-quality discrepancies to be ignored
271             during final contig merge pass. This option may be useful when it is
272             suspected that incorrect edits are causing a misassembly.
273              
274             6. Consensus sequence construction
275              
276             -node_seg 8
277             (phrap only). Minimum segment size (for purposes of traversing
278             weighted directed graph).
279              
280             -node_space 4
281             (phrap only). Spacing between nodes (in weighted directed graph).
282              
283             7. Output
284              
285             Not implemented in this Perl module.
286              
287             8. Miscellaneous
288              
289             -retain_duplicates *
290             (phrap only). Retain exact duplicate reads, rather than eliminating
291             them.
292              
293             -max_subclone_size 5000
294             (phrap only). Maximum subclone size -- for forward-reverse read pair
295             consistency checks.
296              
297             -trim_penalty -2
298             (phrap only). Penalty used for identifying degenerate sequence at
299             beginning & end of read.
300              
301             -trim_score 20
302             (phrap only). Minimum score for identifying degenerate sequence at
303             beginning & end of read.
304              
305             -trim_qual 13
306             (phrap only). Quality value used in to define the "high-quality" part
307             of a read, (the part which should overlap; this is used to adjust
308             qualities at ends of reads.
309              
310             -confirm_length 8
311             (phrap only). Minimum size of confirming segment (segment starts at
312             3d distinct nuc following discrepancy).
313              
314             -confirm_trim 1
315             (phrap only). Amount by which confirming segments are trimmed at
316             edges.
317              
318             -confirm_penalty -5
319             (phrap only). Penalty used in aligning against "confirming" reads.
320              
321             -confirm_score 30
322             (phrap only). Minimum alignment score for a read to be allowed to
323             "confirm" part of another read.
324              
325             -indexwordsize 10
326             Size of indexing (hashing) words, used in finding word matches
327             between sequences. The value of this parameter has a generally minor
328             effect on run time and memory usage.
329              
330             =cut
331              
332             sub new {
333 1     1 1 80 my ($class,@args) = @_;
334 1         9 my $self = $class->SUPER::new(@args);
335 1         22 $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation,
336             $qual_param, $use_dash, $join);
337 1 50       10 $self->program_name($program_name) if not defined $self->program_name();
338 1         5 $self->_assembly_format($asm_format);
339 1         5 return $self;
340             }
341              
342             =head2 out_type
343              
344             Title : out_type
345             Usage : $assembler->out_type('Bio::Assembly::ScaffoldI')
346             Function: Get/set the desired type of output
347             Returns : The type of results to return
348             Args : Desired type of results to return (optional):
349             'Bio::Assembly::IO' object
350             'Bio::Assembly::ScaffoldI' object (default)
351             The name of a file to save the results in
352              
353             =cut
354              
355              
356             =head2 run
357              
358             Title : run
359             Usage : $asm = $factory->run($fasta_file)
360             Function: Run Phrap
361             Returns : Assembly results (file, IO object or assembly object)
362             Args : - sequence input (FASTA file or sequence object arrayref)
363             - optional quality score input (QUAL file or quality score object
364             arrayref)
365             =cut
366              
367              
368             =head2 _run
369              
370             Title : _run
371             Usage : $factory->_run()
372             Function: Make a system call and run Phrap
373             Returns : An assembly file
374             Args : - FASTA file
375             - optional QUAL file
376              
377             =cut
378              
379             sub _run {
380 0     0     my ($self, $fasta_file, $qual_file) = @_;
381              
382             # Move quality file to proper place
383 0           my $tmp_qual_file = "$fasta_file.qual";
384 0 0 0       if ($qual_file && not -f $tmp_qual_file) {
385 0           $tmp_qual_file = "$fasta_file.qual"; # by Cap3 convention
386 0 0 0       link ($qual_file, $tmp_qual_file) or copy ($qual_file, $tmp_qual_file) or
387             $self->throw("Could not copy file '$qual_file' to '$tmp_qual_file': $!");
388             }
389              
390             # Setup needed files and filehandles
391 0           my ($output_fh, $output_file) = $self->_prepare_output_file( );
392              
393             # Get program executable
394 0           my $exe = $self->executable;
395              
396             # Get command-line options
397 0           my $options = join ' ', @{$self->_translate_params()};
  0            
398              
399             # Usage: phrap seq_file1 [seq_file2 ...] [-option value] [-option value] ...
400 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
401 0           my $str = "$exe $options $fasta_file 1> $output_file 2> $null";
402 0 0         if ($self->verbose() >= 0) {
403 0           $self->debug( "$exe command = $str\n" );
404             };
405 0           my $status = system($str);
406 0 0         $self->throw( "Phrap call ($str) crashed: $? \n") unless $status==0;
407 0           close($output_fh);
408              
409             # Result files
410 0           my $log_file = "$fasta_file.log";
411 0           my $contigs_file = "$fasta_file.contigs";
412 0           my $problems_file = "$fasta_file.problems";
413 0           my $problems_qual_file = "$fasta_file.problems.qual";
414 0           my $contigs_qual_file = "$fasta_file.contigs.qual";
415 0           my $singlets_file = "$fasta_file.singlets";
416              
417             # Remove all files except for the PHRAP file
418 0           for my $file ($log_file, $contigs_file, $problems_file, $problems_qual_file,
419             $contigs_qual_file, $singlets_file, $tmp_qual_file) {
420 0           unlink $file;
421             }
422              
423 0           return $output_file;
424             }
425              
426             1;