File Coverage

lib/App/SimulateReads/Command/Digest.pm
Criterion Covered Total %
statement 27 141 19.1
branch 0 58 0.0
condition 0 12 0.0
subroutine 9 15 60.0
pod 3 3 100.0
total 39 229 17.0


line stmt bran cond sub pod time code
1             package App::SimulateReads::Command::Digest;
2             # ABSTRACT: digest command class. Simulate single-end and paired-end reads.
3              
4 1     1   1837 use App::SimulateReads::Base 'class';
  1         3  
  1         8  
5 1     1   8 use App::SimulateReads::Quality::Handle;
  1         2  
  1         41  
6 1     1   382 use App::SimulateReads::Fastq::SingleEnd;
  1         410  
  1         52  
7 1     1   376 use App::SimulateReads::Fastq::PairedEnd;
  1         401  
  1         41  
8 1     1   323 use App::SimulateReads::Simulator;
  1         449  
  1         45  
9 1     1   8 use Path::Class 'file';
  1         2  
  1         63  
10 1     1   5 use File::Path 'make_path';
  1         2  
  1         43  
11 1     1   5 use Pod::Usage;
  1         2  
  1         146  
12              
13             extends 'App::SimulateReads::CLI::Command';
14              
15             our $VERSION = '0.05'; # VERSION
16              
17             use constant {
18 1         1602 COUNT_LOOPS_BY_OPT => ['coverage', 'number-of-reads'],
19             STRAND_BIAS_OPT => ['random', 'plus', 'minus'],
20             SEQID_WEIGHT_OPT => ['length', 'same', 'file'],
21             SEQUENCING_TYPE_OPT => ['single-end', 'paired-end']
22 1     1   6 };
  1         3  
23              
24             override 'opt_spec' => sub {
25             super,
26             'prefix|p=s',
27             'verbose|v',
28             'output-dir|o=s',
29             'jobs|j=i',
30             'gzip|z!',
31             'coverage|c=f',
32             'read-size|r=i',
33             'fragment-mean|m=i',
34             'fragment-stdd|d=i',
35             'sequencing-error|e=f',
36             'sequencing-type|t=s',
37             'quality-profile|q=s',
38             'strand-bias|b=s',
39             'seqid-weight|w=s',
40             'number-of-reads|n=i',
41             'weight-file|f=s'
42             };
43              
44             sub _default_opt {
45 0     0     'verbose' => 0,
46             'prefix' => 'out',
47             'output-dir' => '.',
48             'jobs' => 1,
49             'gzip' => 1,
50             'count-loops-by' => 'coverage',
51             'coverage' => 1,
52             'strand-bias' => 'random',
53             'seqid-weight' => 'length',
54             'sequencing-type' => 'paired-end',
55             'fragment-mean' => 300,
56             'fragment-stdd' => 50,
57             'sequencing-error' => 0.005,
58             'read-size' => 101,
59             'quality-profile' => 'hiseq'
60             }
61              
62             sub _log_msg_opt {
63 0     0     my ($self, $opts) = @_;
64 0           while (my ($key, $value) = each %$opts) {
65 0 0         next if ref($value) =~ /Fastq/;
66 0 0         $value = "not defined" if not defined $value;
67 0           $key =~ s/_/ /g;
68 0           log_msg " => $key $value";
69             }
70             }
71              
72             sub _quality_profile_report {
73 0     0     my $quality = App::SimulateReads::Quality::Handle->new;
74 0           return $quality->make_report;
75             }
76              
77             sub validate_args {
78 0     0 1   my ($self, $args) = @_;
79 0           my $fasta_file = shift @$args;
80              
81             # Mandatory fasta file
82 0 0         if (not defined $fasta_file) {
83 0           die "Missing fasta file\n";
84             }
85              
86             # Is it really a file?
87 0 0         if (not -f $fasta_file) {
88 0           die "<$fasta_file> is not a file. Please, give me a valid fasta file\n";
89             }
90              
91             # Check the file extension: fasta, fa, fna, ffn followed, or not, by .gz
92 0 0         if ($fasta_file !~ /.+\.(fasta|fa|fna|ffn)(\.gz)?$/) {
93 0           die "<$fasta_file> does not seem to be a fasta file. Please check the file extension\n";
94             }
95              
96 0 0         die "Too many arguments: '@$args'\n" if @$args;
97             }
98              
99             sub validate_opts {
100 0     0 1   my ($self, $opts) = @_;
101 0           my %default_opt = $self->_default_opt;
102 0           $self->fill_opts($opts, \%default_opt);
103              
104             # Possible alternatives
105 0           my %STRAND_BIAS = map { $_ => 1 } @{ &STRAND_BIAS_OPT };
  0            
  0            
106 0           my %SEQID_WEIGHT = map { $_ => 1 } @{ &SEQID_WEIGHT_OPT };
  0            
  0            
107 0           my %SEQUENCING_TYPE = map { $_ => 1 } @{ &SEQUENCING_TYPE_OPT };
  0            
  0            
108 0           my %QUALITY_PROFILE = %{ $self->_quality_profile_report };
  0            
109              
110             # prefix
111 0 0         if ($opts->{prefix} =~ /([\/\\])/) {
112 0           die "Invalid character in 'prefix' option: $opts->{prefix} => '$1'\n";
113             }
114              
115             # jobs > 0
116 0 0         if ($opts->{jobs} <= 0) {
117 0           die "Option 'jobs' requires an integer greater than zero, not $opts->{jobs}\n";
118             }
119              
120             # 0 <= sequencing_error <= 1
121 0 0 0       if (0 > $opts->{'sequencing-error'} || $opts->{'sequencing-error'} > 1) {
122 0           die "Option 'sequencing-error' requires a value between zero and one, not $opts->{'sequencing-error'}\n";
123             }
124              
125             # quality_profile
126 0 0 0       if ((not exists $QUALITY_PROFILE{$opts->{'quality-profile'}}) && ($opts->{'quality-profile'} ne 'poisson')) {
127 0           my $opt = join ', ' => keys %QUALITY_PROFILE;
128 0           die "Option 'quality-profile' requires one of these arguments: $opt and poisson. Not $opts->{'quality-profile'}\n";
129             }
130              
131             # 0 < read-size <= 101
132 0 0         if (0 > $opts->{'read-size'}) {
133 0           die "Option 'read-size' requires an integer greater than zero, not $opts->{'read-size'}\n";
134             }
135              
136             # read-size if quality-profile is not poisson, test for available sizes
137 0           my $quality_entry = $QUALITY_PROFILE{$opts->{'quality-profile'}};
138 0           my %sizes = map { $_->{size} => 1 } @$quality_entry;
  0            
139 0 0 0       if ((not $sizes{$opts->{'read-size'}}) && ($opts->{'quality-profile'} ne 'poisson')) {
140 0           my $opt = join ', ' => keys %sizes;
141 0           die "Option 'read-size' requires one of these arguments for $opts->{'quality-profile'}: $opt. Not $opts->{'read-size'}\n";
142             }
143              
144             # strand_bias (STRAND_BIAS_OPT)
145 0 0         if (not exists $STRAND_BIAS{$opts->{'strand-bias'}}) {
146 0           my $opt = join ', ' => keys %STRAND_BIAS;
147 0           die "Option 'strand-bias' requires one of these arguments: $opt. Not $opts->{'strand-bias'}\n";
148             }
149              
150             # sequencing_type (SEQUENCING_TYPE_OPT)
151 0 0         if (not exists $SEQUENCING_TYPE{$opts->{'sequencing-type'}}) {
152 0           my $opt = join ', ' => keys %SEQUENCING_TYPE;
153 0           die "Option 'sequencing-type' requires one of these arguments: $opt not $opts->{'sequencing-type'}\n";
154             }
155              
156             ## Dependently validated arguments
157             # fragment_mean and fragment_stdd
158 0 0         if ($opts->{'sequencing-type'} eq 'paired-end') {
159             # fragment_mean > 0
160 0 0         if ($opts->{'fragment-mean'} <= 0) {
161 0           die "Option 'fragment-mean' requires an integer greater than zero, not $opts->{'fragment-mean'}\n";
162             }
163              
164             # fragment_stdd > 0
165 0 0         if ($opts->{'fragment-stdd'} < 0) {
166 0           die "Option 'fragment-stdd' requires an integer greater or equal to zero, not $opts->{'fragment-stdd'}\n";
167             }
168              
169             # (fragment_mean - fragment_stdd) >= read_size
170 0 0         if (($opts->{'fragment-mean'} - $opts->{'fragment-stdd'}) < $opts->{'read-size'}) {
171             die "Option 'fragment-mean' minus 'fragment-stdd' requires a value greater or equal read-size, not " .
172 0           ($opts->{'fragment-mean'} - $opts->{'fragment-stdd'}) . "\n";
173             }
174             }
175              
176             # count-loops-by (COUNT_LOOPS_BY_OPT). The default value is counting by coverage
177             # Or calculate number of reads by coverage, or the user pass the number-of-reads.
178             # number-of-reads option overrides coverage
179 0 0         if (exists $opts->{'number-of-reads'}) {
180             # number_of_reads > 0
181 0 0         if ($opts->{'number-of-reads'} <= 0) {
182 0           die "Option 'number-of-reads' requires a value greater than zero, not $opts->{'number-of-reads'}\n";
183             }
184              
185             # sequencing_type eq paired-end requires at least 2 reads
186 0 0 0       if ($opts->{'number-of-reads'} < 2 && $opts->{'sequencing-type'} eq 'paired-end') {
187 0           die "Option 'number-of-reads' requires a value greater or equal to 2 for paired-end reads, not $opts->{'number-of-reads'}\n";
188             }
189             }
190              
191             # coverage > 0
192 0 0         if ($opts->{coverage} <= 0) {
193 0           die "Option 'coverage' requires a value greater than zero, not $opts->{coverage}\n";
194             }
195              
196             # seqid-weight (SEQID_WEIGHT_OPT)
197 0 0         if (not exists $SEQID_WEIGHT{$opts->{'seqid-weight'}}) {
198 0           my $opt = join ', ' => keys %SEQID_WEIGHT;
199 0           die "Option 'seqid-weight' requires one of these arguments: $opt not $opts->{'seqid_weight'}\n";
200             }
201              
202             # seqid-weight eq 'file' requires a weight-file
203 0 0         if ($opts->{'seqid-weight'} eq 'file') {
204 0 0         if (not defined $opts->{'weight-file'}) {
205 0           die "Option 'seqid-weight=file' requires the argument 'weight-file' with a tab-separated values file\n";
206             }
207              
208             # It is defined, but the file exists?
209 0 0         if (not -f $opts->{'weight-file'}) {
210 0           die "Option 'weight-file' requires a valid file, not $opts->{'weight-file'}\n";
211             }
212             }
213             }
214              
215             sub execute {
216 0     0 1   my ($self, $opts, $args) = @_;
217 0           my $fasta_file = shift @$args;
218              
219 0           my %default_opt = $self->_default_opt;
220 0           $self->fill_opts($opts, \%default_opt);
221              
222             # Set if user wants a verbose log
223 0           $LOG_VERBOSE = $opts->{verbose};
224              
225             # Set default count-loop-by behavior
226 0 0         $opts->{'count-loops-by'} = 'number-of-reads' if exists $opts->{'number-of-reads'};
227              
228             # Create output directory if it not exist
229 0           make_path($opts->{'output-dir'}, {error => \my $err_list});
230 0           my $err_dir;
231 0 0         if (@$err_list) {
232 0           for (@$err_list) {
233 0           my ($dir, $message) = %$_;
234 0           $err_dir .= "Problem creating '$dir': $message\n";
235             }
236 0           die "$err_dir";
237             }
238              
239             # Concatenate output-dir to prefix
240 0           my $prefix = file($opts->{'output-dir'}, $opts->{prefix});
241 0           $opts->{prefix} = "$prefix";
242              
243             #-------------------------------------------------------------------------------
244             # Log presentation header
245             #-------------------------------------------------------------------------------
246 0           my $time_stamp = localtime;
247 0           my $progname = $self->progname;
248 0           my $argv = $self->argv;
249 0           log_msg <<"HEADER";
250             --------------------------------------------------------
251             Date $time_stamp
252             $progname Copyright (C) 2017 Thiago L. A. Miller
253             --------------------------------------------------------
254             :: Arguments passed by the user:
255             => '@$argv'
256             HEADER
257              
258             #-------------------------------------------------------------------------------
259             # Construct the Fastq and Simulator classes
260             #-------------------------------------------------------------------------------
261             my %paired_end_param = (
262             quality_profile => $opts->{'quality-profile'},
263             sequencing_error => $opts->{'sequencing-error'},
264             read_size => $opts->{'read-size'},
265             fragment_mean => $opts->{'fragment-mean'},
266 0           fragment_stdd => $opts->{'fragment-stdd'}
267             );
268              
269             my %single_end_param = (
270             quality_profile => $opts->{'quality-profile'},
271             sequencing_error => $opts->{'sequencing-error'},
272 0           read_size => $opts->{'read-size'}
273             );
274              
275 0           my $fastq;
276 0 0         if ($opts->{'sequencing-type'} eq 'paired-end') {
277 0           log_msg ":: Creating paired-end fastq generator ...";
278 0           $self->_log_msg_opt(\%paired_end_param);
279 0           $fastq = App::SimulateReads::Fastq::PairedEnd->new(%paired_end_param);
280             } else {
281 0           log_msg ":: Creating single-end fastq generator ...";
282 0           $self->_log_msg_opt(\%single_end_param);
283 0           $fastq = App::SimulateReads::Fastq::SingleEnd->new(%single_end_param);
284             }
285              
286             my %simulator_param = (
287             fastq => $fastq,
288             fasta_file => $fasta_file,
289             prefix => $opts->{'prefix'},
290             output_gzip => $opts->{'gzip'},
291             count_loops_by => $opts->{'count-loops-by'},
292             number_of_reads => $opts->{'number-of-reads'},
293             coverage => $opts->{'coverage'},
294             jobs => $opts->{'jobs'},
295             strand_bias => $opts->{'strand-bias'},
296             seqid_weight => $opts->{'seqid-weight'},
297 0           weight_file => $opts->{'weight-file'}
298             );
299              
300 0           my $simulator;
301 0           log_msg ":: Creating simulator ...";
302 0           $self->_log_msg_opt(\%simulator_param);
303 0           $simulator = App::SimulateReads::Simulator->new(%simulator_param);
304              
305             #-------------------------------------------------------------------------------
306             # Let's simulate it!
307             #-------------------------------------------------------------------------------
308 0           log_msg ":: Running simulation ...";
309 0           $simulator->run_simulation;
310              
311 0           log_msg ":: End simulation. So long, and thanks for all the fish!";
312             }
313              
314             __END__
315              
316             =pod
317              
318             =encoding UTF-8
319              
320             =head1 NAME
321              
322             App::SimulateReads::Command::Digest - digest command class. Simulate single-end and paired-end reads.
323              
324             =head1 VERSION
325              
326             version 0.05
327              
328             =head1 SYNOPSIS
329              
330             simulate_reads digest [options] <fasta-file>
331              
332             Arguments:
333             a fasta-file
334              
335             Options:
336             -h, --help brief help message
337             -M, --man full documentation
338             -v, --verbose print log messages
339             -p, --prefix prefix output [default:"out"]
340             -o, --output-dir output directory [default:"."]
341             -j, --jobs number of jobs [default:"1"; Integer]
342             -z, --gzip compress output file
343             -c, --coverage fastq-file coverage [default:"1", Number]
344             -n, --number-of-reads directly set the number of reads
345             [default:"1", Integer]
346             -t, --sequencing-type single-end or paired-end reads
347             [default:"paired-end"]
348             -q, --quality-profile illumina sequencing system profiles
349             [default:"hiseq"]
350             -e, --sequencing-error sequencing error rate
351             [default:"0.005"; Number]
352             -r, --read-size the read size [default:"101"; Integer]
353             -m, --fragment-mean the fragment mean size for paired-end reads
354             [default:"300"; Integer]
355             -d, --fragment-stdd the fragment standard deviation size for
356             paired-end reads [default:"50"; Integer]
357             -b, --strand-bias which strand to be used: plus, minus and random
358             [default:"random"]
359             -w, --seqid-weight seqid raffle type: length, same, file
360             [default: "length"]
361             -f, --weight-file weight file when seqid-weight=file
362              
363             =head1 DESCRIPTION
364              
365             B<simulate_reads> will read the given input file and do something
366             useful with the contents thereof.
367              
368             =head1 OPTIONS
369              
370             =over 8
371              
372             =item B<--help>
373              
374             Print a brief help message and exits.
375              
376             =item B<--man>
377              
378             Prints the manual page and exits.
379              
380             =item B<--verbose>
381              
382             Prints log information to standard error
383              
384             =item B<--prefix>
385              
386             Concatenates the prefix to the output-file name.
387              
388             =item B<--output-dir>
389              
390             Creates output-file inside output-dir. If output-dir
391             does not exist, it is created recursively
392              
393             =item B<--jobs>
394              
395             Sets the number of child jobs to be created
396              
397             =item B<--gzip>
398              
399             Compress the output-file with gzip algorithm. It is
400             possible to pass --no-gzip if one wants
401             uncompressed output-file
402              
403             =item B<--read-size>
404              
405             Sets the read size. For now the unique valid value is 101
406              
407             =item B<--coverage>
408              
409             Calculates the number of reads based on the sequence
410             coverage: number_of_reads = (sequence_size * coverage) / read_size
411              
412             =item B<--number-of-reads>
413              
414             Sets directly the number of reads desired. It overrides coverage,
415             in case the two options are given
416              
417             =item B<--sequencing-type>
418              
419             Sets the sequencing type to single-end or paired-end
420              
421             =item B<--fragment-mean>
422              
423             If the sequencing-type is set to paired-end, it sets the
424             fragment mean
425              
426             =item B<--fragment-stdd>
427              
428             If the sequencing-type is set to paired-end, it sets the
429             fragment standard deviation
430              
431             =item B<--sequencing-error>
432              
433             Sets the sequencing error rate. Valid values are between zero and one
434              
435             =item B<--quality-profile>
436              
437             Sets the illumina sequencing system profile for quality. For now, the unique
438             valid values are hiseq and poisson
439              
440             =item B<--strand-bias>
441              
442             Sets which strand to use to make a read. Valid options are plus, minus and
443             random - if you want to randomly calculte the strand for each read
444              
445             =item B<--seqid-weight>
446              
447             Sets the seqid (e.g. chromossome, ensembl id) raffle behavior. Valid options are
448             length, same and file. If it is set to 'same', all seqid receives the same weight
449             when raffling. If it is set to 'length', the seqid weight is calculated based on
450             the seqid sequence length. And finally, if it is set to 'file', the user must set
451             the option --weight-file. For details, see B<--weight-file>
452              
453             =item B<--weight-file>
454              
455             If --seqid-weight is set to file, then this option becomes mandatory. A valid
456             weight file is a tab-separated values file with 2 columns. The first column is
457             for the seqid and the second column for the desired weight. Valid weights are integers
458              
459             =back
460              
461             =head1 AUTHOR
462              
463             Thiago L. A. Miller <tmiller@mochsl.org.br>
464              
465             =head1 COPYRIGHT AND LICENSE
466              
467             This software is Copyright (c) 2017 by Teaching and Research Institute from Sírio-Libanês Hospital.
468              
469             This is free software, licensed under:
470              
471             The GNU General Public License, Version 3, June 2007
472              
473             =cut