File Coverage

lib/App/SimulateReads/Simulator.pm
Criterion Covered Total %
statement 143 185 77.3
branch 24 48 50.0
condition 8 18 44.4
subroutine 18 22 81.8
pod 0 2 0.0
total 193 275 70.1


line stmt bran cond sub pod time code
1             package App::SimulateReads::Simulator;
2             # ABSTRACT: Class responsible to make the simulation
3              
4 6     6   18196 use App::SimulateReads::Base 'class';
  6         17  
  6         58  
5 6     6   57 use App::SimulateReads::Fastq::SingleEnd;
  6         12  
  6         197  
6 6     6   29 use App::SimulateReads::Fastq::PairedEnd;
  6         12  
  6         173  
7 6     6   2247 use App::SimulateReads::InterlaceProcesses;
  6         2278  
  6         244  
8 6     6   2046 use File::Cat 'cat';
  6         2616  
  6         340  
9 6     6   1984 use Parallel::ForkManager;
  6         42889  
  6         17969  
10              
11             with qw/App::SimulateReads::Role::WeightedRaffle App::SimulateReads::Role::IO/;
12              
13             our $VERSION = '0.05'; # VERSION
14              
15             #-------------------------------------------------------------------------------
16             # Moose attributes
17             #-------------------------------------------------------------------------------
18             has 'jobs' => (is => 'ro', isa => 'My:IntGt0', required => 1);
19             has 'prefix' => (is => 'ro', isa => 'Str', required => 1);
20             has 'output_gzip' => (is => 'ro', isa => 'Bool', required => 1);
21             has 'fasta_file' => (is => 'ro', isa => 'My:Fasta', required => 1);
22             has 'coverage' => (is => 'ro', isa => 'My:NumGt0', required => 0);
23             has 'number_of_reads' => (is => 'ro', isa => 'My:IntGt0', required => 0);
24             has 'count_loops_by' => (is => 'ro', isa => 'My:CountLoopBy', required => 1);
25             has 'strand_bias' => (is => 'ro', isa => 'My:StrandBias', required => 1);
26             has 'seqid_weight' => (is => 'ro', isa => 'My:SeqIdWeight', required => 1);
27             has 'weight_file' => (is => 'ro', isa => 'My:File', required => 0);
28             has 'fastq' => (
29             is => 'ro',
30             isa => 'App::SimulateReads::Fastq::SingleEnd | App::SimulateReads::Fastq::PairedEnd',
31             required => 1,
32             handles => [ qw{ sprint_fastq } ]
33             );
34             has '_fasta' => (
35             is => 'ro',
36             isa => 'My:IdxFasta',
37             builder => '_build_fasta',
38             lazy_build => 1
39             );
40             has '_strand' => (
41             is => 'ro',
42             isa => 'CodeRef',
43             builder => '_build_strand',
44             lazy_build => 1
45             );
46             has '_seqid_raffle' => (
47             is => 'ro',
48             isa => 'CodeRef',
49             builder => '_build_seqid_raffle',
50             lazy_build => 1
51             );
52              
53             #=== CLASS METHOD ============================================================
54             # CLASS: Simulator
55             # METHOD: BUILD (Moose)
56             # PARAMETERS: Void
57             # RETURNS: Void
58             # DESCRIPTION: Validate optional attributes and/or attributes that depends of
59             # another attribute
60             # THROWS: If seqid_weight == 'file' and the user did not pass a weight
61             # file, throws an exception.
62             # If count_loops_by == 'coverage', then coverage must be defined,
63             # as well as 'number_of_reads', number_of_reads must be defined
64             # COMMENTS: We need to initialize lazy attributes here. If not, the child
65             # processes could independently initialize one
66             # SEE ALSO: n/a
67             #===============================================================================
68             sub BUILD {
69 20     20 0 45 my $self = shift;
70              
71             # If seqid_weight is 'file', then weight_file must be defined
72 20 50 33     620 if ($self->seqid_weight eq 'file' and not defined $self->weight_file) {
73 0         0 croak "seqid_weight=file requires a weight_file\n";
74             }
75              
76             # If count_loops_by is 'coverage', then coverage must be defined. Else if
77             # it is equal to 'number_of_reads', then number_of_reads must be defined
78 20 50 33     615 if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) {
    50 33        
79 0         0 croak "count_loops_by=coverage requires a coverage number\n";
80             } elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) {
81 0         0 croak "count_loops_by=number_of_reads requires a number_of_reads number\n";
82             }
83            
84             ## Just to ensure that the lazy attributes are built before &new returns
85             # Only seqid_weight=same is not a weighted raffle, so in this case
86             # not construct weight attribute
87 20 50       485 $self->weights if $self->seqid_weight ne 'same';
88 20         505 $self->_strand;
89 20         480 $self->_fasta;
90 20         635 $self->_seqid_raffle;
91             } ## --- end sub BUILD
92            
93             #=== CLASS METHOD ============================================================
94             # CLASS: Simulator
95             # METHOD: _build_strand (BUILDER)
96             # PARAMETERS: Void
97             # RETURNS: Ref Code
98             # DESCRIPTION: Build _strand attribute. (dynamic linkage)
99             # THROWS: If it is given a unknown option, throws an error
100             # COMMENTS: Valid strand_bias: 'plus', 'minus' and 'random'
101             # SEE ALSO: n/a
102             #===============================================================================
103             sub _build_strand {
104 20     20   40 my $self = shift;
105 20         35 return do {
106 20         520 given ($self->strand_bias) {
107 20     0   60 when ('plus') { sub {1} }
  0         0  
  0         0  
108 20     0   35 when ('minus') { sub {0} }
  0         0  
  0         0  
109 20     1710   30 when ('random') { sub { int(rand(2)) }}
  20         545  
  1710         7767  
110 0         0 default { croak "Unknown option '$_' for strand bias\n" }
  0         0  
111             }
112             };
113             } ## --- end sub _build_strand
114              
115             #=== CLASS METHOD ============================================================
116             # CLASS: Simulator
117             # METHOD: _build_fasta (BUILDER)
118             # PARAMETERS: Void
119             # RETURNS: $indexed_fasta My:IdxFasta
120             # DESCRIPTION: Build _fasta attribute
121             # THROWS: For single end read: If the read size required is greater than
122             # any genomic sequence, then throws an error. For paired end read:
123             # If fragment mean is greater than any genomic sequence, the throws
124             # an error.
125             # COMMENTS: none
126             # SEE ALSO: n/a
127             #===============================================================================
128             sub _build_fasta {
129 20     20   30 my $self = shift;
130 20         515 log_msg ":: Indexing fasta file '" . $self->fasta_file . "' ...";
131 20         840 my $indexed_fasta = $self->index_fasta($self->fasta_file);
132 20 50       60 croak "Error parsing " . $self->fasta_file . ". Maybe the file is empty\n"
133             unless %$indexed_fasta;
134              
135             # Validate genome about the read size required
136 20         40 my $err;
137 20         55 for my $id (keys %$indexed_fasta) {
138 100         150 my $index_size = $indexed_fasta->{$id}{size};
139 100         2390 given (ref $self->fastq) {
140 100         195 when ('App::SimulateReads::Fastq::SingleEnd') {
141 50         1240 my $read_size = $self->fastq->read_size;
142 50 50       160 if ($index_size < $read_size) {
143 0         0 $err .= "seqid sequence length (>$id => $index_size) lesser than required read size ($read_size)\n";
144             }
145             }
146 50         70 when ('App::SimulateReads::Fastq::PairedEnd') {
147 50         1140 my $fragment_mean = $self->fastq->fragment_mean;
148 50 50       155 if ($index_size < $fragment_mean) {
149 0         0 $err .= "seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)\n";
150             }
151             }
152 0         0 default {
153 0         0 croak "Unknown option '$_' for sequencing type\n";
154             }
155             }
156             }
157            
158 20 50       60 croak "Error parsing '" . $self->fasta_file . "':\n$err" if defined $err;
159 20         465 return $indexed_fasta;
160             } ## --- end sub _build_fasta
161              
162             #=== CLASS METHOD ============================================================
163             # CLASS: Simulator
164             # METHOD: _build_weights (BUILDER)
165             # PARAMETERS: Void
166             # RETURNS: My:Weights
167             # DESCRIPTION: Build weights. It is required by the WeightedRaffle role.
168             # It verifies seqid_weight and sets the required weights matrix
169             # THROWS: If seqid_weight == 'file', then it needs to be validated
170             # If it is given a unknown option, throws an error
171             # COMMENTS: none
172             # SEE ALSO: n/a
173             #===============================================================================
174             sub _build_weights {
175 20     20   45 my $self = shift;
176 20         80 return do {
177 20         590 given ($self->seqid_weight) {
178 20         85 when ('length') {
179 20         55 my %chr_size = map { $_, $self->_fasta->{$_}{size} } keys %{ $self->_fasta };
  100         2260  
  20         535  
180 20         105 $self->calculate_weights(\%chr_size);
181             }
182 0         0 when ('file') {
183 0         0 log_msg ":: Indexing weight file '" . $self->weight_file . "' ...";
184 0         0 my $indexed_file = $self->index_weight_file($self->weight_file);
185              
186             # Validate weight_file
187 0 0       0 croak "Error parsing '" . $self->weight_file . "': Maybe the file is empty\n"
188             unless %$indexed_file;
189              
190 0         0 my $indexed_fasta = $self->_fasta;
191 0         0 my $err;
192 0         0 for my $id (keys %$indexed_file) {
193 0 0       0 if (not exists $indexed_fasta->{$id}) {
194 0         0 $err .= "seqid '$id' not found in '" . $self->fasta_file . "'\n";
195             }
196             }
197 0 0       0 croak "Error in validating '" . $self->weight_file . "':\n" . $err
198             if defined $err;
199              
200 0         0 $self->calculate_weights($indexed_file);
201             }
202 0         0 when ('same') {
203 0         0 croak "Error: Cannot build raffle weights for 'seqid-weight=same'\n";
204             }
205 0         0 default {
206 0         0 croak "Unknown option '$_' for weighted raffle\n";
207             }
208             }
209             };
210             } ## --- end sub _build_weights
211              
212             #=== CLASS METHOD ============================================================
213             # CLASS: Simulator
214             # METHOD: _build_seqid_raffle (BUILDER)
215             # PARAMETERS: Void
216             # RETURNS: Ref Code
217             # DESCRIPTION: Build _seqid_raffle attribute. (dynamic linkage)
218             # THROWS: If it is given a unknown option, throws an error
219             # COMMENTS: seqid_weight can be: 'length', 'file' and 'same'
220             # SEE ALSO: n/a
221             #===============================================================================
222             sub _build_seqid_raffle {
223 20     20   35 my $self = shift;
224 20         35 return do {
225 20         500 given ($self->seqid_weight) {
226 20         55 when ('same') {
227 0         0 my @seqids = keys %{ $self->_fasta };
  0         0  
228 0         0 my $seqids_size = scalar @seqids;
229 0     0   0 sub { $seqids[int(rand($seqids_size))] };
  0         0  
230             }
231 20         75 when (/^(file|length)$/) {
232 20     1710   565 sub { $self->weighted_raffle };
  1710         4730  
233             }
234 0         0 default {
235 0         0 croak "Unknown option '$_' for seqid-raffle\n";
236             }
237             }
238             };
239             } ## --- end sub _build_seqid_raffle
240              
241             #=== CLASS METHOD ============================================================
242             # CLASS: Simulator
243             # METHOD: _calculate_number_of_reads (PRIVATE)
244             # PARAMETERS: Void
245             # RETURNS: $number_of_reads Int > 0
246             # DESCRIPTION: Calculates the number of reads to produce based on the coverage
247             # or the own value passed by the user
248             # THROWS: If count_loops_by is equal to 'coverage', it may accur that the
249             # fasta_file size, or the coverage asked is too low, which results in
250             # zero reads
251             # If it is given a unknown option, throws an error
252             # COMMENTS: count_loops_by can be: 'coverage' and 'number_of_reads'
253             # SEE ALSO: n/a
254             #===============================================================================
255             sub _calculate_number_of_reads {
256 8     8   33 my $self = shift;
257 8         19 my $number_of_reads = do {
258 8         294 given($self->count_loops_by) {
259 8         38 when ('coverage') {
260             # It is needed to calculate the genome size
261 8         408 my $fasta = $self->_fasta;
262 8         21 my $fasta_size = 0;
263 8         19 $fasta_size += $fasta->{$_}{size} for keys %{ $fasta };
  8         77  
264 8         283 int(($fasta_size * $self->coverage) / $self->fastq->read_size);
265             }
266 0         0 when ('number-of-reads') {
267 0         0 $self->number_of_reads;
268             }
269 0         0 default {
270 0         0 croak "Unknown option '$_' for calculating the number of reads\n";
271             }
272             }
273             };
274              
275             # In case it is paired-end read, divide the number of reads by 2 because App::SimulateReads::Fastq::PairedEnd class
276             # returns 2 reads at time
277 8         260 my $class = ref $self->fastq;
278 8 100       46 my $read_type_factor = $class eq 'App::SimulateReads::Fastq::PairedEnd' ? 2 : 1;
279 8         32 $number_of_reads = int($number_of_reads / $read_type_factor);
280              
281             # Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size
282 8 50 66     122 if ($number_of_reads <= 0 || ($class eq 'App::SimulateReads::Fastq::PairedEnd' && $number_of_reads == 1)) {
      33        
283 0         0 croak "The computed number of reads is equal to zero.\n" .
284             "It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n";
285             }
286              
287 8         33 return $number_of_reads;
288             } ## --- end sub _calculate_number_of_reads
289              
290             #=== CLASS METHOD ============================================================
291             # CLASS: Simulator
292             # METHOD: run_simulation
293             # PARAMETERS: Void
294             # RETURNS: Void
295             # DESCRIPTION: The main class method where the simulation unfolds
296             # THROWS: Try catch the fasta object passed by the user. If occurs an error,
297             # then throws an exception
298             # COMMENTS: It is not recommended to make parallelism in perl, so the workaround
299             # is to implement a parent, child system by forking the task and
300             # making part of the job in each child. In the end, it returns to
301             # parent and concatenate all temporary files generated
302             # SEE ALSO: n/a
303             #===============================================================================
304             sub run_simulation {
305 8     8 0 5423 my $self = shift;
306 8         344 my $fasta = $self->_fasta;
307              
308             # Calculate the number of reads to be generated
309 8         46 my $number_of_reads = $self->_calculate_number_of_reads;
310              
311             # Function that returns strand by strand_bias
312 8         280 my $strand = $self->_strand;
313              
314             # Function that returns seqid by seqid_weight
315 8         289 my $seqid = $self->_seqid_raffle;
316              
317             # Files to be generated
318 8         305 my %files = (
319             'App::SimulateReads::Fastq::SingleEnd' => [
320             $self->prefix . '_simulation_read.fastq'
321             ],
322             'App::SimulateReads::Fastq::PairedEnd' => [
323             $self->prefix . '_simulation_read_R1.fastq',
324             $self->prefix . '_simulation_read_R2.fastq'
325             ],
326             );
327              
328             # Is it single-end or paired-end?
329 8         268 my $fastq_class = ref $self->fastq;
330              
331             # Forks
332 8         409 my $number_of_jobs = $self->jobs;
333 8         129 my $pm = Parallel::ForkManager->new($number_of_jobs);
334            
335             # Parent child pids
336 8         4805 my $parent_pid = $$;
337 8         22 my @child_pid;
338              
339             # Temporary files tracker
340             my @tmp_files;
341              
342             # Run in parent right after creating child process
343             $pm->run_on_start(
344             sub {
345 10     10   21554 my ($pid, $files_ref) = @_;
346 10         129 push @child_pid => $pid;
347 10         114 push @tmp_files => @$files_ref;
348             }
349 8         90 );
350              
351 8 50       220 log_msg sprintf ":: Creating %d child %s ...",
352             $number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs";
353              
354 8         38 for my $tid (1..$number_of_jobs) {
355             #-------------------------------------------------------------------------------
356             # Inside parent
357             #-------------------------------------------------------------------------------
358 14         328 log_msg ":: Creating job $tid ...";
359 14         33 my @files_t = map { "$_.${parent_pid}_part$tid" } @{ $files{$fastq_class} };
  19         134  
  14         121  
360 14 100       82 my $pid = $pm->start(\@files_t) and next;
361              
362             #-------------------------------------------------------------------------------
363             # Inside child
364             #-------------------------------------------------------------------------------
365             # Intelace child/parent processes
366 4         10820 my $sig = App::SimulateReads::InterlaceProcesses->new(foreign_pid => [$parent_pid]);
367              
368             # Set child seed
369 4         62 my $seed = time + $$;
370 4         37 srand($seed);
371              
372             # Calculate the number of reads to this job and correct this local index
373             # to the global index
374 4         39 my $number_of_reads_t = int($number_of_reads/$number_of_jobs);
375 4         27 my $last_read_idx = $number_of_reads_t * $tid;
376 4         16 my $idx = $last_read_idx - $number_of_reads_t + 1;
377              
378             # If it is the last job, make it work on the leftover reads of int() truncation
379 4 100       26 $last_read_idx += $number_of_reads % $number_of_jobs
380             if $tid == $number_of_jobs;
381              
382 4         112 log_msg " => Job $tid: Working on reads from $idx to $last_read_idx";
383              
384             # Create temporary files
385 4         69 log_msg " => Job $tid: Creating temporary file: @files_t";
386 4         10 my @fhs = map { $self->my_open_w($_, $self->output_gzip) } @files_t;
  6         313  
387              
388             # Run simualtion in child
389 4   66     89 for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) {
390 1710         3719 my $id = $seqid->();
391 1710         3308 my @fastq_entry;
392             try {
393             @fastq_entry = $self->sprint_fastq("SR${parent_pid}.$id.$i $i",
394 1710     1710   121056 $id, \$fasta->{$id}{seq}, $fasta->{$id}{size}, $strand->());
395             } catch {
396 0     0   0 croak "Not defined entry for seqid '>$id' at job $tid: $_";
397             } finally {
398 1710 50   1710   34128 unless (@_) {
399 1710         4818 for my $fh_idx (0..$#fhs) {
400 2280 50       11508 $fhs[$fh_idx]->say(${$fastq_entry[$fh_idx]})
  2280         7161  
401             or croak "Cannot write to $files_t[$fh_idx]: $!\n";
402             }
403             }
404 1710         14137 };
405             }
406              
407 4         135 log_msg " => Job $tid: Writing and closing file: @files_t";
408             # Close temporary files
409 4         16 for my $fh_idx (0..$#fhs) {
410 6 50       227 $fhs[$fh_idx]->close
411             or croak "Cannot write file $files_t[$fh_idx]: $!\n";
412             }
413              
414             # Child exit
415 4         183 log_msg " => Job $tid is finished";
416 4         62 $pm->finish;
417             }
418              
419             # Back to parent
420             # Interlace parent/child(s) processes
421 4         818 my $sig = App::SimulateReads::InterlaceProcesses->new(foreign_pid => \@child_pid);
422 4         38 $pm->wait_all_children;
423              
424 4 50       12014794 if ($sig->signal_catched) {
425 0         0 log_msg ":: Termination signal received!";
426             }
427              
428 4         72 log_msg ":: Saving the work ...";
429              
430             # Concatenate all temporary files
431 4         32 log_msg ":: Concatenating all temporary files ...";
432 4 50       11 my @fh = map { $self->my_open_w($self->output_gzip ? "$_.gz" : $_, 0) } @{ $files{$fastq_class} };
  5         476  
  4         38  
433 4         42 for my $i (0..$#tmp_files) {
434 10         20699 my $fh_idx = $i % scalar @fh;
435 10 50       93 cat $tmp_files[$i] => $fh[$fh_idx]
436             or croak "Cannot concatenate $tmp_files[$i] to $files{$fastq_class}[$fh_idx]: $!\n";
437             }
438              
439             # Close files
440 4         15932 log_msg ":: Writing and closing output file: @{ $files{$fastq_class} }";
  4         89  
441 4         28 for my $fh_idx (0..$#fh) {
442 5 50       120 $fh[$fh_idx]->close
443             or croak "Cannot write file $files{$fastq_class}[$fh_idx]: $!\n";
444             }
445              
446             # Clean up the mess
447 4         189 log_msg ":: Removing temporary files ...";
448 4         27 for my $file_t (@tmp_files) {
449 10 50       895 unlink $file_t
450             or croak "Cannot remove temporary file: $file_t: $!\n";
451             }
452             } ## --- end sub run_simulation
453              
454             __END__
455              
456             =pod
457              
458             =encoding UTF-8
459              
460             =head1 NAME
461              
462             App::SimulateReads::Simulator - Class responsible to make the simulation
463              
464             =head1 VERSION
465              
466             version 0.05
467              
468             =head1 AUTHOR
469              
470             Thiago L. A. Miller <tmiller@mochsl.org.br>
471              
472             =head1 COPYRIGHT AND LICENSE
473              
474             This software is Copyright (c) 2017 by Teaching and Research Institute from Sírio-Libanês Hospital.
475              
476             This is free software, licensed under:
477              
478             The GNU General Public License, Version 3, June 2007
479              
480             =cut