File Coverage

lib/App/SimulateReads/Simulator.pm
Criterion Covered Total %
statement 142 188 75.5
branch 24 48 50.0
condition 8 18 44.4
subroutine 18 22 81.8
pod 0 2 0.0
total 192 278 69.0


line stmt bran cond sub pod time code
1             package App::SimulateReads::Simulator;
2             # ABSTRACT: Class responsible to make the simulation
3              
4 6     6   19751 use App::SimulateReads::Base 'class';
  6         17  
  6         51  
5 6     6   58 use App::SimulateReads::Fastq::SingleEnd;
  6         12  
  6         178  
6 6     6   34 use App::SimulateReads::Fastq::PairedEnd;
  6         12  
  6         117  
7 6     6   1940 use App::SimulateReads::InterlaceProcesses;
  6         1821  
  6         223  
8 6     6   2130 use File::Cat 'cat';
  6         2391  
  6         325  
9 6     6   1957 use Parallel::ForkManager;
  6         39882  
  6         13803  
10              
11             with qw/App::SimulateReads::Role::WeightedRaffle App::SimulateReads::Role::IO/;
12              
13             our $VERSION = '0.06'; # VERSION
14              
15             #-------------------------------------------------------------------------------
16             # Moose attributes
17             #-------------------------------------------------------------------------------
18             has 'jobs' => (is => 'ro', isa => 'My:IntGt0', required => 1);
19             has 'prefix' => (is => 'ro', isa => 'Str', required => 1);
20             has 'output_gzip' => (is => 'ro', isa => 'Bool', required => 1);
21             has 'fasta_file' => (is => 'ro', isa => 'My:Fasta', required => 1);
22             has 'coverage' => (is => 'ro', isa => 'My:NumGt0', required => 0);
23             has 'number_of_reads' => (is => 'ro', isa => 'My:IntGt0', required => 0);
24             has 'count_loops_by' => (is => 'ro', isa => 'My:CountLoopBy', required => 1);
25             has 'strand_bias' => (is => 'ro', isa => 'My:StrandBias', required => 1);
26             has 'seqid_weight' => (is => 'ro', isa => 'My:SeqIdWeight', required => 1);
27             has 'weight_file' => (is => 'ro', isa => 'My:File', required => 0);
28             has 'fastq' => (
29             is => 'ro',
30             isa => 'App::SimulateReads::Fastq::SingleEnd | App::SimulateReads::Fastq::PairedEnd',
31             required => 1,
32             handles => [ qw{ sprint_fastq } ]
33             );
34             has '_fasta' => (
35             is => 'ro',
36             isa => 'My:IdxFasta',
37             builder => '_build_fasta',
38             lazy_build => 1
39             );
40             has '_strand' => (
41             is => 'ro',
42             isa => 'CodeRef',
43             builder => '_build_strand',
44             lazy_build => 1
45             );
46             has '_seqid_raffle' => (
47             is => 'ro',
48             isa => 'CodeRef',
49             builder => '_build_seqid_raffle',
50             lazy_build => 1
51             );
52              
53             #=== CLASS METHOD ============================================================
54             # CLASS: Simulator
55             # METHOD: BUILD (Moose)
56             # PARAMETERS: Void
57             # RETURNS: Void
58             # DESCRIPTION: Validate optional attributes and/or attributes that depends of
59             # another attribute
60             # THROWS: If seqid_weight == 'file' and the user did not pass a weight
61             # file, throws an exception.
62             # If count_loops_by == 'coverage', then coverage must be defined,
63             # as well as 'number_of_reads', number_of_reads must be defined
64             # COMMENTS: We need to initialize lazy attributes here. If not, the child
65             # processes could independently initialize one
66             # SEE ALSO: n/a
67             #===============================================================================
68             sub BUILD {
69 20     20 0 45 my $self = shift;
70              
71             # If seqid_weight is 'file', then weight_file must be defined
72 20 50 33     500 if ($self->seqid_weight eq 'file' and not defined $self->weight_file) {
73 0         0 croak "seqid_weight=file requires a weight_file\n";
74             }
75              
76             # If count_loops_by is 'coverage', then coverage must be defined. Else if
77             # it is equal to 'number_of_reads', then number_of_reads must be defined
78 20 50 33     620 if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) {
    50 33        
79 0         0 croak "count_loops_by=coverage requires a coverage number\n";
80             } elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) {
81 0         0 croak "count_loops_by=number_of_reads requires a number_of_reads number\n";
82             }
83            
84             ## Just to ensure that the lazy attributes are built before &new returns
85             # Only seqid_weight=same is not a weighted raffle, so in this case
86             # not construct weight attribute
87 20 50       470 $self->weights if $self->seqid_weight ne 'same';
88 20         495 $self->_strand;
89 20         600 $self->_fasta;
90 20         520 $self->_seqid_raffle;
91             } ## --- end sub BUILD
92            
93             #=== CLASS METHOD ============================================================
94             # CLASS: Simulator
95             # METHOD: _build_strand (BUILDER)
96             # PARAMETERS: Void
97             # RETURNS: Ref Code
98             # DESCRIPTION: Build _strand attribute. (dynamic linkage)
99             # THROWS: If it is given a unknown option, throws an error
100             # COMMENTS: Valid strand_bias: 'plus', 'minus' and 'random'
101             # SEE ALSO: n/a
102             #===============================================================================
103             sub _build_strand {
104 20     20   40 my $self = shift;
105 20         25 return do {
106 20         510 given ($self->strand_bias) {
107 20     0   45 when ('plus') { sub {1} }
  0         0  
  0         0  
108 20     0   40 when ('minus') { sub {0} }
  0         0  
  0         0  
109 20     1710   35 when ('random') { sub { int(rand(2)) }}
  20         570  
  1710         6541  
110 0         0 default { croak "Unknown option '$_' for strand bias\n" }
  0         0  
111             }
112             };
113             } ## --- end sub _build_strand
114              
115             #=== CLASS METHOD ============================================================
116             # CLASS: Simulator
117             # METHOD: _build_fasta (BUILDER)
118             # PARAMETERS: Void
119             # RETURNS: $indexed_fasta My:IdxFasta
120             # DESCRIPTION: Build _fasta attribute
121             # THROWS: For single end read: If the read size required is greater than
122             # any genomic sequence, then throws an error. For paired end read:
123             # If fragment mean is greater than any genomic sequence, the throws
124             # an error.
125             # COMMENTS: none
126             # SEE ALSO: n/a
127             #===============================================================================
128             sub _build_fasta {
129 20     20   30 my $self = shift;
130 20         515 log_msg ":: Indexing fasta file '" . $self->fasta_file . "' ...";
131 20         495 my $indexed_fasta = $self->index_fasta($self->fasta_file);
132              
133 20 50       60 unless (%$indexed_fasta) {
134 0         0 croak sprintf "Error parsing '%s'. Maybe the file is empty\n" => $self->fasta_file;
135             }
136              
137             # Validate genome about the read size required
138 20         60 for my $id (keys %$indexed_fasta) {
139 100         180 my $index_size = $indexed_fasta->{$id}{size};
140 100         2440 given (ref $self->fastq) {
141 100         205 when ('App::SimulateReads::Fastq::SingleEnd') {
142 50         1135 my $read_size = $self->fastq->read_size;
143 50 50       150 if ($index_size < $read_size) {
144 0         0 log_msg ":: seqid sequence length (>$id => $index_size) lesser than required read size ($read_size)\n" .
145             " -> I'm going to include '>$id' in the blacklist\n";
146 0         0 delete $indexed_fasta->{$id};
147             }
148             }
149 50         70 when ('App::SimulateReads::Fastq::PairedEnd') {
150 50         1155 my $fragment_mean = $self->fastq->fragment_mean;
151 50 50       200 if ($index_size < $fragment_mean) {
152 0         0 log_msg ":: seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)\n" .
153             " -> I'm going to include '>$id' in the blacklist\n";
154 0         0 delete $indexed_fasta->{$id};
155             }
156             }
157 0         0 default {
158 0         0 croak "Unknown option '$_' for sequencing type\n";
159             }
160             }
161             }
162            
163 20 50       55 unless (%$indexed_fasta) {
164 0         0 croak sprintf "Fasta file '%s' has no valid entry\n" => $self->fasta_file;
165             }
166              
167 20         470 return $indexed_fasta;
168             } ## --- end sub _build_fasta
169              
170             #=== CLASS METHOD ============================================================
171             # CLASS: Simulator
172             # METHOD: _build_weights (BUILDER)
173             # PARAMETERS: Void
174             # RETURNS: My:Weights
175             # DESCRIPTION: Build weights. It is required by the WeightedRaffle role.
176             # It verifies seqid_weight and sets the required weights matrix
177             # THROWS: If seqid_weight == 'file', then it needs to be validated
178             # If it is given a unknown option, throws an error
179             # COMMENTS: none
180             # SEE ALSO: n/a
181             #===============================================================================
182             sub _build_weights {
183 20     20   35 my $self = shift;
184 20         25 return do {
185 20         505 given ($self->seqid_weight) {
186 20         75 when ('length') {
187 20         30 my %chr_size = map { $_, $self->_fasta->{$_}{size} } keys %{ $self->_fasta };
  100         2235  
  20         485  
188 20         110 $self->calculate_weights(\%chr_size);
189             }
190 0         0 when ('file') {
191 0         0 log_msg ":: Indexing weight file '" . $self->weight_file . "' ...";
192 0         0 my $indexed_file = $self->index_weight_file($self->weight_file);
193              
194             # Validate weight_file
195 0 0       0 croak "Error parsing '" . $self->weight_file . "': Maybe the file is empty\n"
196             unless %$indexed_file;
197              
198 0         0 my $indexed_fasta = $self->_fasta;
199 0         0 my $err;
200 0         0 for my $id (keys %$indexed_file) {
201 0 0       0 if (not exists $indexed_fasta->{$id}) {
202 0         0 $err .= "seqid '$id' not found in '" . $self->fasta_file . "'\n";
203             }
204             }
205 0 0       0 croak "Error in validating '" . $self->weight_file . "':\n" . $err
206             if defined $err;
207              
208 0         0 $self->calculate_weights($indexed_file);
209             }
210 0         0 when ('same') {
211 0         0 croak "Error: Cannot build raffle weights for 'seqid-weight=same'\n";
212             }
213 0         0 default {
214 0         0 croak "Unknown option '$_' for weighted raffle\n";
215             }
216             }
217             };
218             } ## --- end sub _build_weights
219              
220             #=== CLASS METHOD ============================================================
221             # CLASS: Simulator
222             # METHOD: _build_seqid_raffle (BUILDER)
223             # PARAMETERS: Void
224             # RETURNS: Ref Code
225             # DESCRIPTION: Build _seqid_raffle attribute. (dynamic linkage)
226             # THROWS: If it is given a unknown option, throws an error
227             # COMMENTS: seqid_weight can be: 'length', 'file' and 'same'
228             # SEE ALSO: n/a
229             #===============================================================================
230             sub _build_seqid_raffle {
231 20     20   30 my $self = shift;
232 20         35 return do {
233 20         490 given ($self->seqid_weight) {
234 20         50 when ('same') {
235 0         0 my @seqids = keys %{ $self->_fasta };
  0         0  
236 0         0 my $seqids_size = scalar @seqids;
237 0     0   0 sub { $seqids[int(rand($seqids_size))] };
  0         0  
238             }
239 20         85 when (/^(file|length)$/) {
240 20     1710   580 sub { $self->weighted_raffle };
  1710         4329  
241             }
242 0         0 default {
243 0         0 croak "Unknown option '$_' for seqid-raffle\n";
244             }
245             }
246             };
247             } ## --- end sub _build_seqid_raffle
248              
249             #=== CLASS METHOD ============================================================
250             # CLASS: Simulator
251             # METHOD: _calculate_number_of_reads (PRIVATE)
252             # PARAMETERS: Void
253             # RETURNS: $number_of_reads Int > 0
254             # DESCRIPTION: Calculates the number of reads to produce based on the coverage
255             # or the own value passed by the user
256             # THROWS: If count_loops_by is equal to 'coverage', it may accur that the
257             # fasta_file size, or the coverage asked is too low, which results in
258             # zero reads
259             # If it is given a unknown option, throws an error
260             # COMMENTS: count_loops_by can be: 'coverage' and 'number_of_reads'
261             # SEE ALSO: n/a
262             #===============================================================================
263             sub _calculate_number_of_reads {
264 8     8   32 my $self = shift;
265 8         19 my $number_of_reads = do {
266 8         277 given($self->count_loops_by) {
267 8         35 when ('coverage') {
268             # It is needed to calculate the genome size
269 8         249 my $fasta = $self->_fasta;
270 8         27 my $fasta_size = 0;
271 8         21 $fasta_size += $fasta->{$_}{size} for keys %{ $fasta };
  8         77  
272 8         262 int(($fasta_size * $self->coverage) / $self->fastq->read_size);
273             }
274 0         0 when ('number-of-reads') {
275 0         0 $self->number_of_reads;
276             }
277 0         0 default {
278 0         0 croak "Unknown option '$_' for calculating the number of reads\n";
279             }
280             }
281             };
282              
283             # In case it is paired-end read, divide the number of reads by 2 because App::SimulateReads::Fastq::PairedEnd class
284             # returns 2 reads at time
285 8         240 my $class = ref $self->fastq;
286 8 100       32 my $read_type_factor = $class eq 'App::SimulateReads::Fastq::PairedEnd' ? 2 : 1;
287 8         27 $number_of_reads = int($number_of_reads / $read_type_factor);
288              
289             # Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size
290 8 50 66     84 if ($number_of_reads <= 0 || ($class eq 'App::SimulateReads::Fastq::PairedEnd' && $number_of_reads == 1)) {
      33        
291 0         0 croak "The computed number of reads is equal to zero.\n" .
292             "It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n";
293             }
294              
295 8         35 return $number_of_reads;
296             } ## --- end sub _calculate_number_of_reads
297              
298             #=== CLASS METHOD ============================================================
299             # CLASS: Simulator
300             # METHOD: run_simulation
301             # PARAMETERS: Void
302             # RETURNS: Void
303             # DESCRIPTION: The main class method where the simulation unfolds
304             # THROWS: Try catch the fasta object passed by the user. If occurs an error,
305             # then throws an exception
306             # COMMENTS: It is not recommended to make parallelism in perl, so the workaround
307             # is to implement a parent, child system by forking the task and
308             # making part of the job in each child. In the end, it returns to
309             # parent and concatenate all temporary files generated
310             # SEE ALSO: n/a
311             #===============================================================================
312             sub run_simulation {
313 8     8 0 3906 my $self = shift;
314 8         316 my $fasta = $self->_fasta;
315              
316             # Calculate the number of reads to be generated
317 8         43 my $number_of_reads = $self->_calculate_number_of_reads;
318              
319             # Function that returns strand by strand_bias
320 8         270 my $strand = $self->_strand;
321              
322             # Function that returns seqid by seqid_weight
323 8         263 my $seqid = $self->_seqid_raffle;
324              
325             # Files to be generated
326 8         298 my %files = (
327             'App::SimulateReads::Fastq::SingleEnd' => [
328             $self->prefix . '_simulation_read.fastq'
329             ],
330             'App::SimulateReads::Fastq::PairedEnd' => [
331             $self->prefix . '_simulation_read_R1.fastq',
332             $self->prefix . '_simulation_read_R2.fastq'
333             ],
334             );
335              
336             # Is it single-end or paired-end?
337 8         257 my $fastq_class = ref $self->fastq;
338              
339             # Forks
340 8         1741 my $number_of_jobs = $self->jobs;
341 8         103 my $pm = Parallel::ForkManager->new($number_of_jobs);
342            
343             # Parent child pids
344 8         4480 my $parent_pid = $$;
345 8         27 my @child_pid;
346              
347             # Temporary files tracker
348             my @tmp_files;
349              
350             # Run in parent right after creating child process
351             $pm->run_on_start(
352             sub {
353 10     10   17014 my ($pid, $files_ref) = @_;
354 10         73 push @child_pid => $pid;
355 10         94 push @tmp_files => @$files_ref;
356             }
357 8         84 );
358              
359 8 50       181 log_msg sprintf ":: Creating %d child %s ...",
360             $number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs";
361              
362 8         32 for my $tid (1..$number_of_jobs) {
363             #-------------------------------------------------------------------------------
364             # Inside parent
365             #-------------------------------------------------------------------------------
366 14         314 log_msg ":: Creating job $tid ...";
367 14         31 my @files_t = map { "$_.${parent_pid}_part$tid" } @{ $files{$fastq_class} };
  19         104  
  14         114  
368 14 100       87 my $pid = $pm->start(\@files_t) and next;
369              
370             #-------------------------------------------------------------------------------
371             # Inside child
372             #-------------------------------------------------------------------------------
373             # Intelace child/parent processes
374 4         17718 my $sig = App::SimulateReads::InterlaceProcesses->new(foreign_pid => [$parent_pid]);
375              
376             # Set child seed
377 4         93 my $seed = time + $$;
378 4         46 srand($seed);
379              
380             # Calculate the number of reads to this job and correct this local index
381             # to the global index
382 4         60 my $number_of_reads_t = int($number_of_reads/$number_of_jobs);
383 4         28 my $last_read_idx = $number_of_reads_t * $tid;
384 4         13 my $idx = $last_read_idx - $number_of_reads_t + 1;
385              
386             # If it is the last job, make it work on the leftover reads of int() truncation
387 4 100       53 $last_read_idx += $number_of_reads % $number_of_jobs
388             if $tid == $number_of_jobs;
389              
390 4         115 log_msg " => Job $tid: Working on reads from $idx to $last_read_idx";
391              
392             # Create temporary files
393 4         54 log_msg " => Job $tid: Creating temporary file: @files_t";
394 4         12 my @fhs = map { $self->my_open_w($_, $self->output_gzip) } @files_t;
  6         342  
395              
396             # Run simualtion in child
397 4   66     117 for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) {
398 1710         3181 my $id = $seqid->();
399 1710         2888 my @fastq_entry;
400             try {
401             @fastq_entry = $self->sprint_fastq("SR${parent_pid}.$id.$i $i",
402 1710     1710   96368 $id, \$fasta->{$id}{seq}, $fasta->{$id}{size}, $strand->());
403             } catch {
404 0     0   0 croak "Not defined entry for seqid '>$id' at job $tid: $_";
405             } finally {
406 1710 50   1710   27565 unless (@_) {
407 1710         3978 for my $fh_idx (0..$#fhs) {
408 2280 50       8885 $fhs[$fh_idx]->say(${$fastq_entry[$fh_idx]})
  2280         5947  
409             or croak "Cannot write to $files_t[$fh_idx]: $!\n";
410             }
411             }
412 1710         11624 };
413             }
414              
415 4         128 log_msg " => Job $tid: Writing and closing file: @files_t";
416             # Close temporary files
417 4         14 for my $fh_idx (0..$#fhs) {
418 6 50       208 $fhs[$fh_idx]->close
419             or croak "Cannot write file $files_t[$fh_idx]: $!\n";
420             }
421              
422             # Child exit
423 4         196 log_msg " => Job $tid is finished";
424 4         63 $pm->finish;
425             }
426              
427             # Back to parent
428             # Interlace parent/child(s) processes
429 4         717 my $sig = App::SimulateReads::InterlaceProcesses->new(foreign_pid => \@child_pid);
430 4         52 $pm->wait_all_children;
431              
432 4 50       12015456 if ($sig->signal_catched) {
433 0         0 log_msg ":: Termination signal received!";
434             }
435              
436 4         56 log_msg ":: Saving the work ...";
437              
438             # Concatenate all temporary files
439 4         26 log_msg ":: Concatenating all temporary files ...";
440 4 50       18 my @fh = map { $self->my_open_w($self->output_gzip ? "$_.gz" : $_, 0) } @{ $files{$fastq_class} };
  5         392  
  4         28  
441 4         38 for my $i (0..$#tmp_files) {
442 10         17898 my $fh_idx = $i % scalar @fh;
443 10 50       62 cat $tmp_files[$i] => $fh[$fh_idx]
444             or croak "Cannot concatenate $tmp_files[$i] to $files{$fastq_class}[$fh_idx]: $!\n";
445             }
446              
447             # Close files
448 4         14507 log_msg ":: Writing and closing output file: @{ $files{$fastq_class} }";
  4         61  
449 4         25 for my $fh_idx (0..$#fh) {
450 5 50       80 $fh[$fh_idx]->close
451             or croak "Cannot write file $files{$fastq_class}[$fh_idx]: $!\n";
452             }
453              
454             # Clean up the mess
455 4         138 log_msg ":: Removing temporary files ...";
456 4         15 for my $file_t (@tmp_files) {
457 10 50       687 unlink $file_t
458             or croak "Cannot remove temporary file: $file_t: $!\n";
459             }
460             } ## --- end sub run_simulation
461              
462             __END__
463              
464             =pod
465              
466             =encoding UTF-8
467              
468             =head1 NAME
469              
470             App::SimulateReads::Simulator - Class responsible to make the simulation
471              
472             =head1 VERSION
473              
474             version 0.06
475              
476             =head1 AUTHOR
477              
478             Thiago L. A. Miller <tmiller@mochsl.org.br>
479              
480             =head1 COPYRIGHT AND LICENSE
481              
482             This software is Copyright (c) 2017 by Teaching and Research Institute from Sírio-Libanês Hospital.
483              
484             This is free software, licensed under:
485              
486             The GNU General Public License, Version 3, June 2007
487              
488             =cut