File Coverage

lib/App/Sandy/Simulator.pm
Criterion Covered Total %
statement 277 447 61.9
branch 72 174 41.3
condition 11 48 22.9
subroutine 32 44 72.7
pod 0 2 0.0
total 392 715 54.8


line stmt bran cond sub pod time code
1             package App::Sandy::Simulator;
2             # ABSTRACT: Class responsible to make the simulation
3              
4 6     6   24205 use App::Sandy::Base 'class';
  6         13  
  6         46  
5 6     6   435 use App::Sandy::Seq::SingleEnd;
  6         433  
  6         160  
6 6     6   450 use App::Sandy::Seq::PairedEnd;
  6         470  
  6         229  
7 6     6   3125 use App::Sandy::InterlaceProcesses;
  6         2739  
  6         279  
8 6     6   3697 use App::Sandy::WeightedRaffle;
  6         2724  
  6         299  
9 6     6   548 use App::Sandy::PieceTable;
  6         438  
  6         177  
10 6     6   3188 use App::Sandy::DB::Handle::Expression;
  6         2702  
  6         282  
11 6     6   3807 use App::Sandy::DB::Handle::Variation;
  6         2609  
  6         293  
12 6     6   68 use List::Util 'min';
  6         17  
  6         460  
13 6     6   5336 use File::Cat 'cat';
  6         3380  
  6         400  
14 6     6   3475 use Parallel::ForkManager;
  6         340548  
  6         48314  
15              
16             with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/;
17              
18             our $VERSION = '0.22'; # VERSION
19              
20             has 'argv' => (
21             is => 'ro',
22             isa => 'ArrayRef[Str]',
23             required => 1
24             );
25              
26             has 'truncate' => (
27             is => 'ro',
28             isa => 'Bool',
29             required => 1
30             );
31              
32             has 'seed' => (
33             is => 'ro',
34             isa => 'Int',
35             required => 1
36             );
37              
38             has 'jobs' => (
39             is => 'ro',
40             isa => 'My:IntGt0',
41             required => 1
42             );
43              
44             has 'prefix' => (
45             is => 'ro',
46             isa => 'Str',
47             required => 1
48             );
49              
50             has 'join_paired_ends' => (
51             is => 'ro',
52             isa => 'Bool',
53             required => 1
54             );
55              
56             has 'output_format' => (
57             is => 'ro',
58             isa => 'My:Format',
59             required => 1
60             );
61              
62             has 'compression_level' => (
63             is => 'ro',
64             isa => 'My:Level',
65             required => 1
66             );
67              
68             has 'fasta_file' => (
69             is => 'ro',
70             isa => 'My:Fasta',
71             required => 1
72             );
73              
74             has 'coverage' => (
75             is => 'ro',
76             isa => 'My:NumGt0',
77             required => 0
78             );
79              
80             has 'number_of_reads' => (
81             is => 'ro',
82             isa => 'My:IntGt0',
83             required => 0
84             );
85              
86             has 'count_loops_by' => (
87             is => 'ro',
88             isa => 'My:CountLoopBy',
89             required => 1
90             );
91              
92             has 'strand_bias' => (
93             is => 'ro',
94             isa => 'My:StrandBias',
95             required => 1
96             );
97              
98             has 'seqid_weight' => (
99             is => 'ro',
100             isa => 'My:SeqIdWeight',
101             required => 1
102             );
103              
104             has 'expression_matrix' => (
105             is => 'ro',
106             isa => 'Str',
107             required => 0
108             );
109              
110             has 'genomic_variation' => (
111             is => 'ro',
112             isa => 'ArrayRef[Str]',
113             required => 0
114             );
115              
116             has '_genomic_variation_names' => (
117             is => 'ro',
118             isa => 'Maybe[Str]',
119             builder => '_build_genomic_variation_names',
120             lazy_build => 1
121             );
122              
123             has 'seq' => (
124             is => 'ro',
125             isa => 'App::Sandy::Seq::SingleEnd | App::Sandy::Seq::PairedEnd',
126             required => 1,
127             handles => [ qw{ sprint_seq gen_sam_header gen_eof_marker } ]
128             );
129              
130             has '_fasta' => (
131             is => 'ro',
132             isa => 'My:IdxFasta',
133             builder => '_build_fasta',
134             lazy_build => 1
135             );
136              
137             has '_fasta_tree' => (
138             traits => ['Hash'],
139             is => 'ro',
140             isa => 'HashRef[ArrayRef]',
141             default => sub { {} },
142             handles => {
143             _set_fasta_tree => 'set',
144             _get_fasta_tree => 'get',
145             _exists_fasta_tree => 'exists',
146             _fasta_tree_pairs => 'kv',
147             _has_no_fasta_tree => 'is_empty'
148             }
149             );
150              
151             has '_fasta_rtree' => (
152             traits => ['Hash'],
153             is => 'ro',
154             isa => 'HashRef[Str]',
155             default => sub { {} },
156             handles => {
157             _set_fasta_rtree => 'set',
158             _get_fasta_rtree => 'get',
159             _delete_fasta_rtree => 'delete',
160             _exists_fasta_rtree => 'exists',
161             _fasta_rtree_pairs => 'kv',
162             _has_no_fasta_rtree => 'is_empty'
163             }
164             );
165              
166             has '_seqname' => (
167             traits => ['Hash'],
168             is => 'ro',
169             isa => 'HashRef[Str]',
170             default => sub { {} },
171             handles => {
172             _set_seqname => 'set',
173             _get_seqname => 'get'
174             }
175             );
176              
177             has '_piece_table' => (
178             is => 'ro',
179             isa => 'HashRef[HashRef[My:PieceTable]]',
180             builder => '_build_piece_table',
181             lazy_build => 1
182             );
183              
184             has '_strand' => (
185             is => 'ro',
186             isa => 'CodeRef',
187             builder => '_build_strand',
188             lazy_build => 1
189             );
190              
191             has '_seqid_raffle' => (
192             is => 'ro',
193             isa => 'CodeRef',
194             builder => '_build_seqid_raffle',
195             lazy_build => 1
196             );
197              
198             sub BUILD {
199 20     20 0 1105 my $self = shift;
200              
201             # If seqid_weight is 'count', then expression_matrix must be defined
202 20 50 33     620 if ($self->seqid_weight eq 'count' and not defined $self->expression_matrix) {
203 0         0 croak "seqid_weight=count requires a expression_matrix\n";
204             }
205              
206             # If count_loops_by is 'coverage', then coverage must be defined. Else if
207             # it is equal to 'number_of_reads', then number_of_reads must be defined
208 20 50 33     535 if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) {
    50 33        
209 0         0 croak "count_loops_by=coverage requires a coverage number\n";
210             } elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) {
211 0         0 croak "count_loops_by=number_of_reads requires a number_of_reads number\n";
212             }
213              
214             ## Just to ensure that the lazy attributes are built before &new returns
215 20         550 $self->_piece_table;
216 20         550 $self->_seqid_raffle;
217 20         515 $self->_fasta;
218 20         510 $self->_strand;
219             }
220              
221             sub _build_strand {
222 20     20   40 my $self = shift;
223 20         40 my $strand_sub;
224              
225 20 50       600 if ($self->strand_bias eq 'plus') {
    50          
    50          
226 0     0   0 $strand_sub = sub {1};
  0         0  
227             } elsif ($self->strand_bias eq 'minus') {
228 0     0   0 $strand_sub = sub {0};
  0         0  
229             } elsif ($self->strand_bias eq 'random') {
230 20     1710   80 $strand_sub = sub { int(rand(2)) };
  1710         7756  
231             } else {
232 0         0 croak sprintf "Unknown option '%s' for strand bias\n",
233             $self->strand_bias;
234             }
235              
236 20         495 return $strand_sub;
237             }
238              
239             sub _index_fasta {
240 20     20   45 my $self = shift;
241 20         505 my $fasta = $self->fasta_file;
242              
243 20         100 my $fh = $self->with_open_r($fasta);
244              
245             # indexed_genome = ID => (seq, len)
246 20         75 my %indexed_fasta;
247              
248             # >ID|PID as in gencode transcripts
249             my %fasta_rtree;
250 20         0 my $id;
251              
252 20         570 while (<$fh>) {
253 860         1290 chomp;
254 860 50       1540 next if /^;/;
255 860 100       1540 if (/^>/) {
256 100         315 my @fields = split /\|/;
257 100         230 $id = $fields[0];
258 100         280 $id =~ s/^>//;
259 100         325 $id =~ s/^\s+|\s+$//g;
260              
261             # Seq ID standardization in order to manage comparations
262             # between chr1, Chr1, CHR1, 1 etc;
263 100         355 my $std_id = $self->with_std_seqid($id);
264 100         3735 $self->_set_seqname(
265             $id => $std_id,
266             $std_id => $id
267             );
268              
269             # It is necessary to catch gene -> transcript relation
270             # # TODO: Make a hash tarit for indexed fasta
271 100 50       610 if (defined $fields[1]) {
272 0         0 my $pid = $fields[1];
273 0         0 $pid =~ s/^\s+|\s+$//g;
274 0         0 $fasta_rtree{$id} = $pid;
275             }
276             } else {
277 760 50       1290 die "Error reading fasta file '$fasta': Not defined id"
278             unless defined $id;
279 760         2270 $indexed_fasta{$id}{seq} .= $_;
280             }
281             }
282              
283 20         115 for (keys %indexed_fasta) {
284 100         255 $indexed_fasta{$_}{size} = length $indexed_fasta{$_}{seq};
285             }
286              
287 20 50       65 unless (%indexed_fasta) {
288 0         0 die "Error parsing '$fasta'. Maybe the file is empty\n";
289             }
290              
291             $fh->close
292 20 50       135 or die "Cannot close file $fasta: $!\n";
293              
294 20 50       420 $self->_set_fasta_rtree(%fasta_rtree) if %fasta_rtree;
295 20         110 return \%indexed_fasta;
296             }
297              
298             sub _build_fasta {
299 20     20   40 my $self = shift;
300 20         520 my $fasta = $self->fasta_file;
301              
302 20         120 log_msg ":: Indexing fasta file '$fasta' ...";
303 20         55 my $indexed_fasta = $self->_index_fasta;
304              
305             # Validate genome about the read size required
306 20         120 log_msg ":: Validating fasta file '$fasta' ...";
307             # Entries to remove
308 20         35 my @blacklist;
309              
310 20 50       695 unless ($self->truncate) {
311 20         75 for my $id (keys %$indexed_fasta) {
312 100         190 my $index_size = $indexed_fasta->{$id}{size};
313 100         2455 my $class = ref $self->seq;
314              
315 100 100       280 if ($class eq 'App::Sandy::Seq::SingleEnd') {
    50          
316 50         1170 my $read_mean = $self->seq->read_mean;
317 50 50       135 if ($index_size < $read_mean) {
318 0         0 log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required read mean ($read_mean)";
319 0         0 delete $indexed_fasta->{$id};
320 0         0 push @blacklist => $id;
321             }
322             } elsif ($class eq 'App::Sandy::Seq::PairedEnd') {
323 50         1355 my $fragment_mean = $self->seq->fragment_mean;
324 50 50       130 if ($index_size < $fragment_mean) {
325 0         0 log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)";
326 0         0 delete $indexed_fasta->{$id};
327 0         0 push @blacklist => $id;
328             }
329             } else {
330 0         0 croak "Unknown option '$class' for sequencing type\n";
331             }
332             }
333             }
334              
335 20 50       65 unless (%$indexed_fasta) {
336 0         0 die sprintf "Fasta file '%s' has no valid entry\n" => $self->fasta_file;
337             }
338              
339             # If fasta_rtree has entries
340 20 50       715 unless ($self->_has_no_fasta_rtree) {
341             # Remove no valid entries from id -> pid relation
342 0 0       0 $self->_delete_fasta_rtree(@blacklist) if @blacklist;
343             }
344              
345 20         525 return $indexed_fasta;
346             }
347              
348             sub _populate_fasta_tree {
349 20     20   35 my $self = shift;
350              
351             # If fasta_rtree has entries
352 20 50       705 unless ($self->_has_no_fasta_rtree) {
353             # Build parent -> child ids relation
354 0         0 my %fasta_tree;
355              
356             # Reverse fasta_rtree to pid -> \@ids
357 0         0 for my $pair ($self->_fasta_rtree_pairs) {
358 0         0 my ($id, $pid) = (@$pair);
359 0         0 push @{ $fasta_tree{$pid} } => $id;
  0         0  
360             }
361              
362 0         0 $self->_set_fasta_tree(%fasta_tree);
363             }
364             }
365              
366             sub _retrieve_expression_matrix {
367 0     0   0 my $self = shift;
368 0         0 my $expression = App::Sandy::DB::Handle::Expression->new;
369 0         0 return $expression->retrievedb($self->expression_matrix);
370             }
371              
372             sub _build_seqid_raffle {
373 20     20   35 my $self = shift;
374              
375             # Get the piece table
376 20         530 my $piece_table = $self->_piece_table;
377              
378             # The builded function
379 20         40 my $seqid_sub;
380              
381 20 50       580 if ($self->seqid_weight eq 'same') {
    50          
    50          
382 0     0   0 my ($keys, $weights) = $self->_populate_key_weight($piece_table, sub { 1 });
  0         0  
383              
384             # If weight == 1 means that there are 2 keys for
385             # the same seq_id.
386             # If weight == 2 means that there is only one key
387             # for the seq_id, so I double that key
388 0         0 for (my $i = 0; $i < @$weights; $i++) {
389 0 0       0 if ($weights->[$i] > 1) {
390 0         0 push @$keys => $keys->[$i];
391             }
392             }
393              
394 0         0 my $keys_size = scalar @$keys;
395 0     0   0 $seqid_sub = sub { $keys->[int(rand($keys_size))] };
  0         0  
396             } elsif ($self->seqid_weight eq 'count') {
397             # Catch expression-matrix entry from database
398 0         0 my $indexed_file = $self->_retrieve_expression_matrix;
399              
400             # Catch indexed fasta
401 0         0 my $indexed_fasta = $self->_fasta;
402              
403             # Validate expression_matrix
404 0         0 for my $id (keys %$indexed_file) {
405             # If not exists into indexed_fasta, it must then exist into fasta_tree
406 0 0 0     0 unless (exists $piece_table->{$id} || $self->_exists_fasta_tree($id)) {
407 0         0 log_msg sprintf ":: Ignoring seqid '%s' from expression-matrix '%s': It is not found into the indexed fasta"
408             => $id, $self->expression_matrix;
409 0         0 delete $indexed_file->{$id};
410             }
411             }
412              
413 0 0       0 unless (%$indexed_file) {
414 0         0 die sprintf "No valid seqid entry of the expression-matrix '%s' is recorded into the indexed fasta\n"
415             => $self->expression_matrix;
416             }
417              
418 0         0 my (%ptable_ind, %ptable_cluster);
419              
420             # Split indexed_file seq_ids between those
421             # into piece_table and those that represents a cluster
422             # of seq_ids as in gene -> transcript relationship
423 0         0 for my $seq_id (keys %$indexed_file) {
424 0 0       0 if (exists $piece_table->{$seq_id}) {
425 0         0 $ptable_ind{$seq_id} = $piece_table->{$seq_id};
426              
427             } else {
428 0         0 my $ids = $self->_get_fasta_tree($seq_id);
429              
430             # Bug catcher
431 0 0       0 unless (@$ids) {
432 0         0 croak "seq_id '$seq_id' not found into piece_table";
433             }
434              
435 0         0 $ptable_cluster{$seq_id} = $ids;
436             }
437             }
438              
439             # Let's calculate the weight taking in acount
440             # the size increase/decrease
441             my $calc_ind_weight = sub {
442 0     0   0 my ($seq_id, $type) = @_;
443              
444 0         0 my $counts = $indexed_file->{$seq_id};
445 0         0 my $size = $piece_table->{$seq_id}{$type}{size};
446 0         0 my $fasta_size = $indexed_fasta->{$seq_id}{size};
447              
448             # Correct the weight according to the
449             # genomic variation change by the ratio
450             # between the table size and fasta size
451 0         0 my $factor = $size / $fasta_size;
452              
453 0         0 return $counts * $factor;
454 0         0 };
455              
456 0         0 my ($keys, $weights);
457              
458 0 0       0 if (%ptable_ind) {
459 0         0 ($keys, $weights) = $self->_populate_key_weight(\%ptable_ind,
460             $calc_ind_weight);
461             }
462              
463             # If there are seq_id cluster like, then its is
464             # time to calculate these weights
465 0         0 for my $seq_id (sort keys %ptable_cluster) {
466 0         0 my %ptable;
467              
468             # Slice piece_table hash
469 0         0 my $ids = $ptable_cluster{$seq_id};
470 0         0 @ptable{@$ids} = @$piece_table{@$ids};
471              
472             # total size among all ids of cluster
473 0         0 my %total;
474              
475             # Calculate the total size by type
476 0         0 for my $type_h (values %ptable) {
477 0         0 for my $type (keys %$type_h) {
478 0         0 $total{$type} += $type_h->{$type}{size};
479             }
480             }
481              
482             # Calculate the weight taking in acount the size increase/decrease
483             # and the ratio between the total size by type and the table size.
484             # The problem here is that I must divide the 'counts' for some 'seq_id'
485             # among all ids that belong to it
486             my $calc_cluster_weight = sub {
487 0     0   0 my ($id, $type) = @_;
488              
489 0         0 my $counts = $indexed_file->{$seq_id};
490 0         0 my $size = $piece_table->{$id}{$type}{size};
491 0         0 my $fasta_size = $indexed_fasta->{$id}{size};
492              
493             # Divide the counts among all ids
494 0         0 my $ratio = $size / $total{$type};
495              
496             # Correct the weight according to the size
497 0         0 my $factor = $size / $fasta_size;
498              
499 0         0 return $counts * $factor * $ratio;
500 0         0 };
501              
502 0         0 my ($k, $w) = $self->_populate_key_weight(\%ptable,
503             $calc_cluster_weight);
504              
505 0         0 push @$keys => @$k;
506 0         0 push @$weights => @$w;
507             }
508              
509 0 0 0     0 unless (@$keys && @$weights) {
510 0         0 croak "No keys weights have been set";
511             }
512              
513             # It is very necessary in order
514             # to avoid truncation of numbers
515             # between zero and one
516 0         0 $self->_round_weight($weights);
517              
518 0         0 my $raffler = App::Sandy::WeightedRaffle->new(
519             'weights' => $weights,
520             'keys' => $keys
521             );
522              
523 0     0   0 $seqid_sub = sub { $raffler->weighted_raffle };
  0         0  
524             } elsif ($self->seqid_weight eq 'length') {
525             my $calc_weight = sub {
526 100     100   210 my ($seq_id, $type) = @_;
527 100         205 return $piece_table->{$seq_id}{$type}{size};
528 20         115 };
529              
530 20         70 my ($keys, $weights) = $self->_populate_key_weight($piece_table,
531             $calc_weight);
532              
533             # Just in case ...
534 20         70 $self->_round_weight($weights);
535              
536 20         720 my $raffler = App::Sandy::WeightedRaffle->new(
537             weights => $weights,
538             keys => $keys
539             );
540              
541 20     1710   155 $seqid_sub = sub { $raffler->weighted_raffle };
  1710         6155  
542             } else {
543 0         0 croak sprintf "Unknown option '%s' for seqid_weight\n",
544             $self->seqid_weight;
545             }
546              
547 20         560 return $seqid_sub;
548             }
549              
550             sub _round_weight {
551 20     20   40 my ($self, $weights) = @_;
552              
553 20         75 my $min = min @$weights;
554              
555 20 50       55 if ($min <= 0) {
556 0         0 croak "min weight le to zero: $min";
557             }
558              
559 20 50       95 my $factor = $min < 1
560             ? (1 / $min)
561             : 1;
562              
563 20         45 for my $weight (@$weights) {
564 100         195 $weight = int($weight * $factor + 0.5);
565             }
566             }
567              
568             sub _populate_key_weight {
569 20     20   55 my ($self, $piece_table, $calc_weight) = @_;
570              
571 20         35 my (@keys, @weights);
572              
573             # It needs to be sorted in order to the
574             # seed works
575 20         125 for my $seq_id (sort keys %$piece_table) {
576 100         160 my $type_h = $piece_table->{$seq_id};
577              
578             # If there is no alternative seq_id, then
579             # set a factor to correct the size.
580             # It is necessary because the seq_ids with
581             # alternative and reference will double its
582             # own coverage
583 100 50       200 my $factor = scalar keys %$type_h == 1
584             ? 2
585             : 1;
586              
587 100         205 for my $type (sort keys %$type_h) {
588              
589 100         330 my %key = (
590             'seq_id' => $seq_id,
591             'type' => $type
592             );
593              
594 100         185 my $weight = $calc_weight->($seq_id, $type);
595              
596 100         195 push @keys => \%key;
597 100         295 push @weights => $weight * $factor;
598             }
599             }
600              
601 20         65 return (\@keys, \@weights);
602             }
603              
604             sub _build_genomic_variation_names {
605 20     20   25 my $self = shift;
606 20 50       555 if ($self->genomic_variation) {
607 0         0 return sprintf "[%s]", => join ", ", @{ $self->genomic_variation };
  0         0  
608             }
609             }
610              
611             sub _retrieve_genomic_variation {
612 0     0   0 my $self = shift;
613 0         0 my $variation = App::Sandy::DB::Handle::Variation->new;
614 0         0 return $variation->retrievedb($self->genomic_variation);
615             }
616              
617             sub _build_piece_table {
618 20     20   35 my $self = shift;
619              
620 20         625 my $genomic_variation = $self->_genomic_variation_names;
621 20         35 my $indexed_snv;
622              
623             # Retrieve genomic variation if the user provided it
624 20 50       55 if (defined $genomic_variation) {
625 0         0 $indexed_snv = $self->_retrieve_genomic_variation;
626 0         0 log_msg ":: Validate genomic variation '$genomic_variation' against indexed fasta ...";
627 0         0 $self->_validate_indexed_snv_against_fasta($indexed_snv);
628             }
629              
630             # Catch index fasta
631 20         505 my $indexed_fasta = $self->_fasta;
632              
633             # Build piece table
634 20         40 my %piece_table;
635              
636             # Let's construct the piece_table
637 20         80 log_msg ":: Build piece table ...";
638              
639 20         85 while (my ($seq_id, $fasta_h) = each %$indexed_fasta) {
640 100         210 my $seq = \$fasta_h->{seq};
641 100         3605 my $std_seq_id = $self->_get_seqname($seq_id);
642              
643             # Initialize piece tables for $seq_id ref
644 100         2575 $piece_table{$seq_id}{ref}{table} = App::Sandy::PieceTable->new(orig => $seq);
645              
646             # If there is indexed_snv for seq_id, then construct the piece table with it
647 100 50 33     440 if (defined $indexed_snv && defined $indexed_snv->{$std_seq_id}) {
648 0         0 my $snvs = $indexed_snv->{$std_seq_id};
649              
650             # Filter only the homozygotic snvs to feed reference seq_id
651 0         0 my @snvs_homo = grep { $_->{plo} eq 'HO' } @$snvs;
  0         0  
652              
653 0 0       0 if (@snvs_homo) {
654             # Populate reference seq_id
655 0         0 $self->_populate_piece_table($piece_table{$seq_id}{ref}{table}, \@snvs_homo);
656             }
657              
658             # Initialize piece tables for $seq_id alt
659 0         0 $piece_table{$seq_id}{alt}{table} = App::Sandy::PieceTable->new(orig => $seq);
660              
661             # Populate alternative seq_id
662 0         0 $self->_populate_piece_table($piece_table{$seq_id}{alt}{table}, $snvs);
663             }
664             }
665              
666             # Initialize the logical offsets and valodate the
667             # new size due to the genomic variation
668              
669 20         40 my @blacklist;
670              
671 20         70 for my $seq_id (keys %piece_table) {
672 100         190 my $type_h = delete $piece_table{$seq_id};
673              
674 100         300 for my $type (keys %$type_h) {
675 100         160 my $table_h = delete $type_h->{$type};
676 100         175 my $table = $table_h->{table};
677              
678             # Initialize the logical offset
679 100         495 $table->calculate_logical_offset;
680              
681             # Get the new size
682 100         2660 my $new_size = $table->logical_len;
683              
684 100 50       2515 unless ($self->truncate) {
685 100         2440 my $class = ref $self->seq;
686              
687 100 100       310 if ($class eq 'App::Sandy::Seq::SingleEnd') {
    50          
688 50 50       1185 if ($new_size < $self->seq->read_mean) {
689 0         0 log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required read-mean";
690 0         0 next;
691             }
692             } elsif ($class eq 'App::Sandy::Seq::PairedEnd') {
693 50 50       1175 if ($new_size < $self->seq->fragment_mean) {
694 0         0 log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required fragment mean";
695 0         0 next;
696             }
697             } else {
698 0         0 die "No valid options for 'seq'";
699             }
700             }
701              
702             # If all's right
703 100         240 $table_h->{size} = $new_size;
704 100         230 $type_h->{$type} = $table_h;
705             }
706              
707             # if there is at least one type,
708             # then return it to the piece_table
709 100 50       275 if (%$type_h) {
710 100         215 $piece_table{$seq_id} = $type_h;
711              
712             # else, just remove it!
713             } else {
714 0         0 push @blacklist => $seq_id;
715             }
716             }
717              
718 20 50       60 unless (%piece_table) {
719 0         0 die "All fasta entries were removed due to deletions. ",
720             "Please, verify the genomic variation '$genomic_variation'\n";
721             }
722              
723             # If fasta_rtree has entries
724 20 50       795 unless ($self->_has_no_fasta_rtree) {
725             # Remove no valid entries from id -> pid relation
726 0 0       0 $self->_delete_fasta_rtree(@blacklist) if @blacklist;
727             }
728              
729             # Make the id -> pid relationship
730 20         80 $self->_populate_fasta_tree;
731              
732             # HASH -> SEQ_ID -> @(REF @ALT) -> @(TABLE SIZE)
733 20         615 return \%piece_table;
734             }
735              
736             sub _populate_piece_table {
737 0     0   0 my ($self, $table, $snvs) = @_;
738              
739 0         0 for my $snv (@$snvs) {
740             # If there is an ID, make sure that it is not a comma, colon
741             # separated list. Else, make sure to keep the ref/alt length
742             # to max 25+25+1=51
743             my $annot = defined $snv->{id} && $snv->{id} ne '.'
744             ? sprintf "%d:%s" => $snv->{pos} + 1, (split(/[,;]/, $snv->{id}))[0]
745 0 0 0     0 : sprintf "%d:%.25s/%.25s" => $snv->{pos} + 1, $snv->{ref}, $snv->{alt};
746              
747             # Insertion
748 0 0       0 if ($snv->{ref} eq '-') {
    0          
749 0         0 $table->insert(\$snv->{alt}, $snv->{pos}, $annot);
750              
751             # Deletion
752             } elsif ($snv->{alt} eq '-') {
753 0         0 $table->delete($snv->{pos}, length $snv->{ref}, $annot);
754              
755             # Change
756             } else {
757 0         0 $table->change(\$snv->{alt}, $snv->{pos}, length $snv->{ref}, $annot);
758             }
759             }
760             }
761              
762             sub _validate_indexed_snv_against_fasta {
763 0     0   0 my ($self, $indexed_snv) = @_;
764              
765 0         0 my $indexed_fasta = $self->_fasta;
766 0         0 my $genomic_variation = $self->_genomic_variation_names;
767              
768 0         0 for my $std_seq_id (keys %$indexed_snv) {
769 0         0 my $snvs = delete $indexed_snv->{$std_seq_id};
770 0         0 my $seq_id = $self->_get_seqname($std_seq_id);
771              
772 0 0 0     0 unless (defined $seq_id && exists $indexed_fasta->{$seq_id}) {
773 0         0 next;
774             }
775              
776 0         0 my $seq = \$indexed_fasta->{$seq_id}{seq};
777 0         0 my $size = $indexed_fasta->{$seq_id}{size};
778              
779 0         0 my @saved_snvs;
780              
781 0         0 for my $snv (@$snvs) {
782             # Insertions may accur until one base after the
783             # end of the sequence, not more
784 0 0 0     0 if (($snv->{ref} eq '-' && $snv->{pos} > $size) || ($snv->{ref} ne '-' && $snv->{pos} >= $size)) {
    0 0        
      0        
785             log_msg sprintf ":: In validating '%s': Position, %s/%s at %s:%d, outside fasta sequence",
786 0         0 $genomic_variation, $snv->{ref}, $snv->{alt}, $seq_id, $snv->{pos} + 1;
787              
788             # Next snv
789 0         0 next;
790             # Deletions and changes. Just verify if the reference exists
791             } elsif ($snv->{ref} ne '-') {
792 0         0 my $ref = substr $$seq, $snv->{pos}, length($snv->{ref});
793              
794 0 0       0 if (uc($ref) ne uc($snv->{ref})) {
795             log_msg sprintf ":: In validating '%s': Not found reference '%s' at fasta position %s:%d",
796 0         0 $genomic_variation, $snv->{ref}, $seq_id, $snv->{pos} + 1;
797              
798             # Next snv
799 0         0 next;
800             }
801             }
802              
803 0         0 push @saved_snvs => $snv;
804             }
805              
806 0 0       0 if (@saved_snvs) {
807 0         0 $indexed_snv->{$std_seq_id} = [@saved_snvs];
808             }
809             }
810             }
811              
812             sub _calculate_number_of_reads {
813 8     8   34 my $self = shift;
814 8         19 my $number_of_reads;
815              
816 8 50       239 if ($self->count_loops_by eq 'coverage') {
    0          
817             # It is needed to calculate the genome size
818 8         246 my $fasta = $self->_fasta;
819 8         36 my $fasta_size = 0;
820 8         19 $fasta_size += $fasta->{$_}{size} for keys %{ $fasta };
  8         60  
821 8         230 $number_of_reads = int(($fasta_size * $self->coverage) / $self->seq->read_mean);
822             # In case it is paired-end read, divide the number of reads by 2 because
823             # App::Sandy::Seq::PairedEnd class returns 2 reads at time
824 8 100       203 $number_of_reads = int($number_of_reads / 2)
825             if ref($self->seq) eq 'App::Sandy::Seq::PairedEnd';
826             } elsif ($self->count_loops_by eq 'number-of-reads') {
827 0         0 $number_of_reads = $self->number_of_reads;
828             } else {
829 0         0 croak sprintf "Unknown option '%s' for calculating the number of reads\n",
830             $self->count_loops_by;
831             }
832              
833             # Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size
834 8 50       32 if ($number_of_reads <= 0) {
835 0         0 die "The computed number of reads is equal to zero.\n" .
836             "It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n";
837             }
838              
839 8         27 return $number_of_reads;
840             }
841              
842             sub _set_seed {
843 4     4   67 my ($self, $inc) = @_;
844 4 50       336 my $seed = defined $inc ? $self->seed + $inc : $self->seed;
845 4         63 srand($seed);
846 4         123 require Math::Random;
847 4         178 Math::Random::random_set_seed_from_phrase($seed);
848             }
849              
850             sub _calculate_parent_count {
851 4     4   22 my ($self, $counter_ref) = @_;
852 4 50       235 return if $self->_has_no_fasta_rtree;
853              
854 0         0 my %parent_count;
855              
856 0         0 while (my ($id, $count) = each %$counter_ref) {
857 0         0 my $pid = $self->_get_fasta_rtree($id);
858 0 0       0 $parent_count{$pid} += $count if defined $pid;
859             }
860              
861 0         0 return \%parent_count;
862             }
863              
864             sub run_simulation {
865 8     8 0 4225 my $self = shift;
866 8         313 my $piece_table = $self->_piece_table;
867              
868             # Calculate the number of reads to be generated
869 8         55 my $number_of_reads = $self->_calculate_number_of_reads;
870              
871             # Function that returns strand by strand_bias
872 8         246 my $strand = $self->_strand;
873              
874             # Function that returns seqid by seqid_weight
875 8         261 my $seqid = $self->_seqid_raffle;
876              
877             # genome or transcriptome?
878 8         251 my $simulation = $self->argv->[0];
879              
880             # Count file to be generated
881 8 50       264 my $count_file = $simulation eq 'transcriptome'
882             ? $self->prefix . '_abundance.tsv'
883             : $self->prefix . '_coverage.tsv';
884              
885             # Main files
886 8         217 my %files = (
887             bam => [
888             $self->prefix . '.bam'
889             ],
890             sam => [
891             $self->prefix . '.sam'
892             ],
893             single_fastq => [
894             $self->prefix . '_R1_001.fastq'
895             ],
896             single_fastq_gz => [
897             $self->prefix . '_R1_001.fastq.gz'
898             ],
899             join_paired_fastq => [
900             $self->prefix . '.fastq'
901             ],
902             join_paired_fastq_gz => [
903             $self->prefix . '.fastq.gz'
904             ],
905             paired_fastq => [
906             $self->prefix . '_R1_001.fastq',
907             $self->prefix . '_R2_001.fastq'
908             ],
909             paired_fastq_gz => [
910             $self->prefix . '_R1_001.fastq.gz',
911             $self->prefix . '_R2_001.fastq.gz'
912             ]
913             );
914              
915             # Set the file class in order to know
916             # how to deal with all files options
917 8         216 my $seq_class = ref $self->seq;
918 8         272 my $output_format = $self->output_format;
919 8         24 my $file_class;
920              
921             # This mess is necessary to catch the
922             # right value into the %files hash
923 8 50       98 if ($output_format =~ /(sam|bam)/) {
    50          
924 0         0 $file_class = $output_format;
925             } elsif ($output_format =~ /fastq/) {
926 8 100       33 if ($seq_class eq 'App::Sandy::Seq::SingleEnd') {
    50          
927 5         15 $file_class = 'single_fastq';
928             } elsif ($seq_class eq 'App::Sandy::Seq::PairedEnd') {
929 3         9 $file_class = 'paired_fastq';
930 3 50       126 $file_class = "join_$file_class" if $self->join_paired_ends;
931             } else {
932 0         0 croak "Something wrong with the seq class: $seq_class";
933             }
934 8 50       32 if ($output_format eq 'fastq.gz') {
935 0         0 $file_class .= '_gz';
936             }
937             } else {
938 0         0 croak "Something wrong with the output format: $output_format";
939             }
940              
941             # Forks
942 8         250 my $number_of_jobs = $self->jobs;
943 8         162 my $pm = Parallel::ForkManager->new($number_of_jobs);
944              
945             # Parent child pids
946 8         25425 my $parent_pid = $$;
947 8         21 my @child_pid;
948              
949             # Temporary files tracker
950             my @tmp_files;
951              
952             # Run in parent right after creating child process
953             $pm->run_on_start(
954             sub {
955 10     10   18802 my $pid = shift;
956 10         259 push @child_pid => $pid;
957             }
958 8         72 );
959              
960             # Count the overall cumulative number of reads for each seqid
961 8         59 my %counters;
962              
963             # Run in parent right after finishing child process
964             $pm->run_on_finish(
965             sub {
966 8     8   24032709 my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $counter_ref) = @_;
967 8         101 while (my ($seqid, $count) = each %$counter_ref) {
968 40         254 $counters{$seqid} += $count;
969             }
970             }
971 8         61 );
972              
973 8 50       141 log_msg sprintf ":: Creating %d child %s ...",
974             $number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs";
975              
976 8         40 for my $tid (1..$number_of_jobs) {
977             #-------------------------------------------------------------------------------
978             # Inside parent
979             #-------------------------------------------------------------------------------
980 14         773 log_msg ":: Creating job $tid ...";
981 14         33 my @files_t = map { "$_.${parent_pid}.part$tid" } @{ $files{$file_class} };
  19         107  
  14         224  
982 14         103 push @tmp_files => @files_t;
983 14 100       77 my $pid = $pm->start and next;
984              
985             #-------------------------------------------------------------------------------
986             # Inside child
987             #-------------------------------------------------------------------------------
988             # Intelace child/parent processes
989 4         42813 my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => [$parent_pid]);
990              
991             # Set child seed
992 4         173 $self->_set_seed($tid);
993              
994             # Calculate the number of reads to this job and correct this local index
995             # to the global index
996 4         249 my $number_of_reads_t = int($number_of_reads/$number_of_jobs);
997 4         36 my $last_read_idx = $number_of_reads_t * $tid;
998 4         91 my $idx = $last_read_idx - $number_of_reads_t + 1;
999              
1000             # If it is the last job, make it work on the leftover reads of int() truncation
1001 4 100       115 $last_read_idx += $number_of_reads % $number_of_jobs
1002             if $tid == $number_of_jobs;
1003              
1004 4         220 log_msg " => Job $tid: Working on sequences from $idx to $last_read_idx";
1005              
1006             # Create temporary files
1007 4         116 log_msg " => Job $tid: Creating temporary file: @files_t";
1008              
1009             # And here we go ...
1010 4         51 my @fhs;
1011              
1012             # Set the right filehandle format
1013 4 50       189 if ($output_format =~ /^(sam|fastq)$/) {
    0          
    0          
1014 4         41 @fhs = map { $self->with_open_w($_, 0) } @files_t;
  6         249  
1015             } elsif ($output_format eq 'fastq.gz') {
1016 0         0 @fhs = map { $self->with_open_w($_, $self->compression_level) } @files_t;
  0         0  
1017             } elsif ($output_format eq 'bam') {
1018 0         0 @fhs = map { $self->with_open_bam_w($_, $self->compression_level) } @files_t;
  0         0  
1019             } else {
1020 0         0 croak "Something wrong with the output format: $file_class";
1021             }
1022              
1023             # sprint_seq gives two entries for paired-emd, so
1024             # if it is a bam|sam|join-paired-ends, it is necessary
1025             # to copy the filehandle in order to print both entries
1026             # to the same file
1027 4 50 66     149 if ($seq_class eq 'App::Sandy::Seq::PairedEnd'
1028             && $file_class =~ /(sam|bam|join)/) {
1029 0         0 $fhs[1] = $fhs[0];
1030             }
1031              
1032             # Count the cumulative number of reads for each seqid
1033 4         34 my %counter;
1034              
1035             # If the output format is 'bam|sam' and it is the first job, then
1036             # write the header
1037 4 50 33     134 if ($output_format =~ /^(sam|bam)$/ && $tid == 1) {
1038 0         0 my $header_ref = $self->gen_sam_header($self->argv);
1039 0         0 print {$fhs[0]} "$$header_ref";
  0         0  
1040             }
1041              
1042             # Run simulation in child
1043 4   66     373 for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) {
1044 1710         4612 my $id = $seqid->();
1045 1710         5566 my $ptable = $piece_table->{$id->{seq_id}}{$id->{type}};
1046 1710         2945 my @seq_entry;
1047             try {
1048             @seq_entry = $self->sprint_seq($tid, $i, $id->{seq_id}, $id->{type},
1049 1710     1710   122749 $ptable->{table}, $ptable->{size}, $strand->());
1050             } catch {
1051 0     0   0 die "Not defined entry for seqid '>$id->{seq_id}' at job $tid: $_";
1052             } finally {
1053 1710 50   1710   38761 unless (@_) {
1054 1710         5237 for my $fh_idx (0..$#fhs) {
1055 2280         4898 $counter{$id->{seq_id}}++;
1056 2280         3259 print {$fhs[$fh_idx]} "${$seq_entry[$fh_idx]}";
  2280         4241  
  2280         10194  
1057             }
1058             }
1059 1710         14362 };
1060             }
1061              
1062 4         154 log_msg " => Job $tid: Writing and closing file: @files_t";
1063              
1064             # Close temporary files
1065             # Get index from @files_t in order to avoid
1066             # close the same filehandle twice - When the
1067             # position 1-N is a copy
1068 4         16 for my $fh_idx (0..$#files_t) {
1069 6         402 close $fhs[$fh_idx];
1070             }
1071              
1072             # If it is a bam and it is the last loop, then
1073             # write a eof marker
1074 4 50 33     41 if ($output_format eq 'bam' && $tid == $number_of_jobs) {
1075 0         0 $self->gen_eof_marker($files_t[0]);
1076             }
1077              
1078             # Child exit
1079 4         34 log_msg " => Job $tid is finished";
1080 4         189 $pm->finish(0, \%counter);
1081             }
1082              
1083             # Back to parent
1084             # Interlace parent/child(s) processes
1085 4         1311 my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => \@child_pid);
1086 4         51 $pm->wait_all_children;
1087              
1088 4 50       354 if ($sig->signal_catched) {
1089 0         0 log_msg ":: Termination signal received!";
1090             }
1091              
1092 4         94 log_msg ":: Saving the work ...";
1093              
1094             # Concatenate all temporary files
1095 4         26 log_msg ":: Concatenate all temporary files";
1096              
1097             # Save time. Rename tmp_file (1,2)
1098 4         8 for my $file (@{ $files{$file_class} }) {
  4         46  
1099 5         36 my $tmp = shift @tmp_files;
1100 5         66 log_msg " => Concatenating $tmp to $file ...";
1101 5 50       308 rename $tmp => $file
1102             or die "Cannot create '$file': $!\n";
1103             }
1104              
1105             # Append to renamed tmp files
1106 4         16 my @fh = map { $self->with_open_a($_) } @{ $files{$file_class} };
  5         122  
  4         35  
1107              
1108 4         37 for my $i (0..$#tmp_files) {
1109 5         21 my $fh_idx = $i % scalar @fh;
1110              
1111 5         85 log_msg " => Concatenating $tmp_files[$i] to $files{$file_class}[$fh_idx] ...";
1112 5 50       90 cat $tmp_files[$i] => $fh[$fh_idx]
1113             or die "Cannot concatenate $tmp_files[$i] to $files{$file_class}[$fh_idx]: $!\n";
1114              
1115             # Clean up the mess
1116 5 50       13730 unlink $tmp_files[$i]
1117             or die "Cannot remove temporary file '$tmp_files[$i]': $!\n";
1118             }
1119              
1120             # Close files
1121 4         35 log_msg ":: Writing and closing output file: @{ $files{$file_class} }";
  4         63  
1122 4         20 for my $fh_idx (0..$#fh) {
1123 5 50       146 close $fh[$fh_idx]
1124             or die "Cannot write file $files{$file_class}[$fh_idx]: $!\n";
1125             }
1126              
1127             # Save counts
1128 4         24 log_msg ":: Saving count file";
1129 4         64 my $count_fh = $self->with_open_w($count_file, 0);
1130              
1131             # It is necessary to correct the abundance according to
1132             # fragment sequencing end
1133 4         11 my $count_factor = 1;
1134 4 50 33     252 if ($self->count_loops_by eq 'number-of-reads'
1135             && ref($self->seq) eq 'App::Sandy::Seq::PairedEnd') {
1136 0         0 $count_factor = 2;
1137             }
1138              
1139 4         39 log_msg " => Writing counts to $count_file ...";
1140 4         35 for my $id (sort keys %counters) {
1141 20         109 printf {$count_fh} "%s\t%d\n" => $id,
1142 20         31 int($counters{$id} / $count_factor);
1143             }
1144              
1145             # Just in case, calculate 'gene' like expression
1146 4         98 my $parent_count = $self->_calculate_parent_count(\%counters);
1147              
1148 4         21 for my $id (sort keys %$parent_count) {
1149 0         0 printf {$count_fh} "%s\t%d\n" => $id,
1150 0         0 int($parent_count->{$id} / $count_factor);
1151             }
1152              
1153             # Close $count_file
1154 4         42 log_msg ":; Writing and closing $count_file ...";
1155 4 50       314 close $count_fh
1156             or die "Cannot write file $count_file: $!\n";
1157             }
1158              
1159             __END__
1160              
1161             =pod
1162              
1163             =encoding UTF-8
1164              
1165             =head1 NAME
1166              
1167             App::Sandy::Simulator - Class responsible to make the simulation
1168              
1169             =head1 VERSION
1170              
1171             version 0.22
1172              
1173             =head1 AUTHORS
1174              
1175             =over 4
1176              
1177             =item *
1178              
1179             Thiago L. A. Miller <tmiller@mochsl.org.br>
1180              
1181             =item *
1182              
1183             J. Leonel Buzzo <lbuzzo@mochsl.org.br>
1184              
1185             =item *
1186              
1187             Felipe R. C. dos Santos <fsantos@mochsl.org.br>
1188              
1189             =item *
1190              
1191             Helena B. Conceição <hconceicao@mochsl.org.br>
1192              
1193             =item *
1194              
1195             Gabriela Guardia <gguardia@mochsl.org.br>
1196              
1197             =item *
1198              
1199             Fernanda Orpinelli <forpinelli@mochsl.org.br>
1200              
1201             =item *
1202              
1203             Pedro A. F. Galante <pgalante@mochsl.org.br>
1204              
1205             =back
1206              
1207             =head1 COPYRIGHT AND LICENSE
1208              
1209             This software is Copyright (c) 2018 by Teaching and Research Institute from Sírio-Libanês Hospital.
1210              
1211             This is free software, licensed under:
1212              
1213             The GNU General Public License, Version 3, June 2007
1214              
1215             =cut