File Coverage

lib/App/Sandy/Simulator.pm

Criterion	Covered	Total	%
statement	277	447	61.9
branch	72	174	41.3
condition	11	48	22.9
subroutine	32	44	72.7
pod	0	2	0.0
total	392	715	54.8

line	stmt	bran	cond	sub	pod	time	code
1							package App::Sandy::Simulator;
2							# ABSTRACT: Class responsible to make the simulation
3
4	6			6		24205	use App::Sandy::Base 'class';
	6					13
	6					46
5	6			6		435	use App::Sandy::Seq::SingleEnd;
	6					433
	6					160
6	6			6		450	use App::Sandy::Seq::PairedEnd;
	6					470
	6					229
7	6			6		3125	use App::Sandy::InterlaceProcesses;
	6					2739
	6					279
8	6			6		3697	use App::Sandy::WeightedRaffle;
	6					2724
	6					299
9	6			6		548	use App::Sandy::PieceTable;
	6					438
	6					177
10	6			6		3188	use App::Sandy::DB::Handle::Expression;
	6					2702
	6					282
11	6			6		3807	use App::Sandy::DB::Handle::Variation;
	6					2609
	6					293
12	6			6		68	use List::Util 'min';
	6					17
	6					460
13	6			6		5336	use File::Cat 'cat';
	6					3380
	6					400
14	6			6		3475	use Parallel::ForkManager;
	6					340548
	6					48314
15
16							with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/;
17
18							our $VERSION = '0.22'; # VERSION
19
20							has 'argv' => (
21							is => 'ro',
22							isa => 'ArrayRef[Str]',
23							required => 1
24							);
25
26							has 'truncate' => (
27							is => 'ro',
28							isa => 'Bool',
29							required => 1
30							);
31
32							has 'seed' => (
33							is => 'ro',
34							isa => 'Int',
35							required => 1
36							);
37
38							has 'jobs' => (
39							is => 'ro',
40							isa => 'My:IntGt0',
41							required => 1
42							);
43
44							has 'prefix' => (
45							is => 'ro',
46							isa => 'Str',
47							required => 1
48							);
49
50							has 'join_paired_ends' => (
51							is => 'ro',
52							isa => 'Bool',
53							required => 1
54							);
55
56							has 'output_format' => (
57							is => 'ro',
58							isa => 'My:Format',
59							required => 1
60							);
61
62							has 'compression_level' => (
63							is => 'ro',
64							isa => 'My:Level',
65							required => 1
66							);
67
68							has 'fasta_file' => (
69							is => 'ro',
70							isa => 'My:Fasta',
71							required => 1
72							);
73
74							has 'coverage' => (
75							is => 'ro',
76							isa => 'My:NumGt0',
77							required => 0
78							);
79
80							has 'number_of_reads' => (
81							is => 'ro',
82							isa => 'My:IntGt0',
83							required => 0
84							);
85
86							has 'count_loops_by' => (
87							is => 'ro',
88							isa => 'My:CountLoopBy',
89							required => 1
90							);
91
92							has 'strand_bias' => (
93							is => 'ro',
94							isa => 'My:StrandBias',
95							required => 1
96							);
97
98							has 'seqid_weight' => (
99							is => 'ro',
100							isa => 'My:SeqIdWeight',
101							required => 1
102							);
103
104							has 'expression_matrix' => (
105							is => 'ro',
106							isa => 'Str',
107							required => 0
108							);
109
110							has 'genomic_variation' => (
111							is => 'ro',
112							isa => 'ArrayRef[Str]',
113							required => 0
114							);
115
116							has '_genomic_variation_names' => (
117							is => 'ro',
118							isa => 'Maybe[Str]',
119							builder => '_build_genomic_variation_names',
120							lazy_build => 1
121							);
122
123							has 'seq' => (
124							is => 'ro',
125							isa => 'App::Sandy::Seq::SingleEnd \| App::Sandy::Seq::PairedEnd',
126							required => 1,
127							handles => [ qw{ sprint_seq gen_sam_header gen_eof_marker } ]
128							);
129
130							has '_fasta' => (
131							is => 'ro',
132							isa => 'My:IdxFasta',
133							builder => '_build_fasta',
134							lazy_build => 1
135							);
136
137							has '_fasta_tree' => (
138							traits => ['Hash'],
139							is => 'ro',
140							isa => 'HashRef[ArrayRef]',
141							default => sub { {} },
142							handles => {
143							_set_fasta_tree => 'set',
144							_get_fasta_tree => 'get',
145							_exists_fasta_tree => 'exists',
146							_fasta_tree_pairs => 'kv',
147							_has_no_fasta_tree => 'is_empty'
148							}
149							);
150
151							has '_fasta_rtree' => (
152							traits => ['Hash'],
153							is => 'ro',
154							isa => 'HashRef[Str]',
155							default => sub { {} },
156							handles => {
157							_set_fasta_rtree => 'set',
158							_get_fasta_rtree => 'get',
159							_delete_fasta_rtree => 'delete',
160							_exists_fasta_rtree => 'exists',
161							_fasta_rtree_pairs => 'kv',
162							_has_no_fasta_rtree => 'is_empty'
163							}
164							);
165
166							has '_seqname' => (
167							traits => ['Hash'],
168							is => 'ro',
169							isa => 'HashRef[Str]',
170							default => sub { {} },
171							handles => {
172							_set_seqname => 'set',
173							_get_seqname => 'get'
174							}
175							);
176
177							has '_piece_table' => (
178							is => 'ro',
179							isa => 'HashRef[HashRef[My:PieceTable]]',
180							builder => '_build_piece_table',
181							lazy_build => 1
182							);
183
184							has '_strand' => (
185							is => 'ro',
186							isa => 'CodeRef',
187							builder => '_build_strand',
188							lazy_build => 1
189							);
190
191							has '_seqid_raffle' => (
192							is => 'ro',
193							isa => 'CodeRef',
194							builder => '_build_seqid_raffle',
195							lazy_build => 1
196							);
197
198							sub BUILD {
199	20			20	0	1105	my $self = shift;
200
201							# If seqid_weight is 'count', then expression_matrix must be defined
202	20	50	33			620	if ($self->seqid_weight eq 'count' and not defined $self->expression_matrix) {
203	0					0	croak "seqid_weight=count requires a expression_matrix\n";
204							}
205
206							# If count_loops_by is 'coverage', then coverage must be defined. Else if
207							# it is equal to 'number_of_reads', then number_of_reads must be defined
208	20	50	33			535	if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) {
		50	33
209	0					0	croak "count_loops_by=coverage requires a coverage number\n";
210							} elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) {
211	0					0	croak "count_loops_by=number_of_reads requires a number_of_reads number\n";
212							}
213
214							## Just to ensure that the lazy attributes are built before &new returns
215	20					550	$self->_piece_table;
216	20					550	$self->_seqid_raffle;
217	20					515	$self->_fasta;
218	20					510	$self->_strand;
219							}
220
221							sub _build_strand {
222	20			20		40	my $self = shift;
223	20					40	my $strand_sub;
224
225	20	50				600	if ($self->strand_bias eq 'plus') {
		50
		50
226	0			0		0	$strand_sub = sub {1};
	0					0
227							} elsif ($self->strand_bias eq 'minus') {
228	0			0		0	$strand_sub = sub {0};
	0					0
229							} elsif ($self->strand_bias eq 'random') {
230	20			1710		80	$strand_sub = sub { int(rand(2)) };
	1710					7756
231							} else {
232	0					0	croak sprintf "Unknown option '%s' for strand bias\n",
233							$self->strand_bias;
234							}
235
236	20					495	return $strand_sub;
237							}
238
239							sub _index_fasta {
240	20			20		45	my $self = shift;
241	20					505	my $fasta = $self->fasta_file;
242
243	20					100	my $fh = $self->with_open_r($fasta);
244
245							# indexed_genome = ID => (seq, len)
246	20					75	my %indexed_fasta;
247
248							# >ID\|PID as in gencode transcripts
249							my %fasta_rtree;
250	20					0	my $id;
251
252	20					570	while (<$fh>) {
253	860					1290	chomp;
254	860	50				1540	next if /^;/;
255	860	100				1540	if (/^>/) {
256	100					315	my @fields = split /\\|/;
257	100					230	$id = $fields[0];
258	100					280	$id =~ s/^>//;
259	100					325	$id =~ s/^\s+\|\s+$//g;
260
261							# Seq ID standardization in order to manage comparations
262							# between chr1, Chr1, CHR1, 1 etc;
263	100					355	my $std_id = $self->with_std_seqid($id);
264	100					3735	$self->_set_seqname(
265							$id => $std_id,
266							$std_id => $id
267							);
268
269							# It is necessary to catch gene -> transcript relation
270							# # TODO: Make a hash tarit for indexed fasta
271	100	50				610	if (defined $fields[1]) {
272	0					0	my $pid = $fields[1];
273	0					0	$pid =~ s/^\s+\|\s+$//g;
274	0					0	$fasta_rtree{$id} = $pid;
275							}
276							} else {
277	760	50				1290	die "Error reading fasta file '$fasta': Not defined id"
278							unless defined $id;
279	760					2270	$indexed_fasta{$id}{seq} .= $_;
280							}
281							}
282
283	20					115	for (keys %indexed_fasta) {
284	100					255	$indexed_fasta{$_}{size} = length $indexed_fasta{$_}{seq};
285							}
286
287	20	50				65	unless (%indexed_fasta) {
288	0					0	die "Error parsing '$fasta'. Maybe the file is empty\n";
289							}
290
291							$fh->close
292	20	50				135	or die "Cannot close file $fasta: $!\n";
293
294	20	50				420	$self->_set_fasta_rtree(%fasta_rtree) if %fasta_rtree;
295	20					110	return \%indexed_fasta;
296							}
297
298							sub _build_fasta {
299	20			20		40	my $self = shift;
300	20					520	my $fasta = $self->fasta_file;
301
302	20					120	log_msg ":: Indexing fasta file '$fasta' ...";
303	20					55	my $indexed_fasta = $self->_index_fasta;
304
305							# Validate genome about the read size required
306	20					120	log_msg ":: Validating fasta file '$fasta' ...";
307							# Entries to remove
308	20					35	my @blacklist;
309
310	20	50				695	unless ($self->truncate) {
311	20					75	for my $id (keys %$indexed_fasta) {
312	100					190	my $index_size = $indexed_fasta->{$id}{size};
313	100					2455	my $class = ref $self->seq;
314
315	100	100				280	if ($class eq 'App::Sandy::Seq::SingleEnd') {
		50
316	50					1170	my $read_mean = $self->seq->read_mean;
317	50	50				135	if ($index_size < $read_mean) {
318	0					0	log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required read mean ($read_mean)";
319	0					0	delete $indexed_fasta->{$id};
320	0					0	push @blacklist => $id;
321							}
322							} elsif ($class eq 'App::Sandy::Seq::PairedEnd') {
323	50					1355	my $fragment_mean = $self->seq->fragment_mean;
324	50	50				130	if ($index_size < $fragment_mean) {
325	0					0	log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)";
326	0					0	delete $indexed_fasta->{$id};
327	0					0	push @blacklist => $id;
328							}
329							} else {
330	0					0	croak "Unknown option '$class' for sequencing type\n";
331							}
332							}
333							}
334
335	20	50				65	unless (%$indexed_fasta) {
336	0					0	die sprintf "Fasta file '%s' has no valid entry\n" => $self->fasta_file;
337							}
338
339							# If fasta_rtree has entries
340	20	50				715	unless ($self->_has_no_fasta_rtree) {
341							# Remove no valid entries from id -> pid relation
342	0	0				0	$self->_delete_fasta_rtree(@blacklist) if @blacklist;
343							}
344
345	20					525	return $indexed_fasta;
346							}
347
348							sub _populate_fasta_tree {
349	20			20		35	my $self = shift;
350
351							# If fasta_rtree has entries
352	20	50				705	unless ($self->_has_no_fasta_rtree) {
353							# Build parent -> child ids relation
354	0					0	my %fasta_tree;
355
356							# Reverse fasta_rtree to pid -> \@ids
357	0					0	for my $pair ($self->_fasta_rtree_pairs) {
358	0					0	my ($id, $pid) = (@$pair);
359	0					0	push @{ $fasta_tree{$pid} } => $id;
	0					0
360							}
361
362	0					0	$self->_set_fasta_tree(%fasta_tree);
363							}
364							}
365
366							sub _retrieve_expression_matrix {
367	0			0		0	my $self = shift;
368	0					0	my $expression = App::Sandy::DB::Handle::Expression->new;
369	0					0	return $expression->retrievedb($self->expression_matrix);
370							}
371
372							sub _build_seqid_raffle {
373	20			20		35	my $self = shift;
374
375							# Get the piece table
376	20					530	my $piece_table = $self->_piece_table;
377
378							# The builded function
379	20					40	my $seqid_sub;
380
381	20	50				580	if ($self->seqid_weight eq 'same') {
		50
		50
382	0			0		0	my ($keys, $weights) = $self->_populate_key_weight($piece_table, sub { 1 });
	0					0
383
384							# If weight == 1 means that there are 2 keys for
385							# the same seq_id.
386							# If weight == 2 means that there is only one key
387							# for the seq_id, so I double that key
388	0					0	for (my $i = 0; $i < @$weights; $i++) {
389	0	0				0	if ($weights->[$i] > 1) {
390	0					0	push @$keys => $keys->[$i];
391							}
392							}
393
394	0					0	my $keys_size = scalar @$keys;
395	0			0		0	$seqid_sub = sub { $keys->[int(rand($keys_size))] };
	0					0
396							} elsif ($self->seqid_weight eq 'count') {
397							# Catch expression-matrix entry from database
398	0					0	my $indexed_file = $self->_retrieve_expression_matrix;
399
400							# Catch indexed fasta
401	0					0	my $indexed_fasta = $self->_fasta;
402
403							# Validate expression_matrix
404	0					0	for my $id (keys %$indexed_file) {
405							# If not exists into indexed_fasta, it must then exist into fasta_tree
406	0	0	0			0	unless (exists $piece_table->{$id} \|\| $self->_exists_fasta_tree($id)) {
407	0					0	log_msg sprintf ":: Ignoring seqid '%s' from expression-matrix '%s': It is not found into the indexed fasta"
408							=> $id, $self->expression_matrix;
409	0					0	delete $indexed_file->{$id};
410							}
411							}
412
413	0	0				0	unless (%$indexed_file) {
414	0					0	die sprintf "No valid seqid entry of the expression-matrix '%s' is recorded into the indexed fasta\n"
415							=> $self->expression_matrix;
416							}
417
418	0					0	my (%ptable_ind, %ptable_cluster);
419
420							# Split indexed_file seq_ids between those
421							# into piece_table and those that represents a cluster
422							# of seq_ids as in gene -> transcript relationship
423	0					0	for my $seq_id (keys %$indexed_file) {
424	0	0				0	if (exists $piece_table->{$seq_id}) {
425	0					0	$ptable_ind{$seq_id} = $piece_table->{$seq_id};
426
427							} else {
428	0					0	my $ids = $self->_get_fasta_tree($seq_id);
429
430							# Bug catcher
431	0	0				0	unless (@$ids) {
432	0					0	croak "seq_id '$seq_id' not found into piece_table";
433							}
434
435	0					0	$ptable_cluster{$seq_id} = $ids;
436							}
437							}
438
439							# Let's calculate the weight taking in acount
440							# the size increase/decrease
441							my $calc_ind_weight = sub {
442	0			0		0	my ($seq_id, $type) = @_;
443
444	0					0	my $counts = $indexed_file->{$seq_id};
445	0					0	my $size = $piece_table->{$seq_id}{$type}{size};
446	0					0	my $fasta_size = $indexed_fasta->{$seq_id}{size};
447
448							# Correct the weight according to the
449							# genomic variation change by the ratio
450							# between the table size and fasta size
451	0					0	my $factor = $size / $fasta_size;
452
453	0					0	return $counts * $factor;
454	0					0	};
455
456	0					0	my ($keys, $weights);
457
458	0	0				0	if (%ptable_ind) {
459	0					0	($keys, $weights) = $self->_populate_key_weight(\%ptable_ind,
460							$calc_ind_weight);
461							}
462
463							# If there are seq_id cluster like, then its is
464							# time to calculate these weights
465	0					0	for my $seq_id (sort keys %ptable_cluster) {
466	0					0	my %ptable;
467
468							# Slice piece_table hash
469	0					0	my $ids = $ptable_cluster{$seq_id};
470	0					0	@ptable{@$ids} = @$piece_table{@$ids};
471
472							# total size among all ids of cluster
473	0					0	my %total;
474
475							# Calculate the total size by type
476	0					0	for my $type_h (values %ptable) {
477	0					0	for my $type (keys %$type_h) {
478	0					0	$total{$type} += $type_h->{$type}{size};
479							}
480							}
481
482							# Calculate the weight taking in acount the size increase/decrease
483							# and the ratio between the total size by type and the table size.
484							# The problem here is that I must divide the 'counts' for some 'seq_id'
485							# among all ids that belong to it
486							my $calc_cluster_weight = sub {
487	0			0		0	my ($id, $type) = @_;
488
489	0					0	my $counts = $indexed_file->{$seq_id};
490	0					0	my $size = $piece_table->{$id}{$type}{size};
491	0					0	my $fasta_size = $indexed_fasta->{$id}{size};
492
493							# Divide the counts among all ids
494	0					0	my $ratio = $size / $total{$type};
495
496							# Correct the weight according to the size
497	0					0	my $factor = $size / $fasta_size;
498
499	0					0	return $counts * $factor * $ratio;
500	0					0	};
501
502	0					0	my ($k, $w) = $self->_populate_key_weight(\%ptable,
503							$calc_cluster_weight);
504
505	0					0	push @$keys => @$k;
506	0					0	push @$weights => @$w;
507							}
508
509	0	0	0			0	unless (@$keys && @$weights) {
510	0					0	croak "No keys weights have been set";
511							}
512
513							# It is very necessary in order
514							# to avoid truncation of numbers
515							# between zero and one
516	0					0	$self->_round_weight($weights);
517
518	0					0	my $raffler = App::Sandy::WeightedRaffle->new(
519							'weights' => $weights,
520							'keys' => $keys
521							);
522
523	0			0		0	$seqid_sub = sub { $raffler->weighted_raffle };
	0					0
524							} elsif ($self->seqid_weight eq 'length') {
525							my $calc_weight = sub {
526	100			100		210	my ($seq_id, $type) = @_;
527	100					205	return $piece_table->{$seq_id}{$type}{size};
528	20					115	};
529
530	20					70	my ($keys, $weights) = $self->_populate_key_weight($piece_table,
531							$calc_weight);
532
533							# Just in case ...
534	20					70	$self->_round_weight($weights);
535
536	20					720	my $raffler = App::Sandy::WeightedRaffle->new(
537							weights => $weights,
538							keys => $keys
539							);
540
541	20			1710		155	$seqid_sub = sub { $raffler->weighted_raffle };
	1710					6155
542							} else {
543	0					0	croak sprintf "Unknown option '%s' for seqid_weight\n",
544							$self->seqid_weight;
545							}
546
547	20					560	return $seqid_sub;
548							}
549
550							sub _round_weight {
551	20			20		40	my ($self, $weights) = @_;
552
553	20					75	my $min = min @$weights;
554
555	20	50				55	if ($min <= 0) {
556	0					0	croak "min weight le to zero: $min";
557							}
558
559	20	50				95	my $factor = $min < 1
560							? (1 / $min)
561							: 1;
562
563	20					45	for my $weight (@$weights) {
564	100					195	$weight = int($weight * $factor + 0.5);
565							}
566							}
567
568							sub _populate_key_weight {
569	20			20		55	my ($self, $piece_table, $calc_weight) = @_;
570
571	20					35	my (@keys, @weights);
572
573							# It needs to be sorted in order to the
574							# seed works
575	20					125	for my $seq_id (sort keys %$piece_table) {
576	100					160	my $type_h = $piece_table->{$seq_id};
577
578							# If there is no alternative seq_id, then
579							# set a factor to correct the size.
580							# It is necessary because the seq_ids with
581							# alternative and reference will double its
582							# own coverage
583	100	50				200	my $factor = scalar keys %$type_h == 1
584							? 2
585							: 1;
586
587	100					205	for my $type (sort keys %$type_h) {
588
589	100					330	my %key = (
590							'seq_id' => $seq_id,
591							'type' => $type
592							);
593
594	100					185	my $weight = $calc_weight->($seq_id, $type);
595
596	100					195	push @keys => \%key;
597	100					295	push @weights => $weight * $factor;
598							}
599							}
600
601	20					65	return (\@keys, \@weights);
602							}
603
604							sub _build_genomic_variation_names {
605	20			20		25	my $self = shift;
606	20	50				555	if ($self->genomic_variation) {
607	0					0	return sprintf "[%s]", => join ", ", @{ $self->genomic_variation };
	0					0
608							}
609							}
610
611							sub _retrieve_genomic_variation {
612	0			0		0	my $self = shift;
613	0					0	my $variation = App::Sandy::DB::Handle::Variation->new;
614	0					0	return $variation->retrievedb($self->genomic_variation);
615							}
616
617							sub _build_piece_table {
618	20			20		35	my $self = shift;
619
620	20					625	my $genomic_variation = $self->_genomic_variation_names;
621	20					35	my $indexed_snv;
622
623							# Retrieve genomic variation if the user provided it
624	20	50				55	if (defined $genomic_variation) {
625	0					0	$indexed_snv = $self->_retrieve_genomic_variation;
626	0					0	log_msg ":: Validate genomic variation '$genomic_variation' against indexed fasta ...";
627	0					0	$self->_validate_indexed_snv_against_fasta($indexed_snv);
628							}
629
630							# Catch index fasta
631	20					505	my $indexed_fasta = $self->_fasta;
632
633							# Build piece table
634	20					40	my %piece_table;
635
636							# Let's construct the piece_table
637	20					80	log_msg ":: Build piece table ...";
638
639	20					85	while (my ($seq_id, $fasta_h) = each %$indexed_fasta) {
640	100					210	my $seq = \$fasta_h->{seq};
641	100					3605	my $std_seq_id = $self->_get_seqname($seq_id);
642
643							# Initialize piece tables for $seq_id ref
644	100					2575	$piece_table{$seq_id}{ref}{table} = App::Sandy::PieceTable->new(orig => $seq);
645
646							# If there is indexed_snv for seq_id, then construct the piece table with it
647	100	50	33			440	if (defined $indexed_snv && defined $indexed_snv->{$std_seq_id}) {
648	0					0	my $snvs = $indexed_snv->{$std_seq_id};
649
650							# Filter only the homozygotic snvs to feed reference seq_id
651	0					0	my @snvs_homo = grep { $_->{plo} eq 'HO' } @$snvs;
	0					0
652
653	0	0				0	if (@snvs_homo) {
654							# Populate reference seq_id
655	0					0	$self->_populate_piece_table($piece_table{$seq_id}{ref}{table}, \@snvs_homo);
656							}
657
658							# Initialize piece tables for $seq_id alt
659	0					0	$piece_table{$seq_id}{alt}{table} = App::Sandy::PieceTable->new(orig => $seq);
660
661							# Populate alternative seq_id
662	0					0	$self->_populate_piece_table($piece_table{$seq_id}{alt}{table}, $snvs);
663							}
664							}
665
666							# Initialize the logical offsets and valodate the
667							# new size due to the genomic variation
668
669	20					40	my @blacklist;
670
671	20					70	for my $seq_id (keys %piece_table) {
672	100					190	my $type_h = delete $piece_table{$seq_id};
673
674	100					300	for my $type (keys %$type_h) {
675	100					160	my $table_h = delete $type_h->{$type};
676	100					175	my $table = $table_h->{table};
677
678							# Initialize the logical offset
679	100					495	$table->calculate_logical_offset;
680
681							# Get the new size
682	100					2660	my $new_size = $table->logical_len;
683
684	100	50				2515	unless ($self->truncate) {
685	100					2440	my $class = ref $self->seq;
686
687	100	100				310	if ($class eq 'App::Sandy::Seq::SingleEnd') {
		50
688	50	50				1185	if ($new_size < $self->seq->read_mean) {
689	0					0	log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required read-mean";
690	0					0	next;
691							}
692							} elsif ($class eq 'App::Sandy::Seq::PairedEnd') {
693	50	50				1175	if ($new_size < $self->seq->fragment_mean) {
694	0					0	log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required fragment mean";
695	0					0	next;
696							}
697							} else {
698	0					0	die "No valid options for 'seq'";
699							}
700							}
701
702							# If all's right
703	100					240	$table_h->{size} = $new_size;
704	100					230	$type_h->{$type} = $table_h;
705							}
706
707							# if there is at least one type,
708							# then return it to the piece_table
709	100	50				275	if (%$type_h) {
710	100					215	$piece_table{$seq_id} = $type_h;
711
712							# else, just remove it!
713							} else {
714	0					0	push @blacklist => $seq_id;
715							}
716							}
717
718	20	50				60	unless (%piece_table) {
719	0					0	die "All fasta entries were removed due to deletions. ",
720							"Please, verify the genomic variation '$genomic_variation'\n";
721							}
722
723							# If fasta_rtree has entries
724	20	50				795	unless ($self->_has_no_fasta_rtree) {
725							# Remove no valid entries from id -> pid relation
726	0	0				0	$self->_delete_fasta_rtree(@blacklist) if @blacklist;
727							}
728
729							# Make the id -> pid relationship
730	20					80	$self->_populate_fasta_tree;
731
732							# HASH -> SEQ_ID -> @(REF @ALT) -> @(TABLE SIZE)
733	20					615	return \%piece_table;
734							}
735
736							sub _populate_piece_table {
737	0			0		0	my ($self, $table, $snvs) = @_;
738
739	0					0	for my $snv (@$snvs) {
740							# If there is an ID, make sure that it is not a comma, colon
741							# separated list. Else, make sure to keep the ref/alt length
742							# to max 25+25+1=51
743							my $annot = defined $snv->{id} && $snv->{id} ne '.'
744							? sprintf "%d:%s" => $snv->{pos} + 1, (split(/[,;]/, $snv->{id}))[0]
745	0	0	0			0	: sprintf "%d:%.25s/%.25s" => $snv->{pos} + 1, $snv->{ref}, $snv->{alt};
746
747							# Insertion
748	0	0				0	if ($snv->{ref} eq '-') {
		0
749	0					0	$table->insert(\$snv->{alt}, $snv->{pos}, $annot);
750
751							# Deletion
752							} elsif ($snv->{alt} eq '-') {
753	0					0	$table->delete($snv->{pos}, length $snv->{ref}, $annot);
754
755							# Change
756							} else {
757	0					0	$table->change(\$snv->{alt}, $snv->{pos}, length $snv->{ref}, $annot);
758							}
759							}
760							}
761
762							sub _validate_indexed_snv_against_fasta {
763	0			0		0	my ($self, $indexed_snv) = @_;
764
765	0					0	my $indexed_fasta = $self->_fasta;
766	0					0	my $genomic_variation = $self->_genomic_variation_names;
767
768	0					0	for my $std_seq_id (keys %$indexed_snv) {
769	0					0	my $snvs = delete $indexed_snv->{$std_seq_id};
770	0					0	my $seq_id = $self->_get_seqname($std_seq_id);
771
772	0	0	0			0	unless (defined $seq_id && exists $indexed_fasta->{$seq_id}) {
773	0					0	next;
774							}
775
776	0					0	my $seq = \$indexed_fasta->{$seq_id}{seq};
777	0					0	my $size = $indexed_fasta->{$seq_id}{size};
778
779	0					0	my @saved_snvs;
780
781	0					0	for my $snv (@$snvs) {
782							# Insertions may accur until one base after the
783							# end of the sequence, not more
784	0	0	0			0	if (($snv->{ref} eq '-' && $snv->{pos} > $size) \|\| ($snv->{ref} ne '-' && $snv->{pos} >= $size)) {
		0	0
			0
785							log_msg sprintf ":: In validating '%s': Position, %s/%s at %s:%d, outside fasta sequence",
786	0					0	$genomic_variation, $snv->{ref}, $snv->{alt}, $seq_id, $snv->{pos} + 1;
787
788							# Next snv
789	0					0	next;
790							# Deletions and changes. Just verify if the reference exists
791							} elsif ($snv->{ref} ne '-') {
792	0					0	my $ref = substr $$seq, $snv->{pos}, length($snv->{ref});
793
794	0	0				0	if (uc($ref) ne uc($snv->{ref})) {
795							log_msg sprintf ":: In validating '%s': Not found reference '%s' at fasta position %s:%d",
796	0					0	$genomic_variation, $snv->{ref}, $seq_id, $snv->{pos} + 1;
797
798							# Next snv
799	0					0	next;
800							}
801							}
802
803	0					0	push @saved_snvs => $snv;
804							}
805
806	0	0				0	if (@saved_snvs) {
807	0					0	$indexed_snv->{$std_seq_id} = [@saved_snvs];
808							}
809							}
810							}
811
812							sub _calculate_number_of_reads {
813	8			8		34	my $self = shift;
814	8					19	my $number_of_reads;
815
816	8	50				239	if ($self->count_loops_by eq 'coverage') {
		0
817							# It is needed to calculate the genome size
818	8					246	my $fasta = $self->_fasta;
819	8					36	my $fasta_size = 0;
820	8					19	$fasta_size += $fasta->{$_}{size} for keys %{ $fasta };
	8					60
821	8					230	$number_of_reads = int(($fasta_size * $self->coverage) / $self->seq->read_mean);
822							# In case it is paired-end read, divide the number of reads by 2 because
823							# App::Sandy::Seq::PairedEnd class returns 2 reads at time
824	8	100				203	$number_of_reads = int($number_of_reads / 2)
825							if ref($self->seq) eq 'App::Sandy::Seq::PairedEnd';
826							} elsif ($self->count_loops_by eq 'number-of-reads') {
827	0					0	$number_of_reads = $self->number_of_reads;
828							} else {
829	0					0	croak sprintf "Unknown option '%s' for calculating the number of reads\n",
830							$self->count_loops_by;
831							}
832
833							# Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size
834	8	50				32	if ($number_of_reads <= 0) {
835	0					0	die "The computed number of reads is equal to zero.\n" .
836							"It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n";
837							}
838
839	8					27	return $number_of_reads;
840							}
841
842							sub _set_seed {
843	4			4		67	my ($self, $inc) = @_;
844	4	50				336	my $seed = defined $inc ? $self->seed + $inc : $self->seed;
845	4					63	srand($seed);
846	4					123	require Math::Random;
847	4					178	Math::Random::random_set_seed_from_phrase($seed);
848							}
849
850							sub _calculate_parent_count {
851	4			4		22	my ($self, $counter_ref) = @_;
852	4	50				235	return if $self->_has_no_fasta_rtree;
853
854	0					0	my %parent_count;
855
856	0					0	while (my ($id, $count) = each %$counter_ref) {
857	0					0	my $pid = $self->_get_fasta_rtree($id);
858	0	0				0	$parent_count{$pid} += $count if defined $pid;
859							}
860
861	0					0	return \%parent_count;
862							}
863
864							sub run_simulation {
865	8			8	0	4225	my $self = shift;
866	8					313	my $piece_table = $self->_piece_table;
867
868							# Calculate the number of reads to be generated
869	8					55	my $number_of_reads = $self->_calculate_number_of_reads;
870
871							# Function that returns strand by strand_bias
872	8					246	my $strand = $self->_strand;
873
874							# Function that returns seqid by seqid_weight
875	8					261	my $seqid = $self->_seqid_raffle;
876
877							# genome or transcriptome?
878	8					251	my $simulation = $self->argv->[0];
879
880							# Count file to be generated
881	8	50				264	my $count_file = $simulation eq 'transcriptome'
882							? $self->prefix . '_abundance.tsv'
883							: $self->prefix . '_coverage.tsv';
884
885							# Main files
886	8					217	my %files = (
887							bam => [
888							$self->prefix . '.bam'
889							],
890							sam => [
891							$self->prefix . '.sam'
892							],
893							single_fastq => [
894							$self->prefix . '_R1_001.fastq'
895							],
896							single_fastq_gz => [
897							$self->prefix . '_R1_001.fastq.gz'
898							],
899							join_paired_fastq => [
900							$self->prefix . '.fastq'
901							],
902							join_paired_fastq_gz => [
903							$self->prefix . '.fastq.gz'
904							],
905							paired_fastq => [
906							$self->prefix . '_R1_001.fastq',
907							$self->prefix . '_R2_001.fastq'
908							],
909							paired_fastq_gz => [
910							$self->prefix . '_R1_001.fastq.gz',
911							$self->prefix . '_R2_001.fastq.gz'
912							]
913							);
914
915							# Set the file class in order to know
916							# how to deal with all files options
917	8					216	my $seq_class = ref $self->seq;
918	8					272	my $output_format = $self->output_format;
919	8					24	my $file_class;
920
921							# This mess is necessary to catch the
922							# right value into the %files hash
923	8	50				98	if ($output_format =~ /(sam\|bam)/) {
		50
924	0					0	$file_class = $output_format;
925							} elsif ($output_format =~ /fastq/) {
926	8	100				33	if ($seq_class eq 'App::Sandy::Seq::SingleEnd') {
		50
927	5					15	$file_class = 'single_fastq';
928							} elsif ($seq_class eq 'App::Sandy::Seq::PairedEnd') {
929	3					9	$file_class = 'paired_fastq';
930	3	50				126	$file_class = "join_$file_class" if $self->join_paired_ends;
931							} else {
932	0					0	croak "Something wrong with the seq class: $seq_class";
933							}
934	8	50				32	if ($output_format eq 'fastq.gz') {
935	0					0	$file_class .= '_gz';
936							}
937							} else {
938	0					0	croak "Something wrong with the output format: $output_format";
939							}
940
941							# Forks
942	8					250	my $number_of_jobs = $self->jobs;
943	8					162	my $pm = Parallel::ForkManager->new($number_of_jobs);
944
945							# Parent child pids
946	8					25425	my $parent_pid = $$;
947	8					21	my @child_pid;
948
949							# Temporary files tracker
950							my @tmp_files;
951
952							# Run in parent right after creating child process
953							$pm->run_on_start(
954							sub {
955	10			10		18802	my $pid = shift;
956	10					259	push @child_pid => $pid;
957							}
958	8					72	);
959
960							# Count the overall cumulative number of reads for each seqid
961	8					59	my %counters;
962
963							# Run in parent right after finishing child process
964							$pm->run_on_finish(
965							sub {
966	8			8		24032709	my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $counter_ref) = @_;
967	8					101	while (my ($seqid, $count) = each %$counter_ref) {
968	40					254	$counters{$seqid} += $count;
969							}
970							}
971	8					61	);
972
973	8	50				141	log_msg sprintf ":: Creating %d child %s ...",
974							$number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs";
975
976	8					40	for my $tid (1..$number_of_jobs) {
977							#-------------------------------------------------------------------------------
978							# Inside parent
979							#-------------------------------------------------------------------------------
980	14					773	log_msg ":: Creating job $tid ...";
981	14					33	my @files_t = map { "$_.${parent_pid}.part$tid" } @{ $files{$file_class} };
	19					107
	14					224
982	14					103	push @tmp_files => @files_t;
983	14	100				77	my $pid = $pm->start and next;
984
985							#-------------------------------------------------------------------------------
986							# Inside child
987							#-------------------------------------------------------------------------------
988							# Intelace child/parent processes
989	4					42813	my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => [$parent_pid]);
990
991							# Set child seed
992	4					173	$self->_set_seed($tid);
993
994							# Calculate the number of reads to this job and correct this local index
995							# to the global index
996	4					249	my $number_of_reads_t = int($number_of_reads/$number_of_jobs);
997	4					36	my $last_read_idx = $number_of_reads_t * $tid;
998	4					91	my $idx = $last_read_idx - $number_of_reads_t + 1;
999
1000							# If it is the last job, make it work on the leftover reads of int() truncation
1001	4	100				115	$last_read_idx += $number_of_reads % $number_of_jobs
1002							if $tid == $number_of_jobs;
1003
1004	4					220	log_msg " => Job $tid: Working on sequences from $idx to $last_read_idx";
1005
1006							# Create temporary files
1007	4					116	log_msg " => Job $tid: Creating temporary file: @files_t";
1008
1009							# And here we go ...
1010	4					51	my @fhs;
1011
1012							# Set the right filehandle format
1013	4	50				189	if ($output_format =~ /^(sam\|fastq)$/) {
		0
		0
1014	4					41	@fhs = map { $self->with_open_w($_, 0) } @files_t;
	6					249
1015							} elsif ($output_format eq 'fastq.gz') {
1016	0					0	@fhs = map { $self->with_open_w($_, $self->compression_level) } @files_t;
	0					0
1017							} elsif ($output_format eq 'bam') {
1018	0					0	@fhs = map { $self->with_open_bam_w($_, $self->compression_level) } @files_t;
	0					0
1019							} else {
1020	0					0	croak "Something wrong with the output format: $file_class";
1021							}
1022
1023							# sprint_seq gives two entries for paired-emd, so
1024							# if it is a bam\|sam\|join-paired-ends, it is necessary
1025							# to copy the filehandle in order to print both entries
1026							# to the same file
1027	4	50	66			149	if ($seq_class eq 'App::Sandy::Seq::PairedEnd'
1028							&& $file_class =~ /(sam\|bam\|join)/) {
1029	0					0	$fhs[1] = $fhs[0];
1030							}
1031
1032							# Count the cumulative number of reads for each seqid
1033	4					34	my %counter;
1034
1035							# If the output format is 'bam\|sam' and it is the first job, then
1036							# write the header
1037	4	50	33			134	if ($output_format =~ /^(sam\|bam)$/ && $tid == 1) {
1038	0					0	my $header_ref = $self->gen_sam_header($self->argv);
1039	0					0	print {$fhs[0]} "$$header_ref";
	0					0
1040							}
1041
1042							# Run simulation in child
1043	4		66			373	for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) {
1044	1710					4612	my $id = $seqid->();
1045	1710					5566	my $ptable = $piece_table->{$id->{seq_id}}{$id->{type}};
1046	1710					2945	my @seq_entry;
1047							try {
1048							@seq_entry = $self->sprint_seq($tid, $i, $id->{seq_id}, $id->{type},
1049	1710			1710		122749	$ptable->{table}, $ptable->{size}, $strand->());
1050							} catch {
1051	0			0		0	die "Not defined entry for seqid '>$id->{seq_id}' at job $tid: $_";
1052							} finally {
1053	1710	50		1710		38761	unless (@_) {
1054	1710					5237	for my $fh_idx (0..$#fhs) {
1055	2280					4898	$counter{$id->{seq_id}}++;
1056	2280					3259	print {$fhs[$fh_idx]} "${$seq_entry[$fh_idx]}";
	2280					4241
	2280					10194
1057							}
1058							}
1059	1710					14362	};
1060							}
1061
1062	4					154	log_msg " => Job $tid: Writing and closing file: @files_t";
1063
1064							# Close temporary files
1065							# Get index from @files_t in order to avoid
1066							# close the same filehandle twice - When the
1067							# position 1-N is a copy
1068	4					16	for my $fh_idx (0..$#files_t) {
1069	6					402	close $fhs[$fh_idx];
1070							}
1071
1072							# If it is a bam and it is the last loop, then
1073							# write a eof marker
1074	4	50	33			41	if ($output_format eq 'bam' && $tid == $number_of_jobs) {
1075	0					0	$self->gen_eof_marker($files_t[0]);
1076							}
1077
1078							# Child exit
1079	4					34	log_msg " => Job $tid is finished";
1080	4					189	$pm->finish(0, \%counter);
1081							}
1082
1083							# Back to parent
1084							# Interlace parent/child(s) processes
1085	4					1311	my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => \@child_pid);
1086	4					51	$pm->wait_all_children;
1087
1088	4	50				354	if ($sig->signal_catched) {
1089	0					0	log_msg ":: Termination signal received!";
1090							}
1091
1092	4					94	log_msg ":: Saving the work ...";
1093
1094							# Concatenate all temporary files
1095	4					26	log_msg ":: Concatenate all temporary files";
1096
1097							# Save time. Rename tmp_file (1,2)
1098	4					8	for my $file (@{ $files{$file_class} }) {
	4					46
1099	5					36	my $tmp = shift @tmp_files;
1100	5					66	log_msg " => Concatenating $tmp to $file ...";
1101	5	50				308	rename $tmp => $file
1102							or die "Cannot create '$file': $!\n";
1103							}
1104
1105							# Append to renamed tmp files
1106	4					16	my @fh = map { $self->with_open_a($_) } @{ $files{$file_class} };
	5					122
	4					35
1107
1108	4					37	for my $i (0..$#tmp_files) {
1109	5					21	my $fh_idx = $i % scalar @fh;
1110
1111	5					85	log_msg " => Concatenating $tmp_files[$i] to $files{$file_class}[$fh_idx] ...";
1112	5	50				90	cat $tmp_files[$i] => $fh[$fh_idx]
1113							or die "Cannot concatenate $tmp_files[$i] to $files{$file_class}[$fh_idx]: $!\n";
1114
1115							# Clean up the mess
1116	5	50				13730	unlink $tmp_files[$i]
1117							or die "Cannot remove temporary file '$tmp_files[$i]': $!\n";
1118							}
1119
1120							# Close files
1121	4					35	log_msg ":: Writing and closing output file: @{ $files{$file_class} }";
	4					63
1122	4					20	for my $fh_idx (0..$#fh) {
1123	5	50				146	close $fh[$fh_idx]
1124							or die "Cannot write file $files{$file_class}[$fh_idx]: $!\n";
1125							}
1126
1127							# Save counts
1128	4					24	log_msg ":: Saving count file";
1129	4					64	my $count_fh = $self->with_open_w($count_file, 0);
1130
1131							# It is necessary to correct the abundance according to
1132							# fragment sequencing end
1133	4					11	my $count_factor = 1;
1134	4	50	33			252	if ($self->count_loops_by eq 'number-of-reads'
1135							&& ref($self->seq) eq 'App::Sandy::Seq::PairedEnd') {
1136	0					0	$count_factor = 2;
1137							}
1138
1139	4					39	log_msg " => Writing counts to $count_file ...";
1140	4					35	for my $id (sort keys %counters) {
1141	20					109	printf {$count_fh} "%s\t%d\n" => $id,
1142	20					31	int($counters{$id} / $count_factor);
1143							}
1144
1145							# Just in case, calculate 'gene' like expression
1146	4					98	my $parent_count = $self->_calculate_parent_count(\%counters);
1147
1148	4					21	for my $id (sort keys %$parent_count) {
1149	0					0	printf {$count_fh} "%s\t%d\n" => $id,
1150	0					0	int($parent_count->{$id} / $count_factor);
1151							}
1152
1153							# Close $count_file
1154	4					42	log_msg ":; Writing and closing $count_file ...";
1155	4	50				314	close $count_fh
1156							or die "Cannot write file $count_file: $!\n";
1157							}
1158
1159							__END__
1160
1161							=pod
1162
1163							=encoding UTF-8
1164
1165							=head1 NAME
1166
1167							App::Sandy::Simulator - Class responsible to make the simulation
1168
1169							=head1 VERSION
1170
1171							version 0.22
1172
1173							=head1 AUTHORS
1174
1175							=over 4
1176
1177							=item *
1178
1179							Thiago L. A. Miller <tmiller@mochsl.org.br>
1180
1181							=item *
1182
1183							J. Leonel Buzzo <lbuzzo@mochsl.org.br>
1184
1185							=item *
1186
1187							Felipe R. C. dos Santos <fsantos@mochsl.org.br>
1188
1189							=item *
1190
1191							Helena B. ConceiÃ§Ã£o <hconceicao@mochsl.org.br>
1192
1193							=item *
1194
1195							Gabriela Guardia <gguardia@mochsl.org.br>
1196
1197							=item *
1198
1199							Fernanda Orpinelli <forpinelli@mochsl.org.br>
1200
1201							=item *
1202
1203							Pedro A. F. Galante <pgalante@mochsl.org.br>
1204
1205							=back
1206
1207							=head1 COPYRIGHT AND LICENSE
1208
1209							This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-LibanÃªs Hospital.
1210
1211							This is free software, licensed under:
1212
1213							The GNU General Public License, Version 3, June 2007
1214
1215							=cut