line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::Simulator; |
2
|
|
|
|
|
|
|
# ABSTRACT: Class responsible to make the simulation |
3
|
|
|
|
|
|
|
|
4
|
6
|
|
|
6
|
|
24205
|
use App::Sandy::Base 'class'; |
|
6
|
|
|
|
|
13
|
|
|
6
|
|
|
|
|
46
|
|
5
|
6
|
|
|
6
|
|
435
|
use App::Sandy::Seq::SingleEnd; |
|
6
|
|
|
|
|
433
|
|
|
6
|
|
|
|
|
160
|
|
6
|
6
|
|
|
6
|
|
450
|
use App::Sandy::Seq::PairedEnd; |
|
6
|
|
|
|
|
470
|
|
|
6
|
|
|
|
|
229
|
|
7
|
6
|
|
|
6
|
|
3125
|
use App::Sandy::InterlaceProcesses; |
|
6
|
|
|
|
|
2739
|
|
|
6
|
|
|
|
|
279
|
|
8
|
6
|
|
|
6
|
|
3697
|
use App::Sandy::WeightedRaffle; |
|
6
|
|
|
|
|
2724
|
|
|
6
|
|
|
|
|
299
|
|
9
|
6
|
|
|
6
|
|
548
|
use App::Sandy::PieceTable; |
|
6
|
|
|
|
|
438
|
|
|
6
|
|
|
|
|
177
|
|
10
|
6
|
|
|
6
|
|
3188
|
use App::Sandy::DB::Handle::Expression; |
|
6
|
|
|
|
|
2702
|
|
|
6
|
|
|
|
|
282
|
|
11
|
6
|
|
|
6
|
|
3807
|
use App::Sandy::DB::Handle::Variation; |
|
6
|
|
|
|
|
2609
|
|
|
6
|
|
|
|
|
293
|
|
12
|
6
|
|
|
6
|
|
68
|
use List::Util 'min'; |
|
6
|
|
|
|
|
17
|
|
|
6
|
|
|
|
|
460
|
|
13
|
6
|
|
|
6
|
|
5336
|
use File::Cat 'cat'; |
|
6
|
|
|
|
|
3380
|
|
|
6
|
|
|
|
|
400
|
|
14
|
6
|
|
|
6
|
|
3475
|
use Parallel::ForkManager; |
|
6
|
|
|
|
|
340548
|
|
|
6
|
|
|
|
|
48314
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.22'; # VERSION |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has 'argv' => ( |
21
|
|
|
|
|
|
|
is => 'ro', |
22
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
23
|
|
|
|
|
|
|
required => 1 |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has 'truncate' => ( |
27
|
|
|
|
|
|
|
is => 'ro', |
28
|
|
|
|
|
|
|
isa => 'Bool', |
29
|
|
|
|
|
|
|
required => 1 |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
has 'seed' => ( |
33
|
|
|
|
|
|
|
is => 'ro', |
34
|
|
|
|
|
|
|
isa => 'Int', |
35
|
|
|
|
|
|
|
required => 1 |
36
|
|
|
|
|
|
|
); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
has 'jobs' => ( |
39
|
|
|
|
|
|
|
is => 'ro', |
40
|
|
|
|
|
|
|
isa => 'My:IntGt0', |
41
|
|
|
|
|
|
|
required => 1 |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
has 'prefix' => ( |
45
|
|
|
|
|
|
|
is => 'ro', |
46
|
|
|
|
|
|
|
isa => 'Str', |
47
|
|
|
|
|
|
|
required => 1 |
48
|
|
|
|
|
|
|
); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
has 'join_paired_ends' => ( |
51
|
|
|
|
|
|
|
is => 'ro', |
52
|
|
|
|
|
|
|
isa => 'Bool', |
53
|
|
|
|
|
|
|
required => 1 |
54
|
|
|
|
|
|
|
); |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
has 'output_format' => ( |
57
|
|
|
|
|
|
|
is => 'ro', |
58
|
|
|
|
|
|
|
isa => 'My:Format', |
59
|
|
|
|
|
|
|
required => 1 |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
has 'compression_level' => ( |
63
|
|
|
|
|
|
|
is => 'ro', |
64
|
|
|
|
|
|
|
isa => 'My:Level', |
65
|
|
|
|
|
|
|
required => 1 |
66
|
|
|
|
|
|
|
); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
has 'fasta_file' => ( |
69
|
|
|
|
|
|
|
is => 'ro', |
70
|
|
|
|
|
|
|
isa => 'My:Fasta', |
71
|
|
|
|
|
|
|
required => 1 |
72
|
|
|
|
|
|
|
); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
has 'coverage' => ( |
75
|
|
|
|
|
|
|
is => 'ro', |
76
|
|
|
|
|
|
|
isa => 'My:NumGt0', |
77
|
|
|
|
|
|
|
required => 0 |
78
|
|
|
|
|
|
|
); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
has 'number_of_reads' => ( |
81
|
|
|
|
|
|
|
is => 'ro', |
82
|
|
|
|
|
|
|
isa => 'My:IntGt0', |
83
|
|
|
|
|
|
|
required => 0 |
84
|
|
|
|
|
|
|
); |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
has 'count_loops_by' => ( |
87
|
|
|
|
|
|
|
is => 'ro', |
88
|
|
|
|
|
|
|
isa => 'My:CountLoopBy', |
89
|
|
|
|
|
|
|
required => 1 |
90
|
|
|
|
|
|
|
); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
has 'strand_bias' => ( |
93
|
|
|
|
|
|
|
is => 'ro', |
94
|
|
|
|
|
|
|
isa => 'My:StrandBias', |
95
|
|
|
|
|
|
|
required => 1 |
96
|
|
|
|
|
|
|
); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
has 'seqid_weight' => ( |
99
|
|
|
|
|
|
|
is => 'ro', |
100
|
|
|
|
|
|
|
isa => 'My:SeqIdWeight', |
101
|
|
|
|
|
|
|
required => 1 |
102
|
|
|
|
|
|
|
); |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
has 'expression_matrix' => ( |
105
|
|
|
|
|
|
|
is => 'ro', |
106
|
|
|
|
|
|
|
isa => 'Str', |
107
|
|
|
|
|
|
|
required => 0 |
108
|
|
|
|
|
|
|
); |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
has 'genomic_variation' => ( |
111
|
|
|
|
|
|
|
is => 'ro', |
112
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
113
|
|
|
|
|
|
|
required => 0 |
114
|
|
|
|
|
|
|
); |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
has '_genomic_variation_names' => ( |
117
|
|
|
|
|
|
|
is => 'ro', |
118
|
|
|
|
|
|
|
isa => 'Maybe[Str]', |
119
|
|
|
|
|
|
|
builder => '_build_genomic_variation_names', |
120
|
|
|
|
|
|
|
lazy_build => 1 |
121
|
|
|
|
|
|
|
); |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
has 'seq' => ( |
124
|
|
|
|
|
|
|
is => 'ro', |
125
|
|
|
|
|
|
|
isa => 'App::Sandy::Seq::SingleEnd | App::Sandy::Seq::PairedEnd', |
126
|
|
|
|
|
|
|
required => 1, |
127
|
|
|
|
|
|
|
handles => [ qw{ sprint_seq gen_sam_header gen_eof_marker } ] |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
has '_fasta' => ( |
131
|
|
|
|
|
|
|
is => 'ro', |
132
|
|
|
|
|
|
|
isa => 'My:IdxFasta', |
133
|
|
|
|
|
|
|
builder => '_build_fasta', |
134
|
|
|
|
|
|
|
lazy_build => 1 |
135
|
|
|
|
|
|
|
); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
has '_fasta_tree' => ( |
138
|
|
|
|
|
|
|
traits => ['Hash'], |
139
|
|
|
|
|
|
|
is => 'ro', |
140
|
|
|
|
|
|
|
isa => 'HashRef[ArrayRef]', |
141
|
|
|
|
|
|
|
default => sub { {} }, |
142
|
|
|
|
|
|
|
handles => { |
143
|
|
|
|
|
|
|
_set_fasta_tree => 'set', |
144
|
|
|
|
|
|
|
_get_fasta_tree => 'get', |
145
|
|
|
|
|
|
|
_exists_fasta_tree => 'exists', |
146
|
|
|
|
|
|
|
_fasta_tree_pairs => 'kv', |
147
|
|
|
|
|
|
|
_has_no_fasta_tree => 'is_empty' |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
has '_fasta_rtree' => ( |
152
|
|
|
|
|
|
|
traits => ['Hash'], |
153
|
|
|
|
|
|
|
is => 'ro', |
154
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
155
|
|
|
|
|
|
|
default => sub { {} }, |
156
|
|
|
|
|
|
|
handles => { |
157
|
|
|
|
|
|
|
_set_fasta_rtree => 'set', |
158
|
|
|
|
|
|
|
_get_fasta_rtree => 'get', |
159
|
|
|
|
|
|
|
_delete_fasta_rtree => 'delete', |
160
|
|
|
|
|
|
|
_exists_fasta_rtree => 'exists', |
161
|
|
|
|
|
|
|
_fasta_rtree_pairs => 'kv', |
162
|
|
|
|
|
|
|
_has_no_fasta_rtree => 'is_empty' |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
); |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
has '_seqname' => ( |
167
|
|
|
|
|
|
|
traits => ['Hash'], |
168
|
|
|
|
|
|
|
is => 'ro', |
169
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
170
|
|
|
|
|
|
|
default => sub { {} }, |
171
|
|
|
|
|
|
|
handles => { |
172
|
|
|
|
|
|
|
_set_seqname => 'set', |
173
|
|
|
|
|
|
|
_get_seqname => 'get' |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
); |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
has '_piece_table' => ( |
178
|
|
|
|
|
|
|
is => 'ro', |
179
|
|
|
|
|
|
|
isa => 'HashRef[HashRef[My:PieceTable]]', |
180
|
|
|
|
|
|
|
builder => '_build_piece_table', |
181
|
|
|
|
|
|
|
lazy_build => 1 |
182
|
|
|
|
|
|
|
); |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
has '_strand' => ( |
185
|
|
|
|
|
|
|
is => 'ro', |
186
|
|
|
|
|
|
|
isa => 'CodeRef', |
187
|
|
|
|
|
|
|
builder => '_build_strand', |
188
|
|
|
|
|
|
|
lazy_build => 1 |
189
|
|
|
|
|
|
|
); |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
has '_seqid_raffle' => ( |
192
|
|
|
|
|
|
|
is => 'ro', |
193
|
|
|
|
|
|
|
isa => 'CodeRef', |
194
|
|
|
|
|
|
|
builder => '_build_seqid_raffle', |
195
|
|
|
|
|
|
|
lazy_build => 1 |
196
|
|
|
|
|
|
|
); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub BUILD { |
199
|
20
|
|
|
20
|
0
|
1105
|
my $self = shift; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# If seqid_weight is 'count', then expression_matrix must be defined |
202
|
20
|
50
|
33
|
|
|
620
|
if ($self->seqid_weight eq 'count' and not defined $self->expression_matrix) { |
203
|
0
|
|
|
|
|
0
|
croak "seqid_weight=count requires a expression_matrix\n"; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# If count_loops_by is 'coverage', then coverage must be defined. Else if |
207
|
|
|
|
|
|
|
# it is equal to 'number_of_reads', then number_of_reads must be defined |
208
|
20
|
50
|
33
|
|
|
535
|
if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) { |
|
|
50
|
33
|
|
|
|
|
209
|
0
|
|
|
|
|
0
|
croak "count_loops_by=coverage requires a coverage number\n"; |
210
|
|
|
|
|
|
|
} elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) { |
211
|
0
|
|
|
|
|
0
|
croak "count_loops_by=number_of_reads requires a number_of_reads number\n"; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
## Just to ensure that the lazy attributes are built before &new returns |
215
|
20
|
|
|
|
|
550
|
$self->_piece_table; |
216
|
20
|
|
|
|
|
550
|
$self->_seqid_raffle; |
217
|
20
|
|
|
|
|
515
|
$self->_fasta; |
218
|
20
|
|
|
|
|
510
|
$self->_strand; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub _build_strand { |
222
|
20
|
|
|
20
|
|
40
|
my $self = shift; |
223
|
20
|
|
|
|
|
40
|
my $strand_sub; |
224
|
|
|
|
|
|
|
|
225
|
20
|
50
|
|
|
|
600
|
if ($self->strand_bias eq 'plus') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
226
|
0
|
|
|
0
|
|
0
|
$strand_sub = sub {1}; |
|
0
|
|
|
|
|
0
|
|
227
|
|
|
|
|
|
|
} elsif ($self->strand_bias eq 'minus') { |
228
|
0
|
|
|
0
|
|
0
|
$strand_sub = sub {0}; |
|
0
|
|
|
|
|
0
|
|
229
|
|
|
|
|
|
|
} elsif ($self->strand_bias eq 'random') { |
230
|
20
|
|
|
1710
|
|
80
|
$strand_sub = sub { int(rand(2)) }; |
|
1710
|
|
|
|
|
7756
|
|
231
|
|
|
|
|
|
|
} else { |
232
|
0
|
|
|
|
|
0
|
croak sprintf "Unknown option '%s' for strand bias\n", |
233
|
|
|
|
|
|
|
$self->strand_bias; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
20
|
|
|
|
|
495
|
return $strand_sub; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub _index_fasta { |
240
|
20
|
|
|
20
|
|
45
|
my $self = shift; |
241
|
20
|
|
|
|
|
505
|
my $fasta = $self->fasta_file; |
242
|
|
|
|
|
|
|
|
243
|
20
|
|
|
|
|
100
|
my $fh = $self->with_open_r($fasta); |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# indexed_genome = ID => (seq, len) |
246
|
20
|
|
|
|
|
75
|
my %indexed_fasta; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# >ID|PID as in gencode transcripts |
249
|
|
|
|
|
|
|
my %fasta_rtree; |
250
|
20
|
|
|
|
|
0
|
my $id; |
251
|
|
|
|
|
|
|
|
252
|
20
|
|
|
|
|
570
|
while (<$fh>) { |
253
|
860
|
|
|
|
|
1290
|
chomp; |
254
|
860
|
50
|
|
|
|
1540
|
next if /^;/; |
255
|
860
|
100
|
|
|
|
1540
|
if (/^>/) { |
256
|
100
|
|
|
|
|
315
|
my @fields = split /\|/; |
257
|
100
|
|
|
|
|
230
|
$id = $fields[0]; |
258
|
100
|
|
|
|
|
280
|
$id =~ s/^>//; |
259
|
100
|
|
|
|
|
325
|
$id =~ s/^\s+|\s+$//g; |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Seq ID standardization in order to manage comparations |
262
|
|
|
|
|
|
|
# between chr1, Chr1, CHR1, 1 etc; |
263
|
100
|
|
|
|
|
355
|
my $std_id = $self->with_std_seqid($id); |
264
|
100
|
|
|
|
|
3735
|
$self->_set_seqname( |
265
|
|
|
|
|
|
|
$id => $std_id, |
266
|
|
|
|
|
|
|
$std_id => $id |
267
|
|
|
|
|
|
|
); |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# It is necessary to catch gene -> transcript relation |
270
|
|
|
|
|
|
|
# # TODO: Make a hash tarit for indexed fasta |
271
|
100
|
50
|
|
|
|
610
|
if (defined $fields[1]) { |
272
|
0
|
|
|
|
|
0
|
my $pid = $fields[1]; |
273
|
0
|
|
|
|
|
0
|
$pid =~ s/^\s+|\s+$//g; |
274
|
0
|
|
|
|
|
0
|
$fasta_rtree{$id} = $pid; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
} else { |
277
|
760
|
50
|
|
|
|
1290
|
die "Error reading fasta file '$fasta': Not defined id" |
278
|
|
|
|
|
|
|
unless defined $id; |
279
|
760
|
|
|
|
|
2270
|
$indexed_fasta{$id}{seq} .= $_; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
20
|
|
|
|
|
115
|
for (keys %indexed_fasta) { |
284
|
100
|
|
|
|
|
255
|
$indexed_fasta{$_}{size} = length $indexed_fasta{$_}{seq}; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
20
|
50
|
|
|
|
65
|
unless (%indexed_fasta) { |
288
|
0
|
|
|
|
|
0
|
die "Error parsing '$fasta'. Maybe the file is empty\n"; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
$fh->close |
292
|
20
|
50
|
|
|
|
135
|
or die "Cannot close file $fasta: $!\n"; |
293
|
|
|
|
|
|
|
|
294
|
20
|
50
|
|
|
|
420
|
$self->_set_fasta_rtree(%fasta_rtree) if %fasta_rtree; |
295
|
20
|
|
|
|
|
110
|
return \%indexed_fasta; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
sub _build_fasta { |
299
|
20
|
|
|
20
|
|
40
|
my $self = shift; |
300
|
20
|
|
|
|
|
520
|
my $fasta = $self->fasta_file; |
301
|
|
|
|
|
|
|
|
302
|
20
|
|
|
|
|
120
|
log_msg ":: Indexing fasta file '$fasta' ..."; |
303
|
20
|
|
|
|
|
55
|
my $indexed_fasta = $self->_index_fasta; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# Validate genome about the read size required |
306
|
20
|
|
|
|
|
120
|
log_msg ":: Validating fasta file '$fasta' ..."; |
307
|
|
|
|
|
|
|
# Entries to remove |
308
|
20
|
|
|
|
|
35
|
my @blacklist; |
309
|
|
|
|
|
|
|
|
310
|
20
|
50
|
|
|
|
695
|
unless ($self->truncate) { |
311
|
20
|
|
|
|
|
75
|
for my $id (keys %$indexed_fasta) { |
312
|
100
|
|
|
|
|
190
|
my $index_size = $indexed_fasta->{$id}{size}; |
313
|
100
|
|
|
|
|
2455
|
my $class = ref $self->seq; |
314
|
|
|
|
|
|
|
|
315
|
100
|
100
|
|
|
|
280
|
if ($class eq 'App::Sandy::Seq::SingleEnd') { |
|
|
50
|
|
|
|
|
|
316
|
50
|
|
|
|
|
1170
|
my $read_mean = $self->seq->read_mean; |
317
|
50
|
50
|
|
|
|
135
|
if ($index_size < $read_mean) { |
318
|
0
|
|
|
|
|
0
|
log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required read mean ($read_mean)"; |
319
|
0
|
|
|
|
|
0
|
delete $indexed_fasta->{$id}; |
320
|
0
|
|
|
|
|
0
|
push @blacklist => $id; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
} elsif ($class eq 'App::Sandy::Seq::PairedEnd') { |
323
|
50
|
|
|
|
|
1355
|
my $fragment_mean = $self->seq->fragment_mean; |
324
|
50
|
50
|
|
|
|
130
|
if ($index_size < $fragment_mean) { |
325
|
0
|
|
|
|
|
0
|
log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)"; |
326
|
0
|
|
|
|
|
0
|
delete $indexed_fasta->{$id}; |
327
|
0
|
|
|
|
|
0
|
push @blacklist => $id; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} else { |
330
|
0
|
|
|
|
|
0
|
croak "Unknown option '$class' for sequencing type\n"; |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
20
|
50
|
|
|
|
65
|
unless (%$indexed_fasta) { |
336
|
0
|
|
|
|
|
0
|
die sprintf "Fasta file '%s' has no valid entry\n" => $self->fasta_file; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# If fasta_rtree has entries |
340
|
20
|
50
|
|
|
|
715
|
unless ($self->_has_no_fasta_rtree) { |
341
|
|
|
|
|
|
|
# Remove no valid entries from id -> pid relation |
342
|
0
|
0
|
|
|
|
0
|
$self->_delete_fasta_rtree(@blacklist) if @blacklist; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
20
|
|
|
|
|
525
|
return $indexed_fasta; |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
sub _populate_fasta_tree { |
349
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# If fasta_rtree has entries |
352
|
20
|
50
|
|
|
|
705
|
unless ($self->_has_no_fasta_rtree) { |
353
|
|
|
|
|
|
|
# Build parent -> child ids relation |
354
|
0
|
|
|
|
|
0
|
my %fasta_tree; |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# Reverse fasta_rtree to pid -> \@ids |
357
|
0
|
|
|
|
|
0
|
for my $pair ($self->_fasta_rtree_pairs) { |
358
|
0
|
|
|
|
|
0
|
my ($id, $pid) = (@$pair); |
359
|
0
|
|
|
|
|
0
|
push @{ $fasta_tree{$pid} } => $id; |
|
0
|
|
|
|
|
0
|
|
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
0
|
|
|
|
|
0
|
$self->_set_fasta_tree(%fasta_tree); |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
sub _retrieve_expression_matrix { |
367
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
368
|
0
|
|
|
|
|
0
|
my $expression = App::Sandy::DB::Handle::Expression->new; |
369
|
0
|
|
|
|
|
0
|
return $expression->retrievedb($self->expression_matrix); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
sub _build_seqid_raffle { |
373
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# Get the piece table |
376
|
20
|
|
|
|
|
530
|
my $piece_table = $self->_piece_table; |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# The builded function |
379
|
20
|
|
|
|
|
40
|
my $seqid_sub; |
380
|
|
|
|
|
|
|
|
381
|
20
|
50
|
|
|
|
580
|
if ($self->seqid_weight eq 'same') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
382
|
0
|
|
|
0
|
|
0
|
my ($keys, $weights) = $self->_populate_key_weight($piece_table, sub { 1 }); |
|
0
|
|
|
|
|
0
|
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# If weight == 1 means that there are 2 keys for |
385
|
|
|
|
|
|
|
# the same seq_id. |
386
|
|
|
|
|
|
|
# If weight == 2 means that there is only one key |
387
|
|
|
|
|
|
|
# for the seq_id, so I double that key |
388
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < @$weights; $i++) { |
389
|
0
|
0
|
|
|
|
0
|
if ($weights->[$i] > 1) { |
390
|
0
|
|
|
|
|
0
|
push @$keys => $keys->[$i]; |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
0
|
my $keys_size = scalar @$keys; |
395
|
0
|
|
|
0
|
|
0
|
$seqid_sub = sub { $keys->[int(rand($keys_size))] }; |
|
0
|
|
|
|
|
0
|
|
396
|
|
|
|
|
|
|
} elsif ($self->seqid_weight eq 'count') { |
397
|
|
|
|
|
|
|
# Catch expression-matrix entry from database |
398
|
0
|
|
|
|
|
0
|
my $indexed_file = $self->_retrieve_expression_matrix; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# Catch indexed fasta |
401
|
0
|
|
|
|
|
0
|
my $indexed_fasta = $self->_fasta; |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# Validate expression_matrix |
404
|
0
|
|
|
|
|
0
|
for my $id (keys %$indexed_file) { |
405
|
|
|
|
|
|
|
# If not exists into indexed_fasta, it must then exist into fasta_tree |
406
|
0
|
0
|
0
|
|
|
0
|
unless (exists $piece_table->{$id} || $self->_exists_fasta_tree($id)) { |
407
|
0
|
|
|
|
|
0
|
log_msg sprintf ":: Ignoring seqid '%s' from expression-matrix '%s': It is not found into the indexed fasta" |
408
|
|
|
|
|
|
|
=> $id, $self->expression_matrix; |
409
|
0
|
|
|
|
|
0
|
delete $indexed_file->{$id}; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
0
|
0
|
|
|
|
0
|
unless (%$indexed_file) { |
414
|
0
|
|
|
|
|
0
|
die sprintf "No valid seqid entry of the expression-matrix '%s' is recorded into the indexed fasta\n" |
415
|
|
|
|
|
|
|
=> $self->expression_matrix; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
my (%ptable_ind, %ptable_cluster); |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# Split indexed_file seq_ids between those |
421
|
|
|
|
|
|
|
# into piece_table and those that represents a cluster |
422
|
|
|
|
|
|
|
# of seq_ids as in gene -> transcript relationship |
423
|
0
|
|
|
|
|
0
|
for my $seq_id (keys %$indexed_file) { |
424
|
0
|
0
|
|
|
|
0
|
if (exists $piece_table->{$seq_id}) { |
425
|
0
|
|
|
|
|
0
|
$ptable_ind{$seq_id} = $piece_table->{$seq_id}; |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
} else { |
428
|
0
|
|
|
|
|
0
|
my $ids = $self->_get_fasta_tree($seq_id); |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Bug catcher |
431
|
0
|
0
|
|
|
|
0
|
unless (@$ids) { |
432
|
0
|
|
|
|
|
0
|
croak "seq_id '$seq_id' not found into piece_table"; |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
|
435
|
0
|
|
|
|
|
0
|
$ptable_cluster{$seq_id} = $ids; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
# Let's calculate the weight taking in acount |
440
|
|
|
|
|
|
|
# the size increase/decrease |
441
|
|
|
|
|
|
|
my $calc_ind_weight = sub { |
442
|
0
|
|
|
0
|
|
0
|
my ($seq_id, $type) = @_; |
443
|
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
0
|
my $counts = $indexed_file->{$seq_id}; |
445
|
0
|
|
|
|
|
0
|
my $size = $piece_table->{$seq_id}{$type}{size}; |
446
|
0
|
|
|
|
|
0
|
my $fasta_size = $indexed_fasta->{$seq_id}{size}; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# Correct the weight according to the |
449
|
|
|
|
|
|
|
# genomic variation change by the ratio |
450
|
|
|
|
|
|
|
# between the table size and fasta size |
451
|
0
|
|
|
|
|
0
|
my $factor = $size / $fasta_size; |
452
|
|
|
|
|
|
|
|
453
|
0
|
|
|
|
|
0
|
return $counts * $factor; |
454
|
0
|
|
|
|
|
0
|
}; |
455
|
|
|
|
|
|
|
|
456
|
0
|
|
|
|
|
0
|
my ($keys, $weights); |
457
|
|
|
|
|
|
|
|
458
|
0
|
0
|
|
|
|
0
|
if (%ptable_ind) { |
459
|
0
|
|
|
|
|
0
|
($keys, $weights) = $self->_populate_key_weight(\%ptable_ind, |
460
|
|
|
|
|
|
|
$calc_ind_weight); |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# If there are seq_id cluster like, then its is |
464
|
|
|
|
|
|
|
# time to calculate these weights |
465
|
0
|
|
|
|
|
0
|
for my $seq_id (sort keys %ptable_cluster) { |
466
|
0
|
|
|
|
|
0
|
my %ptable; |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# Slice piece_table hash |
469
|
0
|
|
|
|
|
0
|
my $ids = $ptable_cluster{$seq_id}; |
470
|
0
|
|
|
|
|
0
|
@ptable{@$ids} = @$piece_table{@$ids}; |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# total size among all ids of cluster |
473
|
0
|
|
|
|
|
0
|
my %total; |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
# Calculate the total size by type |
476
|
0
|
|
|
|
|
0
|
for my $type_h (values %ptable) { |
477
|
0
|
|
|
|
|
0
|
for my $type (keys %$type_h) { |
478
|
0
|
|
|
|
|
0
|
$total{$type} += $type_h->{$type}{size}; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Calculate the weight taking in acount the size increase/decrease |
483
|
|
|
|
|
|
|
# and the ratio between the total size by type and the table size. |
484
|
|
|
|
|
|
|
# The problem here is that I must divide the 'counts' for some 'seq_id' |
485
|
|
|
|
|
|
|
# among all ids that belong to it |
486
|
|
|
|
|
|
|
my $calc_cluster_weight = sub { |
487
|
0
|
|
|
0
|
|
0
|
my ($id, $type) = @_; |
488
|
|
|
|
|
|
|
|
489
|
0
|
|
|
|
|
0
|
my $counts = $indexed_file->{$seq_id}; |
490
|
0
|
|
|
|
|
0
|
my $size = $piece_table->{$id}{$type}{size}; |
491
|
0
|
|
|
|
|
0
|
my $fasta_size = $indexed_fasta->{$id}{size}; |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Divide the counts among all ids |
494
|
0
|
|
|
|
|
0
|
my $ratio = $size / $total{$type}; |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# Correct the weight according to the size |
497
|
0
|
|
|
|
|
0
|
my $factor = $size / $fasta_size; |
498
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
0
|
return $counts * $factor * $ratio; |
500
|
0
|
|
|
|
|
0
|
}; |
501
|
|
|
|
|
|
|
|
502
|
0
|
|
|
|
|
0
|
my ($k, $w) = $self->_populate_key_weight(\%ptable, |
503
|
|
|
|
|
|
|
$calc_cluster_weight); |
504
|
|
|
|
|
|
|
|
505
|
0
|
|
|
|
|
0
|
push @$keys => @$k; |
506
|
0
|
|
|
|
|
0
|
push @$weights => @$w; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
0
|
0
|
0
|
|
|
0
|
unless (@$keys && @$weights) { |
510
|
0
|
|
|
|
|
0
|
croak "No keys weights have been set"; |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# It is very necessary in order |
514
|
|
|
|
|
|
|
# to avoid truncation of numbers |
515
|
|
|
|
|
|
|
# between zero and one |
516
|
0
|
|
|
|
|
0
|
$self->_round_weight($weights); |
517
|
|
|
|
|
|
|
|
518
|
0
|
|
|
|
|
0
|
my $raffler = App::Sandy::WeightedRaffle->new( |
519
|
|
|
|
|
|
|
'weights' => $weights, |
520
|
|
|
|
|
|
|
'keys' => $keys |
521
|
|
|
|
|
|
|
); |
522
|
|
|
|
|
|
|
|
523
|
0
|
|
|
0
|
|
0
|
$seqid_sub = sub { $raffler->weighted_raffle }; |
|
0
|
|
|
|
|
0
|
|
524
|
|
|
|
|
|
|
} elsif ($self->seqid_weight eq 'length') { |
525
|
|
|
|
|
|
|
my $calc_weight = sub { |
526
|
100
|
|
|
100
|
|
210
|
my ($seq_id, $type) = @_; |
527
|
100
|
|
|
|
|
205
|
return $piece_table->{$seq_id}{$type}{size}; |
528
|
20
|
|
|
|
|
115
|
}; |
529
|
|
|
|
|
|
|
|
530
|
20
|
|
|
|
|
70
|
my ($keys, $weights) = $self->_populate_key_weight($piece_table, |
531
|
|
|
|
|
|
|
$calc_weight); |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Just in case ... |
534
|
20
|
|
|
|
|
70
|
$self->_round_weight($weights); |
535
|
|
|
|
|
|
|
|
536
|
20
|
|
|
|
|
720
|
my $raffler = App::Sandy::WeightedRaffle->new( |
537
|
|
|
|
|
|
|
weights => $weights, |
538
|
|
|
|
|
|
|
keys => $keys |
539
|
|
|
|
|
|
|
); |
540
|
|
|
|
|
|
|
|
541
|
20
|
|
|
1710
|
|
155
|
$seqid_sub = sub { $raffler->weighted_raffle }; |
|
1710
|
|
|
|
|
6155
|
|
542
|
|
|
|
|
|
|
} else { |
543
|
0
|
|
|
|
|
0
|
croak sprintf "Unknown option '%s' for seqid_weight\n", |
544
|
|
|
|
|
|
|
$self->seqid_weight; |
545
|
|
|
|
|
|
|
} |
546
|
|
|
|
|
|
|
|
547
|
20
|
|
|
|
|
560
|
return $seqid_sub; |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
sub _round_weight { |
551
|
20
|
|
|
20
|
|
40
|
my ($self, $weights) = @_; |
552
|
|
|
|
|
|
|
|
553
|
20
|
|
|
|
|
75
|
my $min = min @$weights; |
554
|
|
|
|
|
|
|
|
555
|
20
|
50
|
|
|
|
55
|
if ($min <= 0) { |
556
|
0
|
|
|
|
|
0
|
croak "min weight le to zero: $min"; |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
|
559
|
20
|
50
|
|
|
|
95
|
my $factor = $min < 1 |
560
|
|
|
|
|
|
|
? (1 / $min) |
561
|
|
|
|
|
|
|
: 1; |
562
|
|
|
|
|
|
|
|
563
|
20
|
|
|
|
|
45
|
for my $weight (@$weights) { |
564
|
100
|
|
|
|
|
195
|
$weight = int($weight * $factor + 0.5); |
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
sub _populate_key_weight { |
569
|
20
|
|
|
20
|
|
55
|
my ($self, $piece_table, $calc_weight) = @_; |
570
|
|
|
|
|
|
|
|
571
|
20
|
|
|
|
|
35
|
my (@keys, @weights); |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# It needs to be sorted in order to the |
574
|
|
|
|
|
|
|
# seed works |
575
|
20
|
|
|
|
|
125
|
for my $seq_id (sort keys %$piece_table) { |
576
|
100
|
|
|
|
|
160
|
my $type_h = $piece_table->{$seq_id}; |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# If there is no alternative seq_id, then |
579
|
|
|
|
|
|
|
# set a factor to correct the size. |
580
|
|
|
|
|
|
|
# It is necessary because the seq_ids with |
581
|
|
|
|
|
|
|
# alternative and reference will double its |
582
|
|
|
|
|
|
|
# own coverage |
583
|
100
|
50
|
|
|
|
200
|
my $factor = scalar keys %$type_h == 1 |
584
|
|
|
|
|
|
|
? 2 |
585
|
|
|
|
|
|
|
: 1; |
586
|
|
|
|
|
|
|
|
587
|
100
|
|
|
|
|
205
|
for my $type (sort keys %$type_h) { |
588
|
|
|
|
|
|
|
|
589
|
100
|
|
|
|
|
330
|
my %key = ( |
590
|
|
|
|
|
|
|
'seq_id' => $seq_id, |
591
|
|
|
|
|
|
|
'type' => $type |
592
|
|
|
|
|
|
|
); |
593
|
|
|
|
|
|
|
|
594
|
100
|
|
|
|
|
185
|
my $weight = $calc_weight->($seq_id, $type); |
595
|
|
|
|
|
|
|
|
596
|
100
|
|
|
|
|
195
|
push @keys => \%key; |
597
|
100
|
|
|
|
|
295
|
push @weights => $weight * $factor; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
|
601
|
20
|
|
|
|
|
65
|
return (\@keys, \@weights); |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
sub _build_genomic_variation_names { |
605
|
20
|
|
|
20
|
|
25
|
my $self = shift; |
606
|
20
|
50
|
|
|
|
555
|
if ($self->genomic_variation) { |
607
|
0
|
|
|
|
|
0
|
return sprintf "[%s]", => join ", ", @{ $self->genomic_variation }; |
|
0
|
|
|
|
|
0
|
|
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
sub _retrieve_genomic_variation { |
612
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
613
|
0
|
|
|
|
|
0
|
my $variation = App::Sandy::DB::Handle::Variation->new; |
614
|
0
|
|
|
|
|
0
|
return $variation->retrievedb($self->genomic_variation); |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
sub _build_piece_table { |
618
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
619
|
|
|
|
|
|
|
|
620
|
20
|
|
|
|
|
625
|
my $genomic_variation = $self->_genomic_variation_names; |
621
|
20
|
|
|
|
|
35
|
my $indexed_snv; |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# Retrieve genomic variation if the user provided it |
624
|
20
|
50
|
|
|
|
55
|
if (defined $genomic_variation) { |
625
|
0
|
|
|
|
|
0
|
$indexed_snv = $self->_retrieve_genomic_variation; |
626
|
0
|
|
|
|
|
0
|
log_msg ":: Validate genomic variation '$genomic_variation' against indexed fasta ..."; |
627
|
0
|
|
|
|
|
0
|
$self->_validate_indexed_snv_against_fasta($indexed_snv); |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Catch index fasta |
631
|
20
|
|
|
|
|
505
|
my $indexed_fasta = $self->_fasta; |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
# Build piece table |
634
|
20
|
|
|
|
|
40
|
my %piece_table; |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
# Let's construct the piece_table |
637
|
20
|
|
|
|
|
80
|
log_msg ":: Build piece table ..."; |
638
|
|
|
|
|
|
|
|
639
|
20
|
|
|
|
|
85
|
while (my ($seq_id, $fasta_h) = each %$indexed_fasta) { |
640
|
100
|
|
|
|
|
210
|
my $seq = \$fasta_h->{seq}; |
641
|
100
|
|
|
|
|
3605
|
my $std_seq_id = $self->_get_seqname($seq_id); |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
# Initialize piece tables for $seq_id ref |
644
|
100
|
|
|
|
|
2575
|
$piece_table{$seq_id}{ref}{table} = App::Sandy::PieceTable->new(orig => $seq); |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
# If there is indexed_snv for seq_id, then construct the piece table with it |
647
|
100
|
50
|
33
|
|
|
440
|
if (defined $indexed_snv && defined $indexed_snv->{$std_seq_id}) { |
648
|
0
|
|
|
|
|
0
|
my $snvs = $indexed_snv->{$std_seq_id}; |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# Filter only the homozygotic snvs to feed reference seq_id |
651
|
0
|
|
|
|
|
0
|
my @snvs_homo = grep { $_->{plo} eq 'HO' } @$snvs; |
|
0
|
|
|
|
|
0
|
|
652
|
|
|
|
|
|
|
|
653
|
0
|
0
|
|
|
|
0
|
if (@snvs_homo) { |
654
|
|
|
|
|
|
|
# Populate reference seq_id |
655
|
0
|
|
|
|
|
0
|
$self->_populate_piece_table($piece_table{$seq_id}{ref}{table}, \@snvs_homo); |
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
# Initialize piece tables for $seq_id alt |
659
|
0
|
|
|
|
|
0
|
$piece_table{$seq_id}{alt}{table} = App::Sandy::PieceTable->new(orig => $seq); |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
# Populate alternative seq_id |
662
|
0
|
|
|
|
|
0
|
$self->_populate_piece_table($piece_table{$seq_id}{alt}{table}, $snvs); |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
} |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
# Initialize the logical offsets and valodate the |
667
|
|
|
|
|
|
|
# new size due to the genomic variation |
668
|
|
|
|
|
|
|
|
669
|
20
|
|
|
|
|
40
|
my @blacklist; |
670
|
|
|
|
|
|
|
|
671
|
20
|
|
|
|
|
70
|
for my $seq_id (keys %piece_table) { |
672
|
100
|
|
|
|
|
190
|
my $type_h = delete $piece_table{$seq_id}; |
673
|
|
|
|
|
|
|
|
674
|
100
|
|
|
|
|
300
|
for my $type (keys %$type_h) { |
675
|
100
|
|
|
|
|
160
|
my $table_h = delete $type_h->{$type}; |
676
|
100
|
|
|
|
|
175
|
my $table = $table_h->{table}; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
# Initialize the logical offset |
679
|
100
|
|
|
|
|
495
|
$table->calculate_logical_offset; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# Get the new size |
682
|
100
|
|
|
|
|
2660
|
my $new_size = $table->logical_len; |
683
|
|
|
|
|
|
|
|
684
|
100
|
50
|
|
|
|
2515
|
unless ($self->truncate) { |
685
|
100
|
|
|
|
|
2440
|
my $class = ref $self->seq; |
686
|
|
|
|
|
|
|
|
687
|
100
|
100
|
|
|
|
310
|
if ($class eq 'App::Sandy::Seq::SingleEnd') { |
|
|
50
|
|
|
|
|
|
688
|
50
|
50
|
|
|
|
1185
|
if ($new_size < $self->seq->read_mean) { |
689
|
0
|
|
|
|
|
0
|
log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required read-mean"; |
690
|
0
|
|
|
|
|
0
|
next; |
691
|
|
|
|
|
|
|
} |
692
|
|
|
|
|
|
|
} elsif ($class eq 'App::Sandy::Seq::PairedEnd') { |
693
|
50
|
50
|
|
|
|
1175
|
if ($new_size < $self->seq->fragment_mean) { |
694
|
0
|
|
|
|
|
0
|
log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required fragment mean"; |
695
|
0
|
|
|
|
|
0
|
next; |
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
} else { |
698
|
0
|
|
|
|
|
0
|
die "No valid options for 'seq'"; |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
# If all's right |
703
|
100
|
|
|
|
|
240
|
$table_h->{size} = $new_size; |
704
|
100
|
|
|
|
|
230
|
$type_h->{$type} = $table_h; |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# if there is at least one type, |
708
|
|
|
|
|
|
|
# then return it to the piece_table |
709
|
100
|
50
|
|
|
|
275
|
if (%$type_h) { |
710
|
100
|
|
|
|
|
215
|
$piece_table{$seq_id} = $type_h; |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
# else, just remove it! |
713
|
|
|
|
|
|
|
} else { |
714
|
0
|
|
|
|
|
0
|
push @blacklist => $seq_id; |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
} |
717
|
|
|
|
|
|
|
|
718
|
20
|
50
|
|
|
|
60
|
unless (%piece_table) { |
719
|
0
|
|
|
|
|
0
|
die "All fasta entries were removed due to deletions. ", |
720
|
|
|
|
|
|
|
"Please, verify the genomic variation '$genomic_variation'\n"; |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
# If fasta_rtree has entries |
724
|
20
|
50
|
|
|
|
795
|
unless ($self->_has_no_fasta_rtree) { |
725
|
|
|
|
|
|
|
# Remove no valid entries from id -> pid relation |
726
|
0
|
0
|
|
|
|
0
|
$self->_delete_fasta_rtree(@blacklist) if @blacklist; |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# Make the id -> pid relationship |
730
|
20
|
|
|
|
|
80
|
$self->_populate_fasta_tree; |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# HASH -> SEQ_ID -> @(REF @ALT) -> @(TABLE SIZE) |
733
|
20
|
|
|
|
|
615
|
return \%piece_table; |
734
|
|
|
|
|
|
|
} |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
sub _populate_piece_table { |
737
|
0
|
|
|
0
|
|
0
|
my ($self, $table, $snvs) = @_; |
738
|
|
|
|
|
|
|
|
739
|
0
|
|
|
|
|
0
|
for my $snv (@$snvs) { |
740
|
|
|
|
|
|
|
# If there is an ID, make sure that it is not a comma, colon |
741
|
|
|
|
|
|
|
# separated list. Else, make sure to keep the ref/alt length |
742
|
|
|
|
|
|
|
# to max 25+25+1=51 |
743
|
|
|
|
|
|
|
my $annot = defined $snv->{id} && $snv->{id} ne '.' |
744
|
|
|
|
|
|
|
? sprintf "%d:%s" => $snv->{pos} + 1, (split(/[,;]/, $snv->{id}))[0] |
745
|
0
|
0
|
0
|
|
|
0
|
: sprintf "%d:%.25s/%.25s" => $snv->{pos} + 1, $snv->{ref}, $snv->{alt}; |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
# Insertion |
748
|
0
|
0
|
|
|
|
0
|
if ($snv->{ref} eq '-') { |
|
|
0
|
|
|
|
|
|
749
|
0
|
|
|
|
|
0
|
$table->insert(\$snv->{alt}, $snv->{pos}, $annot); |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
# Deletion |
752
|
|
|
|
|
|
|
} elsif ($snv->{alt} eq '-') { |
753
|
0
|
|
|
|
|
0
|
$table->delete($snv->{pos}, length $snv->{ref}, $annot); |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
# Change |
756
|
|
|
|
|
|
|
} else { |
757
|
0
|
|
|
|
|
0
|
$table->change(\$snv->{alt}, $snv->{pos}, length $snv->{ref}, $annot); |
758
|
|
|
|
|
|
|
} |
759
|
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
sub _validate_indexed_snv_against_fasta { |
763
|
0
|
|
|
0
|
|
0
|
my ($self, $indexed_snv) = @_; |
764
|
|
|
|
|
|
|
|
765
|
0
|
|
|
|
|
0
|
my $indexed_fasta = $self->_fasta; |
766
|
0
|
|
|
|
|
0
|
my $genomic_variation = $self->_genomic_variation_names; |
767
|
|
|
|
|
|
|
|
768
|
0
|
|
|
|
|
0
|
for my $std_seq_id (keys %$indexed_snv) { |
769
|
0
|
|
|
|
|
0
|
my $snvs = delete $indexed_snv->{$std_seq_id}; |
770
|
0
|
|
|
|
|
0
|
my $seq_id = $self->_get_seqname($std_seq_id); |
771
|
|
|
|
|
|
|
|
772
|
0
|
0
|
0
|
|
|
0
|
unless (defined $seq_id && exists $indexed_fasta->{$seq_id}) { |
773
|
0
|
|
|
|
|
0
|
next; |
774
|
|
|
|
|
|
|
} |
775
|
|
|
|
|
|
|
|
776
|
0
|
|
|
|
|
0
|
my $seq = \$indexed_fasta->{$seq_id}{seq}; |
777
|
0
|
|
|
|
|
0
|
my $size = $indexed_fasta->{$seq_id}{size}; |
778
|
|
|
|
|
|
|
|
779
|
0
|
|
|
|
|
0
|
my @saved_snvs; |
780
|
|
|
|
|
|
|
|
781
|
0
|
|
|
|
|
0
|
for my $snv (@$snvs) { |
782
|
|
|
|
|
|
|
# Insertions may accur until one base after the |
783
|
|
|
|
|
|
|
# end of the sequence, not more |
784
|
0
|
0
|
0
|
|
|
0
|
if (($snv->{ref} eq '-' && $snv->{pos} > $size) || ($snv->{ref} ne '-' && $snv->{pos} >= $size)) { |
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
785
|
|
|
|
|
|
|
log_msg sprintf ":: In validating '%s': Position, %s/%s at %s:%d, outside fasta sequence", |
786
|
0
|
|
|
|
|
0
|
$genomic_variation, $snv->{ref}, $snv->{alt}, $seq_id, $snv->{pos} + 1; |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# Next snv |
789
|
0
|
|
|
|
|
0
|
next; |
790
|
|
|
|
|
|
|
# Deletions and changes. Just verify if the reference exists |
791
|
|
|
|
|
|
|
} elsif ($snv->{ref} ne '-') { |
792
|
0
|
|
|
|
|
0
|
my $ref = substr $$seq, $snv->{pos}, length($snv->{ref}); |
793
|
|
|
|
|
|
|
|
794
|
0
|
0
|
|
|
|
0
|
if (uc($ref) ne uc($snv->{ref})) { |
795
|
|
|
|
|
|
|
log_msg sprintf ":: In validating '%s': Not found reference '%s' at fasta position %s:%d", |
796
|
0
|
|
|
|
|
0
|
$genomic_variation, $snv->{ref}, $seq_id, $snv->{pos} + 1; |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
# Next snv |
799
|
0
|
|
|
|
|
0
|
next; |
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
0
|
|
|
|
|
0
|
push @saved_snvs => $snv; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
0
|
0
|
|
|
|
0
|
if (@saved_snvs) { |
807
|
0
|
|
|
|
|
0
|
$indexed_snv->{$std_seq_id} = [@saved_snvs]; |
808
|
|
|
|
|
|
|
} |
809
|
|
|
|
|
|
|
} |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
sub _calculate_number_of_reads { |
813
|
8
|
|
|
8
|
|
34
|
my $self = shift; |
814
|
8
|
|
|
|
|
19
|
my $number_of_reads; |
815
|
|
|
|
|
|
|
|
816
|
8
|
50
|
|
|
|
239
|
if ($self->count_loops_by eq 'coverage') { |
|
|
0
|
|
|
|
|
|
817
|
|
|
|
|
|
|
# It is needed to calculate the genome size |
818
|
8
|
|
|
|
|
246
|
my $fasta = $self->_fasta; |
819
|
8
|
|
|
|
|
36
|
my $fasta_size = 0; |
820
|
8
|
|
|
|
|
19
|
$fasta_size += $fasta->{$_}{size} for keys %{ $fasta }; |
|
8
|
|
|
|
|
60
|
|
821
|
8
|
|
|
|
|
230
|
$number_of_reads = int(($fasta_size * $self->coverage) / $self->seq->read_mean); |
822
|
|
|
|
|
|
|
# In case it is paired-end read, divide the number of reads by 2 because |
823
|
|
|
|
|
|
|
# App::Sandy::Seq::PairedEnd class returns 2 reads at time |
824
|
8
|
100
|
|
|
|
203
|
$number_of_reads = int($number_of_reads / 2) |
825
|
|
|
|
|
|
|
if ref($self->seq) eq 'App::Sandy::Seq::PairedEnd'; |
826
|
|
|
|
|
|
|
} elsif ($self->count_loops_by eq 'number-of-reads') { |
827
|
0
|
|
|
|
|
0
|
$number_of_reads = $self->number_of_reads; |
828
|
|
|
|
|
|
|
} else { |
829
|
0
|
|
|
|
|
0
|
croak sprintf "Unknown option '%s' for calculating the number of reads\n", |
830
|
|
|
|
|
|
|
$self->count_loops_by; |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
# Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size |
834
|
8
|
50
|
|
|
|
32
|
if ($number_of_reads <= 0) { |
835
|
0
|
|
|
|
|
0
|
die "The computed number of reads is equal to zero.\n" . |
836
|
|
|
|
|
|
|
"It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n"; |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
|
839
|
8
|
|
|
|
|
27
|
return $number_of_reads; |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
sub _set_seed { |
843
|
4
|
|
|
4
|
|
67
|
my ($self, $inc) = @_; |
844
|
4
|
50
|
|
|
|
336
|
my $seed = defined $inc ? $self->seed + $inc : $self->seed; |
845
|
4
|
|
|
|
|
63
|
srand($seed); |
846
|
4
|
|
|
|
|
123
|
require Math::Random; |
847
|
4
|
|
|
|
|
178
|
Math::Random::random_set_seed_from_phrase($seed); |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
sub _calculate_parent_count { |
851
|
4
|
|
|
4
|
|
22
|
my ($self, $counter_ref) = @_; |
852
|
4
|
50
|
|
|
|
235
|
return if $self->_has_no_fasta_rtree; |
853
|
|
|
|
|
|
|
|
854
|
0
|
|
|
|
|
0
|
my %parent_count; |
855
|
|
|
|
|
|
|
|
856
|
0
|
|
|
|
|
0
|
while (my ($id, $count) = each %$counter_ref) { |
857
|
0
|
|
|
|
|
0
|
my $pid = $self->_get_fasta_rtree($id); |
858
|
0
|
0
|
|
|
|
0
|
$parent_count{$pid} += $count if defined $pid; |
859
|
|
|
|
|
|
|
} |
860
|
|
|
|
|
|
|
|
861
|
0
|
|
|
|
|
0
|
return \%parent_count; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
sub run_simulation { |
865
|
8
|
|
|
8
|
0
|
4225
|
my $self = shift; |
866
|
8
|
|
|
|
|
313
|
my $piece_table = $self->_piece_table; |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
# Calculate the number of reads to be generated |
869
|
8
|
|
|
|
|
55
|
my $number_of_reads = $self->_calculate_number_of_reads; |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# Function that returns strand by strand_bias |
872
|
8
|
|
|
|
|
246
|
my $strand = $self->_strand; |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
# Function that returns seqid by seqid_weight |
875
|
8
|
|
|
|
|
261
|
my $seqid = $self->_seqid_raffle; |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
# genome or transcriptome? |
878
|
8
|
|
|
|
|
251
|
my $simulation = $self->argv->[0]; |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
# Count file to be generated |
881
|
8
|
50
|
|
|
|
264
|
my $count_file = $simulation eq 'transcriptome' |
882
|
|
|
|
|
|
|
? $self->prefix . '_abundance.tsv' |
883
|
|
|
|
|
|
|
: $self->prefix . '_coverage.tsv'; |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
# Main files |
886
|
8
|
|
|
|
|
217
|
my %files = ( |
887
|
|
|
|
|
|
|
bam => [ |
888
|
|
|
|
|
|
|
$self->prefix . '.bam' |
889
|
|
|
|
|
|
|
], |
890
|
|
|
|
|
|
|
sam => [ |
891
|
|
|
|
|
|
|
$self->prefix . '.sam' |
892
|
|
|
|
|
|
|
], |
893
|
|
|
|
|
|
|
single_fastq => [ |
894
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq' |
895
|
|
|
|
|
|
|
], |
896
|
|
|
|
|
|
|
single_fastq_gz => [ |
897
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq.gz' |
898
|
|
|
|
|
|
|
], |
899
|
|
|
|
|
|
|
join_paired_fastq => [ |
900
|
|
|
|
|
|
|
$self->prefix . '.fastq' |
901
|
|
|
|
|
|
|
], |
902
|
|
|
|
|
|
|
join_paired_fastq_gz => [ |
903
|
|
|
|
|
|
|
$self->prefix . '.fastq.gz' |
904
|
|
|
|
|
|
|
], |
905
|
|
|
|
|
|
|
paired_fastq => [ |
906
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq', |
907
|
|
|
|
|
|
|
$self->prefix . '_R2_001.fastq' |
908
|
|
|
|
|
|
|
], |
909
|
|
|
|
|
|
|
paired_fastq_gz => [ |
910
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq.gz', |
911
|
|
|
|
|
|
|
$self->prefix . '_R2_001.fastq.gz' |
912
|
|
|
|
|
|
|
] |
913
|
|
|
|
|
|
|
); |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
# Set the file class in order to know |
916
|
|
|
|
|
|
|
# how to deal with all files options |
917
|
8
|
|
|
|
|
216
|
my $seq_class = ref $self->seq; |
918
|
8
|
|
|
|
|
272
|
my $output_format = $self->output_format; |
919
|
8
|
|
|
|
|
24
|
my $file_class; |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
# This mess is necessary to catch the |
922
|
|
|
|
|
|
|
# right value into the %files hash |
923
|
8
|
50
|
|
|
|
98
|
if ($output_format =~ /(sam|bam)/) { |
|
|
50
|
|
|
|
|
|
924
|
0
|
|
|
|
|
0
|
$file_class = $output_format; |
925
|
|
|
|
|
|
|
} elsif ($output_format =~ /fastq/) { |
926
|
8
|
100
|
|
|
|
33
|
if ($seq_class eq 'App::Sandy::Seq::SingleEnd') { |
|
|
50
|
|
|
|
|
|
927
|
5
|
|
|
|
|
15
|
$file_class = 'single_fastq'; |
928
|
|
|
|
|
|
|
} elsif ($seq_class eq 'App::Sandy::Seq::PairedEnd') { |
929
|
3
|
|
|
|
|
9
|
$file_class = 'paired_fastq'; |
930
|
3
|
50
|
|
|
|
126
|
$file_class = "join_$file_class" if $self->join_paired_ends; |
931
|
|
|
|
|
|
|
} else { |
932
|
0
|
|
|
|
|
0
|
croak "Something wrong with the seq class: $seq_class"; |
933
|
|
|
|
|
|
|
} |
934
|
8
|
50
|
|
|
|
32
|
if ($output_format eq 'fastq.gz') { |
935
|
0
|
|
|
|
|
0
|
$file_class .= '_gz'; |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
} else { |
938
|
0
|
|
|
|
|
0
|
croak "Something wrong with the output format: $output_format"; |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
# Forks |
942
|
8
|
|
|
|
|
250
|
my $number_of_jobs = $self->jobs; |
943
|
8
|
|
|
|
|
162
|
my $pm = Parallel::ForkManager->new($number_of_jobs); |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
# Parent child pids |
946
|
8
|
|
|
|
|
25425
|
my $parent_pid = $$; |
947
|
8
|
|
|
|
|
21
|
my @child_pid; |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
# Temporary files tracker |
950
|
|
|
|
|
|
|
my @tmp_files; |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
# Run in parent right after creating child process |
953
|
|
|
|
|
|
|
$pm->run_on_start( |
954
|
|
|
|
|
|
|
sub { |
955
|
10
|
|
|
10
|
|
18802
|
my $pid = shift; |
956
|
10
|
|
|
|
|
259
|
push @child_pid => $pid; |
957
|
|
|
|
|
|
|
} |
958
|
8
|
|
|
|
|
72
|
); |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# Count the overall cumulative number of reads for each seqid |
961
|
8
|
|
|
|
|
59
|
my %counters; |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
# Run in parent right after finishing child process |
964
|
|
|
|
|
|
|
$pm->run_on_finish( |
965
|
|
|
|
|
|
|
sub { |
966
|
8
|
|
|
8
|
|
24032709
|
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $counter_ref) = @_; |
967
|
8
|
|
|
|
|
101
|
while (my ($seqid, $count) = each %$counter_ref) { |
968
|
40
|
|
|
|
|
254
|
$counters{$seqid} += $count; |
969
|
|
|
|
|
|
|
} |
970
|
|
|
|
|
|
|
} |
971
|
8
|
|
|
|
|
61
|
); |
972
|
|
|
|
|
|
|
|
973
|
8
|
50
|
|
|
|
141
|
log_msg sprintf ":: Creating %d child %s ...", |
974
|
|
|
|
|
|
|
$number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs"; |
975
|
|
|
|
|
|
|
|
976
|
8
|
|
|
|
|
40
|
for my $tid (1..$number_of_jobs) { |
977
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
978
|
|
|
|
|
|
|
# Inside parent |
979
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
980
|
14
|
|
|
|
|
773
|
log_msg ":: Creating job $tid ..."; |
981
|
14
|
|
|
|
|
33
|
my @files_t = map { "$_.${parent_pid}.part$tid" } @{ $files{$file_class} }; |
|
19
|
|
|
|
|
107
|
|
|
14
|
|
|
|
|
224
|
|
982
|
14
|
|
|
|
|
103
|
push @tmp_files => @files_t; |
983
|
14
|
100
|
|
|
|
77
|
my $pid = $pm->start and next; |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
986
|
|
|
|
|
|
|
# Inside child |
987
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
988
|
|
|
|
|
|
|
# Intelace child/parent processes |
989
|
4
|
|
|
|
|
42813
|
my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => [$parent_pid]); |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# Set child seed |
992
|
4
|
|
|
|
|
173
|
$self->_set_seed($tid); |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
# Calculate the number of reads to this job and correct this local index |
995
|
|
|
|
|
|
|
# to the global index |
996
|
4
|
|
|
|
|
249
|
my $number_of_reads_t = int($number_of_reads/$number_of_jobs); |
997
|
4
|
|
|
|
|
36
|
my $last_read_idx = $number_of_reads_t * $tid; |
998
|
4
|
|
|
|
|
91
|
my $idx = $last_read_idx - $number_of_reads_t + 1; |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
# If it is the last job, make it work on the leftover reads of int() truncation |
1001
|
4
|
100
|
|
|
|
115
|
$last_read_idx += $number_of_reads % $number_of_jobs |
1002
|
|
|
|
|
|
|
if $tid == $number_of_jobs; |
1003
|
|
|
|
|
|
|
|
1004
|
4
|
|
|
|
|
220
|
log_msg " => Job $tid: Working on sequences from $idx to $last_read_idx"; |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
# Create temporary files |
1007
|
4
|
|
|
|
|
116
|
log_msg " => Job $tid: Creating temporary file: @files_t"; |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
# And here we go ... |
1010
|
4
|
|
|
|
|
51
|
my @fhs; |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
# Set the right filehandle format |
1013
|
4
|
50
|
|
|
|
189
|
if ($output_format =~ /^(sam|fastq)$/) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1014
|
4
|
|
|
|
|
41
|
@fhs = map { $self->with_open_w($_, 0) } @files_t; |
|
6
|
|
|
|
|
249
|
|
1015
|
|
|
|
|
|
|
} elsif ($output_format eq 'fastq.gz') { |
1016
|
0
|
|
|
|
|
0
|
@fhs = map { $self->with_open_w($_, $self->compression_level) } @files_t; |
|
0
|
|
|
|
|
0
|
|
1017
|
|
|
|
|
|
|
} elsif ($output_format eq 'bam') { |
1018
|
0
|
|
|
|
|
0
|
@fhs = map { $self->with_open_bam_w($_, $self->compression_level) } @files_t; |
|
0
|
|
|
|
|
0
|
|
1019
|
|
|
|
|
|
|
} else { |
1020
|
0
|
|
|
|
|
0
|
croak "Something wrong with the output format: $file_class"; |
1021
|
|
|
|
|
|
|
} |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# sprint_seq gives two entries for paired-emd, so |
1024
|
|
|
|
|
|
|
# if it is a bam|sam|join-paired-ends, it is necessary |
1025
|
|
|
|
|
|
|
# to copy the filehandle in order to print both entries |
1026
|
|
|
|
|
|
|
# to the same file |
1027
|
4
|
50
|
66
|
|
|
149
|
if ($seq_class eq 'App::Sandy::Seq::PairedEnd' |
1028
|
|
|
|
|
|
|
&& $file_class =~ /(sam|bam|join)/) { |
1029
|
0
|
|
|
|
|
0
|
$fhs[1] = $fhs[0]; |
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
# Count the cumulative number of reads for each seqid |
1033
|
4
|
|
|
|
|
34
|
my %counter; |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# If the output format is 'bam|sam' and it is the first job, then |
1036
|
|
|
|
|
|
|
# write the header |
1037
|
4
|
50
|
33
|
|
|
134
|
if ($output_format =~ /^(sam|bam)$/ && $tid == 1) { |
1038
|
0
|
|
|
|
|
0
|
my $header_ref = $self->gen_sam_header($self->argv); |
1039
|
0
|
|
|
|
|
0
|
print {$fhs[0]} "$$header_ref"; |
|
0
|
|
|
|
|
0
|
|
1040
|
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
# Run simulation in child |
1043
|
4
|
|
66
|
|
|
373
|
for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) { |
1044
|
1710
|
|
|
|
|
4612
|
my $id = $seqid->(); |
1045
|
1710
|
|
|
|
|
5566
|
my $ptable = $piece_table->{$id->{seq_id}}{$id->{type}}; |
1046
|
1710
|
|
|
|
|
2945
|
my @seq_entry; |
1047
|
|
|
|
|
|
|
try { |
1048
|
|
|
|
|
|
|
@seq_entry = $self->sprint_seq($tid, $i, $id->{seq_id}, $id->{type}, |
1049
|
1710
|
|
|
1710
|
|
122749
|
$ptable->{table}, $ptable->{size}, $strand->()); |
1050
|
|
|
|
|
|
|
} catch { |
1051
|
0
|
|
|
0
|
|
0
|
die "Not defined entry for seqid '>$id->{seq_id}' at job $tid: $_"; |
1052
|
|
|
|
|
|
|
} finally { |
1053
|
1710
|
50
|
|
1710
|
|
38761
|
unless (@_) { |
1054
|
1710
|
|
|
|
|
5237
|
for my $fh_idx (0..$#fhs) { |
1055
|
2280
|
|
|
|
|
4898
|
$counter{$id->{seq_id}}++; |
1056
|
2280
|
|
|
|
|
3259
|
print {$fhs[$fh_idx]} "${$seq_entry[$fh_idx]}"; |
|
2280
|
|
|
|
|
4241
|
|
|
2280
|
|
|
|
|
10194
|
|
1057
|
|
|
|
|
|
|
} |
1058
|
|
|
|
|
|
|
} |
1059
|
1710
|
|
|
|
|
14362
|
}; |
1060
|
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
|
|
1062
|
4
|
|
|
|
|
154
|
log_msg " => Job $tid: Writing and closing file: @files_t"; |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
# Close temporary files |
1065
|
|
|
|
|
|
|
# Get index from @files_t in order to avoid |
1066
|
|
|
|
|
|
|
# close the same filehandle twice - When the |
1067
|
|
|
|
|
|
|
# position 1-N is a copy |
1068
|
4
|
|
|
|
|
16
|
for my $fh_idx (0..$#files_t) { |
1069
|
6
|
|
|
|
|
402
|
close $fhs[$fh_idx]; |
1070
|
|
|
|
|
|
|
} |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# If it is a bam and it is the last loop, then |
1073
|
|
|
|
|
|
|
# write a eof marker |
1074
|
4
|
50
|
33
|
|
|
41
|
if ($output_format eq 'bam' && $tid == $number_of_jobs) { |
1075
|
0
|
|
|
|
|
0
|
$self->gen_eof_marker($files_t[0]); |
1076
|
|
|
|
|
|
|
} |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
# Child exit |
1079
|
4
|
|
|
|
|
34
|
log_msg " => Job $tid is finished"; |
1080
|
4
|
|
|
|
|
189
|
$pm->finish(0, \%counter); |
1081
|
|
|
|
|
|
|
} |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
# Back to parent |
1084
|
|
|
|
|
|
|
# Interlace parent/child(s) processes |
1085
|
4
|
|
|
|
|
1311
|
my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => \@child_pid); |
1086
|
4
|
|
|
|
|
51
|
$pm->wait_all_children; |
1087
|
|
|
|
|
|
|
|
1088
|
4
|
50
|
|
|
|
354
|
if ($sig->signal_catched) { |
1089
|
0
|
|
|
|
|
0
|
log_msg ":: Termination signal received!"; |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
|
1092
|
4
|
|
|
|
|
94
|
log_msg ":: Saving the work ..."; |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# Concatenate all temporary files |
1095
|
4
|
|
|
|
|
26
|
log_msg ":: Concatenate all temporary files"; |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
# Save time. Rename tmp_file (1,2) |
1098
|
4
|
|
|
|
|
8
|
for my $file (@{ $files{$file_class} }) { |
|
4
|
|
|
|
|
46
|
|
1099
|
5
|
|
|
|
|
36
|
my $tmp = shift @tmp_files; |
1100
|
5
|
|
|
|
|
66
|
log_msg " => Concatenating $tmp to $file ..."; |
1101
|
5
|
50
|
|
|
|
308
|
rename $tmp => $file |
1102
|
|
|
|
|
|
|
or die "Cannot create '$file': $!\n"; |
1103
|
|
|
|
|
|
|
} |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
# Append to renamed tmp files |
1106
|
4
|
|
|
|
|
16
|
my @fh = map { $self->with_open_a($_) } @{ $files{$file_class} }; |
|
5
|
|
|
|
|
122
|
|
|
4
|
|
|
|
|
35
|
|
1107
|
|
|
|
|
|
|
|
1108
|
4
|
|
|
|
|
37
|
for my $i (0..$#tmp_files) { |
1109
|
5
|
|
|
|
|
21
|
my $fh_idx = $i % scalar @fh; |
1110
|
|
|
|
|
|
|
|
1111
|
5
|
|
|
|
|
85
|
log_msg " => Concatenating $tmp_files[$i] to $files{$file_class}[$fh_idx] ..."; |
1112
|
5
|
50
|
|
|
|
90
|
cat $tmp_files[$i] => $fh[$fh_idx] |
1113
|
|
|
|
|
|
|
or die "Cannot concatenate $tmp_files[$i] to $files{$file_class}[$fh_idx]: $!\n"; |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
# Clean up the mess |
1116
|
5
|
50
|
|
|
|
13730
|
unlink $tmp_files[$i] |
1117
|
|
|
|
|
|
|
or die "Cannot remove temporary file '$tmp_files[$i]': $!\n"; |
1118
|
|
|
|
|
|
|
} |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
# Close files |
1121
|
4
|
|
|
|
|
35
|
log_msg ":: Writing and closing output file: @{ $files{$file_class} }"; |
|
4
|
|
|
|
|
63
|
|
1122
|
4
|
|
|
|
|
20
|
for my $fh_idx (0..$#fh) { |
1123
|
5
|
50
|
|
|
|
146
|
close $fh[$fh_idx] |
1124
|
|
|
|
|
|
|
or die "Cannot write file $files{$file_class}[$fh_idx]: $!\n"; |
1125
|
|
|
|
|
|
|
} |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
# Save counts |
1128
|
4
|
|
|
|
|
24
|
log_msg ":: Saving count file"; |
1129
|
4
|
|
|
|
|
64
|
my $count_fh = $self->with_open_w($count_file, 0); |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# It is necessary to correct the abundance according to |
1132
|
|
|
|
|
|
|
# fragment sequencing end |
1133
|
4
|
|
|
|
|
11
|
my $count_factor = 1; |
1134
|
4
|
50
|
33
|
|
|
252
|
if ($self->count_loops_by eq 'number-of-reads' |
1135
|
|
|
|
|
|
|
&& ref($self->seq) eq 'App::Sandy::Seq::PairedEnd') { |
1136
|
0
|
|
|
|
|
0
|
$count_factor = 2; |
1137
|
|
|
|
|
|
|
} |
1138
|
|
|
|
|
|
|
|
1139
|
4
|
|
|
|
|
39
|
log_msg " => Writing counts to $count_file ..."; |
1140
|
4
|
|
|
|
|
35
|
for my $id (sort keys %counters) { |
1141
|
20
|
|
|
|
|
109
|
printf {$count_fh} "%s\t%d\n" => $id, |
1142
|
20
|
|
|
|
|
31
|
int($counters{$id} / $count_factor); |
1143
|
|
|
|
|
|
|
} |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
# Just in case, calculate 'gene' like expression |
1146
|
4
|
|
|
|
|
98
|
my $parent_count = $self->_calculate_parent_count(\%counters); |
1147
|
|
|
|
|
|
|
|
1148
|
4
|
|
|
|
|
21
|
for my $id (sort keys %$parent_count) { |
1149
|
0
|
|
|
|
|
0
|
printf {$count_fh} "%s\t%d\n" => $id, |
1150
|
0
|
|
|
|
|
0
|
int($parent_count->{$id} / $count_factor); |
1151
|
|
|
|
|
|
|
} |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
# Close $count_file |
1154
|
4
|
|
|
|
|
42
|
log_msg ":; Writing and closing $count_file ..."; |
1155
|
4
|
50
|
|
|
|
314
|
close $count_fh |
1156
|
|
|
|
|
|
|
or die "Cannot write file $count_file: $!\n"; |
1157
|
|
|
|
|
|
|
} |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
__END__ |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
=pod |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
=encoding UTF-8 |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
=head1 NAME |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
App::Sandy::Simulator - Class responsible to make the simulation |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=head1 VERSION |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
version 0.22 |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
=head1 AUTHORS |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
=over 4 |
1176
|
|
|
|
|
|
|
|
1177
|
|
|
|
|
|
|
=item * |
1178
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
=item * |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
=item * |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
=item * |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
1192
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
=item * |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
=item * |
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
=item * |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=back |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-Libanês Hospital. |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
This is free software, licensed under: |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
=cut |