| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package App::Sandy::Simulator; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Class responsible to make the simulation |
|
3
|
|
|
|
|
|
|
|
|
4
|
6
|
|
|
6
|
|
24205
|
use App::Sandy::Base 'class'; |
|
|
6
|
|
|
|
|
13
|
|
|
|
6
|
|
|
|
|
46
|
|
|
5
|
6
|
|
|
6
|
|
435
|
use App::Sandy::Seq::SingleEnd; |
|
|
6
|
|
|
|
|
433
|
|
|
|
6
|
|
|
|
|
160
|
|
|
6
|
6
|
|
|
6
|
|
450
|
use App::Sandy::Seq::PairedEnd; |
|
|
6
|
|
|
|
|
470
|
|
|
|
6
|
|
|
|
|
229
|
|
|
7
|
6
|
|
|
6
|
|
3125
|
use App::Sandy::InterlaceProcesses; |
|
|
6
|
|
|
|
|
2739
|
|
|
|
6
|
|
|
|
|
279
|
|
|
8
|
6
|
|
|
6
|
|
3697
|
use App::Sandy::WeightedRaffle; |
|
|
6
|
|
|
|
|
2724
|
|
|
|
6
|
|
|
|
|
299
|
|
|
9
|
6
|
|
|
6
|
|
548
|
use App::Sandy::PieceTable; |
|
|
6
|
|
|
|
|
438
|
|
|
|
6
|
|
|
|
|
177
|
|
|
10
|
6
|
|
|
6
|
|
3188
|
use App::Sandy::DB::Handle::Expression; |
|
|
6
|
|
|
|
|
2702
|
|
|
|
6
|
|
|
|
|
282
|
|
|
11
|
6
|
|
|
6
|
|
3807
|
use App::Sandy::DB::Handle::Variation; |
|
|
6
|
|
|
|
|
2609
|
|
|
|
6
|
|
|
|
|
293
|
|
|
12
|
6
|
|
|
6
|
|
68
|
use List::Util 'min'; |
|
|
6
|
|
|
|
|
17
|
|
|
|
6
|
|
|
|
|
460
|
|
|
13
|
6
|
|
|
6
|
|
5336
|
use File::Cat 'cat'; |
|
|
6
|
|
|
|
|
3380
|
|
|
|
6
|
|
|
|
|
400
|
|
|
14
|
6
|
|
|
6
|
|
3475
|
use Parallel::ForkManager; |
|
|
6
|
|
|
|
|
340548
|
|
|
|
6
|
|
|
|
|
48314
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.22'; # VERSION |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has 'argv' => ( |
|
21
|
|
|
|
|
|
|
is => 'ro', |
|
22
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
|
23
|
|
|
|
|
|
|
required => 1 |
|
24
|
|
|
|
|
|
|
); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has 'truncate' => ( |
|
27
|
|
|
|
|
|
|
is => 'ro', |
|
28
|
|
|
|
|
|
|
isa => 'Bool', |
|
29
|
|
|
|
|
|
|
required => 1 |
|
30
|
|
|
|
|
|
|
); |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
has 'seed' => ( |
|
33
|
|
|
|
|
|
|
is => 'ro', |
|
34
|
|
|
|
|
|
|
isa => 'Int', |
|
35
|
|
|
|
|
|
|
required => 1 |
|
36
|
|
|
|
|
|
|
); |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
has 'jobs' => ( |
|
39
|
|
|
|
|
|
|
is => 'ro', |
|
40
|
|
|
|
|
|
|
isa => 'My:IntGt0', |
|
41
|
|
|
|
|
|
|
required => 1 |
|
42
|
|
|
|
|
|
|
); |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
has 'prefix' => ( |
|
45
|
|
|
|
|
|
|
is => 'ro', |
|
46
|
|
|
|
|
|
|
isa => 'Str', |
|
47
|
|
|
|
|
|
|
required => 1 |
|
48
|
|
|
|
|
|
|
); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
has 'join_paired_ends' => ( |
|
51
|
|
|
|
|
|
|
is => 'ro', |
|
52
|
|
|
|
|
|
|
isa => 'Bool', |
|
53
|
|
|
|
|
|
|
required => 1 |
|
54
|
|
|
|
|
|
|
); |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
has 'output_format' => ( |
|
57
|
|
|
|
|
|
|
is => 'ro', |
|
58
|
|
|
|
|
|
|
isa => 'My:Format', |
|
59
|
|
|
|
|
|
|
required => 1 |
|
60
|
|
|
|
|
|
|
); |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
has 'compression_level' => ( |
|
63
|
|
|
|
|
|
|
is => 'ro', |
|
64
|
|
|
|
|
|
|
isa => 'My:Level', |
|
65
|
|
|
|
|
|
|
required => 1 |
|
66
|
|
|
|
|
|
|
); |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
has 'fasta_file' => ( |
|
69
|
|
|
|
|
|
|
is => 'ro', |
|
70
|
|
|
|
|
|
|
isa => 'My:Fasta', |
|
71
|
|
|
|
|
|
|
required => 1 |
|
72
|
|
|
|
|
|
|
); |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
has 'coverage' => ( |
|
75
|
|
|
|
|
|
|
is => 'ro', |
|
76
|
|
|
|
|
|
|
isa => 'My:NumGt0', |
|
77
|
|
|
|
|
|
|
required => 0 |
|
78
|
|
|
|
|
|
|
); |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
has 'number_of_reads' => ( |
|
81
|
|
|
|
|
|
|
is => 'ro', |
|
82
|
|
|
|
|
|
|
isa => 'My:IntGt0', |
|
83
|
|
|
|
|
|
|
required => 0 |
|
84
|
|
|
|
|
|
|
); |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
has 'count_loops_by' => ( |
|
87
|
|
|
|
|
|
|
is => 'ro', |
|
88
|
|
|
|
|
|
|
isa => 'My:CountLoopBy', |
|
89
|
|
|
|
|
|
|
required => 1 |
|
90
|
|
|
|
|
|
|
); |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
has 'strand_bias' => ( |
|
93
|
|
|
|
|
|
|
is => 'ro', |
|
94
|
|
|
|
|
|
|
isa => 'My:StrandBias', |
|
95
|
|
|
|
|
|
|
required => 1 |
|
96
|
|
|
|
|
|
|
); |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
has 'seqid_weight' => ( |
|
99
|
|
|
|
|
|
|
is => 'ro', |
|
100
|
|
|
|
|
|
|
isa => 'My:SeqIdWeight', |
|
101
|
|
|
|
|
|
|
required => 1 |
|
102
|
|
|
|
|
|
|
); |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
has 'expression_matrix' => ( |
|
105
|
|
|
|
|
|
|
is => 'ro', |
|
106
|
|
|
|
|
|
|
isa => 'Str', |
|
107
|
|
|
|
|
|
|
required => 0 |
|
108
|
|
|
|
|
|
|
); |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
has 'genomic_variation' => ( |
|
111
|
|
|
|
|
|
|
is => 'ro', |
|
112
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
|
113
|
|
|
|
|
|
|
required => 0 |
|
114
|
|
|
|
|
|
|
); |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
has '_genomic_variation_names' => ( |
|
117
|
|
|
|
|
|
|
is => 'ro', |
|
118
|
|
|
|
|
|
|
isa => 'Maybe[Str]', |
|
119
|
|
|
|
|
|
|
builder => '_build_genomic_variation_names', |
|
120
|
|
|
|
|
|
|
lazy_build => 1 |
|
121
|
|
|
|
|
|
|
); |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
has 'seq' => ( |
|
124
|
|
|
|
|
|
|
is => 'ro', |
|
125
|
|
|
|
|
|
|
isa => 'App::Sandy::Seq::SingleEnd | App::Sandy::Seq::PairedEnd', |
|
126
|
|
|
|
|
|
|
required => 1, |
|
127
|
|
|
|
|
|
|
handles => [ qw{ sprint_seq gen_sam_header gen_eof_marker } ] |
|
128
|
|
|
|
|
|
|
); |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
has '_fasta' => ( |
|
131
|
|
|
|
|
|
|
is => 'ro', |
|
132
|
|
|
|
|
|
|
isa => 'My:IdxFasta', |
|
133
|
|
|
|
|
|
|
builder => '_build_fasta', |
|
134
|
|
|
|
|
|
|
lazy_build => 1 |
|
135
|
|
|
|
|
|
|
); |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
has '_fasta_tree' => ( |
|
138
|
|
|
|
|
|
|
traits => ['Hash'], |
|
139
|
|
|
|
|
|
|
is => 'ro', |
|
140
|
|
|
|
|
|
|
isa => 'HashRef[ArrayRef]', |
|
141
|
|
|
|
|
|
|
default => sub { {} }, |
|
142
|
|
|
|
|
|
|
handles => { |
|
143
|
|
|
|
|
|
|
_set_fasta_tree => 'set', |
|
144
|
|
|
|
|
|
|
_get_fasta_tree => 'get', |
|
145
|
|
|
|
|
|
|
_exists_fasta_tree => 'exists', |
|
146
|
|
|
|
|
|
|
_fasta_tree_pairs => 'kv', |
|
147
|
|
|
|
|
|
|
_has_no_fasta_tree => 'is_empty' |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
); |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
has '_fasta_rtree' => ( |
|
152
|
|
|
|
|
|
|
traits => ['Hash'], |
|
153
|
|
|
|
|
|
|
is => 'ro', |
|
154
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
155
|
|
|
|
|
|
|
default => sub { {} }, |
|
156
|
|
|
|
|
|
|
handles => { |
|
157
|
|
|
|
|
|
|
_set_fasta_rtree => 'set', |
|
158
|
|
|
|
|
|
|
_get_fasta_rtree => 'get', |
|
159
|
|
|
|
|
|
|
_delete_fasta_rtree => 'delete', |
|
160
|
|
|
|
|
|
|
_exists_fasta_rtree => 'exists', |
|
161
|
|
|
|
|
|
|
_fasta_rtree_pairs => 'kv', |
|
162
|
|
|
|
|
|
|
_has_no_fasta_rtree => 'is_empty' |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
); |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
has '_seqname' => ( |
|
167
|
|
|
|
|
|
|
traits => ['Hash'], |
|
168
|
|
|
|
|
|
|
is => 'ro', |
|
169
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
170
|
|
|
|
|
|
|
default => sub { {} }, |
|
171
|
|
|
|
|
|
|
handles => { |
|
172
|
|
|
|
|
|
|
_set_seqname => 'set', |
|
173
|
|
|
|
|
|
|
_get_seqname => 'get' |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
); |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
has '_piece_table' => ( |
|
178
|
|
|
|
|
|
|
is => 'ro', |
|
179
|
|
|
|
|
|
|
isa => 'HashRef[HashRef[My:PieceTable]]', |
|
180
|
|
|
|
|
|
|
builder => '_build_piece_table', |
|
181
|
|
|
|
|
|
|
lazy_build => 1 |
|
182
|
|
|
|
|
|
|
); |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
has '_strand' => ( |
|
185
|
|
|
|
|
|
|
is => 'ro', |
|
186
|
|
|
|
|
|
|
isa => 'CodeRef', |
|
187
|
|
|
|
|
|
|
builder => '_build_strand', |
|
188
|
|
|
|
|
|
|
lazy_build => 1 |
|
189
|
|
|
|
|
|
|
); |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
has '_seqid_raffle' => ( |
|
192
|
|
|
|
|
|
|
is => 'ro', |
|
193
|
|
|
|
|
|
|
isa => 'CodeRef', |
|
194
|
|
|
|
|
|
|
builder => '_build_seqid_raffle', |
|
195
|
|
|
|
|
|
|
lazy_build => 1 |
|
196
|
|
|
|
|
|
|
); |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub BUILD { |
|
199
|
20
|
|
|
20
|
0
|
1105
|
my $self = shift; |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# If seqid_weight is 'count', then expression_matrix must be defined |
|
202
|
20
|
50
|
33
|
|
|
620
|
if ($self->seqid_weight eq 'count' and not defined $self->expression_matrix) { |
|
203
|
0
|
|
|
|
|
0
|
croak "seqid_weight=count requires a expression_matrix\n"; |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# If count_loops_by is 'coverage', then coverage must be defined. Else if |
|
207
|
|
|
|
|
|
|
# it is equal to 'number_of_reads', then number_of_reads must be defined |
|
208
|
20
|
50
|
33
|
|
|
535
|
if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) { |
|
|
|
50
|
33
|
|
|
|
|
|
209
|
0
|
|
|
|
|
0
|
croak "count_loops_by=coverage requires a coverage number\n"; |
|
210
|
|
|
|
|
|
|
} elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) { |
|
211
|
0
|
|
|
|
|
0
|
croak "count_loops_by=number_of_reads requires a number_of_reads number\n"; |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
## Just to ensure that the lazy attributes are built before &new returns |
|
215
|
20
|
|
|
|
|
550
|
$self->_piece_table; |
|
216
|
20
|
|
|
|
|
550
|
$self->_seqid_raffle; |
|
217
|
20
|
|
|
|
|
515
|
$self->_fasta; |
|
218
|
20
|
|
|
|
|
510
|
$self->_strand; |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub _build_strand { |
|
222
|
20
|
|
|
20
|
|
40
|
my $self = shift; |
|
223
|
20
|
|
|
|
|
40
|
my $strand_sub; |
|
224
|
|
|
|
|
|
|
|
|
225
|
20
|
50
|
|
|
|
600
|
if ($self->strand_bias eq 'plus') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
226
|
0
|
|
|
0
|
|
0
|
$strand_sub = sub {1}; |
|
|
0
|
|
|
|
|
0
|
|
|
227
|
|
|
|
|
|
|
} elsif ($self->strand_bias eq 'minus') { |
|
228
|
0
|
|
|
0
|
|
0
|
$strand_sub = sub {0}; |
|
|
0
|
|
|
|
|
0
|
|
|
229
|
|
|
|
|
|
|
} elsif ($self->strand_bias eq 'random') { |
|
230
|
20
|
|
|
1710
|
|
80
|
$strand_sub = sub { int(rand(2)) }; |
|
|
1710
|
|
|
|
|
7756
|
|
|
231
|
|
|
|
|
|
|
} else { |
|
232
|
0
|
|
|
|
|
0
|
croak sprintf "Unknown option '%s' for strand bias\n", |
|
233
|
|
|
|
|
|
|
$self->strand_bias; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
20
|
|
|
|
|
495
|
return $strand_sub; |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub _index_fasta { |
|
240
|
20
|
|
|
20
|
|
45
|
my $self = shift; |
|
241
|
20
|
|
|
|
|
505
|
my $fasta = $self->fasta_file; |
|
242
|
|
|
|
|
|
|
|
|
243
|
20
|
|
|
|
|
100
|
my $fh = $self->with_open_r($fasta); |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# indexed_genome = ID => (seq, len) |
|
246
|
20
|
|
|
|
|
75
|
my %indexed_fasta; |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# >ID|PID as in gencode transcripts |
|
249
|
|
|
|
|
|
|
my %fasta_rtree; |
|
250
|
20
|
|
|
|
|
0
|
my $id; |
|
251
|
|
|
|
|
|
|
|
|
252
|
20
|
|
|
|
|
570
|
while (<$fh>) { |
|
253
|
860
|
|
|
|
|
1290
|
chomp; |
|
254
|
860
|
50
|
|
|
|
1540
|
next if /^;/; |
|
255
|
860
|
100
|
|
|
|
1540
|
if (/^>/) { |
|
256
|
100
|
|
|
|
|
315
|
my @fields = split /\|/; |
|
257
|
100
|
|
|
|
|
230
|
$id = $fields[0]; |
|
258
|
100
|
|
|
|
|
280
|
$id =~ s/^>//; |
|
259
|
100
|
|
|
|
|
325
|
$id =~ s/^\s+|\s+$//g; |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Seq ID standardization in order to manage comparations |
|
262
|
|
|
|
|
|
|
# between chr1, Chr1, CHR1, 1 etc; |
|
263
|
100
|
|
|
|
|
355
|
my $std_id = $self->with_std_seqid($id); |
|
264
|
100
|
|
|
|
|
3735
|
$self->_set_seqname( |
|
265
|
|
|
|
|
|
|
$id => $std_id, |
|
266
|
|
|
|
|
|
|
$std_id => $id |
|
267
|
|
|
|
|
|
|
); |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# It is necessary to catch gene -> transcript relation |
|
270
|
|
|
|
|
|
|
# # TODO: Make a hash tarit for indexed fasta |
|
271
|
100
|
50
|
|
|
|
610
|
if (defined $fields[1]) { |
|
272
|
0
|
|
|
|
|
0
|
my $pid = $fields[1]; |
|
273
|
0
|
|
|
|
|
0
|
$pid =~ s/^\s+|\s+$//g; |
|
274
|
0
|
|
|
|
|
0
|
$fasta_rtree{$id} = $pid; |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
} else { |
|
277
|
760
|
50
|
|
|
|
1290
|
die "Error reading fasta file '$fasta': Not defined id" |
|
278
|
|
|
|
|
|
|
unless defined $id; |
|
279
|
760
|
|
|
|
|
2270
|
$indexed_fasta{$id}{seq} .= $_; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
20
|
|
|
|
|
115
|
for (keys %indexed_fasta) { |
|
284
|
100
|
|
|
|
|
255
|
$indexed_fasta{$_}{size} = length $indexed_fasta{$_}{seq}; |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
20
|
50
|
|
|
|
65
|
unless (%indexed_fasta) { |
|
288
|
0
|
|
|
|
|
0
|
die "Error parsing '$fasta'. Maybe the file is empty\n"; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
$fh->close |
|
292
|
20
|
50
|
|
|
|
135
|
or die "Cannot close file $fasta: $!\n"; |
|
293
|
|
|
|
|
|
|
|
|
294
|
20
|
50
|
|
|
|
420
|
$self->_set_fasta_rtree(%fasta_rtree) if %fasta_rtree; |
|
295
|
20
|
|
|
|
|
110
|
return \%indexed_fasta; |
|
296
|
|
|
|
|
|
|
} |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
sub _build_fasta { |
|
299
|
20
|
|
|
20
|
|
40
|
my $self = shift; |
|
300
|
20
|
|
|
|
|
520
|
my $fasta = $self->fasta_file; |
|
301
|
|
|
|
|
|
|
|
|
302
|
20
|
|
|
|
|
120
|
log_msg ":: Indexing fasta file '$fasta' ..."; |
|
303
|
20
|
|
|
|
|
55
|
my $indexed_fasta = $self->_index_fasta; |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# Validate genome about the read size required |
|
306
|
20
|
|
|
|
|
120
|
log_msg ":: Validating fasta file '$fasta' ..."; |
|
307
|
|
|
|
|
|
|
# Entries to remove |
|
308
|
20
|
|
|
|
|
35
|
my @blacklist; |
|
309
|
|
|
|
|
|
|
|
|
310
|
20
|
50
|
|
|
|
695
|
unless ($self->truncate) { |
|
311
|
20
|
|
|
|
|
75
|
for my $id (keys %$indexed_fasta) { |
|
312
|
100
|
|
|
|
|
190
|
my $index_size = $indexed_fasta->{$id}{size}; |
|
313
|
100
|
|
|
|
|
2455
|
my $class = ref $self->seq; |
|
314
|
|
|
|
|
|
|
|
|
315
|
100
|
100
|
|
|
|
280
|
if ($class eq 'App::Sandy::Seq::SingleEnd') { |
|
|
|
50
|
|
|
|
|
|
|
316
|
50
|
|
|
|
|
1170
|
my $read_mean = $self->seq->read_mean; |
|
317
|
50
|
50
|
|
|
|
135
|
if ($index_size < $read_mean) { |
|
318
|
0
|
|
|
|
|
0
|
log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required read mean ($read_mean)"; |
|
319
|
0
|
|
|
|
|
0
|
delete $indexed_fasta->{$id}; |
|
320
|
0
|
|
|
|
|
0
|
push @blacklist => $id; |
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
} elsif ($class eq 'App::Sandy::Seq::PairedEnd') { |
|
323
|
50
|
|
|
|
|
1355
|
my $fragment_mean = $self->seq->fragment_mean; |
|
324
|
50
|
50
|
|
|
|
130
|
if ($index_size < $fragment_mean) { |
|
325
|
0
|
|
|
|
|
0
|
log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)"; |
|
326
|
0
|
|
|
|
|
0
|
delete $indexed_fasta->{$id}; |
|
327
|
0
|
|
|
|
|
0
|
push @blacklist => $id; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
} else { |
|
330
|
0
|
|
|
|
|
0
|
croak "Unknown option '$class' for sequencing type\n"; |
|
331
|
|
|
|
|
|
|
} |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
20
|
50
|
|
|
|
65
|
unless (%$indexed_fasta) { |
|
336
|
0
|
|
|
|
|
0
|
die sprintf "Fasta file '%s' has no valid entry\n" => $self->fasta_file; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# If fasta_rtree has entries |
|
340
|
20
|
50
|
|
|
|
715
|
unless ($self->_has_no_fasta_rtree) { |
|
341
|
|
|
|
|
|
|
# Remove no valid entries from id -> pid relation |
|
342
|
0
|
0
|
|
|
|
0
|
$self->_delete_fasta_rtree(@blacklist) if @blacklist; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
20
|
|
|
|
|
525
|
return $indexed_fasta; |
|
346
|
|
|
|
|
|
|
} |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
sub _populate_fasta_tree { |
|
349
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# If fasta_rtree has entries |
|
352
|
20
|
50
|
|
|
|
705
|
unless ($self->_has_no_fasta_rtree) { |
|
353
|
|
|
|
|
|
|
# Build parent -> child ids relation |
|
354
|
0
|
|
|
|
|
0
|
my %fasta_tree; |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# Reverse fasta_rtree to pid -> \@ids |
|
357
|
0
|
|
|
|
|
0
|
for my $pair ($self->_fasta_rtree_pairs) { |
|
358
|
0
|
|
|
|
|
0
|
my ($id, $pid) = (@$pair); |
|
359
|
0
|
|
|
|
|
0
|
push @{ $fasta_tree{$pid} } => $id; |
|
|
0
|
|
|
|
|
0
|
|
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
|
|
362
|
0
|
|
|
|
|
0
|
$self->_set_fasta_tree(%fasta_tree); |
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
sub _retrieve_expression_matrix { |
|
367
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
368
|
0
|
|
|
|
|
0
|
my $expression = App::Sandy::DB::Handle::Expression->new; |
|
369
|
0
|
|
|
|
|
0
|
return $expression->retrievedb($self->expression_matrix); |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
sub _build_seqid_raffle { |
|
373
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# Get the piece table |
|
376
|
20
|
|
|
|
|
530
|
my $piece_table = $self->_piece_table; |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# The builded function |
|
379
|
20
|
|
|
|
|
40
|
my $seqid_sub; |
|
380
|
|
|
|
|
|
|
|
|
381
|
20
|
50
|
|
|
|
580
|
if ($self->seqid_weight eq 'same') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
382
|
0
|
|
|
0
|
|
0
|
my ($keys, $weights) = $self->_populate_key_weight($piece_table, sub { 1 }); |
|
|
0
|
|
|
|
|
0
|
|
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# If weight == 1 means that there are 2 keys for |
|
385
|
|
|
|
|
|
|
# the same seq_id. |
|
386
|
|
|
|
|
|
|
# If weight == 2 means that there is only one key |
|
387
|
|
|
|
|
|
|
# for the seq_id, so I double that key |
|
388
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < @$weights; $i++) { |
|
389
|
0
|
0
|
|
|
|
0
|
if ($weights->[$i] > 1) { |
|
390
|
0
|
|
|
|
|
0
|
push @$keys => $keys->[$i]; |
|
391
|
|
|
|
|
|
|
} |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
0
|
my $keys_size = scalar @$keys; |
|
395
|
0
|
|
|
0
|
|
0
|
$seqid_sub = sub { $keys->[int(rand($keys_size))] }; |
|
|
0
|
|
|
|
|
0
|
|
|
396
|
|
|
|
|
|
|
} elsif ($self->seqid_weight eq 'count') { |
|
397
|
|
|
|
|
|
|
# Catch expression-matrix entry from database |
|
398
|
0
|
|
|
|
|
0
|
my $indexed_file = $self->_retrieve_expression_matrix; |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# Catch indexed fasta |
|
401
|
0
|
|
|
|
|
0
|
my $indexed_fasta = $self->_fasta; |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# Validate expression_matrix |
|
404
|
0
|
|
|
|
|
0
|
for my $id (keys %$indexed_file) { |
|
405
|
|
|
|
|
|
|
# If not exists into indexed_fasta, it must then exist into fasta_tree |
|
406
|
0
|
0
|
0
|
|
|
0
|
unless (exists $piece_table->{$id} || $self->_exists_fasta_tree($id)) { |
|
407
|
0
|
|
|
|
|
0
|
log_msg sprintf ":: Ignoring seqid '%s' from expression-matrix '%s': It is not found into the indexed fasta" |
|
408
|
|
|
|
|
|
|
=> $id, $self->expression_matrix; |
|
409
|
0
|
|
|
|
|
0
|
delete $indexed_file->{$id}; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
} |
|
412
|
|
|
|
|
|
|
|
|
413
|
0
|
0
|
|
|
|
0
|
unless (%$indexed_file) { |
|
414
|
0
|
|
|
|
|
0
|
die sprintf "No valid seqid entry of the expression-matrix '%s' is recorded into the indexed fasta\n" |
|
415
|
|
|
|
|
|
|
=> $self->expression_matrix; |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
my (%ptable_ind, %ptable_cluster); |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# Split indexed_file seq_ids between those |
|
421
|
|
|
|
|
|
|
# into piece_table and those that represents a cluster |
|
422
|
|
|
|
|
|
|
# of seq_ids as in gene -> transcript relationship |
|
423
|
0
|
|
|
|
|
0
|
for my $seq_id (keys %$indexed_file) { |
|
424
|
0
|
0
|
|
|
|
0
|
if (exists $piece_table->{$seq_id}) { |
|
425
|
0
|
|
|
|
|
0
|
$ptable_ind{$seq_id} = $piece_table->{$seq_id}; |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
} else { |
|
428
|
0
|
|
|
|
|
0
|
my $ids = $self->_get_fasta_tree($seq_id); |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Bug catcher |
|
431
|
0
|
0
|
|
|
|
0
|
unless (@$ids) { |
|
432
|
0
|
|
|
|
|
0
|
croak "seq_id '$seq_id' not found into piece_table"; |
|
433
|
|
|
|
|
|
|
} |
|
434
|
|
|
|
|
|
|
|
|
435
|
0
|
|
|
|
|
0
|
$ptable_cluster{$seq_id} = $ids; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
# Let's calculate the weight taking in acount |
|
440
|
|
|
|
|
|
|
# the size increase/decrease |
|
441
|
|
|
|
|
|
|
my $calc_ind_weight = sub { |
|
442
|
0
|
|
|
0
|
|
0
|
my ($seq_id, $type) = @_; |
|
443
|
|
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
0
|
my $counts = $indexed_file->{$seq_id}; |
|
445
|
0
|
|
|
|
|
0
|
my $size = $piece_table->{$seq_id}{$type}{size}; |
|
446
|
0
|
|
|
|
|
0
|
my $fasta_size = $indexed_fasta->{$seq_id}{size}; |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# Correct the weight according to the |
|
449
|
|
|
|
|
|
|
# genomic variation change by the ratio |
|
450
|
|
|
|
|
|
|
# between the table size and fasta size |
|
451
|
0
|
|
|
|
|
0
|
my $factor = $size / $fasta_size; |
|
452
|
|
|
|
|
|
|
|
|
453
|
0
|
|
|
|
|
0
|
return $counts * $factor; |
|
454
|
0
|
|
|
|
|
0
|
}; |
|
455
|
|
|
|
|
|
|
|
|
456
|
0
|
|
|
|
|
0
|
my ($keys, $weights); |
|
457
|
|
|
|
|
|
|
|
|
458
|
0
|
0
|
|
|
|
0
|
if (%ptable_ind) { |
|
459
|
0
|
|
|
|
|
0
|
($keys, $weights) = $self->_populate_key_weight(\%ptable_ind, |
|
460
|
|
|
|
|
|
|
$calc_ind_weight); |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# If there are seq_id cluster like, then its is |
|
464
|
|
|
|
|
|
|
# time to calculate these weights |
|
465
|
0
|
|
|
|
|
0
|
for my $seq_id (sort keys %ptable_cluster) { |
|
466
|
0
|
|
|
|
|
0
|
my %ptable; |
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# Slice piece_table hash |
|
469
|
0
|
|
|
|
|
0
|
my $ids = $ptable_cluster{$seq_id}; |
|
470
|
0
|
|
|
|
|
0
|
@ptable{@$ids} = @$piece_table{@$ids}; |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# total size among all ids of cluster |
|
473
|
0
|
|
|
|
|
0
|
my %total; |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
# Calculate the total size by type |
|
476
|
0
|
|
|
|
|
0
|
for my $type_h (values %ptable) { |
|
477
|
0
|
|
|
|
|
0
|
for my $type (keys %$type_h) { |
|
478
|
0
|
|
|
|
|
0
|
$total{$type} += $type_h->{$type}{size}; |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Calculate the weight taking in acount the size increase/decrease |
|
483
|
|
|
|
|
|
|
# and the ratio between the total size by type and the table size. |
|
484
|
|
|
|
|
|
|
# The problem here is that I must divide the 'counts' for some 'seq_id' |
|
485
|
|
|
|
|
|
|
# among all ids that belong to it |
|
486
|
|
|
|
|
|
|
my $calc_cluster_weight = sub { |
|
487
|
0
|
|
|
0
|
|
0
|
my ($id, $type) = @_; |
|
488
|
|
|
|
|
|
|
|
|
489
|
0
|
|
|
|
|
0
|
my $counts = $indexed_file->{$seq_id}; |
|
490
|
0
|
|
|
|
|
0
|
my $size = $piece_table->{$id}{$type}{size}; |
|
491
|
0
|
|
|
|
|
0
|
my $fasta_size = $indexed_fasta->{$id}{size}; |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Divide the counts among all ids |
|
494
|
0
|
|
|
|
|
0
|
my $ratio = $size / $total{$type}; |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# Correct the weight according to the size |
|
497
|
0
|
|
|
|
|
0
|
my $factor = $size / $fasta_size; |
|
498
|
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
0
|
return $counts * $factor * $ratio; |
|
500
|
0
|
|
|
|
|
0
|
}; |
|
501
|
|
|
|
|
|
|
|
|
502
|
0
|
|
|
|
|
0
|
my ($k, $w) = $self->_populate_key_weight(\%ptable, |
|
503
|
|
|
|
|
|
|
$calc_cluster_weight); |
|
504
|
|
|
|
|
|
|
|
|
505
|
0
|
|
|
|
|
0
|
push @$keys => @$k; |
|
506
|
0
|
|
|
|
|
0
|
push @$weights => @$w; |
|
507
|
|
|
|
|
|
|
} |
|
508
|
|
|
|
|
|
|
|
|
509
|
0
|
0
|
0
|
|
|
0
|
unless (@$keys && @$weights) { |
|
510
|
0
|
|
|
|
|
0
|
croak "No keys weights have been set"; |
|
511
|
|
|
|
|
|
|
} |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# It is very necessary in order |
|
514
|
|
|
|
|
|
|
# to avoid truncation of numbers |
|
515
|
|
|
|
|
|
|
# between zero and one |
|
516
|
0
|
|
|
|
|
0
|
$self->_round_weight($weights); |
|
517
|
|
|
|
|
|
|
|
|
518
|
0
|
|
|
|
|
0
|
my $raffler = App::Sandy::WeightedRaffle->new( |
|
519
|
|
|
|
|
|
|
'weights' => $weights, |
|
520
|
|
|
|
|
|
|
'keys' => $keys |
|
521
|
|
|
|
|
|
|
); |
|
522
|
|
|
|
|
|
|
|
|
523
|
0
|
|
|
0
|
|
0
|
$seqid_sub = sub { $raffler->weighted_raffle }; |
|
|
0
|
|
|
|
|
0
|
|
|
524
|
|
|
|
|
|
|
} elsif ($self->seqid_weight eq 'length') { |
|
525
|
|
|
|
|
|
|
my $calc_weight = sub { |
|
526
|
100
|
|
|
100
|
|
210
|
my ($seq_id, $type) = @_; |
|
527
|
100
|
|
|
|
|
205
|
return $piece_table->{$seq_id}{$type}{size}; |
|
528
|
20
|
|
|
|
|
115
|
}; |
|
529
|
|
|
|
|
|
|
|
|
530
|
20
|
|
|
|
|
70
|
my ($keys, $weights) = $self->_populate_key_weight($piece_table, |
|
531
|
|
|
|
|
|
|
$calc_weight); |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Just in case ... |
|
534
|
20
|
|
|
|
|
70
|
$self->_round_weight($weights); |
|
535
|
|
|
|
|
|
|
|
|
536
|
20
|
|
|
|
|
720
|
my $raffler = App::Sandy::WeightedRaffle->new( |
|
537
|
|
|
|
|
|
|
weights => $weights, |
|
538
|
|
|
|
|
|
|
keys => $keys |
|
539
|
|
|
|
|
|
|
); |
|
540
|
|
|
|
|
|
|
|
|
541
|
20
|
|
|
1710
|
|
155
|
$seqid_sub = sub { $raffler->weighted_raffle }; |
|
|
1710
|
|
|
|
|
6155
|
|
|
542
|
|
|
|
|
|
|
} else { |
|
543
|
0
|
|
|
|
|
0
|
croak sprintf "Unknown option '%s' for seqid_weight\n", |
|
544
|
|
|
|
|
|
|
$self->seqid_weight; |
|
545
|
|
|
|
|
|
|
} |
|
546
|
|
|
|
|
|
|
|
|
547
|
20
|
|
|
|
|
560
|
return $seqid_sub; |
|
548
|
|
|
|
|
|
|
} |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
sub _round_weight { |
|
551
|
20
|
|
|
20
|
|
40
|
my ($self, $weights) = @_; |
|
552
|
|
|
|
|
|
|
|
|
553
|
20
|
|
|
|
|
75
|
my $min = min @$weights; |
|
554
|
|
|
|
|
|
|
|
|
555
|
20
|
50
|
|
|
|
55
|
if ($min <= 0) { |
|
556
|
0
|
|
|
|
|
0
|
croak "min weight le to zero: $min"; |
|
557
|
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
|
|
559
|
20
|
50
|
|
|
|
95
|
my $factor = $min < 1 |
|
560
|
|
|
|
|
|
|
? (1 / $min) |
|
561
|
|
|
|
|
|
|
: 1; |
|
562
|
|
|
|
|
|
|
|
|
563
|
20
|
|
|
|
|
45
|
for my $weight (@$weights) { |
|
564
|
100
|
|
|
|
|
195
|
$weight = int($weight * $factor + 0.5); |
|
565
|
|
|
|
|
|
|
} |
|
566
|
|
|
|
|
|
|
} |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
sub _populate_key_weight { |
|
569
|
20
|
|
|
20
|
|
55
|
my ($self, $piece_table, $calc_weight) = @_; |
|
570
|
|
|
|
|
|
|
|
|
571
|
20
|
|
|
|
|
35
|
my (@keys, @weights); |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# It needs to be sorted in order to the |
|
574
|
|
|
|
|
|
|
# seed works |
|
575
|
20
|
|
|
|
|
125
|
for my $seq_id (sort keys %$piece_table) { |
|
576
|
100
|
|
|
|
|
160
|
my $type_h = $piece_table->{$seq_id}; |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# If there is no alternative seq_id, then |
|
579
|
|
|
|
|
|
|
# set a factor to correct the size. |
|
580
|
|
|
|
|
|
|
# It is necessary because the seq_ids with |
|
581
|
|
|
|
|
|
|
# alternative and reference will double its |
|
582
|
|
|
|
|
|
|
# own coverage |
|
583
|
100
|
50
|
|
|
|
200
|
my $factor = scalar keys %$type_h == 1 |
|
584
|
|
|
|
|
|
|
? 2 |
|
585
|
|
|
|
|
|
|
: 1; |
|
586
|
|
|
|
|
|
|
|
|
587
|
100
|
|
|
|
|
205
|
for my $type (sort keys %$type_h) { |
|
588
|
|
|
|
|
|
|
|
|
589
|
100
|
|
|
|
|
330
|
my %key = ( |
|
590
|
|
|
|
|
|
|
'seq_id' => $seq_id, |
|
591
|
|
|
|
|
|
|
'type' => $type |
|
592
|
|
|
|
|
|
|
); |
|
593
|
|
|
|
|
|
|
|
|
594
|
100
|
|
|
|
|
185
|
my $weight = $calc_weight->($seq_id, $type); |
|
595
|
|
|
|
|
|
|
|
|
596
|
100
|
|
|
|
|
195
|
push @keys => \%key; |
|
597
|
100
|
|
|
|
|
295
|
push @weights => $weight * $factor; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
|
|
|
|
|
|
} |
|
600
|
|
|
|
|
|
|
|
|
601
|
20
|
|
|
|
|
65
|
return (\@keys, \@weights); |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
sub _build_genomic_variation_names { |
|
605
|
20
|
|
|
20
|
|
25
|
my $self = shift; |
|
606
|
20
|
50
|
|
|
|
555
|
if ($self->genomic_variation) { |
|
607
|
0
|
|
|
|
|
0
|
return sprintf "[%s]", => join ", ", @{ $self->genomic_variation }; |
|
|
0
|
|
|
|
|
0
|
|
|
608
|
|
|
|
|
|
|
} |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
sub _retrieve_genomic_variation { |
|
612
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
613
|
0
|
|
|
|
|
0
|
my $variation = App::Sandy::DB::Handle::Variation->new; |
|
614
|
0
|
|
|
|
|
0
|
return $variation->retrievedb($self->genomic_variation); |
|
615
|
|
|
|
|
|
|
} |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
sub _build_piece_table { |
|
618
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
|
619
|
|
|
|
|
|
|
|
|
620
|
20
|
|
|
|
|
625
|
my $genomic_variation = $self->_genomic_variation_names; |
|
621
|
20
|
|
|
|
|
35
|
my $indexed_snv; |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# Retrieve genomic variation if the user provided it |
|
624
|
20
|
50
|
|
|
|
55
|
if (defined $genomic_variation) { |
|
625
|
0
|
|
|
|
|
0
|
$indexed_snv = $self->_retrieve_genomic_variation; |
|
626
|
0
|
|
|
|
|
0
|
log_msg ":: Validate genomic variation '$genomic_variation' against indexed fasta ..."; |
|
627
|
0
|
|
|
|
|
0
|
$self->_validate_indexed_snv_against_fasta($indexed_snv); |
|
628
|
|
|
|
|
|
|
} |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Catch index fasta |
|
631
|
20
|
|
|
|
|
505
|
my $indexed_fasta = $self->_fasta; |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
# Build piece table |
|
634
|
20
|
|
|
|
|
40
|
my %piece_table; |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
# Let's construct the piece_table |
|
637
|
20
|
|
|
|
|
80
|
log_msg ":: Build piece table ..."; |
|
638
|
|
|
|
|
|
|
|
|
639
|
20
|
|
|
|
|
85
|
while (my ($seq_id, $fasta_h) = each %$indexed_fasta) { |
|
640
|
100
|
|
|
|
|
210
|
my $seq = \$fasta_h->{seq}; |
|
641
|
100
|
|
|
|
|
3605
|
my $std_seq_id = $self->_get_seqname($seq_id); |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
# Initialize piece tables for $seq_id ref |
|
644
|
100
|
|
|
|
|
2575
|
$piece_table{$seq_id}{ref}{table} = App::Sandy::PieceTable->new(orig => $seq); |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
# If there is indexed_snv for seq_id, then construct the piece table with it |
|
647
|
100
|
50
|
33
|
|
|
440
|
if (defined $indexed_snv && defined $indexed_snv->{$std_seq_id}) { |
|
648
|
0
|
|
|
|
|
0
|
my $snvs = $indexed_snv->{$std_seq_id}; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# Filter only the homozygotic snvs to feed reference seq_id |
|
651
|
0
|
|
|
|
|
0
|
my @snvs_homo = grep { $_->{plo} eq 'HO' } @$snvs; |
|
|
0
|
|
|
|
|
0
|
|
|
652
|
|
|
|
|
|
|
|
|
653
|
0
|
0
|
|
|
|
0
|
if (@snvs_homo) { |
|
654
|
|
|
|
|
|
|
# Populate reference seq_id |
|
655
|
0
|
|
|
|
|
0
|
$self->_populate_piece_table($piece_table{$seq_id}{ref}{table}, \@snvs_homo); |
|
656
|
|
|
|
|
|
|
} |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
# Initialize piece tables for $seq_id alt |
|
659
|
0
|
|
|
|
|
0
|
$piece_table{$seq_id}{alt}{table} = App::Sandy::PieceTable->new(orig => $seq); |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
# Populate alternative seq_id |
|
662
|
0
|
|
|
|
|
0
|
$self->_populate_piece_table($piece_table{$seq_id}{alt}{table}, $snvs); |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
} |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
# Initialize the logical offsets and valodate the |
|
667
|
|
|
|
|
|
|
# new size due to the genomic variation |
|
668
|
|
|
|
|
|
|
|
|
669
|
20
|
|
|
|
|
40
|
my @blacklist; |
|
670
|
|
|
|
|
|
|
|
|
671
|
20
|
|
|
|
|
70
|
for my $seq_id (keys %piece_table) { |
|
672
|
100
|
|
|
|
|
190
|
my $type_h = delete $piece_table{$seq_id}; |
|
673
|
|
|
|
|
|
|
|
|
674
|
100
|
|
|
|
|
300
|
for my $type (keys %$type_h) { |
|
675
|
100
|
|
|
|
|
160
|
my $table_h = delete $type_h->{$type}; |
|
676
|
100
|
|
|
|
|
175
|
my $table = $table_h->{table}; |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
# Initialize the logical offset |
|
679
|
100
|
|
|
|
|
495
|
$table->calculate_logical_offset; |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# Get the new size |
|
682
|
100
|
|
|
|
|
2660
|
my $new_size = $table->logical_len; |
|
683
|
|
|
|
|
|
|
|
|
684
|
100
|
50
|
|
|
|
2515
|
unless ($self->truncate) { |
|
685
|
100
|
|
|
|
|
2440
|
my $class = ref $self->seq; |
|
686
|
|
|
|
|
|
|
|
|
687
|
100
|
100
|
|
|
|
310
|
if ($class eq 'App::Sandy::Seq::SingleEnd') { |
|
|
|
50
|
|
|
|
|
|
|
688
|
50
|
50
|
|
|
|
1185
|
if ($new_size < $self->seq->read_mean) { |
|
689
|
0
|
|
|
|
|
0
|
log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required read-mean"; |
|
690
|
0
|
|
|
|
|
0
|
next; |
|
691
|
|
|
|
|
|
|
} |
|
692
|
|
|
|
|
|
|
} elsif ($class eq 'App::Sandy::Seq::PairedEnd') { |
|
693
|
50
|
50
|
|
|
|
1175
|
if ($new_size < $self->seq->fragment_mean) { |
|
694
|
0
|
|
|
|
|
0
|
log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required fragment mean"; |
|
695
|
0
|
|
|
|
|
0
|
next; |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
} else { |
|
698
|
0
|
|
|
|
|
0
|
die "No valid options for 'seq'"; |
|
699
|
|
|
|
|
|
|
} |
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
# If all's right |
|
703
|
100
|
|
|
|
|
240
|
$table_h->{size} = $new_size; |
|
704
|
100
|
|
|
|
|
230
|
$type_h->{$type} = $table_h; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# if there is at least one type, |
|
708
|
|
|
|
|
|
|
# then return it to the piece_table |
|
709
|
100
|
50
|
|
|
|
275
|
if (%$type_h) { |
|
710
|
100
|
|
|
|
|
215
|
$piece_table{$seq_id} = $type_h; |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
# else, just remove it! |
|
713
|
|
|
|
|
|
|
} else { |
|
714
|
0
|
|
|
|
|
0
|
push @blacklist => $seq_id; |
|
715
|
|
|
|
|
|
|
} |
|
716
|
|
|
|
|
|
|
} |
|
717
|
|
|
|
|
|
|
|
|
718
|
20
|
50
|
|
|
|
60
|
unless (%piece_table) { |
|
719
|
0
|
|
|
|
|
0
|
die "All fasta entries were removed due to deletions. ", |
|
720
|
|
|
|
|
|
|
"Please, verify the genomic variation '$genomic_variation'\n"; |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
# If fasta_rtree has entries |
|
724
|
20
|
50
|
|
|
|
795
|
unless ($self->_has_no_fasta_rtree) { |
|
725
|
|
|
|
|
|
|
# Remove no valid entries from id -> pid relation |
|
726
|
0
|
0
|
|
|
|
0
|
$self->_delete_fasta_rtree(@blacklist) if @blacklist; |
|
727
|
|
|
|
|
|
|
} |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# Make the id -> pid relationship |
|
730
|
20
|
|
|
|
|
80
|
$self->_populate_fasta_tree; |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# HASH -> SEQ_ID -> @(REF @ALT) -> @(TABLE SIZE) |
|
733
|
20
|
|
|
|
|
615
|
return \%piece_table; |
|
734
|
|
|
|
|
|
|
} |
|
735
|
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
sub _populate_piece_table { |
|
737
|
0
|
|
|
0
|
|
0
|
my ($self, $table, $snvs) = @_; |
|
738
|
|
|
|
|
|
|
|
|
739
|
0
|
|
|
|
|
0
|
for my $snv (@$snvs) { |
|
740
|
|
|
|
|
|
|
# If there is an ID, make sure that it is not a comma, colon |
|
741
|
|
|
|
|
|
|
# separated list. Else, make sure to keep the ref/alt length |
|
742
|
|
|
|
|
|
|
# to max 25+25+1=51 |
|
743
|
|
|
|
|
|
|
my $annot = defined $snv->{id} && $snv->{id} ne '.' |
|
744
|
|
|
|
|
|
|
? sprintf "%d:%s" => $snv->{pos} + 1, (split(/[,;]/, $snv->{id}))[0] |
|
745
|
0
|
0
|
0
|
|
|
0
|
: sprintf "%d:%.25s/%.25s" => $snv->{pos} + 1, $snv->{ref}, $snv->{alt}; |
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
# Insertion |
|
748
|
0
|
0
|
|
|
|
0
|
if ($snv->{ref} eq '-') { |
|
|
|
0
|
|
|
|
|
|
|
749
|
0
|
|
|
|
|
0
|
$table->insert(\$snv->{alt}, $snv->{pos}, $annot); |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
# Deletion |
|
752
|
|
|
|
|
|
|
} elsif ($snv->{alt} eq '-') { |
|
753
|
0
|
|
|
|
|
0
|
$table->delete($snv->{pos}, length $snv->{ref}, $annot); |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
# Change |
|
756
|
|
|
|
|
|
|
} else { |
|
757
|
0
|
|
|
|
|
0
|
$table->change(\$snv->{alt}, $snv->{pos}, length $snv->{ref}, $annot); |
|
758
|
|
|
|
|
|
|
} |
|
759
|
|
|
|
|
|
|
} |
|
760
|
|
|
|
|
|
|
} |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
sub _validate_indexed_snv_against_fasta { |
|
763
|
0
|
|
|
0
|
|
0
|
my ($self, $indexed_snv) = @_; |
|
764
|
|
|
|
|
|
|
|
|
765
|
0
|
|
|
|
|
0
|
my $indexed_fasta = $self->_fasta; |
|
766
|
0
|
|
|
|
|
0
|
my $genomic_variation = $self->_genomic_variation_names; |
|
767
|
|
|
|
|
|
|
|
|
768
|
0
|
|
|
|
|
0
|
for my $std_seq_id (keys %$indexed_snv) { |
|
769
|
0
|
|
|
|
|
0
|
my $snvs = delete $indexed_snv->{$std_seq_id}; |
|
770
|
0
|
|
|
|
|
0
|
my $seq_id = $self->_get_seqname($std_seq_id); |
|
771
|
|
|
|
|
|
|
|
|
772
|
0
|
0
|
0
|
|
|
0
|
unless (defined $seq_id && exists $indexed_fasta->{$seq_id}) { |
|
773
|
0
|
|
|
|
|
0
|
next; |
|
774
|
|
|
|
|
|
|
} |
|
775
|
|
|
|
|
|
|
|
|
776
|
0
|
|
|
|
|
0
|
my $seq = \$indexed_fasta->{$seq_id}{seq}; |
|
777
|
0
|
|
|
|
|
0
|
my $size = $indexed_fasta->{$seq_id}{size}; |
|
778
|
|
|
|
|
|
|
|
|
779
|
0
|
|
|
|
|
0
|
my @saved_snvs; |
|
780
|
|
|
|
|
|
|
|
|
781
|
0
|
|
|
|
|
0
|
for my $snv (@$snvs) { |
|
782
|
|
|
|
|
|
|
# Insertions may accur until one base after the |
|
783
|
|
|
|
|
|
|
# end of the sequence, not more |
|
784
|
0
|
0
|
0
|
|
|
0
|
if (($snv->{ref} eq '-' && $snv->{pos} > $size) || ($snv->{ref} ne '-' && $snv->{pos} >= $size)) { |
|
|
|
0
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
785
|
|
|
|
|
|
|
log_msg sprintf ":: In validating '%s': Position, %s/%s at %s:%d, outside fasta sequence", |
|
786
|
0
|
|
|
|
|
0
|
$genomic_variation, $snv->{ref}, $snv->{alt}, $seq_id, $snv->{pos} + 1; |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# Next snv |
|
789
|
0
|
|
|
|
|
0
|
next; |
|
790
|
|
|
|
|
|
|
# Deletions and changes. Just verify if the reference exists |
|
791
|
|
|
|
|
|
|
} elsif ($snv->{ref} ne '-') { |
|
792
|
0
|
|
|
|
|
0
|
my $ref = substr $$seq, $snv->{pos}, length($snv->{ref}); |
|
793
|
|
|
|
|
|
|
|
|
794
|
0
|
0
|
|
|
|
0
|
if (uc($ref) ne uc($snv->{ref})) { |
|
795
|
|
|
|
|
|
|
log_msg sprintf ":: In validating '%s': Not found reference '%s' at fasta position %s:%d", |
|
796
|
0
|
|
|
|
|
0
|
$genomic_variation, $snv->{ref}, $seq_id, $snv->{pos} + 1; |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
# Next snv |
|
799
|
0
|
|
|
|
|
0
|
next; |
|
800
|
|
|
|
|
|
|
} |
|
801
|
|
|
|
|
|
|
} |
|
802
|
|
|
|
|
|
|
|
|
803
|
0
|
|
|
|
|
0
|
push @saved_snvs => $snv; |
|
804
|
|
|
|
|
|
|
} |
|
805
|
|
|
|
|
|
|
|
|
806
|
0
|
0
|
|
|
|
0
|
if (@saved_snvs) { |
|
807
|
0
|
|
|
|
|
0
|
$indexed_snv->{$std_seq_id} = [@saved_snvs]; |
|
808
|
|
|
|
|
|
|
} |
|
809
|
|
|
|
|
|
|
} |
|
810
|
|
|
|
|
|
|
} |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
sub _calculate_number_of_reads { |
|
813
|
8
|
|
|
8
|
|
34
|
my $self = shift; |
|
814
|
8
|
|
|
|
|
19
|
my $number_of_reads; |
|
815
|
|
|
|
|
|
|
|
|
816
|
8
|
50
|
|
|
|
239
|
if ($self->count_loops_by eq 'coverage') { |
|
|
|
0
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
# It is needed to calculate the genome size |
|
818
|
8
|
|
|
|
|
246
|
my $fasta = $self->_fasta; |
|
819
|
8
|
|
|
|
|
36
|
my $fasta_size = 0; |
|
820
|
8
|
|
|
|
|
19
|
$fasta_size += $fasta->{$_}{size} for keys %{ $fasta }; |
|
|
8
|
|
|
|
|
60
|
|
|
821
|
8
|
|
|
|
|
230
|
$number_of_reads = int(($fasta_size * $self->coverage) / $self->seq->read_mean); |
|
822
|
|
|
|
|
|
|
# In case it is paired-end read, divide the number of reads by 2 because |
|
823
|
|
|
|
|
|
|
# App::Sandy::Seq::PairedEnd class returns 2 reads at time |
|
824
|
8
|
100
|
|
|
|
203
|
$number_of_reads = int($number_of_reads / 2) |
|
825
|
|
|
|
|
|
|
if ref($self->seq) eq 'App::Sandy::Seq::PairedEnd'; |
|
826
|
|
|
|
|
|
|
} elsif ($self->count_loops_by eq 'number-of-reads') { |
|
827
|
0
|
|
|
|
|
0
|
$number_of_reads = $self->number_of_reads; |
|
828
|
|
|
|
|
|
|
} else { |
|
829
|
0
|
|
|
|
|
0
|
croak sprintf "Unknown option '%s' for calculating the number of reads\n", |
|
830
|
|
|
|
|
|
|
$self->count_loops_by; |
|
831
|
|
|
|
|
|
|
} |
|
832
|
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
# Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size |
|
834
|
8
|
50
|
|
|
|
32
|
if ($number_of_reads <= 0) { |
|
835
|
0
|
|
|
|
|
0
|
die "The computed number of reads is equal to zero.\n" . |
|
836
|
|
|
|
|
|
|
"It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n"; |
|
837
|
|
|
|
|
|
|
} |
|
838
|
|
|
|
|
|
|
|
|
839
|
8
|
|
|
|
|
27
|
return $number_of_reads; |
|
840
|
|
|
|
|
|
|
} |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
sub _set_seed { |
|
843
|
4
|
|
|
4
|
|
67
|
my ($self, $inc) = @_; |
|
844
|
4
|
50
|
|
|
|
336
|
my $seed = defined $inc ? $self->seed + $inc : $self->seed; |
|
845
|
4
|
|
|
|
|
63
|
srand($seed); |
|
846
|
4
|
|
|
|
|
123
|
require Math::Random; |
|
847
|
4
|
|
|
|
|
178
|
Math::Random::random_set_seed_from_phrase($seed); |
|
848
|
|
|
|
|
|
|
} |
|
849
|
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
sub _calculate_parent_count { |
|
851
|
4
|
|
|
4
|
|
22
|
my ($self, $counter_ref) = @_; |
|
852
|
4
|
50
|
|
|
|
235
|
return if $self->_has_no_fasta_rtree; |
|
853
|
|
|
|
|
|
|
|
|
854
|
0
|
|
|
|
|
0
|
my %parent_count; |
|
855
|
|
|
|
|
|
|
|
|
856
|
0
|
|
|
|
|
0
|
while (my ($id, $count) = each %$counter_ref) { |
|
857
|
0
|
|
|
|
|
0
|
my $pid = $self->_get_fasta_rtree($id); |
|
858
|
0
|
0
|
|
|
|
0
|
$parent_count{$pid} += $count if defined $pid; |
|
859
|
|
|
|
|
|
|
} |
|
860
|
|
|
|
|
|
|
|
|
861
|
0
|
|
|
|
|
0
|
return \%parent_count; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
sub run_simulation { |
|
865
|
8
|
|
|
8
|
0
|
4225
|
my $self = shift; |
|
866
|
8
|
|
|
|
|
313
|
my $piece_table = $self->_piece_table; |
|
867
|
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
# Calculate the number of reads to be generated |
|
869
|
8
|
|
|
|
|
55
|
my $number_of_reads = $self->_calculate_number_of_reads; |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# Function that returns strand by strand_bias |
|
872
|
8
|
|
|
|
|
246
|
my $strand = $self->_strand; |
|
873
|
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
# Function that returns seqid by seqid_weight |
|
875
|
8
|
|
|
|
|
261
|
my $seqid = $self->_seqid_raffle; |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
# genome or transcriptome? |
|
878
|
8
|
|
|
|
|
251
|
my $simulation = $self->argv->[0]; |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
# Count file to be generated |
|
881
|
8
|
50
|
|
|
|
264
|
my $count_file = $simulation eq 'transcriptome' |
|
882
|
|
|
|
|
|
|
? $self->prefix . '_abundance.tsv' |
|
883
|
|
|
|
|
|
|
: $self->prefix . '_coverage.tsv'; |
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
# Main files |
|
886
|
8
|
|
|
|
|
217
|
my %files = ( |
|
887
|
|
|
|
|
|
|
bam => [ |
|
888
|
|
|
|
|
|
|
$self->prefix . '.bam' |
|
889
|
|
|
|
|
|
|
], |
|
890
|
|
|
|
|
|
|
sam => [ |
|
891
|
|
|
|
|
|
|
$self->prefix . '.sam' |
|
892
|
|
|
|
|
|
|
], |
|
893
|
|
|
|
|
|
|
single_fastq => [ |
|
894
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq' |
|
895
|
|
|
|
|
|
|
], |
|
896
|
|
|
|
|
|
|
single_fastq_gz => [ |
|
897
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq.gz' |
|
898
|
|
|
|
|
|
|
], |
|
899
|
|
|
|
|
|
|
join_paired_fastq => [ |
|
900
|
|
|
|
|
|
|
$self->prefix . '.fastq' |
|
901
|
|
|
|
|
|
|
], |
|
902
|
|
|
|
|
|
|
join_paired_fastq_gz => [ |
|
903
|
|
|
|
|
|
|
$self->prefix . '.fastq.gz' |
|
904
|
|
|
|
|
|
|
], |
|
905
|
|
|
|
|
|
|
paired_fastq => [ |
|
906
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq', |
|
907
|
|
|
|
|
|
|
$self->prefix . '_R2_001.fastq' |
|
908
|
|
|
|
|
|
|
], |
|
909
|
|
|
|
|
|
|
paired_fastq_gz => [ |
|
910
|
|
|
|
|
|
|
$self->prefix . '_R1_001.fastq.gz', |
|
911
|
|
|
|
|
|
|
$self->prefix . '_R2_001.fastq.gz' |
|
912
|
|
|
|
|
|
|
] |
|
913
|
|
|
|
|
|
|
); |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
# Set the file class in order to know |
|
916
|
|
|
|
|
|
|
# how to deal with all files options |
|
917
|
8
|
|
|
|
|
216
|
my $seq_class = ref $self->seq; |
|
918
|
8
|
|
|
|
|
272
|
my $output_format = $self->output_format; |
|
919
|
8
|
|
|
|
|
24
|
my $file_class; |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
# This mess is necessary to catch the |
|
922
|
|
|
|
|
|
|
# right value into the %files hash |
|
923
|
8
|
50
|
|
|
|
98
|
if ($output_format =~ /(sam|bam)/) { |
|
|
|
50
|
|
|
|
|
|
|
924
|
0
|
|
|
|
|
0
|
$file_class = $output_format; |
|
925
|
|
|
|
|
|
|
} elsif ($output_format =~ /fastq/) { |
|
926
|
8
|
100
|
|
|
|
33
|
if ($seq_class eq 'App::Sandy::Seq::SingleEnd') { |
|
|
|
50
|
|
|
|
|
|
|
927
|
5
|
|
|
|
|
15
|
$file_class = 'single_fastq'; |
|
928
|
|
|
|
|
|
|
} elsif ($seq_class eq 'App::Sandy::Seq::PairedEnd') { |
|
929
|
3
|
|
|
|
|
9
|
$file_class = 'paired_fastq'; |
|
930
|
3
|
50
|
|
|
|
126
|
$file_class = "join_$file_class" if $self->join_paired_ends; |
|
931
|
|
|
|
|
|
|
} else { |
|
932
|
0
|
|
|
|
|
0
|
croak "Something wrong with the seq class: $seq_class"; |
|
933
|
|
|
|
|
|
|
} |
|
934
|
8
|
50
|
|
|
|
32
|
if ($output_format eq 'fastq.gz') { |
|
935
|
0
|
|
|
|
|
0
|
$file_class .= '_gz'; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
} else { |
|
938
|
0
|
|
|
|
|
0
|
croak "Something wrong with the output format: $output_format"; |
|
939
|
|
|
|
|
|
|
} |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
# Forks |
|
942
|
8
|
|
|
|
|
250
|
my $number_of_jobs = $self->jobs; |
|
943
|
8
|
|
|
|
|
162
|
my $pm = Parallel::ForkManager->new($number_of_jobs); |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
# Parent child pids |
|
946
|
8
|
|
|
|
|
25425
|
my $parent_pid = $$; |
|
947
|
8
|
|
|
|
|
21
|
my @child_pid; |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
# Temporary files tracker |
|
950
|
|
|
|
|
|
|
my @tmp_files; |
|
951
|
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
# Run in parent right after creating child process |
|
953
|
|
|
|
|
|
|
$pm->run_on_start( |
|
954
|
|
|
|
|
|
|
sub { |
|
955
|
10
|
|
|
10
|
|
18802
|
my $pid = shift; |
|
956
|
10
|
|
|
|
|
259
|
push @child_pid => $pid; |
|
957
|
|
|
|
|
|
|
} |
|
958
|
8
|
|
|
|
|
72
|
); |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# Count the overall cumulative number of reads for each seqid |
|
961
|
8
|
|
|
|
|
59
|
my %counters; |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
# Run in parent right after finishing child process |
|
964
|
|
|
|
|
|
|
$pm->run_on_finish( |
|
965
|
|
|
|
|
|
|
sub { |
|
966
|
8
|
|
|
8
|
|
24032709
|
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $counter_ref) = @_; |
|
967
|
8
|
|
|
|
|
101
|
while (my ($seqid, $count) = each %$counter_ref) { |
|
968
|
40
|
|
|
|
|
254
|
$counters{$seqid} += $count; |
|
969
|
|
|
|
|
|
|
} |
|
970
|
|
|
|
|
|
|
} |
|
971
|
8
|
|
|
|
|
61
|
); |
|
972
|
|
|
|
|
|
|
|
|
973
|
8
|
50
|
|
|
|
141
|
log_msg sprintf ":: Creating %d child %s ...", |
|
974
|
|
|
|
|
|
|
$number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs"; |
|
975
|
|
|
|
|
|
|
|
|
976
|
8
|
|
|
|
|
40
|
for my $tid (1..$number_of_jobs) { |
|
977
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
978
|
|
|
|
|
|
|
# Inside parent |
|
979
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
980
|
14
|
|
|
|
|
773
|
log_msg ":: Creating job $tid ..."; |
|
981
|
14
|
|
|
|
|
33
|
my @files_t = map { "$_.${parent_pid}.part$tid" } @{ $files{$file_class} }; |
|
|
19
|
|
|
|
|
107
|
|
|
|
14
|
|
|
|
|
224
|
|
|
982
|
14
|
|
|
|
|
103
|
push @tmp_files => @files_t; |
|
983
|
14
|
100
|
|
|
|
77
|
my $pid = $pm->start and next; |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
986
|
|
|
|
|
|
|
# Inside child |
|
987
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
988
|
|
|
|
|
|
|
# Intelace child/parent processes |
|
989
|
4
|
|
|
|
|
42813
|
my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => [$parent_pid]); |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# Set child seed |
|
992
|
4
|
|
|
|
|
173
|
$self->_set_seed($tid); |
|
993
|
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
# Calculate the number of reads to this job and correct this local index |
|
995
|
|
|
|
|
|
|
# to the global index |
|
996
|
4
|
|
|
|
|
249
|
my $number_of_reads_t = int($number_of_reads/$number_of_jobs); |
|
997
|
4
|
|
|
|
|
36
|
my $last_read_idx = $number_of_reads_t * $tid; |
|
998
|
4
|
|
|
|
|
91
|
my $idx = $last_read_idx - $number_of_reads_t + 1; |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
# If it is the last job, make it work on the leftover reads of int() truncation |
|
1001
|
4
|
100
|
|
|
|
115
|
$last_read_idx += $number_of_reads % $number_of_jobs |
|
1002
|
|
|
|
|
|
|
if $tid == $number_of_jobs; |
|
1003
|
|
|
|
|
|
|
|
|
1004
|
4
|
|
|
|
|
220
|
log_msg " => Job $tid: Working on sequences from $idx to $last_read_idx"; |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
# Create temporary files |
|
1007
|
4
|
|
|
|
|
116
|
log_msg " => Job $tid: Creating temporary file: @files_t"; |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
# And here we go ... |
|
1010
|
4
|
|
|
|
|
51
|
my @fhs; |
|
1011
|
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
# Set the right filehandle format |
|
1013
|
4
|
50
|
|
|
|
189
|
if ($output_format =~ /^(sam|fastq)$/) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1014
|
4
|
|
|
|
|
41
|
@fhs = map { $self->with_open_w($_, 0) } @files_t; |
|
|
6
|
|
|
|
|
249
|
|
|
1015
|
|
|
|
|
|
|
} elsif ($output_format eq 'fastq.gz') { |
|
1016
|
0
|
|
|
|
|
0
|
@fhs = map { $self->with_open_w($_, $self->compression_level) } @files_t; |
|
|
0
|
|
|
|
|
0
|
|
|
1017
|
|
|
|
|
|
|
} elsif ($output_format eq 'bam') { |
|
1018
|
0
|
|
|
|
|
0
|
@fhs = map { $self->with_open_bam_w($_, $self->compression_level) } @files_t; |
|
|
0
|
|
|
|
|
0
|
|
|
1019
|
|
|
|
|
|
|
} else { |
|
1020
|
0
|
|
|
|
|
0
|
croak "Something wrong with the output format: $file_class"; |
|
1021
|
|
|
|
|
|
|
} |
|
1022
|
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# sprint_seq gives two entries for paired-emd, so |
|
1024
|
|
|
|
|
|
|
# if it is a bam|sam|join-paired-ends, it is necessary |
|
1025
|
|
|
|
|
|
|
# to copy the filehandle in order to print both entries |
|
1026
|
|
|
|
|
|
|
# to the same file |
|
1027
|
4
|
50
|
66
|
|
|
149
|
if ($seq_class eq 'App::Sandy::Seq::PairedEnd' |
|
1028
|
|
|
|
|
|
|
&& $file_class =~ /(sam|bam|join)/) { |
|
1029
|
0
|
|
|
|
|
0
|
$fhs[1] = $fhs[0]; |
|
1030
|
|
|
|
|
|
|
} |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
# Count the cumulative number of reads for each seqid |
|
1033
|
4
|
|
|
|
|
34
|
my %counter; |
|
1034
|
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# If the output format is 'bam|sam' and it is the first job, then |
|
1036
|
|
|
|
|
|
|
# write the header |
|
1037
|
4
|
50
|
33
|
|
|
134
|
if ($output_format =~ /^(sam|bam)$/ && $tid == 1) { |
|
1038
|
0
|
|
|
|
|
0
|
my $header_ref = $self->gen_sam_header($self->argv); |
|
1039
|
0
|
|
|
|
|
0
|
print {$fhs[0]} "$$header_ref"; |
|
|
0
|
|
|
|
|
0
|
|
|
1040
|
|
|
|
|
|
|
} |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
# Run simulation in child |
|
1043
|
4
|
|
66
|
|
|
373
|
for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) { |
|
1044
|
1710
|
|
|
|
|
4612
|
my $id = $seqid->(); |
|
1045
|
1710
|
|
|
|
|
5566
|
my $ptable = $piece_table->{$id->{seq_id}}{$id->{type}}; |
|
1046
|
1710
|
|
|
|
|
2945
|
my @seq_entry; |
|
1047
|
|
|
|
|
|
|
try { |
|
1048
|
|
|
|
|
|
|
@seq_entry = $self->sprint_seq($tid, $i, $id->{seq_id}, $id->{type}, |
|
1049
|
1710
|
|
|
1710
|
|
122749
|
$ptable->{table}, $ptable->{size}, $strand->()); |
|
1050
|
|
|
|
|
|
|
} catch { |
|
1051
|
0
|
|
|
0
|
|
0
|
die "Not defined entry for seqid '>$id->{seq_id}' at job $tid: $_"; |
|
1052
|
|
|
|
|
|
|
} finally { |
|
1053
|
1710
|
50
|
|
1710
|
|
38761
|
unless (@_) { |
|
1054
|
1710
|
|
|
|
|
5237
|
for my $fh_idx (0..$#fhs) { |
|
1055
|
2280
|
|
|
|
|
4898
|
$counter{$id->{seq_id}}++; |
|
1056
|
2280
|
|
|
|
|
3259
|
print {$fhs[$fh_idx]} "${$seq_entry[$fh_idx]}"; |
|
|
2280
|
|
|
|
|
4241
|
|
|
|
2280
|
|
|
|
|
10194
|
|
|
1057
|
|
|
|
|
|
|
} |
|
1058
|
|
|
|
|
|
|
} |
|
1059
|
1710
|
|
|
|
|
14362
|
}; |
|
1060
|
|
|
|
|
|
|
} |
|
1061
|
|
|
|
|
|
|
|
|
1062
|
4
|
|
|
|
|
154
|
log_msg " => Job $tid: Writing and closing file: @files_t"; |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
# Close temporary files |
|
1065
|
|
|
|
|
|
|
# Get index from @files_t in order to avoid |
|
1066
|
|
|
|
|
|
|
# close the same filehandle twice - When the |
|
1067
|
|
|
|
|
|
|
# position 1-N is a copy |
|
1068
|
4
|
|
|
|
|
16
|
for my $fh_idx (0..$#files_t) { |
|
1069
|
6
|
|
|
|
|
402
|
close $fhs[$fh_idx]; |
|
1070
|
|
|
|
|
|
|
} |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# If it is a bam and it is the last loop, then |
|
1073
|
|
|
|
|
|
|
# write a eof marker |
|
1074
|
4
|
50
|
33
|
|
|
41
|
if ($output_format eq 'bam' && $tid == $number_of_jobs) { |
|
1075
|
0
|
|
|
|
|
0
|
$self->gen_eof_marker($files_t[0]); |
|
1076
|
|
|
|
|
|
|
} |
|
1077
|
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
# Child exit |
|
1079
|
4
|
|
|
|
|
34
|
log_msg " => Job $tid is finished"; |
|
1080
|
4
|
|
|
|
|
189
|
$pm->finish(0, \%counter); |
|
1081
|
|
|
|
|
|
|
} |
|
1082
|
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
# Back to parent |
|
1084
|
|
|
|
|
|
|
# Interlace parent/child(s) processes |
|
1085
|
4
|
|
|
|
|
1311
|
my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => \@child_pid); |
|
1086
|
4
|
|
|
|
|
51
|
$pm->wait_all_children; |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
4
|
50
|
|
|
|
354
|
if ($sig->signal_catched) { |
|
1089
|
0
|
|
|
|
|
0
|
log_msg ":: Termination signal received!"; |
|
1090
|
|
|
|
|
|
|
} |
|
1091
|
|
|
|
|
|
|
|
|
1092
|
4
|
|
|
|
|
94
|
log_msg ":: Saving the work ..."; |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# Concatenate all temporary files |
|
1095
|
4
|
|
|
|
|
26
|
log_msg ":: Concatenate all temporary files"; |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
# Save time. Rename tmp_file (1,2) |
|
1098
|
4
|
|
|
|
|
8
|
for my $file (@{ $files{$file_class} }) { |
|
|
4
|
|
|
|
|
46
|
|
|
1099
|
5
|
|
|
|
|
36
|
my $tmp = shift @tmp_files; |
|
1100
|
5
|
|
|
|
|
66
|
log_msg " => Concatenating $tmp to $file ..."; |
|
1101
|
5
|
50
|
|
|
|
308
|
rename $tmp => $file |
|
1102
|
|
|
|
|
|
|
or die "Cannot create '$file': $!\n"; |
|
1103
|
|
|
|
|
|
|
} |
|
1104
|
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
# Append to renamed tmp files |
|
1106
|
4
|
|
|
|
|
16
|
my @fh = map { $self->with_open_a($_) } @{ $files{$file_class} }; |
|
|
5
|
|
|
|
|
122
|
|
|
|
4
|
|
|
|
|
35
|
|
|
1107
|
|
|
|
|
|
|
|
|
1108
|
4
|
|
|
|
|
37
|
for my $i (0..$#tmp_files) { |
|
1109
|
5
|
|
|
|
|
21
|
my $fh_idx = $i % scalar @fh; |
|
1110
|
|
|
|
|
|
|
|
|
1111
|
5
|
|
|
|
|
85
|
log_msg " => Concatenating $tmp_files[$i] to $files{$file_class}[$fh_idx] ..."; |
|
1112
|
5
|
50
|
|
|
|
90
|
cat $tmp_files[$i] => $fh[$fh_idx] |
|
1113
|
|
|
|
|
|
|
or die "Cannot concatenate $tmp_files[$i] to $files{$file_class}[$fh_idx]: $!\n"; |
|
1114
|
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
# Clean up the mess |
|
1116
|
5
|
50
|
|
|
|
13730
|
unlink $tmp_files[$i] |
|
1117
|
|
|
|
|
|
|
or die "Cannot remove temporary file '$tmp_files[$i]': $!\n"; |
|
1118
|
|
|
|
|
|
|
} |
|
1119
|
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
# Close files |
|
1121
|
4
|
|
|
|
|
35
|
log_msg ":: Writing and closing output file: @{ $files{$file_class} }"; |
|
|
4
|
|
|
|
|
63
|
|
|
1122
|
4
|
|
|
|
|
20
|
for my $fh_idx (0..$#fh) { |
|
1123
|
5
|
50
|
|
|
|
146
|
close $fh[$fh_idx] |
|
1124
|
|
|
|
|
|
|
or die "Cannot write file $files{$file_class}[$fh_idx]: $!\n"; |
|
1125
|
|
|
|
|
|
|
} |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
# Save counts |
|
1128
|
4
|
|
|
|
|
24
|
log_msg ":: Saving count file"; |
|
1129
|
4
|
|
|
|
|
64
|
my $count_fh = $self->with_open_w($count_file, 0); |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# It is necessary to correct the abundance according to |
|
1132
|
|
|
|
|
|
|
# fragment sequencing end |
|
1133
|
4
|
|
|
|
|
11
|
my $count_factor = 1; |
|
1134
|
4
|
50
|
33
|
|
|
252
|
if ($self->count_loops_by eq 'number-of-reads' |
|
1135
|
|
|
|
|
|
|
&& ref($self->seq) eq 'App::Sandy::Seq::PairedEnd') { |
|
1136
|
0
|
|
|
|
|
0
|
$count_factor = 2; |
|
1137
|
|
|
|
|
|
|
} |
|
1138
|
|
|
|
|
|
|
|
|
1139
|
4
|
|
|
|
|
39
|
log_msg " => Writing counts to $count_file ..."; |
|
1140
|
4
|
|
|
|
|
35
|
for my $id (sort keys %counters) { |
|
1141
|
20
|
|
|
|
|
109
|
printf {$count_fh} "%s\t%d\n" => $id, |
|
1142
|
20
|
|
|
|
|
31
|
int($counters{$id} / $count_factor); |
|
1143
|
|
|
|
|
|
|
} |
|
1144
|
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
# Just in case, calculate 'gene' like expression |
|
1146
|
4
|
|
|
|
|
98
|
my $parent_count = $self->_calculate_parent_count(\%counters); |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
4
|
|
|
|
|
21
|
for my $id (sort keys %$parent_count) { |
|
1149
|
0
|
|
|
|
|
0
|
printf {$count_fh} "%s\t%d\n" => $id, |
|
1150
|
0
|
|
|
|
|
0
|
int($parent_count->{$id} / $count_factor); |
|
1151
|
|
|
|
|
|
|
} |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
# Close $count_file |
|
1154
|
4
|
|
|
|
|
42
|
log_msg ":; Writing and closing $count_file ..."; |
|
1155
|
4
|
50
|
|
|
|
314
|
close $count_fh |
|
1156
|
|
|
|
|
|
|
or die "Cannot write file $count_file: $!\n"; |
|
1157
|
|
|
|
|
|
|
} |
|
1158
|
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
__END__ |
|
1160
|
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
=pod |
|
1162
|
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
=encoding UTF-8 |
|
1164
|
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
=head1 NAME |
|
1166
|
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
App::Sandy::Simulator - Class responsible to make the simulation |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=head1 VERSION |
|
1170
|
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
version 0.22 |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
=head1 AUTHORS |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
=over 4 |
|
1176
|
|
|
|
|
|
|
|
|
1177
|
|
|
|
|
|
|
=item * |
|
1178
|
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
=item * |
|
1182
|
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
|
1184
|
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
=item * |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
|
1188
|
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
=item * |
|
1190
|
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
|
1192
|
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
=item * |
|
1194
|
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
|
1196
|
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
=item * |
|
1198
|
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
|
1200
|
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
=item * |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
|
1204
|
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=back |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-Libanês Hospital. |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
This is free software, licensed under: |
|
1212
|
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
=cut |