line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::Command::Genome; |
2
|
|
|
|
|
|
|
# ABSTRACT: simulate command class. Simulate genome sequencing |
3
|
|
|
|
|
|
|
|
4
|
1
|
|
|
1
|
|
1541
|
use App::Sandy::Base 'class'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
6
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
extends 'App::Sandy::CLI::Command'; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
with 'App::Sandy::Role::Digest'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.22'; # VERSION |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub default_opt { |
13
|
0
|
|
|
0
|
0
|
|
'paired-end-id' => '%i.%U:%c:%F:%X-%Z', |
14
|
|
|
|
|
|
|
'single-end-id' => '%i.%U:%c:%s:%t-%n', |
15
|
|
|
|
|
|
|
'seed' => time, |
16
|
|
|
|
|
|
|
'verbose' => 0, |
17
|
|
|
|
|
|
|
'prefix' => 'out', |
18
|
|
|
|
|
|
|
'output-dir' => '.', |
19
|
|
|
|
|
|
|
'jobs' => 1, |
20
|
|
|
|
|
|
|
'count-loops-by' => 'coverage', |
21
|
|
|
|
|
|
|
'coverage' => 8, |
22
|
|
|
|
|
|
|
'strand-bias' => 'random', |
23
|
|
|
|
|
|
|
'seqid-weight' => 'length', |
24
|
|
|
|
|
|
|
'sequencing-type' => 'paired-end', |
25
|
|
|
|
|
|
|
'fragment-mean' => 300, |
26
|
|
|
|
|
|
|
'fragment-stdd' => 50, |
27
|
|
|
|
|
|
|
'sequencing-error' => 0.001, |
28
|
|
|
|
|
|
|
'read-mean' => 100, |
29
|
|
|
|
|
|
|
'read-stdd' => 0, |
30
|
|
|
|
|
|
|
'quality-profile' => 'poisson', |
31
|
|
|
|
|
|
|
'join-paired-ends' => 0, |
32
|
|
|
|
|
|
|
'output-format' => 'fastq.gz', |
33
|
|
|
|
|
|
|
'compression-level' => 6 |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub rm_opt { |
37
|
0
|
|
|
0
|
0
|
|
'strand-bias', |
38
|
|
|
|
|
|
|
'number-of-reads', |
39
|
|
|
|
|
|
|
'seqid-weight', |
40
|
|
|
|
|
|
|
'expression-matrix' |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
__END__ |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=pod |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=encoding UTF-8 |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 NAME |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
App::Sandy::Command::Genome - simulate command class. Simulate genome sequencing |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 VERSION |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
version 0.22 |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 SYNOPSIS |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sandy genome [options] <fasta-file> |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Arguments: |
62
|
|
|
|
|
|
|
a fasta-file |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Options: |
65
|
|
|
|
|
|
|
-h, --help brief help message |
66
|
|
|
|
|
|
|
-u, --man full documentation |
67
|
|
|
|
|
|
|
-v, --verbose print log messages |
68
|
|
|
|
|
|
|
-p, --prefix prefix output [default:"out"] |
69
|
|
|
|
|
|
|
-o, --output-dir output directory [default:"."] |
70
|
|
|
|
|
|
|
-O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"] |
71
|
|
|
|
|
|
|
-1, --join-paired-ends merge R1 and R2 outputs in one file |
72
|
|
|
|
|
|
|
-x, --compression-level speed compression: "1" - compress faster, |
73
|
|
|
|
|
|
|
"9" - compress better [default:"6"; Integer] |
74
|
|
|
|
|
|
|
-i, --append-id append to the defined template id [Format] |
75
|
|
|
|
|
|
|
-I, --id overlap the default template id [Format] |
76
|
|
|
|
|
|
|
-j, --jobs number of jobs [default:"1"; Integer] |
77
|
|
|
|
|
|
|
-s, --seed set the seed of the base generator |
78
|
|
|
|
|
|
|
[default:"time()"; Integer] |
79
|
|
|
|
|
|
|
-c, --coverage genome coverage [default:"8", Number] |
80
|
|
|
|
|
|
|
-t, --sequencing-type single-end or paired-end reads |
81
|
|
|
|
|
|
|
[default:"paired-end"] |
82
|
|
|
|
|
|
|
-q, --quality-profile sequencing system profiles from quality |
83
|
|
|
|
|
|
|
database [default:"poisson"] |
84
|
|
|
|
|
|
|
-e, --sequencing-error sequencing error rate for poisson |
85
|
|
|
|
|
|
|
[default:"0.001"; Number] |
86
|
|
|
|
|
|
|
-m, --read-mean read mean size for poisson |
87
|
|
|
|
|
|
|
[default:"100"; Integer] |
88
|
|
|
|
|
|
|
-d, --read-stdd read standard deviation size for poisson |
89
|
|
|
|
|
|
|
[default:"0"; Integer] |
90
|
|
|
|
|
|
|
-M, --fragment-mean the fragment mean size for paired-end reads |
91
|
|
|
|
|
|
|
[default:"300"; Integer] |
92
|
|
|
|
|
|
|
-D, --fragment-stdd the fragment standard deviation size for |
93
|
|
|
|
|
|
|
paired-end reads [default:"50"; Integer] |
94
|
|
|
|
|
|
|
-a, --genomic-variation a list of genomic variation entries from |
95
|
|
|
|
|
|
|
variation database. This option may be passed |
96
|
|
|
|
|
|
|
multiple times [default:"none"] |
97
|
|
|
|
|
|
|
-A, --genomic-variation-regex a list of perl-like regex to match genomic |
98
|
|
|
|
|
|
|
variation entries in variation database. |
99
|
|
|
|
|
|
|
This option may be passed multiple times |
100
|
|
|
|
|
|
|
[default:"none"] |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head1 DESCRIPTION |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Simulate genome sequencing. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head1 OPTIONS |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=over 8 |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=item B<--help> |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Print a brief help message and exits. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=item B<--man> |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Prints the manual page and exits. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item B<--verbose> |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Prints log information to standard error |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item B<--prefix> |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Concatenates the prefix to the output-file name. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item B<--output-dir> |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Creates output-file inside output-dir. If output-dir |
129
|
|
|
|
|
|
|
does not exist, it is created recursively |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=item B<--output-format> |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Choose the output format. Available options are: |
134
|
|
|
|
|
|
|
I<bam>, I<sam>, I<fastq.gz>, I<fastq>. |
135
|
|
|
|
|
|
|
For I<bam> option, B<--append-id> is ignored, considering |
136
|
|
|
|
|
|
|
that the sequence identifier is splitted by blank character, so |
137
|
|
|
|
|
|
|
just the first field is included into the query name column |
138
|
|
|
|
|
|
|
(first column). |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=item B<--join-paired-ends> |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
By default, paired-end reads are put into two different files, |
143
|
|
|
|
|
|
|
I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs |
144
|
|
|
|
|
|
|
together, she can pass this option. |
145
|
|
|
|
|
|
|
If the B<--id> does not have the escape character %R, it is |
146
|
|
|
|
|
|
|
automatically included right after the first field (blank separated values) |
147
|
|
|
|
|
|
|
as in I<id/%R> - which resolves to I<id/1> or I<id/2>. |
148
|
|
|
|
|
|
|
It is necessary to distinguish which read is R1/R2 |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item B<--compression-level> |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Regulates the speed of compression using the specified digit (between 1 and 9), |
153
|
|
|
|
|
|
|
where "1" indicates the fastest compression method (less compression) and "9" |
154
|
|
|
|
|
|
|
indicates the slowest compression method (best compression). The default |
155
|
|
|
|
|
|
|
compression level is "6" |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=item B<--append-id> |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Append string template to the defined template id. |
160
|
|
|
|
|
|
|
See B<Format> |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=item B<--id> |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Overlap the default defined template id: |
165
|
|
|
|
|
|
|
I<single-end> %i.%U_%c_%s_%t_%n and I<paired-end> %i.%U_%c_%s_%S_%E |
166
|
|
|
|
|
|
|
e.g. SR123.1_chr1_P_1001_1101 |
167
|
|
|
|
|
|
|
See B<Format> |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item B<Format> |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works. |
172
|
|
|
|
|
|
|
That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid |
173
|
|
|
|
|
|
|
escape characteres are: |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
B<Common escape characters> |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
178
|
|
|
|
|
|
|
Escape Meaning |
179
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
180
|
|
|
|
|
|
|
%i instrument id composed by SR + PID |
181
|
|
|
|
|
|
|
%I job slot number |
182
|
|
|
|
|
|
|
%q quality profile |
183
|
|
|
|
|
|
|
%e sequencing error |
184
|
|
|
|
|
|
|
%x sequencing error position |
185
|
|
|
|
|
|
|
%R read 1, or 2 if it is the paired-end mate |
186
|
|
|
|
|
|
|
%U read number |
187
|
|
|
|
|
|
|
%r read size |
188
|
|
|
|
|
|
|
%m read mean |
189
|
|
|
|
|
|
|
%d read standard deviation |
190
|
|
|
|
|
|
|
%c sequence id as chromossome, gene/transcript id |
191
|
|
|
|
|
|
|
%C sequence id type (reference or alternate non reference allele) *** |
192
|
|
|
|
|
|
|
%s read strand |
193
|
|
|
|
|
|
|
%t read start position |
194
|
|
|
|
|
|
|
%n read end position |
195
|
|
|
|
|
|
|
%a read start position regarding reference genome *** |
196
|
|
|
|
|
|
|
%b read end position regarding reference genome *** |
197
|
|
|
|
|
|
|
%v genomic variation position *** |
198
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
199
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
B<Paired-end specific escape characters> |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
204
|
|
|
|
|
|
|
Escape Meaning |
205
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
206
|
|
|
|
|
|
|
%T mate read start position |
207
|
|
|
|
|
|
|
%N mate read end position |
208
|
|
|
|
|
|
|
%A mate read start position regarding reference genome *** |
209
|
|
|
|
|
|
|
%B mate read end position regarding reference genome *** |
210
|
|
|
|
|
|
|
%D distance between the paired-reads |
211
|
|
|
|
|
|
|
%M fragment mean |
212
|
|
|
|
|
|
|
%D fragment standard deviation |
213
|
|
|
|
|
|
|
%f fragment size |
214
|
|
|
|
|
|
|
%F fragment strand |
215
|
|
|
|
|
|
|
%S fragment start position |
216
|
|
|
|
|
|
|
%E fragment end position |
217
|
|
|
|
|
|
|
%X fragment start position regarding reference genome *** |
218
|
|
|
|
|
|
|
%Z fragment end position regarding reference genome *** |
219
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
220
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=item B<--jobs> |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Sets the number of child jobs to be created |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=item B<--seed> |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
Sets the seed of the base generator. The ability to set the seed is |
229
|
|
|
|
|
|
|
useful for those who want reproducible simulations. Pay attention to |
230
|
|
|
|
|
|
|
the number of jobs (--jobs) set, because each job receives a different |
231
|
|
|
|
|
|
|
seed calculated from the I<main seed>. So, for reproducibility, the |
232
|
|
|
|
|
|
|
same seed set before needs the same number of jobs set before as well. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=item B<--read-mean> |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Sets the read mean if quality-profile is equal to 'poisson'. The |
237
|
|
|
|
|
|
|
quality-profile from database overrides the read-size |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=item B<--read-stdd> |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Sets the read standard deviation if quality-profile is equal to |
242
|
|
|
|
|
|
|
'poisson'. The quality-profile from database overrides the read-stdd |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=item B<--coverage> |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Calculates the number of reads based on the genome |
247
|
|
|
|
|
|
|
coverage: number_of_reads = (sequence_size * coverage) / read_size. |
248
|
|
|
|
|
|
|
This is the default option for genome sequencing simulation |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item B<--sequencing-type> |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Sets the sequencing type to single-end or paired-end |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=item B<--fragment-mean> |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
257
|
|
|
|
|
|
|
fragment mean |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=item B<--fragment-stdd> |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
262
|
|
|
|
|
|
|
fragment standard deviation |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=item B<--sequencing-error> |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
Sets the sequencing error rate if quality-profile is equal to 'poisson'. |
267
|
|
|
|
|
|
|
Valid values are between zero and one |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=item B<--quality-profile> |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Sets the sequencing system profile for quality. The default value is a poisson |
272
|
|
|
|
|
|
|
distribution, but the user can choose among several profiles stored into the |
273
|
|
|
|
|
|
|
database or import his own data. |
274
|
|
|
|
|
|
|
See B<quality> command for more details |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=item B<--genomic-variation> |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Sets the genomic variation to be applied on the genome feeded. By |
279
|
|
|
|
|
|
|
default no variation is included to the simulation, but the user has |
280
|
|
|
|
|
|
|
the power to point some entries from B<variation> database or index his |
281
|
|
|
|
|
|
|
own data. This option accepts a list with comma separated values |
282
|
|
|
|
|
|
|
and can be passed multiple times, which is useful in order to join |
283
|
|
|
|
|
|
|
various types of genomic variation into the same simulation. It is |
284
|
|
|
|
|
|
|
possible to combine this option with B<--genomic-variation-regex> |
285
|
|
|
|
|
|
|
See B<variation> command for the available list of genomic variation |
286
|
|
|
|
|
|
|
entries |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=item B<--genomic-variation-regex> |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
Applies perl-regex in the variation database and selects all entryes |
291
|
|
|
|
|
|
|
that match the pattern. This option accepts a list with comma separated |
292
|
|
|
|
|
|
|
values and can be passed multiple times. It is possible to combine this |
293
|
|
|
|
|
|
|
option with B<--genomic-variation> |
294
|
|
|
|
|
|
|
See B<variation> command for the available list of genomic variation |
295
|
|
|
|
|
|
|
entries |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=back |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=head1 AUTHORS |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=over 4 |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=item * |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=item * |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=item * |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=item * |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=item * |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=item * |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=item * |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=back |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-Libanês Hospital. |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
This is free software, licensed under: |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=cut |