line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::Command::Genome; |
2
|
|
|
|
|
|
|
# ABSTRACT: simulate command class. Simulate genome sequencing |
3
|
|
|
|
|
|
|
|
4
|
1
|
|
|
1
|
|
1430
|
use App::Sandy::Base 'class'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
13
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
extends 'App::Sandy::CLI::Command'; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
with 'App::Sandy::Role::Digest'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.25'; # VERSION |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub default_opt { |
13
|
0
|
|
|
0
|
0
|
|
'paired-end-id' => '%i.%U:%c:%F:%X-%Z', |
14
|
|
|
|
|
|
|
'single-end-id' => '%i.%U:%c:%s:%t-%n', |
15
|
|
|
|
|
|
|
'seed' => time, |
16
|
|
|
|
|
|
|
'verbose' => 0, |
17
|
|
|
|
|
|
|
'prefix' => 'out', |
18
|
|
|
|
|
|
|
'output-dir' => '.', |
19
|
|
|
|
|
|
|
'jobs' => 1, |
20
|
|
|
|
|
|
|
'count-loops-by' => 'coverage', |
21
|
|
|
|
|
|
|
'coverage' => 8, |
22
|
|
|
|
|
|
|
'strand-bias' => 'random', |
23
|
|
|
|
|
|
|
'seqid-weight' => 'length', |
24
|
|
|
|
|
|
|
'sequencing-type' => 'paired-end', |
25
|
|
|
|
|
|
|
'fragment-mean' => 300, |
26
|
|
|
|
|
|
|
'fragment-stdd' => 50, |
27
|
|
|
|
|
|
|
'sequencing-error' => 0.001, |
28
|
|
|
|
|
|
|
'read-mean' => 100, |
29
|
|
|
|
|
|
|
'read-stdd' => 0, |
30
|
|
|
|
|
|
|
'quality-profile' => 'poisson', |
31
|
|
|
|
|
|
|
'join-paired-ends' => 0, |
32
|
|
|
|
|
|
|
'output-format' => 'fastq.gz', |
33
|
|
|
|
|
|
|
'compression-level' => 6 |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub rm_opt { |
37
|
0
|
|
|
0
|
0
|
|
'strand-bias', |
38
|
|
|
|
|
|
|
'number-of-reads', |
39
|
|
|
|
|
|
|
'seqid-weight', |
40
|
|
|
|
|
|
|
'expression-matrix' |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
__END__ |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=pod |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=encoding UTF-8 |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 NAME |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
App::Sandy::Command::Genome - simulate command class. Simulate genome sequencing |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 VERSION |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
version 0.25 |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 SYNOPSIS |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sandy genome [options] <fasta-file> |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Arguments: |
62
|
|
|
|
|
|
|
a fasta file |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Input/Output options: |
65
|
|
|
|
|
|
|
-h, --help brief help message |
66
|
|
|
|
|
|
|
-H, --man full documentation |
67
|
|
|
|
|
|
|
-v, --verbose print log messages |
68
|
|
|
|
|
|
|
-p, --prefix prefix output [default:"out"] |
69
|
|
|
|
|
|
|
-o, --output-dir output directory [default:"."] |
70
|
|
|
|
|
|
|
-O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"] |
71
|
|
|
|
|
|
|
-1, --join-paired-ends merge R1 and R2 outputs in one file |
72
|
|
|
|
|
|
|
-x, --compression-level speed compression: "1" - compress faster, |
73
|
|
|
|
|
|
|
"9" - compress better [default:"6"; Integer] |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
Runtime options: |
76
|
|
|
|
|
|
|
-j, --jobs number of jobs [default:"1"; Integer] |
77
|
|
|
|
|
|
|
-s, --seed set the seed of the base generator |
78
|
|
|
|
|
|
|
[default:"time()"; Integer] |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Sequence identifier options: |
81
|
|
|
|
|
|
|
-i, --append-id append to the defined template id [Format] |
82
|
|
|
|
|
|
|
-I, --id overlap the default template id [Format] |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Sequencing option: |
85
|
|
|
|
|
|
|
-q, --quality-profile sequencing system profiles from quality |
86
|
|
|
|
|
|
|
database [default:"poisson"] |
87
|
|
|
|
|
|
|
-e, --sequencing-error sequencing error rate for poisson |
88
|
|
|
|
|
|
|
[default:"0.001"; Number] |
89
|
|
|
|
|
|
|
-m, --read-mean read mean size for poisson |
90
|
|
|
|
|
|
|
[default:"100"; Integer] |
91
|
|
|
|
|
|
|
-d, --read-stdd read standard deviation size for poisson |
92
|
|
|
|
|
|
|
[default:"0"; Integer] |
93
|
|
|
|
|
|
|
-t, --sequencing-type single-end or paired-end reads |
94
|
|
|
|
|
|
|
[default:"paired-end"] |
95
|
|
|
|
|
|
|
-M, --fragment-mean the fragment mean size for paired-end reads |
96
|
|
|
|
|
|
|
[default:"300"; Integer] |
97
|
|
|
|
|
|
|
-D, --fragment-stdd the fragment standard deviation size for |
98
|
|
|
|
|
|
|
paired-end reads [default:"50"; Integer] |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Genome-specific options: |
101
|
|
|
|
|
|
|
-c, --coverage genome coverage [default:"8", Number] |
102
|
|
|
|
|
|
|
-a, --genomic-variation a list of genomic variation entries from |
103
|
|
|
|
|
|
|
variation database. This option may be passed |
104
|
|
|
|
|
|
|
multiple times [default:"none"] |
105
|
|
|
|
|
|
|
-A, --genomic-variation-regex a list of perl-like regex to match genomic |
106
|
|
|
|
|
|
|
variation entries in variation database. |
107
|
|
|
|
|
|
|
This option may be passed multiple times |
108
|
|
|
|
|
|
|
[default:"none"] |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 DESCRIPTION |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
This subcommand simulates genome sequencing reads taking into account the |
113
|
|
|
|
|
|
|
quality-profile and the genome-variation patterns, along with: raffle |
114
|
|
|
|
|
|
|
seed; coverage (depth); fragment mean and standard deviation; single-end |
115
|
|
|
|
|
|
|
(long and short fragments) and paired-end sequencing type; bam, sam, |
116
|
|
|
|
|
|
|
fastq.gz and fastq output formats and more. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head2 INPUT |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
I<sandy genome> expects as argument a fasta file with chromosome sequences. |
121
|
|
|
|
|
|
|
For example, L<the GENCODE human genome|https://www.gencodegenes.org/human/> |
122
|
|
|
|
|
|
|
GRCh38.p13 fasta file. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head2 OUTPUT |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
The output file generated will depend on the I<output-format> (fastq, bam), |
127
|
|
|
|
|
|
|
on the I<join-paired-ends> option (mate read pairs into a single file) and |
128
|
|
|
|
|
|
|
on the I<sequencing-type> (single-end, paired-end). A file with the simulated |
129
|
|
|
|
|
|
|
coverage (${prefix}_coverage.tsv) for each chromosome (read counts) also |
130
|
|
|
|
|
|
|
accompanies the output file. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 OPTIONS |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=over 8 |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=item B<--help> |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Print a brief help message and exits. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=item B<--man> |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Prints the manual page and exits. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=item B<--verbose> |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
Prints log information to standard error |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=item B<--prefix> |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
Concatenates the prefix to the output-file name. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=item B<--output-dir> |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
Creates output-file inside output-dir. If output-dir |
155
|
|
|
|
|
|
|
does not exist, it is created recursively |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=item B<--output-format> |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Choose the output format. Available options are: |
160
|
|
|
|
|
|
|
I<bam>, I<sam>, I<fastq.gz>, I<fastq>. |
161
|
|
|
|
|
|
|
For I<bam> option, B<--append-id> is ignored, considering |
162
|
|
|
|
|
|
|
that the sequence identifier is splitted by blank character, so |
163
|
|
|
|
|
|
|
just the first field is included into the query name column |
164
|
|
|
|
|
|
|
(first column). |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=item B<--join-paired-ends> |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
By default, paired-end reads are put into two different files, |
169
|
|
|
|
|
|
|
I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs |
170
|
|
|
|
|
|
|
together, she can pass this option. |
171
|
|
|
|
|
|
|
If the B<--id> does not have the escape character %R, it is |
172
|
|
|
|
|
|
|
automatically included right after the first field (blank separated values) |
173
|
|
|
|
|
|
|
as in I<id/%R> - which resolves to I<id/1> or I<id/2>. |
174
|
|
|
|
|
|
|
It is necessary to distinguish which read is R1/R2 |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=item B<--compression-level> |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Regulates the speed of compression using the specified digit (between 1 and 9), |
179
|
|
|
|
|
|
|
where "1" indicates the fastest compression method (less compression) and "9" |
180
|
|
|
|
|
|
|
indicates the slowest compression method (best compression). The default |
181
|
|
|
|
|
|
|
compression level is "6" |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=item B<--append-id> |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Append string template to the defined template id. |
186
|
|
|
|
|
|
|
See B<Format> |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item B<--id> |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Overlap the default defined template id: |
191
|
|
|
|
|
|
|
I<single-end> %i.%U_%c_%s_%t_%n and I<paired-end> %i.%U_%c_%s_%S_%E |
192
|
|
|
|
|
|
|
e.g. SR123.1_chr1_P_1001_1101 |
193
|
|
|
|
|
|
|
See B<Format> |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=item B<Format> |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works. |
198
|
|
|
|
|
|
|
That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid |
199
|
|
|
|
|
|
|
escape characteres are: |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
B<Common escape characters> |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
204
|
|
|
|
|
|
|
Escape Meaning |
205
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
206
|
|
|
|
|
|
|
%i instrument id composed by SR + PID |
207
|
|
|
|
|
|
|
%I job slot number |
208
|
|
|
|
|
|
|
%q quality profile |
209
|
|
|
|
|
|
|
%e sequencing error |
210
|
|
|
|
|
|
|
%x sequencing error position |
211
|
|
|
|
|
|
|
%R read 1, or 2 if it is the paired-end mate |
212
|
|
|
|
|
|
|
%U read number |
213
|
|
|
|
|
|
|
%r read size |
214
|
|
|
|
|
|
|
%m read mean |
215
|
|
|
|
|
|
|
%d read standard deviation |
216
|
|
|
|
|
|
|
%c sequence id as chromossome, gene/transcript id |
217
|
|
|
|
|
|
|
%C sequence id type (reference or alternate non reference allele) *** |
218
|
|
|
|
|
|
|
%s read strand |
219
|
|
|
|
|
|
|
%t read start position |
220
|
|
|
|
|
|
|
%n read end position |
221
|
|
|
|
|
|
|
%a read start position regarding reference genome *** |
222
|
|
|
|
|
|
|
%b read end position regarding reference genome *** |
223
|
|
|
|
|
|
|
%v genomic variation position *** |
224
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
225
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
B<Paired-end specific escape characters> |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
230
|
|
|
|
|
|
|
Escape Meaning |
231
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
232
|
|
|
|
|
|
|
%T mate read start position |
233
|
|
|
|
|
|
|
%N mate read end position |
234
|
|
|
|
|
|
|
%A mate read start position regarding reference genome *** |
235
|
|
|
|
|
|
|
%B mate read end position regarding reference genome *** |
236
|
|
|
|
|
|
|
%D distance between the paired-reads |
237
|
|
|
|
|
|
|
%M fragment mean |
238
|
|
|
|
|
|
|
%D fragment standard deviation |
239
|
|
|
|
|
|
|
%f fragment size |
240
|
|
|
|
|
|
|
%F fragment strand |
241
|
|
|
|
|
|
|
%S fragment start position |
242
|
|
|
|
|
|
|
%E fragment end position |
243
|
|
|
|
|
|
|
%X fragment start position regarding reference genome *** |
244
|
|
|
|
|
|
|
%Z fragment end position regarding reference genome *** |
245
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
246
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=item B<--jobs> |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
Sets the number of child jobs to be created |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item B<--seed> |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
Sets the seed of the base generator. The ability to set the seed is |
255
|
|
|
|
|
|
|
useful for those who want reproducible simulations. Pay attention to |
256
|
|
|
|
|
|
|
the number of jobs (--jobs) set, because each job receives a different |
257
|
|
|
|
|
|
|
seed calculated from the I<main seed>. So, for reproducibility, the |
258
|
|
|
|
|
|
|
same seed set before needs the same number of jobs set before as well. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=item B<--read-mean> |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Sets the read mean if quality-profile is equal to 'poisson'. The |
263
|
|
|
|
|
|
|
quality-profile from database overrides the read-size |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=item B<--read-stdd> |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Sets the read standard deviation if quality-profile is equal to |
268
|
|
|
|
|
|
|
'poisson'. The quality-profile from database overrides the read-stdd |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=item B<--coverage> |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
Calculates the number of reads based on the genome |
273
|
|
|
|
|
|
|
coverage: number_of_reads = (sequence_size * coverage) / read_size. |
274
|
|
|
|
|
|
|
This is the default option for genome sequencing simulation |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=item B<--sequencing-type> |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Sets the sequencing type to single-end or paired-end |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=item B<--fragment-mean> |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
283
|
|
|
|
|
|
|
fragment mean |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=item B<--fragment-stdd> |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
288
|
|
|
|
|
|
|
fragment standard deviation |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=item B<--sequencing-error> |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Sets the sequencing error rate if quality-profile is equal to 'poisson'. |
293
|
|
|
|
|
|
|
Valid values are between zero and one |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=item B<--quality-profile> |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Sets the sequencing system profile for quality. The default value is a poisson |
298
|
|
|
|
|
|
|
distribution, but the user can choose among several profiles stored into the |
299
|
|
|
|
|
|
|
database or import his own data. |
300
|
|
|
|
|
|
|
See B<quality> command for more details |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=item B<--genomic-variation> |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Sets the genomic variation to be applied on the genome feeded. By |
305
|
|
|
|
|
|
|
default no variation is included to the simulation, but the user has |
306
|
|
|
|
|
|
|
the power to point some entries from B<variation> database or index his |
307
|
|
|
|
|
|
|
own data. This option accepts a list with comma separated values |
308
|
|
|
|
|
|
|
and can be passed multiple times, which is useful in order to join |
309
|
|
|
|
|
|
|
various types of genomic variation into the same simulation. It is |
310
|
|
|
|
|
|
|
possible to combine this option with B<--genomic-variation-regex> |
311
|
|
|
|
|
|
|
See B<variation> command for the available list of genomic variation |
312
|
|
|
|
|
|
|
entries |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=item B<--genomic-variation-regex> |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Applies perl-regex in the variation database and selects all entryes |
317
|
|
|
|
|
|
|
that match the pattern. This option accepts a list with comma separated |
318
|
|
|
|
|
|
|
values and can be passed multiple times. It is possible to combine this |
319
|
|
|
|
|
|
|
option with B<--genomic-variation> |
320
|
|
|
|
|
|
|
See B<variation> command for the available list of genomic variation |
321
|
|
|
|
|
|
|
entries |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=back |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head1 EXAMPLES |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
The following command will produce two fastq.gz files (paired-end), with a |
328
|
|
|
|
|
|
|
coverage of 20x and a ${prefix}_coverage.tsv file with the read counts. |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
$ sandy genome \ |
331
|
|
|
|
|
|
|
--verbose \ |
332
|
|
|
|
|
|
|
--jobs=10 \ |
333
|
|
|
|
|
|
|
--sequencing-type=paired-end \ |
334
|
|
|
|
|
|
|
--coverage=20 hg38.fa 2> sim.log |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
or, equivalently: |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
$ sandy genome -v -j 10 -t paired-end -c 20 hg38.fa 2> sim.log |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
Using a seed for reproducibility. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
$ sandy genome --seed=1717 my_fasta.fa |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
To simulate reads with a specific quality-profile other than the default |
345
|
|
|
|
|
|
|
poisson: |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
$ sandy genome --quality-profile=hiseq_101 hg19.fa |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
To see the current list of available quality-profiles: |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$ sandy quality |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
And in order to learn how to add your custom quality-profile, see: |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
$ sandy quality add --help |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Sequence identifiers (first lines of fastq entries) may be customized in output using |
358
|
|
|
|
|
|
|
a format string passed by the user. This format is a combination of literal and escaped |
359
|
|
|
|
|
|
|
characters, in a similar fashion to that used in C programming language’s printf function. |
360
|
|
|
|
|
|
|
For example, let’s simulate a paired-end sequencing and add the read length, read position |
361
|
|
|
|
|
|
|
and mate position into all sequence identifiers: |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
$ sandy genome \ |
364
|
|
|
|
|
|
|
--id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \ |
365
|
|
|
|
|
|
|
hg38.fa |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
In this case, results would be: |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
==> Into R1 |
370
|
|
|
|
|
|
|
@SR.1 read=chr6:979-880 mate=chr6:736-835 length=100 |
371
|
|
|
|
|
|
|
... |
372
|
|
|
|
|
|
|
==> Into R2 |
373
|
|
|
|
|
|
|
@SR.1 read=chr6:736-835 mate=chr6:979-880 length=100 |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
See B<Format> section for details. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
An important feature of B<Sandy> is to simulate a genome with genomic-variation. |
378
|
|
|
|
|
|
|
So to exemplify, let's simulate a genome which includes the fusion between the genes |
379
|
|
|
|
|
|
|
NPM1 and ALK: |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
$ sandy genome --genomic-variation=fusion_hg38_NPM1-ALK hg38.fa |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
To see the current list of available genomic-variation |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$ sandy variation |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
And in order to learn how to add your custom genomic-variation, see: |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
$ sandy variation add --help |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Putting all together: |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
$ sandy genome \ |
394
|
|
|
|
|
|
|
--verbose \ |
395
|
|
|
|
|
|
|
--jobs=10 \ |
396
|
|
|
|
|
|
|
--sequencing-type=paired-end \ |
397
|
|
|
|
|
|
|
--seed=1717 \ |
398
|
|
|
|
|
|
|
--quality-profile=hiseq_101 \ |
399
|
|
|
|
|
|
|
--id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \ |
400
|
|
|
|
|
|
|
--genomic-variation=fusion_hg38_NPM1-ALK \ |
401
|
|
|
|
|
|
|
hg38.fa 2> sim.log |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 AUTHORS |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=over 4 |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=item * |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=item * |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=item * |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=item * |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=item * |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Rodrigo Barreiro <rbarreiro@mochsl.org.br> |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=item * |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=item * |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=item * |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
Rafael Mercuri <rmercuri@mochsl.org.br> |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=item * |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
Rodrigo Barreiro <rbarreiro@mochsl.org.br> |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=item * |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=back |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital. |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
This is free software, licensed under: |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=cut |