| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package App::Sandy::Command::Transcriptome; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: simulate command class. Simulate transcriptome sequencing |
|
3
|
|
|
|
|
|
|
|
|
4
|
1
|
|
|
1
|
|
1849
|
use App::Sandy::Base 'class'; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
extends 'App::Sandy::CLI::Command'; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
with 'App::Sandy::Role::Digest'; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.25'; # VERSION |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub default_opt { |
|
13
|
0
|
|
|
0
|
0
|
|
'paired-end-id' => '%i.%U:%c %U', |
|
14
|
|
|
|
|
|
|
'single-end-id' => '%i.%U:%c %U', |
|
15
|
|
|
|
|
|
|
'seed' => time, |
|
16
|
|
|
|
|
|
|
'verbose' => 0, |
|
17
|
|
|
|
|
|
|
'prefix' => 'out', |
|
18
|
|
|
|
|
|
|
'output-dir' => '.', |
|
19
|
|
|
|
|
|
|
'jobs' => 1, |
|
20
|
|
|
|
|
|
|
'count-loops-by' => 'number-of-reads', |
|
21
|
|
|
|
|
|
|
'number-of-reads' => 1000000, |
|
22
|
|
|
|
|
|
|
'strand-bias' => 'minus', |
|
23
|
|
|
|
|
|
|
'seqid-weight' => 'length', |
|
24
|
|
|
|
|
|
|
'sequencing-type' => 'paired-end', |
|
25
|
|
|
|
|
|
|
'fragment-mean' => 300, |
|
26
|
|
|
|
|
|
|
'fragment-stdd' => 50, |
|
27
|
|
|
|
|
|
|
'sequencing-error' => 0.001, |
|
28
|
|
|
|
|
|
|
'read-mean' => 100, |
|
29
|
|
|
|
|
|
|
'read-stdd' => 0, |
|
30
|
|
|
|
|
|
|
'quality-profile' => 'poisson', |
|
31
|
|
|
|
|
|
|
'join-paired-ends' => 0, |
|
32
|
|
|
|
|
|
|
'output-format' => 'fastq.gz', |
|
33
|
|
|
|
|
|
|
'compression-level' => 6 |
|
34
|
|
|
|
|
|
|
} |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub rm_opt { |
|
37
|
0
|
|
|
0
|
0
|
|
'strand-bias', |
|
38
|
|
|
|
|
|
|
'coverage', |
|
39
|
|
|
|
|
|
|
'seqid-weight', |
|
40
|
|
|
|
|
|
|
'genomic-variation' |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
__END__ |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=pod |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=encoding UTF-8 |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 NAME |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
App::Sandy::Command::Transcriptome - simulate command class. Simulate transcriptome sequencing |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 VERSION |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
version 0.25 |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sandy transcriptome [options] <fasta-file> |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Arguments: |
|
62
|
|
|
|
|
|
|
a fasta file |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Input/Output options: |
|
65
|
|
|
|
|
|
|
-h, --help brief help message |
|
66
|
|
|
|
|
|
|
-H, --man full documentation |
|
67
|
|
|
|
|
|
|
-v, --verbose print log messages |
|
68
|
|
|
|
|
|
|
-p, --prefix prefix output [default:"out"] |
|
69
|
|
|
|
|
|
|
-o, --output-dir output directory [default:"."] |
|
70
|
|
|
|
|
|
|
-O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"] |
|
71
|
|
|
|
|
|
|
-1, --join-paired-ends merge R1 and R2 outputs in one file |
|
72
|
|
|
|
|
|
|
-x, --compression-level speed compression: "1" - compress faster, |
|
73
|
|
|
|
|
|
|
"9" - compress better [default:"6"; Integer] |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
Runtime options: |
|
76
|
|
|
|
|
|
|
-j, --jobs number of jobs [default:"1"; Integer] |
|
77
|
|
|
|
|
|
|
-s, --seed set the seed of the base generator |
|
78
|
|
|
|
|
|
|
[default:"time()"; Integer] |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Sequence identifier options: |
|
81
|
|
|
|
|
|
|
-i, --append-id append to the defined template id [Format] |
|
82
|
|
|
|
|
|
|
-I, --id overlap the default template id [Format] |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Sequencing option: |
|
85
|
|
|
|
|
|
|
-q, --quality-profile sequencing system profiles from quality |
|
86
|
|
|
|
|
|
|
database [default:"poisson"] |
|
87
|
|
|
|
|
|
|
-e, --sequencing-error sequencing error rate for poisson |
|
88
|
|
|
|
|
|
|
[default:"0.001"; Number] |
|
89
|
|
|
|
|
|
|
-m, --read-mean read mean size for poisson |
|
90
|
|
|
|
|
|
|
[default:"100"; Integer] |
|
91
|
|
|
|
|
|
|
-d, --read-stdd read standard deviation size for poisson |
|
92
|
|
|
|
|
|
|
[default:"0"; Integer] |
|
93
|
|
|
|
|
|
|
-t, --sequencing-type single-end or paired-end reads |
|
94
|
|
|
|
|
|
|
[default:"paired-end"] |
|
95
|
|
|
|
|
|
|
-M, --fragment-mean the fragment mean size for paired-end reads |
|
96
|
|
|
|
|
|
|
[default:"300"; Integer] |
|
97
|
|
|
|
|
|
|
-D, --fragment-stdd the fragment standard deviation size for |
|
98
|
|
|
|
|
|
|
paired-end reads [default:"50"; Integer] |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Transcriptome-specific options: |
|
101
|
|
|
|
|
|
|
-n, --number-of-reads set the number of reads |
|
102
|
|
|
|
|
|
|
[default:"1000000", Integer] |
|
103
|
|
|
|
|
|
|
-f, --expression-matrix an expression-matrix entry from database |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
This subcommand simulates transcriptome sequencing reads taking into account |
|
108
|
|
|
|
|
|
|
the quality-profile and the expression-matrix weights, along with: raffle |
|
109
|
|
|
|
|
|
|
seed; number of reads; fragment mean and standard deviation; single-end |
|
110
|
|
|
|
|
|
|
(long and short fragments) and paired-end sequencing type; bam, sam, |
|
111
|
|
|
|
|
|
|
fastq.gz and fastq output formats and more. |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 INPUT |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
I<sandy transcriptome> expects as argument a fasta file with transcript sequences. |
|
116
|
|
|
|
|
|
|
For example, L<the GENCODE human genome|https://www.gencodegenes.org/human/> |
|
117
|
|
|
|
|
|
|
transcript sequences and protein-coding transcript sequences. |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head2 OUTPUT |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
The output file generated will depend on the I<output-format> (fastq, bam), |
|
122
|
|
|
|
|
|
|
on the I<join-paired-ends> option (mate read pairs into a single file) and |
|
123
|
|
|
|
|
|
|
on the I<sequencing-type> (single-end, paired-end). One file with the simulated |
|
124
|
|
|
|
|
|
|
abundance (${prefix}_abundance_transcripts.tsv) per transcript and one file with |
|
125
|
|
|
|
|
|
|
the simulated abundance (${prefix}_abundance_genes.tsv) per gene (if the fasta |
|
126
|
|
|
|
|
|
|
file used has the relationship between gene and its transcripts at the header) |
|
127
|
|
|
|
|
|
|
will accompany the output file. |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head1 OPTIONS |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=over 8 |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item B<--help> |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Print a brief help message and exits. |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=item B<--man> |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Prints the manual page and exits. |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=item B<--verbose> |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Prints log information to standard error |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item B<--prefix> |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Concatenates the prefix to the output-file name. |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item B<--output-dir> |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Creates output-file inside output-dir. If output-dir |
|
152
|
|
|
|
|
|
|
does not exist, it is created recursively |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item B<--output-format> |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Choose the output format. Available options are: |
|
157
|
|
|
|
|
|
|
I<bam>, I<sam>, I<fastq.gz>, I<fastq>. |
|
158
|
|
|
|
|
|
|
For I<bam> option, B<--append-id> is ignored, considering |
|
159
|
|
|
|
|
|
|
that the sequence identifier is splitted by blank character, so |
|
160
|
|
|
|
|
|
|
just the first field is included into the query name column |
|
161
|
|
|
|
|
|
|
(first column). |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=item B<--join-paired-ends> |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
By default, paired-end reads are put into two different files, |
|
166
|
|
|
|
|
|
|
I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs |
|
167
|
|
|
|
|
|
|
together, she can pass this option. |
|
168
|
|
|
|
|
|
|
If the B<--id> does not have the escape character %R, it is |
|
169
|
|
|
|
|
|
|
automatically included right after the first field (blank separated values) |
|
170
|
|
|
|
|
|
|
as in I<id/%R> - which resolves to I<id/1> or I<id/2>. |
|
171
|
|
|
|
|
|
|
It is necessary to distinguish which read is R1/R2 |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=item B<--compression-level> |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Regulates the speed of compression using the specified digit (between 1 and 9), |
|
176
|
|
|
|
|
|
|
where "1" indicates the fastest compression method (less compression) and "9" |
|
177
|
|
|
|
|
|
|
indicates the slowest compression method (best compression). The default |
|
178
|
|
|
|
|
|
|
compression level is "6" |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=item B<--append-id> |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
Append string template to the defined template id. |
|
183
|
|
|
|
|
|
|
See B<Format> |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=item B<--id> |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Overlap the default defined template id: |
|
188
|
|
|
|
|
|
|
I<single-end> %i.%U %U and I<paired-end> %i.%U %U |
|
189
|
|
|
|
|
|
|
e.g. SR123.1 1 |
|
190
|
|
|
|
|
|
|
See B<Format> |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=item B<Format> |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works. |
|
195
|
|
|
|
|
|
|
That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid |
|
196
|
|
|
|
|
|
|
escape characteres are: |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
B<Common escape characters> |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
|
201
|
|
|
|
|
|
|
Escape Meaning |
|
202
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
|
203
|
|
|
|
|
|
|
%i instrument id composed by SR + PID |
|
204
|
|
|
|
|
|
|
%I job slot number |
|
205
|
|
|
|
|
|
|
%q quality profile |
|
206
|
|
|
|
|
|
|
%e sequencing error |
|
207
|
|
|
|
|
|
|
%x sequencing error position |
|
208
|
|
|
|
|
|
|
%R read 1, or 2 if it is the paired-end mate |
|
209
|
|
|
|
|
|
|
%U read number |
|
210
|
|
|
|
|
|
|
%r read size |
|
211
|
|
|
|
|
|
|
%m read mean |
|
212
|
|
|
|
|
|
|
%d read standard deviation |
|
213
|
|
|
|
|
|
|
%c sequence id as chromossome, gene/transcript id |
|
214
|
|
|
|
|
|
|
%C sequence id type (reference or alternate non reference allele) *** |
|
215
|
|
|
|
|
|
|
%s read strand |
|
216
|
|
|
|
|
|
|
%t read start position |
|
217
|
|
|
|
|
|
|
%n read end position |
|
218
|
|
|
|
|
|
|
%a read start position regarding reference genome *** |
|
219
|
|
|
|
|
|
|
%b read end position regarding reference genome *** |
|
220
|
|
|
|
|
|
|
%v genomic variation position *** |
|
221
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
|
222
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
B<Paired-end specific escape characters> |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
|
227
|
|
|
|
|
|
|
Escape Meaning |
|
228
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
|
229
|
|
|
|
|
|
|
%T mate read start position |
|
230
|
|
|
|
|
|
|
%N mate read end position |
|
231
|
|
|
|
|
|
|
%A mate read start position regarding reference genome *** |
|
232
|
|
|
|
|
|
|
%B mate read end position regarding reference genome *** |
|
233
|
|
|
|
|
|
|
%D distance between the paired-reads |
|
234
|
|
|
|
|
|
|
%M fragment mean |
|
235
|
|
|
|
|
|
|
%D fragment standard deviation |
|
236
|
|
|
|
|
|
|
%f fragment size |
|
237
|
|
|
|
|
|
|
%F fragment strand |
|
238
|
|
|
|
|
|
|
%S fragment start position |
|
239
|
|
|
|
|
|
|
%E fragment end position |
|
240
|
|
|
|
|
|
|
%X fragment start position regarding reference genome *** |
|
241
|
|
|
|
|
|
|
%Z fragment end position regarding reference genome *** |
|
242
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
|
243
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item B<--jobs> |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Sets the number of child jobs to be created |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=item B<--seed> |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Sets the seed of the base generator. The ability to set the seed is |
|
252
|
|
|
|
|
|
|
useful for those who want reproducible simulations. Pay attention to |
|
253
|
|
|
|
|
|
|
the number of jobs (--jobs) set, because each job receives a different |
|
254
|
|
|
|
|
|
|
seed calculated from the I<main seed>. So, for reproducibility, the |
|
255
|
|
|
|
|
|
|
same seed set before needs the same number of jobs set before as well. |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=item B<--read-mean> |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Sets the read mean if quality-profile is equal to 'poisson'. The |
|
260
|
|
|
|
|
|
|
quality-profile from database overrides the read-size |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=item B<--read-stdd> |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
Sets the read standard deviation if quality-profile is equal to |
|
265
|
|
|
|
|
|
|
'poisson'. The quality-profile from database overrides the read-stdd |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=item B<--number-of-reads> |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
Sets the number of reads desired for each fragment end. That means, |
|
270
|
|
|
|
|
|
|
it will be the number of reads for each pair - 1 x N reads for single-end |
|
271
|
|
|
|
|
|
|
and 2 x N reads for paired-end. This is the default option for transcriptome |
|
272
|
|
|
|
|
|
|
sequencing simulation |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=item B<--sequencing-type> |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Sets the sequencing type to single-end or paired-end |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=item B<--fragment-mean> |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
|
281
|
|
|
|
|
|
|
fragment mean |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=item B<--fragment-stdd> |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
|
286
|
|
|
|
|
|
|
fragment standard deviation |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=item B<--sequencing-error> |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
Sets the sequencing error rate if quality-profile is equal to 'poisson'. |
|
291
|
|
|
|
|
|
|
Valid values are between zero and one |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=item B<--quality-profile> |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
Sets the sequencing system profile for quality. The default value is a poisson |
|
296
|
|
|
|
|
|
|
distribution, but the user can choose among several profiles stored into the |
|
297
|
|
|
|
|
|
|
database or import his own data. |
|
298
|
|
|
|
|
|
|
See B<quality> command for more details |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=item B<--expression-matrix> |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
By default, the gene/transcript is raffled using its length as weight. If |
|
303
|
|
|
|
|
|
|
you choose an expression-matrix, then the raffle will be made based on the |
|
304
|
|
|
|
|
|
|
gene/transcript expression. |
|
305
|
|
|
|
|
|
|
The expression-matrix entries are found into the database. |
|
306
|
|
|
|
|
|
|
See B<expression> command for more details |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=back |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=head1 EXAMPLES |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
The command: |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
$ sandy transcriptome \ |
|
315
|
|
|
|
|
|
|
--verbose \ |
|
316
|
|
|
|
|
|
|
--jobs=5 \ |
|
317
|
|
|
|
|
|
|
--number-of-reads=5000000 \ |
|
318
|
|
|
|
|
|
|
--output-dir=my_results/ \ |
|
319
|
|
|
|
|
|
|
gencode.v43.transcripts.fa.gz 2> sim.log |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
or, equivalently: |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
$ sandy transcriptome \ |
|
324
|
|
|
|
|
|
|
-v -j 5 -n 5000000 -o my_results/ \ |
|
325
|
|
|
|
|
|
|
gencode.v43.transcripts.fa.gz 2> sim.log |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
will both generate two paired-end fastq files (R1 and R2) into my_results/ directory |
|
328
|
|
|
|
|
|
|
with 5000000 reads and two abundance files, one per transcripts and the other per genes. |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
By default the raffled bias is the transcript length, but the user can change this |
|
331
|
|
|
|
|
|
|
behavior by choosing an expression-matrix from the database: |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
$ sandy transcriptome -f brain_cortex gencode.v43.transcripts.fa.gz |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
To see the current list of available expression matrices: |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
$ sandy expression |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
And in order to learn how to add your custom expression-matrix, see: |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
$ sandy expression add --help |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
For reproducibility, the user can set the seed option and guarantee the reliability of all |
|
344
|
|
|
|
|
|
|
the raffles in a later simulation: |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
$ sandy expression --seed=1717 my_transcripts.fa |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
To simulate reads with a specific quality-profile other than the default |
|
349
|
|
|
|
|
|
|
poisson: |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$ sandy expression --quality-profile=hiseq_150 my_transcripts.fa |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
To see the current list of available quality-profiles: |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
$ sandy quality |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
And in order to learn how to add your custom quality-profile, see: |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
$ sandy quality add --help |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
Sequence identifiers (first lines of fastq entries) may be customized in output using |
|
362
|
|
|
|
|
|
|
a format string passed by the user. This format is a combination of literal and escaped |
|
363
|
|
|
|
|
|
|
characters, in a similar fashion to that used in C programming language’s printf function. |
|
364
|
|
|
|
|
|
|
For example, let’s simulate a paired-end sequencing and add the read length, read position |
|
365
|
|
|
|
|
|
|
and mate position into all sequence identifiers: |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
$ sandy expression --id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" my_genes.fa.gz |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
In this case, results would be: |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
==> Into R1 |
|
372
|
|
|
|
|
|
|
@SR.1 read=BRAF:979-880 mate=BRAF:736-835 length=100 |
|
373
|
|
|
|
|
|
|
... |
|
374
|
|
|
|
|
|
|
==> Into R2 |
|
375
|
|
|
|
|
|
|
@SR.1 read=BRAF:736-835 mate=BRAF:979-880 length=100 |
|
376
|
|
|
|
|
|
|
... |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
See B<Format> section for details. |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Putting all together: |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
$ sandy transcriptome \ |
|
383
|
|
|
|
|
|
|
--verbose \ |
|
384
|
|
|
|
|
|
|
--jobs=5 \ |
|
385
|
|
|
|
|
|
|
--number-of-reads=5000000 \ |
|
386
|
|
|
|
|
|
|
--output-dir=my_results/ \ |
|
387
|
|
|
|
|
|
|
--expression-matrix=brain_cortex \ |
|
388
|
|
|
|
|
|
|
--seed=1717 \ |
|
389
|
|
|
|
|
|
|
--quality-profile=hiseq_150 \ |
|
390
|
|
|
|
|
|
|
--id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \ |
|
391
|
|
|
|
|
|
|
gencode.v43.transcripts.fa.gz 2> sim.log |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=head1 AUTHORS |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=over 4 |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=item * |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=item * |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=item * |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=item * |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=item * |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
Rodrigo Barreiro <rbarreiro@mochsl.org.br> |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
=item * |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=item * |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=item * |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Rafael Mercuri <rmercuri@mochsl.org.br> |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item * |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Rodrigo Barreiro <rbarreiro@mochsl.org.br> |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=item * |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
=back |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital. |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
This is free software, licensed under: |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=cut |