line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::Command::Transcriptome; |
2
|
|
|
|
|
|
|
# ABSTRACT: simulate command class. Simulate transcriptome sequencing |
3
|
|
|
|
|
|
|
|
4
|
1
|
|
|
1
|
|
4678
|
use App::Sandy::Base 'class'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
extends 'App::Sandy::CLI::Command'; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
with 'App::Sandy::Role::Digest'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.22'; # VERSION |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub default_opt { |
13
|
0
|
|
|
0
|
0
|
|
'paired-end-id' => '%i.%U:%c %U', |
14
|
|
|
|
|
|
|
'single-end-id' => '%i.%U:%c %U', |
15
|
|
|
|
|
|
|
'seed' => time, |
16
|
|
|
|
|
|
|
'verbose' => 0, |
17
|
|
|
|
|
|
|
'prefix' => 'out', |
18
|
|
|
|
|
|
|
'output-dir' => '.', |
19
|
|
|
|
|
|
|
'jobs' => 1, |
20
|
|
|
|
|
|
|
'count-loops-by' => 'number-of-reads', |
21
|
|
|
|
|
|
|
'number-of-reads' => 1000000, |
22
|
|
|
|
|
|
|
'strand-bias' => 'minus', |
23
|
|
|
|
|
|
|
'seqid-weight' => 'length', |
24
|
|
|
|
|
|
|
'sequencing-type' => 'paired-end', |
25
|
|
|
|
|
|
|
'fragment-mean' => 300, |
26
|
|
|
|
|
|
|
'fragment-stdd' => 50, |
27
|
|
|
|
|
|
|
'sequencing-error' => 0.001, |
28
|
|
|
|
|
|
|
'read-mean' => 100, |
29
|
|
|
|
|
|
|
'read-stdd' => 0, |
30
|
|
|
|
|
|
|
'quality-profile' => 'poisson', |
31
|
|
|
|
|
|
|
'join-paired-ends' => 0, |
32
|
|
|
|
|
|
|
'output-format' => 'fastq.gz', |
33
|
|
|
|
|
|
|
'compression-level' => 6 |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub rm_opt { |
37
|
0
|
|
|
0
|
0
|
|
'strand-bias', |
38
|
|
|
|
|
|
|
'coverage', |
39
|
|
|
|
|
|
|
'seqid-weight', |
40
|
|
|
|
|
|
|
'genomic-variation' |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
__END__ |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=pod |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=encoding UTF-8 |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 NAME |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
App::Sandy::Command::Transcriptome - simulate command class. Simulate transcriptome sequencing |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 VERSION |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
version 0.22 |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 SYNOPSIS |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sandy transcriptome [options] <fasta-file> |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Arguments: |
62
|
|
|
|
|
|
|
a fasta-file |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Options: |
65
|
|
|
|
|
|
|
-h, --help brief help message |
66
|
|
|
|
|
|
|
-u, --man full documentation |
67
|
|
|
|
|
|
|
-v, --verbose print log messages |
68
|
|
|
|
|
|
|
-p, --prefix prefix output [default:"out"] |
69
|
|
|
|
|
|
|
-o, --output-dir output directory [default:"."] |
70
|
|
|
|
|
|
|
-O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"] |
71
|
|
|
|
|
|
|
-1, --join-paired-ends merge R1 and R2 outputs in one file |
72
|
|
|
|
|
|
|
-x, --compression-level speed compression: "1" - compress faster, |
73
|
|
|
|
|
|
|
"9" - compress better [default:"6"; Integer] |
74
|
|
|
|
|
|
|
-i, --append-id append to the defined template id [Format] |
75
|
|
|
|
|
|
|
-I, --id overlap the default template id [Format] |
76
|
|
|
|
|
|
|
-j, --jobs number of jobs [default:"1"; Integer] |
77
|
|
|
|
|
|
|
-s, --seed set the seed of the base generator |
78
|
|
|
|
|
|
|
[default:"time()"; Integer] |
79
|
|
|
|
|
|
|
-n, --number-of-reads set the number of reads |
80
|
|
|
|
|
|
|
[default:"1000000", Integer] |
81
|
|
|
|
|
|
|
-t, --sequencing-type single-end or paired-end reads |
82
|
|
|
|
|
|
|
[default:"paired-end"] |
83
|
|
|
|
|
|
|
-q, --quality-profile sequencing system profiles from quality |
84
|
|
|
|
|
|
|
database [default:"poisson"] |
85
|
|
|
|
|
|
|
-e, --sequencing-error sequencing error rate for poisson |
86
|
|
|
|
|
|
|
[default:"0.001"; Number] |
87
|
|
|
|
|
|
|
-m, --read-mean read mean size for poisson |
88
|
|
|
|
|
|
|
[default:"100"; Integer] |
89
|
|
|
|
|
|
|
-d, --read-stdd read standard deviation size for poisson |
90
|
|
|
|
|
|
|
[default:"0"; Integer] |
91
|
|
|
|
|
|
|
-M, --fragment-mean the fragment mean size for paired-end reads |
92
|
|
|
|
|
|
|
[default:"300"; Integer] |
93
|
|
|
|
|
|
|
-D, --fragment-stdd the fragment standard deviation size for |
94
|
|
|
|
|
|
|
paired-end reads [default:"50"; Integer] |
95
|
|
|
|
|
|
|
-f, --expression-matrix an expression-matrix entry from database |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head1 DESCRIPTION |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Simulate transcriptome sequencing. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=head1 OPTIONS |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=over 8 |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=item B<--help> |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Print a brief help message and exits. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=item B<--man> |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Prints the manual page and exits. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=item B<--verbose> |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Prints log information to standard error |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=item B<--prefix> |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Concatenates the prefix to the output-file name. |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=item B<--output-dir> |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
Creates output-file inside output-dir. If output-dir |
124
|
|
|
|
|
|
|
does not exist, it is created recursively |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item B<--output-format> |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Choose the output format. Available options are: |
129
|
|
|
|
|
|
|
I<bam>, I<sam>, I<fastq.gz>, I<fastq>. |
130
|
|
|
|
|
|
|
For I<bam> option, B<--append-id> is ignored, considering |
131
|
|
|
|
|
|
|
that the sequence identifier is splitted by blank character, so |
132
|
|
|
|
|
|
|
just the first field is included into the query name column |
133
|
|
|
|
|
|
|
(first column). |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item B<--join-paired-ends> |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
By default, paired-end reads are put into two different files, |
138
|
|
|
|
|
|
|
I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs |
139
|
|
|
|
|
|
|
together, she can pass this option. |
140
|
|
|
|
|
|
|
If the B<--id> does not have the escape character %R, it is |
141
|
|
|
|
|
|
|
automatically included right after the first field (blank separated values) |
142
|
|
|
|
|
|
|
as in I<id/%R> - which resolves to I<id/1> or I<id/2>. |
143
|
|
|
|
|
|
|
It is necessary to distinguish which read is R1/R2 |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item B<--compression-level> |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Regulates the speed of compression using the specified digit (between 1 and 9), |
148
|
|
|
|
|
|
|
where "1" indicates the fastest compression method (less compression) and "9" |
149
|
|
|
|
|
|
|
indicates the slowest compression method (best compression). The default |
150
|
|
|
|
|
|
|
compression level is "6" |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=item B<--append-id> |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
Append string template to the defined template id. |
155
|
|
|
|
|
|
|
See B<Format> |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=item B<--id> |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Overlap the default defined template id: |
160
|
|
|
|
|
|
|
I<single-end> %i.%U %U and I<paired-end> %i.%U %U |
161
|
|
|
|
|
|
|
e.g. SR123.1 1 |
162
|
|
|
|
|
|
|
See B<Format> |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item B<Format> |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works. |
167
|
|
|
|
|
|
|
That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid |
168
|
|
|
|
|
|
|
escape characteres are: |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
B<Common escape characters> |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
173
|
|
|
|
|
|
|
Escape Meaning |
174
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
175
|
|
|
|
|
|
|
%i instrument id composed by SR + PID |
176
|
|
|
|
|
|
|
%I job slot number |
177
|
|
|
|
|
|
|
%q quality profile |
178
|
|
|
|
|
|
|
%e sequencing error |
179
|
|
|
|
|
|
|
%x sequencing error position |
180
|
|
|
|
|
|
|
%R read 1, or 2 if it is the paired-end mate |
181
|
|
|
|
|
|
|
%U read number |
182
|
|
|
|
|
|
|
%r read size |
183
|
|
|
|
|
|
|
%m read mean |
184
|
|
|
|
|
|
|
%d read standard deviation |
185
|
|
|
|
|
|
|
%c sequence id as chromossome, gene/transcript id |
186
|
|
|
|
|
|
|
%C sequence id type (reference or alternate non reference allele) *** |
187
|
|
|
|
|
|
|
%s read strand |
188
|
|
|
|
|
|
|
%t read start position |
189
|
|
|
|
|
|
|
%n read end position |
190
|
|
|
|
|
|
|
%a read start position regarding reference genome *** |
191
|
|
|
|
|
|
|
%b read end position regarding reference genome *** |
192
|
|
|
|
|
|
|
%v genomic variation position *** |
193
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
194
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
B<Paired-end specific escape characters> |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
199
|
|
|
|
|
|
|
Escape Meaning |
200
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
201
|
|
|
|
|
|
|
%T mate read start position |
202
|
|
|
|
|
|
|
%N mate read end position |
203
|
|
|
|
|
|
|
%A mate read start position regarding reference genome *** |
204
|
|
|
|
|
|
|
%B mate read end position regarding reference genome *** |
205
|
|
|
|
|
|
|
%D distance between the paired-reads |
206
|
|
|
|
|
|
|
%M fragment mean |
207
|
|
|
|
|
|
|
%D fragment standard deviation |
208
|
|
|
|
|
|
|
%f fragment size |
209
|
|
|
|
|
|
|
%F fragment strand |
210
|
|
|
|
|
|
|
%S fragment start position |
211
|
|
|
|
|
|
|
%E fragment end position |
212
|
|
|
|
|
|
|
%X fragment start position regarding reference genome *** |
213
|
|
|
|
|
|
|
%Z fragment end position regarding reference genome *** |
214
|
|
|
|
|
|
|
---------------------------------------------------------------------------- |
215
|
|
|
|
|
|
|
*** specific for genomic variation (genome simulation only) |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=item B<--jobs> |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Sets the number of child jobs to be created |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item B<--seed> |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Sets the seed of the base generator. The ability to set the seed is |
224
|
|
|
|
|
|
|
useful for those who want reproducible simulations. Pay attention to |
225
|
|
|
|
|
|
|
the number of jobs (--jobs) set, because each job receives a different |
226
|
|
|
|
|
|
|
seed calculated from the I<main seed>. So, for reproducibility, the |
227
|
|
|
|
|
|
|
same seed set before needs the same number of jobs set before as well. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=item B<--read-mean> |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Sets the read mean if quality-profile is equal to 'poisson'. The |
232
|
|
|
|
|
|
|
quality-profile from database overrides the read-size |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=item B<--read-stdd> |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Sets the read standard deviation if quality-profile is equal to |
237
|
|
|
|
|
|
|
'poisson'. The quality-profile from database overrides the read-stdd |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=item B<--number-of-reads> |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Sets the number of reads desired for each fragment end. That means, |
242
|
|
|
|
|
|
|
it will be the number of reads for each pair - 1 x N reads for single-end |
243
|
|
|
|
|
|
|
and 2 x N reads for paired-end. This is the default option for transcriptome |
244
|
|
|
|
|
|
|
sequencing simulation |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item B<--sequencing-type> |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Sets the sequencing type to single-end or paired-end |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item B<--fragment-mean> |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
253
|
|
|
|
|
|
|
fragment mean |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=item B<--fragment-stdd> |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
If the sequencing-type is set to paired-end, it sets the |
258
|
|
|
|
|
|
|
fragment standard deviation |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=item B<--sequencing-error> |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Sets the sequencing error rate if quality-profile is equal to 'poisson'. |
263
|
|
|
|
|
|
|
Valid values are between zero and one |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=item B<--quality-profile> |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Sets the sequencing system profile for quality. The default value is a poisson |
268
|
|
|
|
|
|
|
distribution, but the user can choose among several profiles stored into the |
269
|
|
|
|
|
|
|
database or import his own data. |
270
|
|
|
|
|
|
|
See B<quality> command for more details |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=item B<--expression-matrix> |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
By default, the gene/transcript is raffled using its length as weight. If |
275
|
|
|
|
|
|
|
you choose an expression-matrix, then the raffle will be made based on the |
276
|
|
|
|
|
|
|
gene/transcript expression. |
277
|
|
|
|
|
|
|
The expression-matrix entries are found into the database. |
278
|
|
|
|
|
|
|
See B<expression> command for more details |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=back |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head1 AUTHORS |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=over 4 |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item * |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=item * |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=item * |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=item * |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=item * |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=item * |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=item * |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=back |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-Libanês Hospital. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
This is free software, licensed under: |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=cut |