File Coverage

lib/App/Sandy/Command/Genome.pm

Criterion	Covered	Total	%
statement	3	5	60.0
branch			n/a
condition			n/a
subroutine	1	3	33.3
pod	0	2	0.0
total	4	10	40.0

line	stmt	sub	pod	time	code
1					package App::Sandy::Command::Genome;
2					# ABSTRACT: simulate command class. Simulate genome sequencing
3
4	1	1		1430	use App::Sandy::Base 'class';
	1			2
	1			13
5
6					extends 'App::Sandy::CLI::Command';
7
8					with 'App::Sandy::Role::Digest';
9
10					our $VERSION = '0.25'; # VERSION
11
12					sub default_opt {
13	0	0	0		'paired-end-id' => '%i.%U:%c:%F:%X-%Z',
14					'single-end-id' => '%i.%U:%c:%s:%t-%n',
15					'seed' => time,
16					'verbose' => 0,
17					'prefix' => 'out',
18					'output-dir' => '.',
19					'jobs' => 1,
20					'count-loops-by' => 'coverage',
21					'coverage' => 8,
22					'strand-bias' => 'random',
23					'seqid-weight' => 'length',
24					'sequencing-type' => 'paired-end',
25					'fragment-mean' => 300,
26					'fragment-stdd' => 50,
27					'sequencing-error' => 0.001,
28					'read-mean' => 100,
29					'read-stdd' => 0,
30					'quality-profile' => 'poisson',
31					'join-paired-ends' => 0,
32					'output-format' => 'fastq.gz',
33					'compression-level' => 6
34					}
35
36					sub rm_opt {
37	0	0	0		'strand-bias',
38					'number-of-reads',
39					'seqid-weight',
40					'expression-matrix'
41					}
42
43					__END__
44
45					=pod
46
47					=encoding UTF-8
48
49					=head1 NAME
50
51					App::Sandy::Command::Genome - simulate command class. Simulate genome sequencing
52
53					=head1 VERSION
54
55					version 0.25
56
57					=head1 SYNOPSIS
58
59					sandy genome [options] <fasta-file>
60
61					Arguments:
62					a fasta file
63
64					Input/Output options:
65					-h, --help brief help message
66					-H, --man full documentation
67					-v, --verbose print log messages
68					-p, --prefix prefix output [default:"out"]
69					-o, --output-dir output directory [default:"."]
70					-O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"]
71					-1, --join-paired-ends merge R1 and R2 outputs in one file
72					-x, --compression-level speed compression: "1" - compress faster,
73					"9" - compress better [default:"6"; Integer]
74
75					Runtime options:
76					-j, --jobs number of jobs [default:"1"; Integer]
77					-s, --seed set the seed of the base generator
78					[default:"time()"; Integer]
79
80					Sequence identifier options:
81					-i, --append-id append to the defined template id [Format]
82					-I, --id overlap the default template id [Format]
83
84					Sequencing option:
85					-q, --quality-profile sequencing system profiles from quality
86					database [default:"poisson"]
87					-e, --sequencing-error sequencing error rate for poisson
88					[default:"0.001"; Number]
89					-m, --read-mean read mean size for poisson
90					[default:"100"; Integer]
91					-d, --read-stdd read standard deviation size for poisson
92					[default:"0"; Integer]
93					-t, --sequencing-type single-end or paired-end reads
94					[default:"paired-end"]
95					-M, --fragment-mean the fragment mean size for paired-end reads
96					[default:"300"; Integer]
97					-D, --fragment-stdd the fragment standard deviation size for
98					paired-end reads [default:"50"; Integer]
99
100					Genome-specific options:
101					-c, --coverage genome coverage [default:"8", Number]
102					-a, --genomic-variation a list of genomic variation entries from
103					variation database. This option may be passed
104					multiple times [default:"none"]
105					-A, --genomic-variation-regex a list of perl-like regex to match genomic
106					variation entries in variation database.
107					This option may be passed multiple times
108					[default:"none"]
109
110					=head1 DESCRIPTION
111
112					This subcommand simulates genome sequencing reads taking into account the
113					quality-profile and the genome-variation patterns, along with: raffle
114					seed; coverage (depth); fragment mean and standard deviation; single-end
115					(long and short fragments) and paired-end sequencing type; bam, sam,
116					fastq.gz and fastq output formats and more.
117
118					=head2 INPUT
119
120					I<sandy genome> expects as argument a fasta file with chromosome sequences.
121					For example, L<the GENCODE human genome\|https://www.gencodegenes.org/human/>
122					GRCh38.p13 fasta file.
123
124					=head2 OUTPUT
125
126					The output file generated will depend on the I<output-format> (fastq, bam),
127					on the I<join-paired-ends> option (mate read pairs into a single file) and
128					on the I<sequencing-type> (single-end, paired-end). A file with the simulated
129					coverage (${prefix}_coverage.tsv) for each chromosome (read counts) also
130					accompanies the output file.
131
132					=head1 OPTIONS
133
134					=over 8
135
136					=item B<--help>
137
138					Print a brief help message and exits.
139
140					=item B<--man>
141
142					Prints the manual page and exits.
143
144					=item B<--verbose>
145
146					Prints log information to standard error
147
148					=item B<--prefix>
149
150					Concatenates the prefix to the output-file name.
151
152					=item B<--output-dir>
153
154					Creates output-file inside output-dir. If output-dir
155					does not exist, it is created recursively
156
157					=item B<--output-format>
158
159					Choose the output format. Available options are:
160					I<bam>, I<sam>, I<fastq.gz>, I<fastq>.
161					For I<bam> option, B<--append-id> is ignored, considering
162					that the sequence identifier is splitted by blank character, so
163					just the first field is included into the query name column
164					(first column).
165
166					=item B<--join-paired-ends>
167
168					By default, paired-end reads are put into two different files,
169					I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs
170					together, she can pass this option.
171					If the B<--id> does not have the escape character %R, it is
172					automatically included right after the first field (blank separated values)
173					as in I<id/%R> - which resolves to I<id/1> or I<id/2>.
174					It is necessary to distinguish which read is R1/R2
175
176					=item B<--compression-level>
177
178					Regulates the speed of compression using the specified digit (between 1 and 9),
179					where "1" indicates the fastest compression method (less compression) and "9"
180					indicates the slowest compression method (best compression). The default
181					compression level is "6"
182
183					=item B<--append-id>
184
185					Append string template to the defined template id.
186					See B<Format>
187
188					=item B<--id>
189
190					Overlap the default defined template id:
191					I<single-end> %i.%U_%c_%s_%t_%n and I<paired-end> %i.%U_%c_%s_%S_%E
192					e.g. SR123.1_chr1_P_1001_1101
193					See B<Format>
194
195					=item B<Format>
196
197					A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works.
198					That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid
199					escape characteres are:
200
201					B<Common escape characters>
202
203					----------------------------------------------------------------------------
204					Escape Meaning
205					----------------------------------------------------------------------------
206					%i instrument id composed by SR + PID
207					%I job slot number
208					%q quality profile
209					%e sequencing error
210					%x sequencing error position
211					%R read 1, or 2 if it is the paired-end mate
212					%U read number
213					%r read size
214					%m read mean
215					%d read standard deviation
216					%c sequence id as chromossome, gene/transcript id
217					%C sequence id type (reference or alternate non reference allele) ***
218					%s read strand
219					%t read start position
220					%n read end position
221					%a read start position regarding reference genome ***
222					%b read end position regarding reference genome ***
223					%v genomic variation position ***
224					----------------------------------------------------------------------------
225					*** specific for genomic variation (genome simulation only)
226
227					B<Paired-end specific escape characters>
228
229					----------------------------------------------------------------------------
230					Escape Meaning
231					----------------------------------------------------------------------------
232					%T mate read start position
233					%N mate read end position
234					%A mate read start position regarding reference genome ***
235					%B mate read end position regarding reference genome ***
236					%D distance between the paired-reads
237					%M fragment mean
238					%D fragment standard deviation
239					%f fragment size
240					%F fragment strand
241					%S fragment start position
242					%E fragment end position
243					%X fragment start position regarding reference genome ***
244					%Z fragment end position regarding reference genome ***
245					----------------------------------------------------------------------------
246					*** specific for genomic variation (genome simulation only)
247
248					=item B<--jobs>
249
250					Sets the number of child jobs to be created
251
252					=item B<--seed>
253
254					Sets the seed of the base generator. The ability to set the seed is
255					useful for those who want reproducible simulations. Pay attention to
256					the number of jobs (--jobs) set, because each job receives a different
257					seed calculated from the I<main seed>. So, for reproducibility, the
258					same seed set before needs the same number of jobs set before as well.
259
260					=item B<--read-mean>
261
262					Sets the read mean if quality-profile is equal to 'poisson'. The
263					quality-profile from database overrides the read-size
264
265					=item B<--read-stdd>
266
267					Sets the read standard deviation if quality-profile is equal to
268					'poisson'. The quality-profile from database overrides the read-stdd
269
270					=item B<--coverage>
271
272					Calculates the number of reads based on the genome
273					coverage: number_of_reads = (sequence_size * coverage) / read_size.
274					This is the default option for genome sequencing simulation
275
276					=item B<--sequencing-type>
277
278					Sets the sequencing type to single-end or paired-end
279
280					=item B<--fragment-mean>
281
282					If the sequencing-type is set to paired-end, it sets the
283					fragment mean
284
285					=item B<--fragment-stdd>
286
287					If the sequencing-type is set to paired-end, it sets the
288					fragment standard deviation
289
290					=item B<--sequencing-error>
291
292					Sets the sequencing error rate if quality-profile is equal to 'poisson'.
293					Valid values are between zero and one
294
295					=item B<--quality-profile>
296
297					Sets the sequencing system profile for quality. The default value is a poisson
298					distribution, but the user can choose among several profiles stored into the
299					database or import his own data.
300					See B<quality> command for more details
301
302					=item B<--genomic-variation>
303
304					Sets the genomic variation to be applied on the genome feeded. By
305					default no variation is included to the simulation, but the user has
306					the power to point some entries from B<variation> database or index his
307					own data. This option accepts a list with comma separated values
308					and can be passed multiple times, which is useful in order to join
309					various types of genomic variation into the same simulation. It is
310					possible to combine this option with B<--genomic-variation-regex>
311					See B<variation> command for the available list of genomic variation
312					entries
313
314					=item B<--genomic-variation-regex>
315
316					Applies perl-regex in the variation database and selects all entryes
317					that match the pattern. This option accepts a list with comma separated
318					values and can be passed multiple times. It is possible to combine this
319					option with B<--genomic-variation>
320					See B<variation> command for the available list of genomic variation
321					entries
322
323					=back
324
325					=head1 EXAMPLES
326
327					The following command will produce two fastq.gz files (paired-end), with a
328					coverage of 20x and a ${prefix}_coverage.tsv file with the read counts.
329
330					$ sandy genome \
331					--verbose \
332					--jobs=10 \
333					--sequencing-type=paired-end \
334					--coverage=20 hg38.fa 2> sim.log
335
336					or, equivalently:
337
338					$ sandy genome -v -j 10 -t paired-end -c 20 hg38.fa 2> sim.log
339
340					Using a seed for reproducibility.
341
342					$ sandy genome --seed=1717 my_fasta.fa
343
344					To simulate reads with a specific quality-profile other than the default
345					poisson:
346
347					$ sandy genome --quality-profile=hiseq_101 hg19.fa
348
349					To see the current list of available quality-profiles:
350
351					$ sandy quality
352
353					And in order to learn how to add your custom quality-profile, see:
354
355					$ sandy quality add --help
356
357					Sequence identifiers (first lines of fastq entries) may be customized in output using
358					a format string passed by the user. This format is a combination of literal and escaped
359					characters, in a similar fashion to that used in C programming language’s printf function.
360					For example, let’s simulate a paired-end sequencing and add the read length, read position
361					and mate position into all sequence identifiers:
362
363					$ sandy genome \
364					--id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \
365					hg38.fa
366
367					In this case, results would be:
368
369					==> Into R1
370					@SR.1 read=chr6:979-880 mate=chr6:736-835 length=100
371					...
372					==> Into R2
373					@SR.1 read=chr6:736-835 mate=chr6:979-880 length=100
374
375					See B<Format> section for details.
376
377					An important feature of B<Sandy> is to simulate a genome with genomic-variation.
378					So to exemplify, let's simulate a genome which includes the fusion between the genes
379					NPM1 and ALK:
380
381					$ sandy genome --genomic-variation=fusion_hg38_NPM1-ALK hg38.fa
382
383					To see the current list of available genomic-variation
384
385					$ sandy variation
386
387					And in order to learn how to add your custom genomic-variation, see:
388
389					$ sandy variation add --help
390
391					Putting all together:
392
393					$ sandy genome \
394					--verbose \
395					--jobs=10 \
396					--sequencing-type=paired-end \
397					--seed=1717 \
398					--quality-profile=hiseq_101 \
399					--id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \
400					--genomic-variation=fusion_hg38_NPM1-ALK \
401					hg38.fa 2> sim.log
402
403					=head1 AUTHORS
404
405					=over 4
406
407					=item *
408
409					Thiago L. A. Miller <tmiller@mochsl.org.br>
410
411					=item *
412
413					J. Leonel Buzzo <lbuzzo@mochsl.org.br>
414
415					=item *
416
417					Felipe R. C. dos Santos <fsantos@mochsl.org.br>
418
419					=item *
420
421					Helena B. Conceição <hconceicao@mochsl.org.br>
422
423					=item *
424
425					Rodrigo Barreiro <rbarreiro@mochsl.org.br>
426
427					=item *
428
429					Gabriela Guardia <gguardia@mochsl.org.br>
430
431					=item *
432
433					Fernanda Orpinelli <forpinelli@mochsl.org.br>
434
435					=item *
436
437					Rafael Mercuri <rmercuri@mochsl.org.br>
438
439					=item *
440
441					Rodrigo Barreiro <rbarreiro@mochsl.org.br>
442
443					=item *
444
445					Pedro A. F. Galante <pgalante@mochsl.org.br>
446
447					=back
448
449					=head1 COPYRIGHT AND LICENSE
450
451					This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital.
452
453					This is free software, licensed under:
454
455					The GNU General Public License, Version 3, June 2007
456
457					=cut