File Coverage

lib/App/Sandy/Command/Genome.pm
Criterion Covered Total %
statement 3 5 60.0
branch n/a
condition n/a
subroutine 1 3 33.3
pod 0 2 0.0
total 4 10 40.0


line stmt bran cond sub pod time code
1             package App::Sandy::Command::Genome;
2             # ABSTRACT: simulate command class. Simulate genome sequencing
3              
4 1     1   1430 use App::Sandy::Base 'class';
  1         2  
  1         13  
5              
6             extends 'App::Sandy::CLI::Command';
7              
8             with 'App::Sandy::Role::Digest';
9              
10             our $VERSION = '0.25'; # VERSION
11              
12             sub default_opt {
13 0     0 0   'paired-end-id' => '%i.%U:%c:%F:%X-%Z',
14             'single-end-id' => '%i.%U:%c:%s:%t-%n',
15             'seed' => time,
16             'verbose' => 0,
17             'prefix' => 'out',
18             'output-dir' => '.',
19             'jobs' => 1,
20             'count-loops-by' => 'coverage',
21             'coverage' => 8,
22             'strand-bias' => 'random',
23             'seqid-weight' => 'length',
24             'sequencing-type' => 'paired-end',
25             'fragment-mean' => 300,
26             'fragment-stdd' => 50,
27             'sequencing-error' => 0.001,
28             'read-mean' => 100,
29             'read-stdd' => 0,
30             'quality-profile' => 'poisson',
31             'join-paired-ends' => 0,
32             'output-format' => 'fastq.gz',
33             'compression-level' => 6
34             }
35              
36             sub rm_opt {
37 0     0 0   'strand-bias',
38             'number-of-reads',
39             'seqid-weight',
40             'expression-matrix'
41             }
42              
43             __END__
44              
45             =pod
46              
47             =encoding UTF-8
48              
49             =head1 NAME
50              
51             App::Sandy::Command::Genome - simulate command class. Simulate genome sequencing
52              
53             =head1 VERSION
54              
55             version 0.25
56              
57             =head1 SYNOPSIS
58              
59             sandy genome [options] <fasta-file>
60              
61             Arguments:
62             a fasta file
63              
64             Input/Output options:
65             -h, --help brief help message
66             -H, --man full documentation
67             -v, --verbose print log messages
68             -p, --prefix prefix output [default:"out"]
69             -o, --output-dir output directory [default:"."]
70             -O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"]
71             -1, --join-paired-ends merge R1 and R2 outputs in one file
72             -x, --compression-level speed compression: "1" - compress faster,
73             "9" - compress better [default:"6"; Integer]
74              
75             Runtime options:
76             -j, --jobs number of jobs [default:"1"; Integer]
77             -s, --seed set the seed of the base generator
78             [default:"time()"; Integer]
79              
80             Sequence identifier options:
81             -i, --append-id append to the defined template id [Format]
82             -I, --id overlap the default template id [Format]
83              
84             Sequencing option:
85             -q, --quality-profile sequencing system profiles from quality
86             database [default:"poisson"]
87             -e, --sequencing-error sequencing error rate for poisson
88             [default:"0.001"; Number]
89             -m, --read-mean read mean size for poisson
90             [default:"100"; Integer]
91             -d, --read-stdd read standard deviation size for poisson
92             [default:"0"; Integer]
93             -t, --sequencing-type single-end or paired-end reads
94             [default:"paired-end"]
95             -M, --fragment-mean the fragment mean size for paired-end reads
96             [default:"300"; Integer]
97             -D, --fragment-stdd the fragment standard deviation size for
98             paired-end reads [default:"50"; Integer]
99              
100             Genome-specific options:
101             -c, --coverage genome coverage [default:"8", Number]
102             -a, --genomic-variation a list of genomic variation entries from
103             variation database. This option may be passed
104             multiple times [default:"none"]
105             -A, --genomic-variation-regex a list of perl-like regex to match genomic
106             variation entries in variation database.
107             This option may be passed multiple times
108             [default:"none"]
109              
110             =head1 DESCRIPTION
111              
112             This subcommand simulates genome sequencing reads taking into account the
113             quality-profile and the genome-variation patterns, along with: raffle
114             seed; coverage (depth); fragment mean and standard deviation; single-end
115             (long and short fragments) and paired-end sequencing type; bam, sam,
116             fastq.gz and fastq output formats and more.
117              
118             =head2 INPUT
119              
120             I<sandy genome> expects as argument a fasta file with chromosome sequences.
121             For example, L<the GENCODE human genome|https://www.gencodegenes.org/human/>
122             GRCh38.p13 fasta file.
123              
124             =head2 OUTPUT
125              
126             The output file generated will depend on the I<output-format> (fastq, bam),
127             on the I<join-paired-ends> option (mate read pairs into a single file) and
128             on the I<sequencing-type> (single-end, paired-end). A file with the simulated
129             coverage (${prefix}_coverage.tsv) for each chromosome (read counts) also
130             accompanies the output file.
131              
132             =head1 OPTIONS
133              
134             =over 8
135              
136             =item B<--help>
137              
138             Print a brief help message and exits.
139              
140             =item B<--man>
141              
142             Prints the manual page and exits.
143              
144             =item B<--verbose>
145              
146             Prints log information to standard error
147              
148             =item B<--prefix>
149              
150             Concatenates the prefix to the output-file name.
151              
152             =item B<--output-dir>
153              
154             Creates output-file inside output-dir. If output-dir
155             does not exist, it is created recursively
156              
157             =item B<--output-format>
158              
159             Choose the output format. Available options are:
160             I<bam>, I<sam>, I<fastq.gz>, I<fastq>.
161             For I<bam> option, B<--append-id> is ignored, considering
162             that the sequence identifier is splitted by blank character, so
163             just the first field is included into the query name column
164             (first column).
165              
166             =item B<--join-paired-ends>
167              
168             By default, paired-end reads are put into two different files,
169             I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs
170             together, she can pass this option.
171             If the B<--id> does not have the escape character %R, it is
172             automatically included right after the first field (blank separated values)
173             as in I<id/%R> - which resolves to I<id/1> or I<id/2>.
174             It is necessary to distinguish which read is R1/R2
175              
176             =item B<--compression-level>
177              
178             Regulates the speed of compression using the specified digit (between 1 and 9),
179             where "1" indicates the fastest compression method (less compression) and "9"
180             indicates the slowest compression method (best compression). The default
181             compression level is "6"
182              
183             =item B<--append-id>
184              
185             Append string template to the defined template id.
186             See B<Format>
187              
188             =item B<--id>
189              
190             Overlap the default defined template id:
191             I<single-end> %i.%U_%c_%s_%t_%n and I<paired-end> %i.%U_%c_%s_%S_%E
192             e.g. SR123.1_chr1_P_1001_1101
193             See B<Format>
194              
195             =item B<Format>
196              
197             A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works.
198             That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid
199             escape characteres are:
200              
201             B<Common escape characters>
202              
203             ----------------------------------------------------------------------------
204             Escape Meaning
205             ----------------------------------------------------------------------------
206             %i instrument id composed by SR + PID
207             %I job slot number
208             %q quality profile
209             %e sequencing error
210             %x sequencing error position
211             %R read 1, or 2 if it is the paired-end mate
212             %U read number
213             %r read size
214             %m read mean
215             %d read standard deviation
216             %c sequence id as chromossome, gene/transcript id
217             %C sequence id type (reference or alternate non reference allele) ***
218             %s read strand
219             %t read start position
220             %n read end position
221             %a read start position regarding reference genome ***
222             %b read end position regarding reference genome ***
223             %v genomic variation position ***
224             ----------------------------------------------------------------------------
225             *** specific for genomic variation (genome simulation only)
226              
227             B<Paired-end specific escape characters>
228              
229             ----------------------------------------------------------------------------
230             Escape Meaning
231             ----------------------------------------------------------------------------
232             %T mate read start position
233             %N mate read end position
234             %A mate read start position regarding reference genome ***
235             %B mate read end position regarding reference genome ***
236             %D distance between the paired-reads
237             %M fragment mean
238             %D fragment standard deviation
239             %f fragment size
240             %F fragment strand
241             %S fragment start position
242             %E fragment end position
243             %X fragment start position regarding reference genome ***
244             %Z fragment end position regarding reference genome ***
245             ----------------------------------------------------------------------------
246             *** specific for genomic variation (genome simulation only)
247              
248             =item B<--jobs>
249              
250             Sets the number of child jobs to be created
251              
252             =item B<--seed>
253              
254             Sets the seed of the base generator. The ability to set the seed is
255             useful for those who want reproducible simulations. Pay attention to
256             the number of jobs (--jobs) set, because each job receives a different
257             seed calculated from the I<main seed>. So, for reproducibility, the
258             same seed set before needs the same number of jobs set before as well.
259              
260             =item B<--read-mean>
261              
262             Sets the read mean if quality-profile is equal to 'poisson'. The
263             quality-profile from database overrides the read-size
264              
265             =item B<--read-stdd>
266              
267             Sets the read standard deviation if quality-profile is equal to
268             'poisson'. The quality-profile from database overrides the read-stdd
269              
270             =item B<--coverage>
271              
272             Calculates the number of reads based on the genome
273             coverage: number_of_reads = (sequence_size * coverage) / read_size.
274             This is the default option for genome sequencing simulation
275              
276             =item B<--sequencing-type>
277              
278             Sets the sequencing type to single-end or paired-end
279              
280             =item B<--fragment-mean>
281              
282             If the sequencing-type is set to paired-end, it sets the
283             fragment mean
284              
285             =item B<--fragment-stdd>
286              
287             If the sequencing-type is set to paired-end, it sets the
288             fragment standard deviation
289              
290             =item B<--sequencing-error>
291              
292             Sets the sequencing error rate if quality-profile is equal to 'poisson'.
293             Valid values are between zero and one
294              
295             =item B<--quality-profile>
296              
297             Sets the sequencing system profile for quality. The default value is a poisson
298             distribution, but the user can choose among several profiles stored into the
299             database or import his own data.
300             See B<quality> command for more details
301              
302             =item B<--genomic-variation>
303              
304             Sets the genomic variation to be applied on the genome feeded. By
305             default no variation is included to the simulation, but the user has
306             the power to point some entries from B<variation> database or index his
307             own data. This option accepts a list with comma separated values
308             and can be passed multiple times, which is useful in order to join
309             various types of genomic variation into the same simulation. It is
310             possible to combine this option with B<--genomic-variation-regex>
311             See B<variation> command for the available list of genomic variation
312             entries
313              
314             =item B<--genomic-variation-regex>
315              
316             Applies perl-regex in the variation database and selects all entryes
317             that match the pattern. This option accepts a list with comma separated
318             values and can be passed multiple times. It is possible to combine this
319             option with B<--genomic-variation>
320             See B<variation> command for the available list of genomic variation
321             entries
322              
323             =back
324              
325             =head1 EXAMPLES
326              
327             The following command will produce two fastq.gz files (paired-end), with a
328             coverage of 20x and a ${prefix}_coverage.tsv file with the read counts.
329              
330             $ sandy genome \
331             --verbose \
332             --jobs=10 \
333             --sequencing-type=paired-end \
334             --coverage=20 hg38.fa 2> sim.log
335              
336             or, equivalently:
337              
338             $ sandy genome -v -j 10 -t paired-end -c 20 hg38.fa 2> sim.log
339              
340             Using a seed for reproducibility.
341              
342             $ sandy genome --seed=1717 my_fasta.fa
343              
344             To simulate reads with a specific quality-profile other than the default
345             poisson:
346              
347             $ sandy genome --quality-profile=hiseq_101 hg19.fa
348              
349             To see the current list of available quality-profiles:
350              
351             $ sandy quality
352              
353             And in order to learn how to add your custom quality-profile, see:
354              
355             $ sandy quality add --help
356              
357             Sequence identifiers (first lines of fastq entries) may be customized in output using
358             a format string passed by the user. This format is a combination of literal and escaped
359             characters, in a similar fashion to that used in C programming language’s printf function.
360             For example, let’s simulate a paired-end sequencing and add the read length, read position
361             and mate position into all sequence identifiers:
362              
363             $ sandy genome \
364             --id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \
365             hg38.fa
366              
367             In this case, results would be:
368              
369             ==> Into R1
370             @SR.1 read=chr6:979-880 mate=chr6:736-835 length=100
371             ...
372             ==> Into R2
373             @SR.1 read=chr6:736-835 mate=chr6:979-880 length=100
374              
375             See B<Format> section for details.
376              
377             An important feature of B<Sandy> is to simulate a genome with genomic-variation.
378             So to exemplify, let's simulate a genome which includes the fusion between the genes
379             NPM1 and ALK:
380              
381             $ sandy genome --genomic-variation=fusion_hg38_NPM1-ALK hg38.fa
382              
383             To see the current list of available genomic-variation
384              
385             $ sandy variation
386              
387             And in order to learn how to add your custom genomic-variation, see:
388              
389             $ sandy variation add --help
390              
391             Putting all together:
392              
393             $ sandy genome \
394             --verbose \
395             --jobs=10 \
396             --sequencing-type=paired-end \
397             --seed=1717 \
398             --quality-profile=hiseq_101 \
399             --id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \
400             --genomic-variation=fusion_hg38_NPM1-ALK \
401             hg38.fa 2> sim.log
402              
403             =head1 AUTHORS
404              
405             =over 4
406              
407             =item *
408              
409             Thiago L. A. Miller <tmiller@mochsl.org.br>
410              
411             =item *
412              
413             J. Leonel Buzzo <lbuzzo@mochsl.org.br>
414              
415             =item *
416              
417             Felipe R. C. dos Santos <fsantos@mochsl.org.br>
418              
419             =item *
420              
421             Helena B. Conceição <hconceicao@mochsl.org.br>
422              
423             =item *
424              
425             Rodrigo Barreiro <rbarreiro@mochsl.org.br>
426              
427             =item *
428              
429             Gabriela Guardia <gguardia@mochsl.org.br>
430              
431             =item *
432              
433             Fernanda Orpinelli <forpinelli@mochsl.org.br>
434              
435             =item *
436              
437             Rafael Mercuri <rmercuri@mochsl.org.br>
438              
439             =item *
440              
441             Rodrigo Barreiro <rbarreiro@mochsl.org.br>
442              
443             =item *
444              
445             Pedro A. F. Galante <pgalante@mochsl.org.br>
446              
447             =back
448              
449             =head1 COPYRIGHT AND LICENSE
450              
451             This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital.
452              
453             This is free software, licensed under:
454              
455             The GNU General Public License, Version 3, June 2007
456              
457             =cut