File Coverage

lib/App/Sandy/Command/Genome.pm

Criterion	Covered	Total	%
statement	3	5	60.0
branch			n/a
condition			n/a
subroutine	1	3	33.3
pod	0	2	0.0
total	4	10	40.0

line	stmt	sub	pod	time	code
1					package App::Sandy::Command::Genome;
2					# ABSTRACT: simulate command class. Simulate genome sequencing
3
4	1	1		1541	use App::Sandy::Base 'class';
	1			3
	1			6
5
6					extends 'App::Sandy::CLI::Command';
7
8					with 'App::Sandy::Role::Digest';
9
10					our $VERSION = '0.22'; # VERSION
11
12					sub default_opt {
13	0	0	0		'paired-end-id' => '%i.%U:%c:%F:%X-%Z',
14					'single-end-id' => '%i.%U:%c:%s:%t-%n',
15					'seed' => time,
16					'verbose' => 0,
17					'prefix' => 'out',
18					'output-dir' => '.',
19					'jobs' => 1,
20					'count-loops-by' => 'coverage',
21					'coverage' => 8,
22					'strand-bias' => 'random',
23					'seqid-weight' => 'length',
24					'sequencing-type' => 'paired-end',
25					'fragment-mean' => 300,
26					'fragment-stdd' => 50,
27					'sequencing-error' => 0.001,
28					'read-mean' => 100,
29					'read-stdd' => 0,
30					'quality-profile' => 'poisson',
31					'join-paired-ends' => 0,
32					'output-format' => 'fastq.gz',
33					'compression-level' => 6
34					}
35
36					sub rm_opt {
37	0	0	0		'strand-bias',
38					'number-of-reads',
39					'seqid-weight',
40					'expression-matrix'
41					}
42
43					__END__
44
45					=pod
46
47					=encoding UTF-8
48
49					=head1 NAME
50
51					App::Sandy::Command::Genome - simulate command class. Simulate genome sequencing
52
53					=head1 VERSION
54
55					version 0.22
56
57					=head1 SYNOPSIS
58
59					sandy genome [options] <fasta-file>
60
61					Arguments:
62					a fasta-file
63
64					Options:
65					-h, --help brief help message
66					-u, --man full documentation
67					-v, --verbose print log messages
68					-p, --prefix prefix output [default:"out"]
69					-o, --output-dir output directory [default:"."]
70					-O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"]
71					-1, --join-paired-ends merge R1 and R2 outputs in one file
72					-x, --compression-level speed compression: "1" - compress faster,
73					"9" - compress better [default:"6"; Integer]
74					-i, --append-id append to the defined template id [Format]
75					-I, --id overlap the default template id [Format]
76					-j, --jobs number of jobs [default:"1"; Integer]
77					-s, --seed set the seed of the base generator
78					[default:"time()"; Integer]
79					-c, --coverage genome coverage [default:"8", Number]
80					-t, --sequencing-type single-end or paired-end reads
81					[default:"paired-end"]
82					-q, --quality-profile sequencing system profiles from quality
83					database [default:"poisson"]
84					-e, --sequencing-error sequencing error rate for poisson
85					[default:"0.001"; Number]
86					-m, --read-mean read mean size for poisson
87					[default:"100"; Integer]
88					-d, --read-stdd read standard deviation size for poisson
89					[default:"0"; Integer]
90					-M, --fragment-mean the fragment mean size for paired-end reads
91					[default:"300"; Integer]
92					-D, --fragment-stdd the fragment standard deviation size for
93					paired-end reads [default:"50"; Integer]
94					-a, --genomic-variation a list of genomic variation entries from
95					variation database. This option may be passed
96					multiple times [default:"none"]
97					-A, --genomic-variation-regex a list of perl-like regex to match genomic
98					variation entries in variation database.
99					This option may be passed multiple times
100					[default:"none"]
101
102					=head1 DESCRIPTION
103
104					Simulate genome sequencing.
105
106					=head1 OPTIONS
107
108					=over 8
109
110					=item B<--help>
111
112					Print a brief help message and exits.
113
114					=item B<--man>
115
116					Prints the manual page and exits.
117
118					=item B<--verbose>
119
120					Prints log information to standard error
121
122					=item B<--prefix>
123
124					Concatenates the prefix to the output-file name.
125
126					=item B<--output-dir>
127
128					Creates output-file inside output-dir. If output-dir
129					does not exist, it is created recursively
130
131					=item B<--output-format>
132
133					Choose the output format. Available options are:
134					I<bam>, I<sam>, I<fastq.gz>, I<fastq>.
135					For I<bam> option, B<--append-id> is ignored, considering
136					that the sequence identifier is splitted by blank character, so
137					just the first field is included into the query name column
138					(first column).
139
140					=item B<--join-paired-ends>
141
142					By default, paired-end reads are put into two different files,
143					I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs
144					together, she can pass this option.
145					If the B<--id> does not have the escape character %R, it is
146					automatically included right after the first field (blank separated values)
147					as in I<id/%R> - which resolves to I<id/1> or I<id/2>.
148					It is necessary to distinguish which read is R1/R2
149
150					=item B<--compression-level>
151
152					Regulates the speed of compression using the specified digit (between 1 and 9),
153					where "1" indicates the fastest compression method (less compression) and "9"
154					indicates the slowest compression method (best compression). The default
155					compression level is "6"
156
157					=item B<--append-id>
158
159					Append string template to the defined template id.
160					See B<Format>
161
162					=item B<--id>
163
164					Overlap the default defined template id:
165					I<single-end> %i.%U_%c_%s_%t_%n and I<paired-end> %i.%U_%c_%s_%S_%E
166					e.g. SR123.1_chr1_P_1001_1101
167					See B<Format>
168
169					=item B<Format>
170
171					A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works.
172					That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid
173					escape characteres are:
174
175					B<Common escape characters>
176
177					----------------------------------------------------------------------------
178					Escape Meaning
179					----------------------------------------------------------------------------
180					%i instrument id composed by SR + PID
181					%I job slot number
182					%q quality profile
183					%e sequencing error
184					%x sequencing error position
185					%R read 1, or 2 if it is the paired-end mate
186					%U read number
187					%r read size
188					%m read mean
189					%d read standard deviation
190					%c sequence id as chromossome, gene/transcript id
191					%C sequence id type (reference or alternate non reference allele) ***
192					%s read strand
193					%t read start position
194					%n read end position
195					%a read start position regarding reference genome ***
196					%b read end position regarding reference genome ***
197					%v genomic variation position ***
198					----------------------------------------------------------------------------
199					*** specific for genomic variation (genome simulation only)
200
201					B<Paired-end specific escape characters>
202
203					----------------------------------------------------------------------------
204					Escape Meaning
205					----------------------------------------------------------------------------
206					%T mate read start position
207					%N mate read end position
208					%A mate read start position regarding reference genome ***
209					%B mate read end position regarding reference genome ***
210					%D distance between the paired-reads
211					%M fragment mean
212					%D fragment standard deviation
213					%f fragment size
214					%F fragment strand
215					%S fragment start position
216					%E fragment end position
217					%X fragment start position regarding reference genome ***
218					%Z fragment end position regarding reference genome ***
219					----------------------------------------------------------------------------
220					*** specific for genomic variation (genome simulation only)
221
222					=item B<--jobs>
223
224					Sets the number of child jobs to be created
225
226					=item B<--seed>
227
228					Sets the seed of the base generator. The ability to set the seed is
229					useful for those who want reproducible simulations. Pay attention to
230					the number of jobs (--jobs) set, because each job receives a different
231					seed calculated from the I<main seed>. So, for reproducibility, the
232					same seed set before needs the same number of jobs set before as well.
233
234					=item B<--read-mean>
235
236					Sets the read mean if quality-profile is equal to 'poisson'. The
237					quality-profile from database overrides the read-size
238
239					=item B<--read-stdd>
240
241					Sets the read standard deviation if quality-profile is equal to
242					'poisson'. The quality-profile from database overrides the read-stdd
243
244					=item B<--coverage>
245
246					Calculates the number of reads based on the genome
247					coverage: number_of_reads = (sequence_size * coverage) / read_size.
248					This is the default option for genome sequencing simulation
249
250					=item B<--sequencing-type>
251
252					Sets the sequencing type to single-end or paired-end
253
254					=item B<--fragment-mean>
255
256					If the sequencing-type is set to paired-end, it sets the
257					fragment mean
258
259					=item B<--fragment-stdd>
260
261					If the sequencing-type is set to paired-end, it sets the
262					fragment standard deviation
263
264					=item B<--sequencing-error>
265
266					Sets the sequencing error rate if quality-profile is equal to 'poisson'.
267					Valid values are between zero and one
268
269					=item B<--quality-profile>
270
271					Sets the sequencing system profile for quality. The default value is a poisson
272					distribution, but the user can choose among several profiles stored into the
273					database or import his own data.
274					See B<quality> command for more details
275
276					=item B<--genomic-variation>
277
278					Sets the genomic variation to be applied on the genome feeded. By
279					default no variation is included to the simulation, but the user has
280					the power to point some entries from B<variation> database or index his
281					own data. This option accepts a list with comma separated values
282					and can be passed multiple times, which is useful in order to join
283					various types of genomic variation into the same simulation. It is
284					possible to combine this option with B<--genomic-variation-regex>
285					See B<variation> command for the available list of genomic variation
286					entries
287
288					=item B<--genomic-variation-regex>
289
290					Applies perl-regex in the variation database and selects all entryes
291					that match the pattern. This option accepts a list with comma separated
292					values and can be passed multiple times. It is possible to combine this
293					option with B<--genomic-variation>
294					See B<variation> command for the available list of genomic variation
295					entries
296
297					=back
298
299					=head1 AUTHORS
300
301					=over 4
302
303					=item *
304
305					Thiago L. A. Miller <tmiller@mochsl.org.br>
306
307					=item *
308
309					J. Leonel Buzzo <lbuzzo@mochsl.org.br>
310
311					=item *
312
313					Felipe R. C. dos Santos <fsantos@mochsl.org.br>
314
315					=item *
316
317					Helena B. ConceiÃ§Ã£o <hconceicao@mochsl.org.br>
318
319					=item *
320
321					Gabriela Guardia <gguardia@mochsl.org.br>
322
323					=item *
324
325					Fernanda Orpinelli <forpinelli@mochsl.org.br>
326
327					=item *
328
329					Pedro A. F. Galante <pgalante@mochsl.org.br>
330
331					=back
332
333					=head1 COPYRIGHT AND LICENSE
334
335					This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-LibanÃªs Hospital.
336
337					This is free software, licensed under:
338
339					The GNU General Public License, Version 3, June 2007
340
341					=cut