File Coverage

lib/App/Sandy/Command/Transcriptome.pm
Criterion Covered Total %
statement 3 5 60.0
branch n/a
condition n/a
subroutine 1 3 33.3
pod 0 2 0.0
total 4 10 40.0


line stmt bran cond sub pod time code
1             package App::Sandy::Command::Transcriptome;
2             # ABSTRACT: simulate command class. Simulate transcriptome sequencing
3              
4 1     1   1849 use App::Sandy::Base 'class';
  1         3  
  1         11  
5              
6             extends 'App::Sandy::CLI::Command';
7              
8             with 'App::Sandy::Role::Digest';
9              
10             our $VERSION = '0.25'; # VERSION
11              
12             sub default_opt {
13 0     0 0   'paired-end-id' => '%i.%U:%c %U',
14             'single-end-id' => '%i.%U:%c %U',
15             'seed' => time,
16             'verbose' => 0,
17             'prefix' => 'out',
18             'output-dir' => '.',
19             'jobs' => 1,
20             'count-loops-by' => 'number-of-reads',
21             'number-of-reads' => 1000000,
22             'strand-bias' => 'minus',
23             'seqid-weight' => 'length',
24             'sequencing-type' => 'paired-end',
25             'fragment-mean' => 300,
26             'fragment-stdd' => 50,
27             'sequencing-error' => 0.001,
28             'read-mean' => 100,
29             'read-stdd' => 0,
30             'quality-profile' => 'poisson',
31             'join-paired-ends' => 0,
32             'output-format' => 'fastq.gz',
33             'compression-level' => 6
34             }
35              
36             sub rm_opt {
37 0     0 0   'strand-bias',
38             'coverage',
39             'seqid-weight',
40             'genomic-variation'
41             }
42              
43             __END__
44              
45             =pod
46              
47             =encoding UTF-8
48              
49             =head1 NAME
50              
51             App::Sandy::Command::Transcriptome - simulate command class. Simulate transcriptome sequencing
52              
53             =head1 VERSION
54              
55             version 0.25
56              
57             =head1 SYNOPSIS
58              
59             sandy transcriptome [options] <fasta-file>
60              
61             Arguments:
62             a fasta file
63              
64             Input/Output options:
65             -h, --help brief help message
66             -H, --man full documentation
67             -v, --verbose print log messages
68             -p, --prefix prefix output [default:"out"]
69             -o, --output-dir output directory [default:"."]
70             -O, --output-format bam, sam, fastq.gz, fastq [default:"fastq.gz"]
71             -1, --join-paired-ends merge R1 and R2 outputs in one file
72             -x, --compression-level speed compression: "1" - compress faster,
73             "9" - compress better [default:"6"; Integer]
74              
75             Runtime options:
76             -j, --jobs number of jobs [default:"1"; Integer]
77             -s, --seed set the seed of the base generator
78             [default:"time()"; Integer]
79              
80             Sequence identifier options:
81             -i, --append-id append to the defined template id [Format]
82             -I, --id overlap the default template id [Format]
83              
84             Sequencing option:
85             -q, --quality-profile sequencing system profiles from quality
86             database [default:"poisson"]
87             -e, --sequencing-error sequencing error rate for poisson
88             [default:"0.001"; Number]
89             -m, --read-mean read mean size for poisson
90             [default:"100"; Integer]
91             -d, --read-stdd read standard deviation size for poisson
92             [default:"0"; Integer]
93             -t, --sequencing-type single-end or paired-end reads
94             [default:"paired-end"]
95             -M, --fragment-mean the fragment mean size for paired-end reads
96             [default:"300"; Integer]
97             -D, --fragment-stdd the fragment standard deviation size for
98             paired-end reads [default:"50"; Integer]
99              
100             Transcriptome-specific options:
101             -n, --number-of-reads set the number of reads
102             [default:"1000000", Integer]
103             -f, --expression-matrix an expression-matrix entry from database
104              
105             =head1 DESCRIPTION
106              
107             This subcommand simulates transcriptome sequencing reads taking into account
108             the quality-profile and the expression-matrix weights, along with: raffle
109             seed; number of reads; fragment mean and standard deviation; single-end
110             (long and short fragments) and paired-end sequencing type; bam, sam,
111             fastq.gz and fastq output formats and more.
112              
113             =head2 INPUT
114              
115             I<sandy transcriptome> expects as argument a fasta file with transcript sequences.
116             For example, L<the GENCODE human genome|https://www.gencodegenes.org/human/>
117             transcript sequences and protein-coding transcript sequences.
118              
119             =head2 OUTPUT
120              
121             The output file generated will depend on the I<output-format> (fastq, bam),
122             on the I<join-paired-ends> option (mate read pairs into a single file) and
123             on the I<sequencing-type> (single-end, paired-end). One file with the simulated
124             abundance (${prefix}_abundance_transcripts.tsv) per transcript and one file with
125             the simulated abundance (${prefix}_abundance_genes.tsv) per gene (if the fasta
126             file used has the relationship between gene and its transcripts at the header)
127             will accompany the output file.
128              
129             =head1 OPTIONS
130              
131             =over 8
132              
133             =item B<--help>
134              
135             Print a brief help message and exits.
136              
137             =item B<--man>
138              
139             Prints the manual page and exits.
140              
141             =item B<--verbose>
142              
143             Prints log information to standard error
144              
145             =item B<--prefix>
146              
147             Concatenates the prefix to the output-file name.
148              
149             =item B<--output-dir>
150              
151             Creates output-file inside output-dir. If output-dir
152             does not exist, it is created recursively
153              
154             =item B<--output-format>
155              
156             Choose the output format. Available options are:
157             I<bam>, I<sam>, I<fastq.gz>, I<fastq>.
158             For I<bam> option, B<--append-id> is ignored, considering
159             that the sequence identifier is splitted by blank character, so
160             just the first field is included into the query name column
161             (first column).
162              
163             =item B<--join-paired-ends>
164              
165             By default, paired-end reads are put into two different files,
166             I<prefix_R[12]_001.fastq(\.gz)?>. If the user wants both outputs
167             together, she can pass this option.
168             If the B<--id> does not have the escape character %R, it is
169             automatically included right after the first field (blank separated values)
170             as in I<id/%R> - which resolves to I<id/1> or I<id/2>.
171             It is necessary to distinguish which read is R1/R2
172              
173             =item B<--compression-level>
174              
175             Regulates the speed of compression using the specified digit (between 1 and 9),
176             where "1" indicates the fastest compression method (less compression) and "9"
177             indicates the slowest compression method (best compression). The default
178             compression level is "6"
179              
180             =item B<--append-id>
181              
182             Append string template to the defined template id.
183             See B<Format>
184              
185             =item B<--id>
186              
187             Overlap the default defined template id:
188             I<single-end> %i.%U %U and I<paired-end> %i.%U %U
189             e.g. SR123.1 1
190             See B<Format>
191              
192             =item B<Format>
193              
194             A string B<Format> is a combination of literal and escape characters similar to the way I<printf> works.
195             That way, the user has the freedom to customize the fastq sequence identifier to fit her needs. Valid
196             escape characteres are:
197              
198             B<Common escape characters>
199              
200             ----------------------------------------------------------------------------
201             Escape Meaning
202             ----------------------------------------------------------------------------
203             %i instrument id composed by SR + PID
204             %I job slot number
205             %q quality profile
206             %e sequencing error
207             %x sequencing error position
208             %R read 1, or 2 if it is the paired-end mate
209             %U read number
210             %r read size
211             %m read mean
212             %d read standard deviation
213             %c sequence id as chromossome, gene/transcript id
214             %C sequence id type (reference or alternate non reference allele) ***
215             %s read strand
216             %t read start position
217             %n read end position
218             %a read start position regarding reference genome ***
219             %b read end position regarding reference genome ***
220             %v genomic variation position ***
221             ----------------------------------------------------------------------------
222             *** specific for genomic variation (genome simulation only)
223              
224             B<Paired-end specific escape characters>
225              
226             ----------------------------------------------------------------------------
227             Escape Meaning
228             ----------------------------------------------------------------------------
229             %T mate read start position
230             %N mate read end position
231             %A mate read start position regarding reference genome ***
232             %B mate read end position regarding reference genome ***
233             %D distance between the paired-reads
234             %M fragment mean
235             %D fragment standard deviation
236             %f fragment size
237             %F fragment strand
238             %S fragment start position
239             %E fragment end position
240             %X fragment start position regarding reference genome ***
241             %Z fragment end position regarding reference genome ***
242             ----------------------------------------------------------------------------
243             *** specific for genomic variation (genome simulation only)
244              
245             =item B<--jobs>
246              
247             Sets the number of child jobs to be created
248              
249             =item B<--seed>
250              
251             Sets the seed of the base generator. The ability to set the seed is
252             useful for those who want reproducible simulations. Pay attention to
253             the number of jobs (--jobs) set, because each job receives a different
254             seed calculated from the I<main seed>. So, for reproducibility, the
255             same seed set before needs the same number of jobs set before as well.
256              
257             =item B<--read-mean>
258              
259             Sets the read mean if quality-profile is equal to 'poisson'. The
260             quality-profile from database overrides the read-size
261              
262             =item B<--read-stdd>
263              
264             Sets the read standard deviation if quality-profile is equal to
265             'poisson'. The quality-profile from database overrides the read-stdd
266              
267             =item B<--number-of-reads>
268              
269             Sets the number of reads desired for each fragment end. That means,
270             it will be the number of reads for each pair - 1 x N reads for single-end
271             and 2 x N reads for paired-end. This is the default option for transcriptome
272             sequencing simulation
273              
274             =item B<--sequencing-type>
275              
276             Sets the sequencing type to single-end or paired-end
277              
278             =item B<--fragment-mean>
279              
280             If the sequencing-type is set to paired-end, it sets the
281             fragment mean
282              
283             =item B<--fragment-stdd>
284              
285             If the sequencing-type is set to paired-end, it sets the
286             fragment standard deviation
287              
288             =item B<--sequencing-error>
289              
290             Sets the sequencing error rate if quality-profile is equal to 'poisson'.
291             Valid values are between zero and one
292              
293             =item B<--quality-profile>
294              
295             Sets the sequencing system profile for quality. The default value is a poisson
296             distribution, but the user can choose among several profiles stored into the
297             database or import his own data.
298             See B<quality> command for more details
299              
300             =item B<--expression-matrix>
301              
302             By default, the gene/transcript is raffled using its length as weight. If
303             you choose an expression-matrix, then the raffle will be made based on the
304             gene/transcript expression.
305             The expression-matrix entries are found into the database.
306             See B<expression> command for more details
307              
308             =back
309              
310             =head1 EXAMPLES
311              
312             The command:
313              
314             $ sandy transcriptome \
315             --verbose \
316             --jobs=5 \
317             --number-of-reads=5000000 \
318             --output-dir=my_results/ \
319             gencode.v43.transcripts.fa.gz 2> sim.log
320              
321             or, equivalently:
322              
323             $ sandy transcriptome \
324             -v -j 5 -n 5000000 -o my_results/ \
325             gencode.v43.transcripts.fa.gz 2> sim.log
326              
327             will both generate two paired-end fastq files (R1 and R2) into my_results/ directory
328             with 5000000 reads and two abundance files, one per transcripts and the other per genes.
329              
330             By default the raffled bias is the transcript length, but the user can change this
331             behavior by choosing an expression-matrix from the database:
332              
333             $ sandy transcriptome -f brain_cortex gencode.v43.transcripts.fa.gz
334              
335             To see the current list of available expression matrices:
336              
337             $ sandy expression
338              
339             And in order to learn how to add your custom expression-matrix, see:
340              
341             $ sandy expression add --help
342              
343             For reproducibility, the user can set the seed option and guarantee the reliability of all
344             the raffles in a later simulation:
345              
346             $ sandy expression --seed=1717 my_transcripts.fa
347              
348             To simulate reads with a specific quality-profile other than the default
349             poisson:
350              
351             $ sandy expression --quality-profile=hiseq_150 my_transcripts.fa
352              
353             To see the current list of available quality-profiles:
354              
355             $ sandy quality
356              
357             And in order to learn how to add your custom quality-profile, see:
358              
359             $ sandy quality add --help
360              
361             Sequence identifiers (first lines of fastq entries) may be customized in output using
362             a format string passed by the user. This format is a combination of literal and escaped
363             characters, in a similar fashion to that used in C programming language’s printf function.
364             For example, let’s simulate a paired-end sequencing and add the read length, read position
365             and mate position into all sequence identifiers:
366              
367             $ sandy expression --id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" my_genes.fa.gz
368              
369             In this case, results would be:
370              
371             ==> Into R1
372             @SR.1 read=BRAF:979-880 mate=BRAF:736-835 length=100
373             ...
374             ==> Into R2
375             @SR.1 read=BRAF:736-835 mate=BRAF:979-880 length=100
376             ...
377              
378             See B<Format> section for details.
379              
380             Putting all together:
381              
382             $ sandy transcriptome \
383             --verbose \
384             --jobs=5 \
385             --number-of-reads=5000000 \
386             --output-dir=my_results/ \
387             --expression-matrix=brain_cortex \
388             --seed=1717 \
389             --quality-profile=hiseq_150 \
390             --id="%i.%U read=%c:%t-%n mate=%c:%T-%N length=%r" \
391             gencode.v43.transcripts.fa.gz 2> sim.log
392              
393             =head1 AUTHORS
394              
395             =over 4
396              
397             =item *
398              
399             Thiago L. A. Miller <tmiller@mochsl.org.br>
400              
401             =item *
402              
403             J. Leonel Buzzo <lbuzzo@mochsl.org.br>
404              
405             =item *
406              
407             Felipe R. C. dos Santos <fsantos@mochsl.org.br>
408              
409             =item *
410              
411             Helena B. Conceição <hconceicao@mochsl.org.br>
412              
413             =item *
414              
415             Rodrigo Barreiro <rbarreiro@mochsl.org.br>
416              
417             =item *
418              
419             Gabriela Guardia <gguardia@mochsl.org.br>
420              
421             =item *
422              
423             Fernanda Orpinelli <forpinelli@mochsl.org.br>
424              
425             =item *
426              
427             Rafael Mercuri <rmercuri@mochsl.org.br>
428              
429             =item *
430              
431             Rodrigo Barreiro <rbarreiro@mochsl.org.br>
432              
433             =item *
434              
435             Pedro A. F. Galante <pgalante@mochsl.org.br>
436              
437             =back
438              
439             =head1 COPYRIGHT AND LICENSE
440              
441             This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital.
442              
443             This is free software, licensed under:
444              
445             The GNU General Public License, Version 3, June 2007
446              
447             =cut