File Coverage

blib/lib/BioX/Workflow/Plugin/Drake.pm
Criterion Covered Total %
statement 12 12 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 16 16 100.0


line stmt bran cond sub pod time code
1             package BioX::Workflow::Plugin::Drake;
2              
3             our $VERSION = '0.11';
4 1     1   22105 use Data::Dumper;
  1         10443  
  1         61  
5 1     1   893 use Data::Pairs;
  1         2430  
  1         42  
6              
7 1     1   729 use Moose::Role;
  1         477827  
  1         6  
8              
9 1     1   6441 use Interpolation E => 'eval';
  1         5019  
  1         6  
10              
11             =head1 NAME
12              
13             BioX::Workflow::Plugin::Drake - A very opinionated template based bioinformatics workflow writer for Drake.
14              
15             =head1 SYNOPSIS
16              
17             The main documentation for this module is at L<BioX::Workflow>. This module extends Workflow in order to add functionality for outputing workflows in drake format.
18              
19             biox-workflow.pl --workflow workflow.yml > workflow.drake
20             drake --workflow workflow.drake #with other functionality such as --jobs for asynchronous output, etc.
21              
22             List your plugins in your workflow.yml file
23              
24             ---
25             plugins:
26             - Drake
27             global:
28             - indir: /home/user/gemini
29             - outdir: /home/user/gemini/gemini-wrapper
30             - file_rule: (.vcf)$|(.vcf.gz)$
31             - infile:
32             - min: 1 ##IF USING MIN
33             #So On and So Forth
34              
35             More information about Drake can be found here L<https://github.com/Factual/drake>.
36              
37             =head2 Default Variables
38              
39             BioX::Workflow::Plugin::Drake assumes your INPUT/OUTPUT and indir/outdirs are
40             linked.
41              
42             This means the output from step1 is the input for step2.
43              
44             You can override this behavior by either declaring any of these values, or in the global
45             variables set auto_input: 0, disable automatic indir/outdir naming with
46             auto_name: 0, and disable automatically naming outdirectories by rule names with
47             enforce_struct: 0.
48              
49              
50             =head2 Example
51              
52             =head3 workflow.yml
53              
54             ---
55             plugins:
56             - Drake
57             global:
58             - indir: /home/user/workflow
59             - outdir: /home/user/workflow/output
60             - file_rule: (.csv)$
61             rules:
62             - backup:
63             local:
64             - INPUT: "{$self->indir}/{$sample}.csv"
65             - OUTPUT: "{$self->outdir}/{$sample}.csv"
66             - thing: "other thing"
67             process: |
68             cp $INPUT $OUTPUT
69             - grep_VARA:
70             local:
71             - OUTPUT: "{$self->outdir}/{$sample}.grep_VARA.csv"
72             process: |
73             echo "Working on {$self->{indir}}/{$sample.csv}"
74             grep -i "VARA" {$self->indir}/{$sample}.csv >> {$self->outdir}/{$sample}.grep_VARA.csv \
75             || touch {$self->OUTPUT}
76             - grep_VARB:
77             local:
78             - OUTPUT: "{$self->outdir}/{$sample}.grep_VARA.grep_VARB.csv"
79             process: |
80             grep -i "VARB" {$self->indir}/{$sample}.grep_VARA.csv >> {$self->outdir}/{$sample}.grep_VARA.grep_VARB.csv || touch {$self->OUTPUT}
81              
82             =head3 Notes on the drake.yml
83              
84             Drake will stop everything if you're job returns with an exit code of anything
85             besides 0. For this reason we have the last command have a command1 || command2
86             syntax, so that even if we don't grep any "VARB" from the file the workflow
87             could continue.
88              
89             =head3 Run it with default setup
90              
91             biox-workflow.pl --workflow workflow.yml > workflow.full.drake
92              
93             =head3 Output with default setup
94              
95             I don't want to inlcude the whole file, but you get the idea
96              
97             ;
98             ; Generated at: 2015-06-21T11:01:24
99             ; This file was generated with the following options
100             ; --workflow drake.yml
101             ; --min 1
102             ;
103              
104             ;
105             ; Samples: test1, test2
106             ;
107             ;
108             ; Starting Workflow
109             ;
110              
111             ;
112             ; Starting backup
113             ;
114              
115              
116             ;
117             ; Variables
118             ; Indir: /home/guests/jir2004/workflow
119             ; Outdir: /home/guests/jir2004/workflow/output/backup
120             ; Local Variables:
121             ; INPUT: {$self->indir}/{$sample}.csv
122             ; OUTPUT: {$self->outdir}/{$sample}.csv
123             ; thing: other thing
124             ;
125              
126             /home/guests/jir2004/workflow/output/backup/$[SAMPLE].csv <- /home/guests/jir2004/workflow/$[SAMPLE].csv
127             cp $INPUT $OUTPUT
128              
129              
130             ;
131             ; Ending backup
132             ;
133              
134              
135             ;
136             ; Starting grep_VARA
137             ;
138              
139              
140             Run drake
141              
142             drake --workflow workflow.full.drake
143              
144             The following steps will be run, in order:
145             1: /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv [timestamped]
146             2: /home/user/workflow/output/backup/test2.csv <- /home/user/workflow/test2.csv [timestamped]
147             3: /home/user/workflow/output/grep_vara/test1.grep_VARA.csv <- /home/user/workflow/output/backup/test1.csv [projected timestamped]
148             4: /home/user/workflow/output/grep_vara/test2.grep_VARA.csv <- /home/user/workflow/output/backup/test2.csv [projected timestamped]
149             5: /home/user/workflow/output/grep_varb/test1.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test1.grep_VARA.csv [projected timestamped]
150             6: /home/user/workflow/output/grep_varb/test2.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test2.grep_VARA.csv [projected timestamped]
151             Confirm? [y/n] y
152             Running 6 steps with concurrence of 1...
153              
154             --- 0. Running (timestamped): /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv
155             --- 0: /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv -> done in 0.02s
156              
157             --- 1. Running (timestamped): /home/user/workflow/output/backup/test2.csv <- /home/user/workflow/test2.csv
158             --- 1: /home/user/workflow/output/backup/test2.csv <- /home/user/workflow/test2.csv -> done in 0.01s
159              
160             --- 2. Running (timestamped): /home/user/workflow/output/grep_vara/test1.grep_VARA.csv <- /home/user/workflow/output/backup/test1.csv
161             Working on /home/user/workflow/output/backup/test1csv
162             --- 2: /home/user/workflow/output/grep_vara/test1.grep_VARA.csv <- /home/user/workflow/output/backup/test1.csv -> done in 0.01s
163              
164             --- 3. Running (timestamped): /home/user/workflow/output/grep_vara/test2.grep_VARA.csv <- /home/user/workflow/output/backup/test2.csv
165             Working on /home/user/workflow/output/backup/test2csv
166             --- 3: /home/user/workflow/output/grep_vara/test2.grep_VARA.csv <- /home/user/workflow/output/backup/test2.csv -> done in 0.01s
167              
168             --- 4. Running (timestamped): /home/user/workflow/output/grep_varb/test1.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test1.grep_VARA.csv
169             --- 4: /home/user/workflow/output/grep_varb/test1.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test1.grep_VARA.csv -> done in 0.01s
170              
171             --- 5. Running (timestamped): /home/user/workflow/output/grep_varb/test2.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test2.grep_VARA.csv
172             --- 5: /home/user/workflow/output/grep_varb/test2.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test2.grep_VARA.csv -> done in 0.08s
173             Done (6 steps run).
174              
175              
176             =head3 Run in minified mode
177              
178             As an alternative you can run this with the --min option, which instead of
179             printing out each workflow prints out only one, and creates a run-workflow.sh
180             which has all of your environmental variables.
181              
182             This option is preferable if running on an HPC cluster with many nodes.
183              
184             This WILL break with use of --resample, either local or global. You need to
185             split up your workflows as opposed to using the --resample option.
186              
187             biox-workflow.pl --workflow workflow.yml --min 1 > workflow.drake #This also creates the run-workflow.sh in the same directory
188             ./run-workflow.sh
189              
190             cat drake.log #Here is the log for the first run
191              
192             2015-06-21 14:02:47,543 INFO Running 3 steps with concurrence of 1...
193             2015-06-21 14:02:47,568 INFO
194             2015-06-21 14:02:47,570 INFO --- 0. Running (timestamped): /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv
195             2015-06-21 14:02:47,592 INFO --- 0: /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv -> done in 0.02s
196              
197             #So on and so forth
198              
199             If you look in the example directory you will see a few png files, these are outputs of the drake workflow.
200              
201             =cut
202              
203             =head1 Acknowledgements
204              
205             Before version 0.03
206              
207             This module was originally developed at and for Weill Cornell Medical
208             College in Qatar within ITS Advanced Computing Team. With approval from
209             WCMC-Q, this information was generalized and put on github, for which
210             the authors would like to express their gratitude.
211              
212             As of version 0.03:
213              
214             This modules continuing development is supported by NYU Abu Dhabi in the Center for Genomics and Systems Biology.
215             With approval from NYUAD, this information was generalized and put on bitbucket, for which
216             the authors would like to express their gratitude.
217              
218             =head1 Inline Code Documentation
219              
220             You shouldn't need these, but if you do here they are.
221              
222             =head2 Attributes
223              
224             =cut
225              
226             =head3 full
227              
228             Print the whole workflow hardcoded. This is the default
229              
230             =cut
231              
232             has 'full' => (
233             is => 'rw',
234             isa => 'Bool',
235             default => 1,
236             );
237              
238             =head3 min
239              
240             Print the workflow as 2 files.
241              
242             Run the drake things
243              
244             drake --vars "SAMPLE=$sample" --workflow/workflow.drake
245              
246             workflow.drake
247              
248             Our regular file
249              
250             =cut
251              
252             has 'min' => (
253             is => 'rw',
254             isa => 'Bool',
255             default => 0,
256             );
257              
258             =head2 Subroutines
259              
260             Subroutines
261              
262             =head3 before run
263              
264             Must initialize some variables
265              
266             =cut
267              
268             before 'run' => sub{
269             my($self) = shift;
270              
271             if($self->min){
272             $self->full(0);
273             }
274             $self->wait(0);
275             $self->comment_char(';');
276             };
277              
278             =head3 after get_samples
279              
280             Things to do if we decide to do a min version
281              
282             =cut
283              
284             after 'get_samples' => sub{
285             my($self) = shift;
286              
287             return unless $self->min;
288              
289             open(my $fh, '>', 'run-workflow.sh') or die print "Could not open file $!\n";
290              
291             print $fh "#!/bin/bash\n\n";
292              
293             foreach my $sample (@{$self->samples}){
294             print $fh <<EOF;
295             drake --vars "SAMPLE=$sample" --workflow workflow.drake
296             EOF
297             }
298              
299             close $fh;
300              
301             chmod 0777, 'run-workflow.sh';
302              
303             $self->samples(["\$SAMPLE"]);
304             };
305              
306             =head3 write_process
307              
308             Fill in the template with the process
309              
310             Ensure INPUT/OUTPUT exist
311              
312             Prettyify the output a bit
313              
314             =cut
315              
316             before 'write_process' => sub{
317             my($self) = shift;
318              
319             $DB::single=2;
320             if((! $self->local_attr->exists('INPUT')) && ! $self->local_attr->exists('OUTPUT') ){
321             print "$self->{comment_char} There is no INPUT or OUTPUT!\n";
322             }
323              
324             #Make the formatting a big prettier
325             my @tmp = split("\n", $self->process);
326             $self->process(join("\n\t", @tmp));
327             };
328              
329             before 'process_template' => sub {
330             my $self = shift;
331              
332             my $tmp = "{\$self->OUTPUT} <- {\$self->INPUT}\n\t";
333             $DB::single=2;
334             if($self->min){
335             $tmp =~ s/\$SAMPLE/\$[SAMPLE]/g;
336             }
337             my $newprocess = $tmp.$self->process;
338             $self->process($newprocess);
339              
340             };
341              
342             1;
343             __END__
344              
345             =encoding utf-8
346              
347             =head1 AUTHOR
348              
349             Jillian Rowe E<lt>jillian.e.rowe@gmail.comE<gt>
350              
351             =head1 COPYRIGHT
352              
353             Copyright 2015- Jillian Rowe
354              
355             =head1 LICENSE
356              
357             This library is free software; you can redistribute it and/or modify
358             it under the same terms as Perl itself.
359              
360             =head1 SEE ALSO
361              
362             =cut