File Coverage

blib/lib/BioX/Workflow/Samples.pm
Criterion Covered Total %
statement 42 46 91.3
branch 4 6 66.6
condition 1 3 33.3
subroutine 7 7 100.0
pod 3 3 100.0
total 57 65 87.6


line stmt bran cond sub pod time code
1             package BioX::Workflow::Samples;
2              
3 2     2   1081 use File::Find::Rule;
  2         4  
  2         19  
4 2     2   98 use File::Basename;
  2         3  
  2         116  
5 2     2   766 use List::Uniq ':all';
  2         933  
  2         237  
6              
7 2     2   10 use Moose::Role;
  2         4  
  2         15  
8              
9             =head1 BioX::Workflow::Samples
10              
11             All the options for samples are here.
12              
13             =head2 Variables
14              
15             =head3 resample
16              
17             Boolean value get new samples based on indir/file_rule or no
18              
19             Samples are found at the beginning of the workflow, based on the global indir variable and the file_find.
20              
21             Chances are you don't want to set resample to true. These files probably won't exist outside of the indirectory until the pipeline is run.
22              
23             One example of doing so, shown in the gemini.yml in the examples directory, is looking for uncompressed files, .vcf extension, compressing them, and
24             then resampling based on the .vcf.gz extension.
25              
26             =cut
27              
28             has 'resample' => (
29             traits => ['NoGetopt'],
30             is => 'rw',
31             isa => 'Bool',
32             default => 0,
33             predicate => 'has_resample',
34             clearer => 'clear_resample',
35             );
36              
37             =head3 infiles
38              
39             Infiles to be processed
40              
41             =cut
42              
43             has 'infiles' => (
44             traits => ['NoGetopt'],
45             is => 'rw',
46             isa => 'ArrayRef',
47             );
48              
49             =head2 find_by_dir
50              
51             Use this option when you sample names are by directory
52             The default is to find samples by filename
53              
54             /SAMPLE1
55             SAMPLE1_r1.fastq.gz
56             SAMPLE1_r2.fastq.gz
57             /SAMPLE2
58             SAMPLE2_r1.fastq.gz
59             SAMPLE2_r2.fastq.gz
60              
61             =cut
62              
63             has 'find_by_dir' => (
64             is => 'rw',
65             isa => 'Bool',
66             default => 0,
67             documentation => q{Use this option when you sample names are directories},
68             predicate => 'has_find_by_dir',
69             clearer => 'clear_find_by_dir',
70             );
71              
72             =head2 by_sample_outdir
73              
74             outdir/
75             /outdir/SAMPLE1
76             /rule1
77             /rule2
78             /rule3
79             /outdir/SAMPLE2
80             /rule1
81             /rule2
82             /rule3
83              
84             Instead of
85              
86             /outdir
87             /rule1
88             /rule2
89              
90             =cut
91              
92             has 'by_sample_outdir' => (
93             is => 'rw',
94             isa => 'Bool',
95             default => 0,
96             documentation => q{When you want your output by sample},
97             clearer => 'clear_by_sample_outdir',
98             predicate => 'has_by_sample_outdir',
99             );
100              
101             =head3 samples
102              
103             Our samples to process. They are either found through file_rule, or passed as command line opts
104              
105             =cut
106              
107             has 'samples' => (
108             traits => ['Array'],
109             is => 'rw',
110             isa => 'ArrayRef',
111             default => sub { [] },
112             required => 0,
113             handles => {
114             all_samples => 'elements',
115             add_sample => 'push',
116             map_samples => 'map',
117             filter_samples => 'grep',
118             find_sample => 'first',
119             get_sample => 'get',
120             join_samples => 'join',
121             count_samples => 'count',
122             has_samples => 'count',
123             has_no_samples => 'is_empty',
124             sorted_samples => 'sort',
125             },
126             documentation =>
127             q{Supply samples on the command line as --samples sample1 --samples sample2, or find through file_rule.}
128             );
129              
130             =head3 sample
131              
132             Each time we get the sample we set it.
133              
134             =cut
135              
136             has 'sample'=> (
137             traits => ['NoGetopt'],
138             is => 'rw',
139             isa => 'Str',
140             required => 0,
141             default => '',
142             );
143              
144             =head3 file_rule
145              
146             Rule to find files/samples
147              
148             =cut
149              
150             has 'file_rule' => (
151             is => 'rw',
152             isa => 'Str',
153             default => sub { return "(.*)"; },
154             clearer => 'clear_file_rule',
155             predicate => 'has_file_rule',
156             );
157              
158             =head2 Subroutines
159              
160             =head3 get_samples
161              
162             Get basename of the files. Can add optional rules.
163              
164             sample.vcf.gz and sample.vcf would be sample if the file_rule is (.vcf)$|(.vcf.gz)$
165              
166             Also gets the full path to infiles
167              
168             Instead of doing
169              
170             foreach my $sample (@$self->samples){
171             dostuff
172             }
173              
174             Could have
175              
176             foreach my $infile (@$self->infiles){
177             dostuff
178             }
179              
180             =cut
181              
182             sub get_samples {
183 6     6 1 14 my ($self) = shift;
184 6         11 my ( @whole, @basename, $text );
185              
186 6 50 33     235 if ( $self->has_samples && !$self->resample ) {
187 0         0 my (@samples) = $self->sorted_samples;
188 0         0 $self->samples( \@samples );
189 0         0 return;
190             }
191              
192 6         147 $text = $self->file_rule;
193              
194 6 100       149 if ( $self->find_by_dir ) {
195 2         69 @whole = find(
196             directory => name => qr/$text/,
197             maxdepth => 1,
198             in => $self->indir
199             );
200              
201             #File find puts directory we are looking in, not just subdirs
202 2         2097 @basename = grep { $_ != basename( $self->{indir} ) } @basename;
  0         0  
203 2         5 @basename = map { basename($_) } @whole;
  10         227  
204 2         13 @basename = sort(@basename);
205             }
206             else {
207 4         121 @whole = find(
208             file => name => qr/$text/,
209             maxdepth => 1,
210             in => $self->indir
211             );
212              
213 4         3189 @basename = map { $self->match_samples( $_, $text ) } @whole;
  20         33  
214 4         21 @basename = uniq(@basename);
215 4         216 @basename = sort(@basename);
216             }
217              
218 6         185 $self->samples( \@basename );
219 6         170 $self->infiles( \@whole );
220              
221 6         23 $self->write_sample_meta;
222             }
223              
224             =head2 write_sample_meta
225              
226             Write the meta for samples
227              
228             =cut
229              
230             sub write_sample_meta {
231 6     6 1 7 my $self = shift;
232              
233 6 50       151 return unless $self->verbose;
234              
235 6         295 print "$self->{comment_char}\n";
236             print "$self->{comment_char} Samples: ",
237 6         17 join( ", ", @{ $self->samples } ) . "\n";
  6         154  
238 6         43 print "$self->{comment_char}\n";
239              
240             }
241              
242              
243             =head2 match_samples
244              
245             Match samples based on regex written in file_rule
246              
247             =cut
248              
249             sub match_samples {
250 20     20 1 20 my $self = shift;
251 20         16 my $file = shift;
252 20         14 my $text = shift;
253              
254 20         204 my @tmp = fileparse($_);
255 20         93 my ($m) = $tmp[0] =~ qr/$text/;
256              
257 20         45 return $m;
258             }
259             1;