File Coverage

blib/lib/BioX/Workflow/Samples.pm

Criterion	Covered	Total	%
statement	42	46	91.3
branch	4	6	66.6
condition	1	3	33.3
subroutine	7	7	100.0
pod	3	3	100.0
total	57	65	87.6

line	stmt	bran	cond	sub	pod	time	code
1							package BioX::Workflow::Samples;
2
3	2			2		1081	use File::Find::Rule;
	2					4
	2					19
4	2			2		98	use File::Basename;
	2					3
	2					116
5	2			2		766	use List::Uniq ':all';
	2					933
	2					237
6
7	2			2		10	use Moose::Role;
	2					4
	2					15
8
9							=head1 BioX::Workflow::Samples
10
11							All the options for samples are here.
12
13							=head2 Variables
14
15							=head3 resample
16
17							Boolean value get new samples based on indir/file_rule or no
18
19							Samples are found at the beginning of the workflow, based on the global indir variable and the file_find.
20
21							Chances are you don't want to set resample to true. These files probably won't exist outside of the indirectory until the pipeline is run.
22
23							One example of doing so, shown in the gemini.yml in the examples directory, is looking for uncompressed files, .vcf extension, compressing them, and
24							then resampling based on the .vcf.gz extension.
25
26							=cut
27
28							has 'resample' => (
29							traits => ['NoGetopt'],
30							is => 'rw',
31							isa => 'Bool',
32							default => 0,
33							predicate => 'has_resample',
34							clearer => 'clear_resample',
35							);
36
37							=head3 infiles
38
39							Infiles to be processed
40
41							=cut
42
43							has 'infiles' => (
44							traits => ['NoGetopt'],
45							is => 'rw',
46							isa => 'ArrayRef',
47							);
48
49							=head2 find_by_dir
50
51							Use this option when you sample names are by directory
52							The default is to find samples by filename
53
54							/SAMPLE1
55							SAMPLE1_r1.fastq.gz
56							SAMPLE1_r2.fastq.gz
57							/SAMPLE2
58							SAMPLE2_r1.fastq.gz
59							SAMPLE2_r2.fastq.gz
60
61							=cut
62
63							has 'find_by_dir' => (
64							is => 'rw',
65							isa => 'Bool',
66							default => 0,
67							documentation => q{Use this option when you sample names are directories},
68							predicate => 'has_find_by_dir',
69							clearer => 'clear_find_by_dir',
70							);
71
72							=head2 by_sample_outdir
73
74							outdir/
75							/outdir/SAMPLE1
76							/rule1
77							/rule2
78							/rule3
79							/outdir/SAMPLE2
80							/rule1
81							/rule2
82							/rule3
83
84							Instead of
85
86							/outdir
87							/rule1
88							/rule2
89
90							=cut
91
92							has 'by_sample_outdir' => (
93							is => 'rw',
94							isa => 'Bool',
95							default => 0,
96							documentation => q{When you want your output by sample},
97							clearer => 'clear_by_sample_outdir',
98							predicate => 'has_by_sample_outdir',
99							);
100
101							=head3 samples
102
103							Our samples to process. They are either found through file_rule, or passed as command line opts
104
105							=cut
106
107							has 'samples' => (
108							traits => ['Array'],
109							is => 'rw',
110							isa => 'ArrayRef',
111							default => sub { [] },
112							required => 0,
113							handles => {
114							all_samples => 'elements',
115							add_sample => 'push',
116							map_samples => 'map',
117							filter_samples => 'grep',
118							find_sample => 'first',
119							get_sample => 'get',
120							join_samples => 'join',
121							count_samples => 'count',
122							has_samples => 'count',
123							has_no_samples => 'is_empty',
124							sorted_samples => 'sort',
125							},
126							documentation =>
127							q{Supply samples on the command line as --samples sample1 --samples sample2, or find through file_rule.}
128							);
129
130							=head3 sample
131
132							Each time we get the sample we set it.
133
134							=cut
135
136							has 'sample'=> (
137							traits => ['NoGetopt'],
138							is => 'rw',
139							isa => 'Str',
140							required => 0,
141							default => '',
142							);
143
144							=head3 file_rule
145
146							Rule to find files/samples
147
148							=cut
149
150							has 'file_rule' => (
151							is => 'rw',
152							isa => 'Str',
153							default => sub { return "(.*)"; },
154							clearer => 'clear_file_rule',
155							predicate => 'has_file_rule',
156							);
157
158							=head2 Subroutines
159
160							=head3 get_samples
161
162							Get basename of the files. Can add optional rules.
163
164							sample.vcf.gz and sample.vcf would be sample if the file_rule is (.vcf)$\|(.vcf.gz)$
165
166							Also gets the full path to infiles
167
168							Instead of doing
169
170							foreach my $sample (@$self->samples){
171							dostuff
172							}
173
174							Could have
175
176							foreach my $infile (@$self->infiles){
177							dostuff
178							}
179
180							=cut
181
182							sub get_samples {
183	6			6	1	14	my ($self) = shift;
184	6					11	my ( @whole, @basename, $text );
185
186	6	50	33			235	if ( $self->has_samples && !$self->resample ) {
187	0					0	my (@samples) = $self->sorted_samples;
188	0					0	$self->samples( \@samples );
189	0					0	return;
190							}
191
192	6					147	$text = $self->file_rule;
193
194	6	100				149	if ( $self->find_by_dir ) {
195	2					69	@whole = find(
196							directory => name => qr/$text/,
197							maxdepth => 1,
198							in => $self->indir
199							);
200
201							#File find puts directory we are looking in, not just subdirs
202	2					2097	@basename = grep { $_ != basename( $self->{indir} ) } @basename;
	0					0
203	2					5	@basename = map { basename($_) } @whole;
	10					227
204	2					13	@basename = sort(@basename);
205							}
206							else {
207	4					121	@whole = find(
208							file => name => qr/$text/,
209							maxdepth => 1,
210							in => $self->indir
211							);
212
213	4					3189	@basename = map { $self->match_samples( $_, $text ) } @whole;
	20					33
214	4					21	@basename = uniq(@basename);
215	4					216	@basename = sort(@basename);
216							}
217
218	6					185	$self->samples( \@basename );
219	6					170	$self->infiles( \@whole );
220
221	6					23	$self->write_sample_meta;
222							}
223
224							=head2 write_sample_meta
225
226							Write the meta for samples
227
228							=cut
229
230							sub write_sample_meta {
231	6			6	1	7	my $self = shift;
232
233	6	50				151	return unless $self->verbose;
234
235	6					295	print "$self->{comment_char}\n";
236							print "$self->{comment_char} Samples: ",
237	6					17	join( ", ", @{ $self->samples } ) . "\n";
	6					154
238	6					43	print "$self->{comment_char}\n";
239
240							}
241
242
243							=head2 match_samples
244
245							Match samples based on regex written in file_rule
246
247							=cut
248
249							sub match_samples {
250	20			20	1	20	my $self = shift;
251	20					16	my $file = shift;
252	20					14	my $text = shift;
253
254	20					204	my @tmp = fileparse($_);
255	20					93	my ($m) = $tmp[0] =~ qr/$text/;
256
257	20					45	return $m;
258							}
259							1;