File Coverage

lib/Bio/MLST/CheckMultipleSpecies.pm
Criterion Covered Total %
statement 99 121 81.8
branch 20 44 45.4
condition 4 21 19.0
subroutine 14 14 100.0
pod 1 1 100.0
total 138 201 68.6


line stmt bran cond sub pod time code
1             package Bio::MLST::CheckMultipleSpecies;
2             $Bio::MLST::CheckMultipleSpecies::VERSION = '2.1.1630910';
3             # ABSTRACT: High throughput multilocus sequence typing (MLST) checking against several MLST databases.
4              
5              
6              
7 6     6   822605 use Moose;
  6         2569962  
  6         53  
8 6     6   48630 use Bio::MLST::Check;
  6         19  
  6         261  
9 6     6   2974 use Bio::MLST::Databases;
  6         27  
  6         330  
10 6     6   58 use Parallel::ForkManager;
  6         12  
  6         200  
11 6     6   39 use File::Temp;
  6         8  
  6         746  
12 6     6   96 use Cwd;
  6         16  
  6         381  
13 6     6   29 use Text::CSV;
  6         11  
  6         57  
14              
15             has 'species' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); # empty array searches against all databases
16             has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
17             has 'parallel_processes' => ( is => 'ro', isa => 'Int', default => 1 ); # max parallel processes
18             has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); # output search progress and number of matches
19             has 'report_all_mlst_db' => ( is => 'rw', isa => 'Bool', default => 0 ); # report all mlst databases searched
20             has 'report_lowest_st' => ( is => 'rw', isa => 'Bool', default => 0 );
21              
22             has 'raw_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
23             has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', required => 1 );
24             has 'blastn_exec' => ( is => 'ro', isa => 'Str', required => 1 );
25             has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
26             has 'spreadsheet_basename' => ( is => 'ro', isa => 'Str', default => 'mlst_results' );
27             has 'output_fasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); # output of fasta not supported
28             has 'output_phylip_files' => ( is => 'ro', isa => 'Bool', default => 0 ); # output of phylip not supported
29             has 'show_contamination_instead_of_alt_matches' => ( is => 'ro', isa => 'Bool', default => 1 );
30              
31             has '_species_list' => ( is => 'ro', isa => 'ArrayRef', lazy_build => 1 );
32             has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(DIR => getcwd, CLEANUP => 1); });
33              
34             sub _build__species_list
35             {
36 6     6   22 my($self) = @_;
37 6         6 my @species_list = @{$self->species};
  6         262  
38              
39             # if no species supplied then run vs all species
40 6 100       33 unless(@species_list)
41             {
42 1         48 my $mlst_databases = Bio::MLST::Databases->new(
43             base_directory => $self->base_directory,
44             );
45 1         2 @species_list = @{$mlst_databases->database_names};
  1         45  
46             }
47              
48 6         35 @species_list = sort { $a cmp $b } @species_list;
  9         25  
49              
50 6         281 return \@species_list;
51             }
52              
53             sub _check_input_files_exist
54             {
55 6     6   18 my($self) = @_;
56              
57 6         323 my $check = Bio::MLST::Check->new( raw_input_fasta_files => $self->raw_input_fasta_files,
58             species => '',
59             base_directory => '',
60             makeblastdb_exec => '',
61             blastn_exec => '',
62             output_directory => '' );
63              
64 6         52 return $check->input_fasta_files_exist;
65             }
66              
67             # print error message if phylip or fasta files requested
68             sub _check_fasta_phylip_options
69             {
70 6     6   388 my($self) = @_;
71              
72 6 100 66     321 return 1 unless ($self->output_fasta_files || $self->output_phylip_files);
73              
74 1         6 print qq[
75             The --output_fasta_files and --output_phylip_files options cannot be used when searching
76             against more than one MLST database as the alleles searched will differ between species.
77              
78             To output fasta and phylip files, please search against a single MLST database.\n\n];
79 1         4 return 0;
80             }
81              
82             sub _run_mlst_for_species_list
83             {
84 5     5   15 my ($self) = @_;
85              
86             # set parallel processes - if more species than processes then search input files in parallel.
87 5         240 my $parallel_process_total = $self->parallel_processes;
88 5 50       10 my $parallel_process_species = @{$self->_species_list} < $self->parallel_processes ? @{$self->_species_list} : $self->parallel_processes;
  5         220  
  0         0  
89 5 50       205 my $parallel_process_fa_file = int($self->parallel_processes/@{$self->_species_list}) ? int($self->parallel_processes/@{$self->_species_list}) : 1;
  5         185  
  5         200  
90              
91             # Run for each species - output to csv files named 0001,0002,etc.
92 5         200 my $pm = new Parallel::ForkManager($self->parallel_processes);
93 5         2070 for(my $i=1; $i <= @{$self->_species_list}; $i++)
  9         17456  
94             {
95 8 100       57 $pm->start and next; # fork here
96              
97 4         15040 my $spreadsheet_basename = sprintf "%04i",$i;
98 4         1048 my $species_name = $self->_species_list->[$i-1];
99 4 50       564 print qq[ Searching $species_name...\n] if $self->verbose;
100              
101 4         534 my $multiple_fastas = Bio::MLST::Check->new(
102             species => $species_name,
103             base_directory => $self->base_directory,
104             raw_input_fasta_files => $self->raw_input_fasta_files,
105             makeblastdb_exec => $self->makeblastdb_exec,
106             blastn_exec => $self->blastn_exec,
107             output_directory => $self->_working_directory->dirname(),
108             spreadsheet_basename => $spreadsheet_basename,
109             parallel_processes => $parallel_process_fa_file,
110             output_fasta_files => $self->output_fasta_files,
111             output_phylip_files => $self->output_phylip_files,
112             show_contamination_instead_of_alt_matches => $self->show_contamination_instead_of_alt_matches,
113             report_lowest_st => $self->report_lowest_st
114             );
115 4         178 $multiple_fastas->create_result_files;
116              
117 0         0 $pm->finish;
118             }
119 1         52 $pm->wait_all_children;
120 1 50       9017607 print qq[ Finished searching\n] if $self->verbose;
121              
122 1         15 return 1;
123             }
124              
125             sub _concatenate_result_files
126             {
127 1     1   10 my ($self) = @_;
128              
129 1         9 for my $file_type ('allele','genomic')
130             {
131             # open output filehandle and csv
132 2         123 my $result_file = $self->output_directory.'/'.$self->spreadsheet_basename.'.'.$file_type.'.csv';
133 2 50       243 open(my $fh_out, '>'.$result_file) or die "Can't open file: $result_file $!\n";
134 2         56 my $csv_out = Text::CSV->new({binary=>1, sep_char=>"\t", always_quote=>1, eol=>"\r\n"});
135            
136             # process temp result files 0001,0002,etc.
137 2         323 my $previous_positive_result = 0;
138 2         4 my $results_found_flag = 0;
139 2         17 for(my $i=1; $i <= @{$self->_species_list}; $i++)
  6         294  
140             {
141             # species and file naming
142 4         151 my $species_name = $self->_species_list->[$i-1];
143 4         15 $species_name =~ s/_/ /g;
144 4         155 my $working_dir = $self->_working_directory->dirname();
145 4         35 my $spreadsheet_basename = sprintf "%04i",$i;
146 4         18 my $result_file_temp = $working_dir.'/'.$spreadsheet_basename.'.'.$file_type.'.csv';
147              
148             # input csv and filehandle
149 4         34 my $csv_in = Text::CSV->new({binary=>1, sep_char=>"\t", eol=>"\r\n"});
150 4         327 my $fh_in;
151            
152             # results rows
153 4         13 my @header_row = ();
154 4         6 my @isolate_rows = ();
155 4         5 my @positive_rows = ();
156              
157             # parse temp results file
158 4         153 open($fh_in, $result_file_temp);
159 4         128 while(my $line = <$fh_in>)
160             {
161 0         0 $csv_in->parse($line);
162 0         0 my @row = $csv_in->fields();
163 0 0       0 next unless @row;
164              
165 0 0 0     0 if($row[0] eq 'Isolate' && $row[1] eq 'ST')
166             {
167 0         0 @header_row = @row;
168 0         0 next;
169             }
170            
171 0         0 push(@isolate_rows, \@row);
172             # filter results
173 0         0 for(my $i=4; $i<@row; $i++)
174             {
175 0 0       0 if($row[$i] ne 'U')
176             {
177 0         0 push(@positive_rows, \@row);
178 0         0 last;
179             }
180             }
181             }
182 4         12 close $fh_in;
183              
184             # Sort results by file name
185 4         19 @isolate_rows = sort{ $a->[0] cmp $b->[0] } @isolate_rows;
  0         0  
186 4         8 @positive_rows = sort{ $a->[0] cmp $b->[0] } @positive_rows;
  0         0  
187              
188             # output to final file
189 4 50       181 if($self->report_all_mlst_db)
    50          
190             {
191 0 0 0     0 $csv_out->print($fh_out,['']) if((@positive_rows || $previous_positive_result) && $i > 1); # blank row
      0        
192 0         0 $csv_out->print($fh_out,[$species_name,'matched '.scalar(@positive_rows).' of '.scalar(@isolate_rows).' files']);
193             }
194             elsif(@positive_rows)
195             {
196 0 0 0     0 $csv_out->print($fh_out,['']) if($previous_positive_result && $i > 1); # blank row
197 0         0 $csv_out->print($fh_out,[$species_name,'matched '.scalar(@positive_rows).' of '.scalar(@isolate_rows).' files']);
198             }
199              
200 4 50       18 if(@positive_rows)
201             {
202 0         0 $csv_out->print($fh_out,\@header_row);
203 0         0 for my $row (@positive_rows)
204             {
205 0         0 $csv_out->print($fh_out,$row);
206             }
207             }
208 4 50       13 $previous_positive_result = scalar(@positive_rows) ? 1:0;
209 4 50       12 $results_found_flag = 1 if scalar(@positive_rows);
210              
211 4 50 33     159 printf " %-40s %d/%d\n",$species_name,scalar(@positive_rows),scalar(@isolate_rows) if ($self->verbose && $file_type eq 'allele'); # verbose
212             }
213              
214             # no matches found
215 2 50 33     69 if(!$self->report_all_mlst_db && !$results_found_flag)
216             {
217 2         17 $csv_out->print($fh_out,['No matches found']);
218             }
219              
220 2         521 close $fh_out;
221             }
222              
223 1         3 return 1;
224             }
225              
226             sub create_result_files
227             {
228 5     5 1 20 my($self) = @_;
229              
230 5 50       35 exit 1 unless $self->_check_input_files_exist;
231 5 50       3045 exit 1 unless $self->_check_fasta_phylip_options;
232 5         35 $self->_run_mlst_for_species_list();
233 1         557 $self->_concatenate_result_files();
234              
235 1         14 return 1;
236             }
237              
238 6     6   8176 no Moose;
  6         7  
  6         37  
239             __PACKAGE__->meta->make_immutable;
240             1;
241              
242             __END__
243              
244             =pod
245              
246             =encoding UTF-8
247              
248             =head1 NAME
249              
250             Bio::MLST::CheckMultipleSpecies - High throughput multilocus sequence typing (MLST) checking against several MLST databases.
251              
252             =head1 VERSION
253              
254             version 2.1.1630910
255              
256             =head1 SYNOPSIS
257              
258             This is a wrapper for the Bio::MLST::Check module allowing MLST checking against several databases.
259              
260             The Bio::MLST::Check options to output a concatenated fasta file of allele matches or to output a
261             phylip alignment file are not supported as the loci for sequence typing will vary between species.
262             Including these options will give an error message requesting that the user refine their search.
263              
264             use Bio::MLST::CheckMultipleSpecies;
265            
266             my @fasta_files = ('isolate_one.fa', 'isolate_two.fa');
267             my @species_list = ('Clostridium diff', 'Streptococcus');
268            
269             my $mlst = Bio::MLST::CheckMultipleSpecies->new( species => \@species_list,
270             raw_input_fasta_files => \@fasta_files,
271             spreadsheet_basename => $spreadsheet_basename,
272             output_directory => $output_directory,
273             base_directory => $base_directory,
274             makeblastdb_exec => $makeblastdb_exec,
275             blastn_exec => $blastn_exec,
276             parallel_processes => $parallel_processes,
277             verbose => 0,);
278             $multiple_species->create_result_files;
279              
280             =head1 METHODS
281              
282             =head2 create_result_files
283              
284             Creates a spreadsheet of results.
285              
286             =head1 SEE ALSO
287              
288             =over 4
289              
290             =item *
291              
292             L<Bio::MLST::Check>
293              
294             =back
295              
296             =head1 AUTHOR
297              
298             Andrew J. Page <ap13@sanger.ac.uk>
299              
300             =head1 COPYRIGHT AND LICENSE
301              
302             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
303              
304             This is free software, licensed under:
305              
306             The GNU General Public License, Version 3, June 2007
307              
308             =cut