File Coverage

lib/Bio/MLST/CheckMultipleSpecies.pm
Criterion Covered Total %
statement 99 121 81.8
branch 20 44 45.4
condition 4 21 19.0
subroutine 14 14 100.0
pod 1 1 100.0
total 138 201 68.6


line stmt bran cond sub pod time code
1             package Bio::MLST::CheckMultipleSpecies;
2             $Bio::MLST::CheckMultipleSpecies::VERSION = '2.1.1706216';
3             # ABSTRACT: High throughput multilocus sequence typing (MLST) checking against several MLST databases.
4              
5              
6              
7 5     5   555770 use Moose;
  5         1615790  
  5         25  
8 5     5   25880 use Bio::MLST::Check;
  5         15  
  5         185  
9 5     5   2290 use Bio::MLST::Databases;
  5         10  
  5         160  
10 5     5   30 use Parallel::ForkManager;
  5         10  
  5         90  
11 5     5   20 use File::Temp;
  5         5  
  5         400  
12 5     5   20 use Cwd;
  5         10  
  5         210  
13 5     5   25 use Text::CSV;
  5         5  
  5         5185  
14              
15             has 'species' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); # empty array searches against all databases
16             has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
17             has 'parallel_processes' => ( is => 'ro', isa => 'Int', default => 1 ); # max parallel processes
18             has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); # output search progress and number of matches
19             has 'report_all_mlst_db' => ( is => 'rw', isa => 'Bool', default => 0 ); # report all mlst databases searched
20             has 'report_lowest_st' => ( is => 'rw', isa => 'Bool', default => 0 );
21              
22             has 'raw_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
23             has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', required => 1 );
24             has 'blastn_exec' => ( is => 'ro', isa => 'Str', required => 1 );
25             has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
26             has 'spreadsheet_basename' => ( is => 'ro', isa => 'Str', default => 'mlst_results' );
27             has 'output_fasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); # output of fasta not supported
28             has 'output_phylip_files' => ( is => 'ro', isa => 'Bool', default => 0 ); # output of phylip not supported
29             has 'show_contamination_instead_of_alt_matches' => ( is => 'ro', isa => 'Bool', default => 1 );
30              
31             has '_species_list' => ( is => 'ro', isa => 'ArrayRef', lazy_build => 1 );
32             has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(DIR => getcwd, CLEANUP => 1); });
33              
34             sub _build__species_list
35             {
36 6     6   11 my($self) = @_;
37 6         11 my @species_list = @{$self->species};
  6         127  
38              
39             # if no species supplied then run vs all species
40 6 100       17 unless(@species_list)
41             {
42 1         21 my $mlst_databases = Bio::MLST::Databases->new(
43             base_directory => $self->base_directory,
44             );
45 1         1 @species_list = @{$mlst_databases->database_names};
  1         24  
46             }
47              
48 6         31 @species_list = sort { $a cmp $b } @species_list;
  9         19  
49              
50 6         117 return \@species_list;
51             }
52              
53             sub _check_input_files_exist
54             {
55 6     6   12 my($self) = @_;
56              
57 6         372 my $check = Bio::MLST::Check->new( raw_input_fasta_files => $self->raw_input_fasta_files,
58             species => '',
59             base_directory => '',
60             makeblastdb_exec => '',
61             blastn_exec => '',
62             output_directory => '' );
63              
64 6         27 return $check->input_fasta_files_exist;
65             }
66              
67             # print error message if phylip or fasta files requested
68             sub _check_fasta_phylip_options
69             {
70 6     6   282 my($self) = @_;
71              
72 6 100 66     176 return 1 unless ($self->output_fasta_files || $self->output_phylip_files);
73              
74 1         4 print qq[
75             The --output_fasta_files and --output_phylip_files options cannot be used when searching
76             against more than one MLST database as the alleles searched will differ between species.
77              
78             To output fasta and phylip files, please search against a single MLST database.\n\n];
79 1         2 return 0;
80             }
81              
82             sub _run_mlst_for_species_list
83             {
84 5     5   10 my ($self) = @_;
85              
86             # set parallel processes - if more species than processes then search input files in parallel.
87 5         115 my $parallel_process_total = $self->parallel_processes;
88 5 50       5 my $parallel_process_species = @{$self->_species_list} < $self->parallel_processes ? @{$self->_species_list} : $self->parallel_processes;
  5         105  
  0         0  
89 5 50       95 my $parallel_process_fa_file = int($self->parallel_processes/@{$self->_species_list}) ? int($self->parallel_processes/@{$self->_species_list}) : 1;
  5         90  
  5         85  
90              
91             # Run for each species - output to csv files named 0001,0002,etc.
92 5         100 my $pm = new Parallel::ForkManager($self->parallel_processes);
93 5         1215 for(my $i=1; $i <= @{$self->_species_list}; $i++)
  9         6461  
94             {
95 8 100       26 $pm->start and next; # fork here
96              
97 4         5540 my $spreadsheet_basename = sprintf "%04i",$i;
98 4         568 my $species_name = $self->_species_list->[$i-1];
99 4 50       196 print qq[ Searching $species_name...\n] if $self->verbose;
100              
101 4         218 my $multiple_fastas = Bio::MLST::Check->new(
102             species => $species_name,
103             base_directory => $self->base_directory,
104             raw_input_fasta_files => $self->raw_input_fasta_files,
105             makeblastdb_exec => $self->makeblastdb_exec,
106             blastn_exec => $self->blastn_exec,
107             output_directory => $self->_working_directory->dirname(),
108             spreadsheet_basename => $spreadsheet_basename,
109             parallel_processes => $parallel_process_fa_file,
110             output_fasta_files => $self->output_fasta_files,
111             output_phylip_files => $self->output_phylip_files,
112             show_contamination_instead_of_alt_matches => $self->show_contamination_instead_of_alt_matches,
113             report_lowest_st => $self->report_lowest_st
114             );
115 4         96 $multiple_fastas->create_result_files;
116              
117 0         0 $pm->finish;
118             }
119 1         21 $pm->wait_all_children;
120 1 50       6006806 print qq[ Finished searching\n] if $self->verbose;
121              
122 1         12 return 1;
123             }
124              
125             sub _concatenate_result_files
126             {
127 1     1   4 my ($self) = @_;
128              
129 1         5 for my $file_type ('allele','genomic')
130             {
131             # open output filehandle and csv
132 2         59 my $result_file = $self->output_directory.'/'.$self->spreadsheet_basename.'.'.$file_type.'.csv';
133 2 50       153 open(my $fh_out, '>'.$result_file) or die "Can't open file: $result_file $!\n";
134 2         38 my $csv_out = Text::CSV->new({binary=>1, sep_char=>"\t", always_quote=>1, eol=>"\r\n"});
135            
136             # process temp result files 0001,0002,etc.
137 2         298 my $previous_positive_result = 0;
138 2         3 my $results_found_flag = 0;
139 2         4 for(my $i=1; $i <= @{$self->_species_list}; $i++)
  6         127  
140             {
141             # species and file naming
142 4         74 my $species_name = $self->_species_list->[$i-1];
143 4         7 $species_name =~ s/_/ /g;
144 4         80 my $working_dir = $self->_working_directory->dirname();
145 4         23 my $spreadsheet_basename = sprintf "%04i",$i;
146 4         11 my $result_file_temp = $working_dir.'/'.$spreadsheet_basename.'.'.$file_type.'.csv';
147              
148             # input csv and filehandle
149 4         24 my $csv_in = Text::CSV->new({binary=>1, sep_char=>"\t", eol=>"\r\n"});
150 4         281 my $fh_in;
151            
152             # results rows
153 4         7 my @header_row = ();
154 4         6 my @isolate_rows = ();
155 4         2 my @positive_rows = ();
156              
157             # parse temp results file
158 4         108 open($fh_in, $result_file_temp);
159 4         62 while(my $line = <$fh_in>)
160             {
161 0         0 $csv_in->parse($line);
162 0         0 my @row = $csv_in->fields();
163 0 0       0 next unless @row;
164              
165 0 0 0     0 if($row[0] eq 'Isolate' && $row[1] eq 'ST')
166             {
167 0         0 @header_row = @row;
168 0         0 next;
169             }
170            
171 0         0 push(@isolate_rows, \@row);
172             # filter results
173 0         0 for(my $i=4; $i<@row; $i++)
174             {
175 0 0       0 if($row[$i] ne 'U')
176             {
177 0         0 push(@positive_rows, \@row);
178 0         0 last;
179             }
180             }
181             }
182 4         11 close $fh_in;
183              
184             # Sort results by file name
185 4         8 @isolate_rows = sort{ $a->[0] cmp $b->[0] } @isolate_rows;
  0         0  
186 4         7 @positive_rows = sort{ $a->[0] cmp $b->[0] } @positive_rows;
  0         0  
187              
188             # output to final file
189 4 50       107 if($self->report_all_mlst_db)
    50          
190             {
191 0 0 0     0 $csv_out->print($fh_out,['']) if((@positive_rows || $previous_positive_result) && $i > 1); # blank row
      0        
192 0         0 $csv_out->print($fh_out,[$species_name,'matched '.scalar(@positive_rows).' of '.scalar(@isolate_rows).' files']);
193             }
194             elsif(@positive_rows)
195             {
196 0 0 0     0 $csv_out->print($fh_out,['']) if($previous_positive_result && $i > 1); # blank row
197 0         0 $csv_out->print($fh_out,[$species_name,'matched '.scalar(@positive_rows).' of '.scalar(@isolate_rows).' files']);
198             }
199              
200 4 50       11 if(@positive_rows)
201             {
202 0         0 $csv_out->print($fh_out,\@header_row);
203 0         0 for my $row (@positive_rows)
204             {
205 0         0 $csv_out->print($fh_out,$row);
206             }
207             }
208 4 50       12 $previous_positive_result = scalar(@positive_rows) ? 1:0;
209 4 50       8 $results_found_flag = 1 if scalar(@positive_rows);
210              
211 4 50 33     72 printf " %-40s %d/%d\n",$species_name,scalar(@positive_rows),scalar(@isolate_rows) if ($self->verbose && $file_type eq 'allele'); # verbose
212             }
213              
214             # no matches found
215 2 50 33     37 if(!$self->report_all_mlst_db && !$results_found_flag)
216             {
217 2         58 $csv_out->print($fh_out,['No matches found']);
218             }
219              
220 2         101 close $fh_out;
221             }
222              
223 1         2 return 1;
224             }
225              
226             sub create_result_files
227             {
228 5     5 1 5 my($self) = @_;
229              
230 5 50       20 exit 1 unless $self->_check_input_files_exist;
231 5 50       1785 exit 1 unless $self->_check_fasta_phylip_options;
232 5         20 $self->_run_mlst_for_species_list();
233 1         489 $self->_concatenate_result_files();
234              
235 1         13 return 1;
236             }
237              
238 5     5   30 no Moose;
  5         5  
  5         25  
239             __PACKAGE__->meta->make_immutable;
240             1;
241              
242             __END__
243              
244             =pod
245              
246             =encoding UTF-8
247              
248             =head1 NAME
249              
250             Bio::MLST::CheckMultipleSpecies - High throughput multilocus sequence typing (MLST) checking against several MLST databases.
251              
252             =head1 VERSION
253              
254             version 2.1.1706216
255              
256             =head1 SYNOPSIS
257              
258             This is a wrapper for the Bio::MLST::Check module allowing MLST checking against several databases.
259              
260             The Bio::MLST::Check options to output a concatenated fasta file of allele matches or to output a
261             phylip alignment file are not supported as the loci for sequence typing will vary between species.
262             Including these options will give an error message requesting that the user refine their search.
263              
264             use Bio::MLST::CheckMultipleSpecies;
265            
266             my @fasta_files = ('isolate_one.fa', 'isolate_two.fa');
267             my @species_list = ('Clostridium diff', 'Streptococcus');
268            
269             my $mlst = Bio::MLST::CheckMultipleSpecies->new( species => \@species_list,
270             raw_input_fasta_files => \@fasta_files,
271             spreadsheet_basename => $spreadsheet_basename,
272             output_directory => $output_directory,
273             base_directory => $base_directory,
274             makeblastdb_exec => $makeblastdb_exec,
275             blastn_exec => $blastn_exec,
276             parallel_processes => $parallel_processes,
277             verbose => 0,);
278             $multiple_species->create_result_files;
279              
280             =head1 METHODS
281              
282             =head2 create_result_files
283              
284             Creates a spreadsheet of results.
285              
286             =head1 SEE ALSO
287              
288             =over 4
289              
290             =item *
291              
292             L<Bio::MLST::Check>
293              
294             =back
295              
296             =head1 AUTHOR
297              
298             Andrew J. Page <ap13@sanger.ac.uk>
299              
300             =head1 COPYRIGHT AND LICENSE
301              
302             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
303              
304             This is free software, licensed under:
305              
306             The GNU General Public License, Version 3, June 2007
307              
308             =cut