File Coverage

lib/Bio/MLST/Blast/BlastN.pm
Criterion Covered Total %
statement 12 117 10.2
branch 0 22 0.0
condition 0 6 0.0
subroutine 4 16 25.0
pod n/a
total 16 161 9.9


line stmt bran cond sub pod time code
1             package Bio::MLST::Blast::BlastN;
2             # ABSTRACT: Wrapper around NCBI BlastN
3             $Bio::MLST::Blast::BlastN::VERSION = '2.1.1706216';
4              
5 12     12   569 use Moose;
  12         13  
  12         62  
6 12     12   50007 use Bio::MLST::Types;
  12         20  
  12         247  
7 12     12   43 use List::Util qw(reduce max min);
  12         12  
  12         11195  
8              
9             # input variables
10             has 'blast_database' => ( is => 'ro', isa => 'Str', required => 1 );
11             has 'query_file' => ( is => 'ro', isa => 'Str', required => 1 );
12             has 'word_sizes' => ( is => 'ro', isa => 'HashRef', required => 1 );
13             has 'exec' => ( is => 'ro', isa => 'Bio::MLST::Executable', default => 'blastn' );
14             has 'perc_identity' => ( is => 'ro', isa => 'Int', default => 0 );
15              
16             # Generated
17             has 'top_hit' => ( is => 'ro', isa => 'Maybe[HashRef]', lazy => 1, builder => '_build_top_hit' );
18              
19             sub _build_hit
20             {
21 0     0     my($self, $line) = @_;
22 0           chomp($line);
23 0           my @row = split(/\t/,$line);
24 0           my ($start, $end) = ($row[8], $row[9]);
25 0 0         ($start, $end, my $reverse) = $start <= $end ? ($start, $end, 0) : ($end, $start, 1);
26             return {
27 0           'allele_name' => $row[0],
28             'source_name' => $row[1],
29             'percentage_identity' => $row[2],
30             'sample_alignment_length' => $row[3],
31             'matches' => $row[12],
32             'source_start' => $start,
33             'source_end' => $end,
34             'reverse' => $reverse,
35             };
36             }
37              
38             sub _build_hits
39             {
40 0     0     my ($self, $blast_output_fh) = @_;
41 0           my @hits;
42 0           while(<$blast_output_fh>)
43             {
44 0           push(@hits, $self->_build_hit($_));
45             }
46 0           return \@hits;
47             }
48              
49             sub _filter_by_alignment_length
50             {
51             ###
52             # For each allele there is a minimum length of sequence it must be aligned
53             # against before it can be considered a match.
54             ###
55 0     0     my ($self, $hits, $word_sizes) = @_;
56 0           my @long_hits = grep { $_->{'sample_alignment_length'} >= $word_sizes->{$_->{'allele_name'}} } @$hits;
  0            
57 0           return \@long_hits;
58             }
59              
60             sub _filter_best_hits
61             {
62 0     0     my($self, $hits, $tollerance) = @_;
63 0 0         $tollerance = defined($tollerance) ? $tollerance : 2.0;
64 0           my @percentages = map { $_->{'percentage_identity'} } @$hits;
  0            
65 0           my $top_percentage = max @percentages;
66 0           my @top_hits = grep { $_->{'percentage_identity'} >= $top_percentage - $tollerance } @$hits;
  0            
67 0           return \@top_hits;
68             }
69              
70             sub _group_overlapping_hits
71             {
72             ###
73             # Hits can overlap, this groups hits which overlap and returns a reference to
74             # an array of references to these groups.
75             ###
76 0     0     my($self, $hits) = @_;
77 0           my @bins = ();
78 0           foreach my $hit (@$hits)
79             {
80 0           my $found_a_bin = 0;
81 0           foreach my $bin (@bins)
82             {
83             # check if hit is in bin
84 0 0 0       if (($hit->{'source_start'} >= $bin->{'start'}) and ($hit->{'source_end'} <= $bin->{'end'}))
    0 0        
85             {
86 0           push(@{$bin->{'hits'}}, $hit);
  0            
87 0           $found_a_bin = 1;
88 0           last;
89             }
90             # check if bin is in hit
91             elsif (($hit->{'source_start'} <= $bin->{'start'}) and ($hit->{'source_end'} >= $bin->{'end'}))
92             {
93 0           push(@{$bin->{'hits'}}, $hit);
  0            
94 0           $bin->{'start'} = $hit->{'source_start'};
95 0           $bin->{'end'} = $hit->{'source_end'};
96 0           $found_a_bin = 1;
97 0           last;
98             }
99             }
100             # If we've not found a bin for this hit, make a new one
101 0 0         if (!$found_a_bin)
102             {
103             my $new_bin = {
104             'start' => $hit->{'source_start'},
105 0           'end' => $hit->{'source_end'},
106             'hits' => [$hit]
107             };
108 0           push(@bins, $new_bin);
109             }
110             }
111 0           return \@bins;
112             }
113              
114             sub _merge_similar_bins
115             {
116             ###
117             # Some alleles differ from others due to indels at their beginning or end,
118             # this merges the bins if they have a lot of overlap
119             ###
120 0     0     my ($self, $bins_ref) = @_;
121 0           my @bins = sort { $a->{'start'} <=> $b->{'start'} } @$bins_ref;
  0            
122 0           my $previous_bin = shift @bins;
123 0           my @combined_bins = $previous_bin;
124 0           my $bin;
125 0           foreach $bin (@bins) {
126             # Check if there is any overlap between the new_bin and the previous_bin
127 0           my $overlap = max (0, ($previous_bin->{'end'} - $bin->{'start'}));
128 0           my $length = min(($bin->{'end'} - $bin->{'start'}), ($previous_bin->{'end'} - $previous_bin->{'start'})) + 1;
129 0           my $overlap_prop = $overlap / $length;
130 0 0         if ($overlap_prop > 0.9) {
131 0           $previous_bin->{'end'} = $bin->{'end'};
132 0           push(@{$previous_bin->{'hits'}}, @{$bin->{'hits'}});
  0            
  0            
133             } else {
134 0           push( @combined_bins, $bin);
135 0           $previous_bin = $bin;
136             }
137             }
138 0           return \@combined_bins;
139             }
140              
141             sub _bins_to_groups
142             {
143 0     0     my($self, $bins) = @_;
144 0           my @groups = map { $_->{hits} } @$bins;
  0            
145 0           return \@groups;
146             }
147              
148             sub _best_hit_in_group
149             {
150             ###
151             # The best hit has the greatest number of matching bases. If two hits have
152             # the same number of matching bases, the one with the greater
153             # percentage identity is selected.
154             ###
155 0     0     my($self, $hits) = @_;
156 0           my @lengths = map { $_->{'matches'} } @$hits;
  0            
157 0           my $max_length = max @lengths;
158 0           my @longest_hits = grep { $_->{'matches'} == $max_length } @$hits;
  0            
159              
160 0 0   0     my $best_hit = reduce { $a->{'percentage_identity'} > $b->{'percentage_identity'} ? $a : $b } @longest_hits;
  0            
161 0           return $best_hit;
162             }
163              
164             sub _blastn_cmd
165             {
166 0     0     my($self) = @_;
167 0           my $word_size = int(100/(100 - $self->perc_identity ));
168 0 0         $word_size = 11 if($word_size < 11);
169 0           my $outfmt = "\"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident\""; # standard format + n. identical base matches
170            
171 0           join(' ',($self->exec, '-task blastn', '-query', $self->query_file, '-db', $self->blast_database, '-outfmt', $outfmt, '-word_size', $word_size , '-perc_identity', $self->perc_identity ));
172             }
173              
174             sub _build_top_hit
175             {
176 0     0     my($self) = @_;
177 0           my $top_hit = {};
178 0           my @contaminants = ();
179 0           open(my $copy_stderr_fh, ">&STDERR"); open(STDERR, '>/dev/null'); # Redirect STDERR
  0            
180 0           open( my $blast_output_fh, '-|',$self->_blastn_cmd);
181 0           close(STDERR); open(STDERR, ">&", $copy_stderr_fh); # Restore STDERR
  0            
182              
183             # Find all of the best non-overlapping matches
184 0           my $hits = $self->_build_hits($blast_output_fh);
185 0           $hits = $self->_filter_by_alignment_length($hits, $self->word_sizes);
186 0           my $best_hits = $self->_filter_best_hits($hits);
187 0           my $bins = $self->_group_overlapping_hits($best_hits);
188 0           $bins = $self->_merge_similar_bins($bins);
189 0           my $groups = $self->_bins_to_groups($bins);
190              
191             # Find the best match
192 0           my @best_in_groups = map { $self->_best_hit_in_group($_) } @$groups;
  0            
193 0 0   0     $top_hit = reduce { $a->{'percentage_identity'} > $b->{'percentage_identity'} ? $a : $b } @best_in_groups;
  0            
194              
195 0 0         if (defined $top_hit)
196             {
197 0           $top_hit->{'percentage_identity'} = int($top_hit->{'percentage_identity'});
198 0           delete $top_hit->{'sample_alignment_length'};
199 0           delete $top_hit->{'matches'};
200             }
201             else {
202 0           $top_hit = {};
203             }
204 0 0         if ( scalar @best_in_groups > 1 )
205             {
206 0           $top_hit->{contamination} = \@best_in_groups;
207             }
208            
209 0           return $top_hit;
210             }
211              
212 12     12   60 no Moose;
  12         11  
  12         68  
213             __PACKAGE__->meta->make_immutable;
214             1;
215              
216             __END__
217              
218             =pod
219              
220             =encoding UTF-8
221              
222             =head1 NAME
223              
224             Bio::MLST::Blast::BlastN - Wrapper around NCBI BlastN
225              
226             =head1 VERSION
227              
228             version 2.1.1706216
229              
230             =head1 SYNOPSIS
231              
232             Wrapper around NCBI BlastN. Run NCBI blast and find the top hit.
233              
234             use Bio::MLST::Blast::BlastN;
235            
236             my $blast_database= Bio::MLST::Blast::BlastN->new(
237             blast_database => 'output_contigs',
238             query_file => 'alleles/adk.tfa',
239             word_size => 500,
240             exec => 'blastn'
241             );
242             $blast_database->top_hit();
243              
244             =head1 METHODS
245              
246             =head2 top_hit
247              
248             Returns a hash containing details about the top blast result.
249              
250             The attributes returned in the hash are:
251             allele_name
252             percentage_identity
253             source_name
254             source_start
255             source_end
256             reverse
257             contamination
258              
259             =head1 SEE ALSO
260              
261             =over 4
262              
263             =item *
264              
265             L<Bio::MLST::Blast::Database>
266              
267             =back
268              
269             =head1 AUTHOR
270              
271             Andrew J. Page <ap13@sanger.ac.uk>
272              
273             =head1 COPYRIGHT AND LICENSE
274              
275             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
276              
277             This is free software, licensed under:
278              
279             The GNU General Public License, Version 3, June 2007
280              
281             =cut