File Coverage

lib/Bio/MLST/Blast/BlastN.pm

Criterion	Covered	Total	%
statement	12	117	10.2
branch	0	22	0.0
condition	0	6	0.0
subroutine	4	16	25.0
pod			n/a
total	16	161	9.9

line	stmt	bran	cond	sub	time	code
1						package Bio::MLST::Blast::BlastN;
2						# ABSTRACT: Wrapper around NCBI BlastN
3						$Bio::MLST::Blast::BlastN::VERSION = '2.1.1630910';
4
5	13			13	1006	use Moose;
	13				32
	13				120
6	13			13	95555	use Bio::MLST::Types;
	13				27
	13				463
7	13			13	79	use List::Util qw(reduce max min);
	13				33
	13				20434
8
9						# input variables
10						has 'blast_database' => ( is => 'ro', isa => 'Str', required => 1 );
11						has 'query_file' => ( is => 'ro', isa => 'Str', required => 1 );
12						has 'word_sizes' => ( is => 'ro', isa => 'HashRef', required => 1 );
13						has 'exec' => ( is => 'ro', isa => 'Bio::MLST::Executable', default => 'blastn' );
14						has 'perc_identity' => ( is => 'ro', isa => 'Int', default => 0 );
15
16						# Generated
17						has 'top_hit' => ( is => 'ro', isa => 'Maybe[HashRef]', lazy => 1, builder => '_build_top_hit' );
18
19						sub _build_hit
20						{
21	0			0		my($self, $line) = @_;
22	0					chomp($line);
23	0					my @row = split(/\t/,$line);
24	0					my ($start, $end) = ($row[8], $row[9]);
25	0	0				($start, $end, my $reverse) = $start <= $end ? ($start, $end, 0) : ($end, $start, 1);
26						return {
27	0					'allele_name' => $row[0],
28						'source_name' => $row[1],
29						'percentage_identity' => $row[2],
30						'sample_alignment_length' => $row[3],
31						'matches' => $row[12],
32						'source_start' => $start,
33						'source_end' => $end,
34						'reverse' => $reverse,
35						};
36						}
37
38						sub _build_hits
39						{
40	0			0		my ($self, $blast_output_fh) = @_;
41	0					my @hits;
42	0					while(<$blast_output_fh>)
43						{
44	0					push(@hits, $self->_build_hit($_));
45						}
46	0					return \@hits;
47						}
48
49						sub _filter_by_alignment_length
50						{
51						###
52						# For each allele there is a minimum length of sequence it must be aligned
53						# against before it can be considered a match.
54						###
55	0			0		my ($self, $hits, $word_sizes) = @_;
56	0					my @long_hits = grep { $_->{'sample_alignment_length'} >= $word_sizes->{$_->{'allele_name'}} } @$hits;
	0
57	0					return \@long_hits;
58						}
59
60						sub _filter_best_hits
61						{
62	0			0		my($self, $hits, $tollerance) = @_;
63	0	0				$tollerance = defined($tollerance) ? $tollerance : 2.0;
64	0					my @percentages = map { $_->{'percentage_identity'} } @$hits;
	0
65	0					my $top_percentage = max @percentages;
66	0					my @top_hits = grep { $_->{'percentage_identity'} >= $top_percentage - $tollerance } @$hits;
	0
67	0					return \@top_hits;
68						}
69
70						sub _group_overlapping_hits
71						{
72						###
73						# Hits can overlap, this groups hits which overlap and returns a reference to
74						# an array of references to these groups.
75						###
76	0			0		my($self, $hits) = @_;
77	0					my @bins = ();
78	0					foreach my $hit (@$hits)
79						{
80	0					my $found_a_bin = 0;
81	0					foreach my $bin (@bins)
82						{
83						# check if hit is in bin
84	0	0	0			if (($hit->{'source_start'} >= $bin->{'start'}) and ($hit->{'source_end'} <= $bin->{'end'}))
		0	0
85						{
86	0					push(@{$bin->{'hits'}}, $hit);
	0
87	0					$found_a_bin = 1;
88	0					last;
89						}
90						# check if bin is in hit
91						elsif (($hit->{'source_start'} <= $bin->{'start'}) and ($hit->{'source_end'} >= $bin->{'end'}))
92						{
93	0					push(@{$bin->{'hits'}}, $hit);
	0
94	0					$bin->{'start'} = $hit->{'source_start'};
95	0					$bin->{'end'} = $hit->{'source_end'};
96	0					$found_a_bin = 1;
97	0					last;
98						}
99						}
100						# If we've not found a bin for this hit, make a new one
101	0	0				if (!$found_a_bin)
102						{
103						my $new_bin = {
104						'start' => $hit->{'source_start'},
105	0					'end' => $hit->{'source_end'},
106						'hits' => [$hit]
107						};
108	0					push(@bins, $new_bin);
109						}
110						}
111	0					return \@bins;
112						}
113
114						sub _merge_similar_bins
115						{
116						###
117						# Some alleles differ from others due to indels at their beginning or end,
118						# this merges the bins if they have a lot of overlap
119						###
120	0			0		my ($self, $bins_ref) = @_;
121	0					my @bins = sort { $a->{'start'} <=> $b->{'start'} } @$bins_ref;
	0
122	0					my $previous_bin = shift @bins;
123	0					my @combined_bins = $previous_bin;
124	0					my $bin;
125	0					foreach $bin (@bins) {
126						# Check if there is any overlap between the new_bin and the previous_bin
127	0					my $overlap = max (0, ($previous_bin->{'end'} - $bin->{'start'}));
128	0					my $length = min(($bin->{'end'} - $bin->{'start'}), ($previous_bin->{'end'} - $previous_bin->{'start'})) + 1;
129	0					my $overlap_prop = $overlap / $length;
130	0	0				if ($overlap_prop > 0.9) {
131	0					$previous_bin->{'end'} = $bin->{'end'};
132	0					push(@{$previous_bin->{'hits'}}, @{$bin->{'hits'}});
	0
	0
133						} else {
134	0					push( @combined_bins, $bin);
135	0					$previous_bin = $bin;
136						}
137						}
138	0					return \@combined_bins;
139						}
140
141						sub _bins_to_groups
142						{
143	0			0		my($self, $bins) = @_;
144	0					my @groups = map { $_->{hits} } @$bins;
	0
145	0					return \@groups;
146						}
147
148						sub _best_hit_in_group
149						{
150						###
151						# The best hit has the greatest number of matching bases. If two hits have
152						# the same number of matching bases, the one with the greater
153						# percentage identity is selected.
154						###
155	0			0		my($self, $hits) = @_;
156	0					my @lengths = map { $_->{'matches'} } @$hits;
	0
157	0					my $max_length = max @lengths;
158	0					my @longest_hits = grep { $_->{'matches'} == $max_length } @$hits;
	0
159
160	0	0		0		my $best_hit = reduce { $a->{'percentage_identity'} > $b->{'percentage_identity'} ? $a : $b } @longest_hits;
	0
161	0					return $best_hit;
162						}
163
164						sub _blastn_cmd
165						{
166	0			0		my($self) = @_;
167	0					my $word_size = int(100/(100 - $self->perc_identity ));
168	0	0				$word_size = 11 if($word_size < 11);
169	0					my $outfmt = "\"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident\""; # standard format + n. identical base matches
170
171	0					join(' ',($self->exec, '-task blastn', '-query', $self->query_file, '-db', $self->blast_database, '-outfmt', $outfmt, '-word_size', $word_size , '-perc_identity', $self->perc_identity ));
172						}
173
174						sub _build_top_hit
175						{
176	0			0		my($self) = @_;
177	0					my $top_hit = {};
178	0					my @contaminants = ();
179	0					open(my $copy_stderr_fh, ">&STDERR"); open(STDERR, '>/dev/null'); # Redirect STDERR
	0
180	0					open( my $blast_output_fh, '-\|',$self->_blastn_cmd);
181	0					close(STDERR); open(STDERR, ">&", $copy_stderr_fh); # Restore STDERR
	0
182
183						# Find all of the best non-overlapping matches
184	0					my $hits = $self->_build_hits($blast_output_fh);
185	0					$hits = $self->_filter_by_alignment_length($hits, $self->word_sizes);
186	0					my $best_hits = $self->_filter_best_hits($hits);
187	0					my $bins = $self->_group_overlapping_hits($best_hits);
188	0					$bins = $self->_merge_similar_bins($bins);
189	0					my $groups = $self->_bins_to_groups($bins);
190
191						# Find the best match
192	0					my @best_in_groups = map { $self->_best_hit_in_group($_) } @$groups;
	0
193	0	0		0		$top_hit = reduce { $a->{'percentage_identity'} > $b->{'percentage_identity'} ? $a : $b } @best_in_groups;
	0
194
195	0	0				if (defined $top_hit)
196						{
197	0					$top_hit->{'percentage_identity'} = int($top_hit->{'percentage_identity'});
198	0					delete $top_hit->{'sample_alignment_length'};
199	0					delete $top_hit->{'matches'};
200						}
201						else {
202	0					$top_hit = {};
203						}
204	0	0				if ( scalar @best_in_groups > 1 )
205						{
206	0					$top_hit->{contamination} = \@best_in_groups;
207						}
208
209	0					return $top_hit;
210						}
211
212	13			13	115	no Moose;
	13				21
	13				159
213						__PACKAGE__->meta->make_immutable;
214						1;
215
216						__END__
217
218						=pod
219
220						=encoding UTF-8
221
222						=head1 NAME
223
224						Bio::MLST::Blast::BlastN - Wrapper around NCBI BlastN
225
226						=head1 VERSION
227
228						version 2.1.1630910
229
230						=head1 SYNOPSIS
231
232						Wrapper around NCBI BlastN. Run NCBI blast and find the top hit.
233
234						use Bio::MLST::Blast::BlastN;
235
236						my $blast_database= Bio::MLST::Blast::BlastN->new(
237						blast_database => 'output_contigs',
238						query_file => 'alleles/adk.tfa',
239						word_size => 500,
240						exec => 'blastn'
241						);
242						$blast_database->top_hit();
243
244						=head1 METHODS
245
246						=head2 top_hit
247
248						Returns a hash containing details about the top blast result.
249
250						The attributes returned in the hash are:
251						allele_name
252						percentage_identity
253						source_name
254						source_start
255						source_end
256						reverse
257						contamination
258
259						=head1 SEE ALSO
260
261						=over 4
262
263						=item *
264
265						L<Bio::MLST::Blast::Database>
266
267						=back
268
269						=head1 AUTHOR
270
271						Andrew J. Page <ap13@sanger.ac.uk>
272
273						=head1 COPYRIGHT AND LICENSE
274
275						This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
276
277						This is free software, licensed under:
278
279						The GNU General Public License, Version 3, June 2007
280
281						=cut