File Coverage

lib/Bio/MLST/CompareAlleles.pm

Criterion	Covered	Total	%
statement	46	138	33.3
branch	0	34	0.0
condition	0	3	0.0
subroutine	14	23	60.8
pod	2	3	66.6
total	62	201	30.8

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MLST::CompareAlleles;
2							$Bio::MLST::CompareAlleles::VERSION = '2.1.1630910';
3							# ABSTRACT: Get a list of matching alleles between the sequence and database
4
5
6	12			12		300864	use Moose;
	12					843239
	12					106
7	12			12		79313	use File::Basename;
	12					25
	12					1433
8	12			12		10378	use Bio::SeqIO;
	12					735479
	12					484
9	12			12		7273	use Bio::Perl;
	12					2086632
	12					1732
10	12			12		6120	use Bio::MLST::Blast::Database;
	12					43
	12					621
11	12			12		6852	use Bio::MLST::Blast::BlastN;
	12					38
	12					528
12	12			12		117	use Bio::MLST::Types;
	12					17
	12					476
13	12			12		5841	use Bio::MLST::SequenceType;
	12					47
	12					24611
14
15							has 'sequence_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
16							has 'allele_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
17							has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', default => 'makeblastdb' );
18							has 'blastn_exec' => ( is => 'ro', isa => 'Str', default => 'blastn' );
19
20							has '_sequence_handle' => ( is => 'ro', isa => 'Bio::SeqIO::fasta', lazy => 1, builder => '_build__sequence_handle');
21							has '_blast_db_location_obj' => ( is => 'ro', isa => 'Bio::MLST::Blast::Database', lazy => 1, builder => '_build__blast_db_location_obj');
22							has '_blast_db_location' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__blast_db_location');
23
24							has 'matching_sequences' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_matching_sequences' );
25							has 'non_matching_sequences' => ( is => 'rw', isa => 'HashRef', default => sub {{}});
26							has 'contamination' => ( is => 'rw', isa => 'Bool', default => 0);
27							has 'contamination_alleles' => ( is => 'rw', isa => 'Maybe[Str]' );
28							has 'contamination_sequence_names' => ( is => 'rw', isa => 'Maybe[ArrayRef]' );
29							has 'new_st' => ( is => 'rw', isa => 'Bool', default => 0);
30							has '_absent_loci' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__absent_loci' );
31							has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
32
33							sub _build__blast_db_location
34							{
35	6			6		22	my ($self) = @_;
36	6					354	return $self->_blast_db_location_obj->location();
37							}
38
39							sub _build__blast_db_location_obj
40							{
41	6			6		22	my ($self) = @_;
42	6					286	return Bio::MLST::Blast::Database->new(fasta_file => $self->sequence_filename, exec => $self->makeblastdb_exec);
43							}
44
45
46							sub _build__sequence_handle
47							{
48	0			0		0	my ($self) = @_;
49	0					0	return Bio::SeqIO->new( -file => $self->sequence_filename , -format => 'Fasta');
50							}
51
52							sub sequence_filename_root
53							{
54	0			0	0	0	my ($self) = @_;
55	0					0	$self->_get_base_filename($self->sequence_filename);
56							}
57
58							sub found_sequence_names
59							{
60	2			2	1	13	my ($self) = @_;
61	2					5	my @sequence_names = sort(keys %{$self->matching_sequences});
	2					90
62	0					0	return \@sequence_names;
63							}
64
65							sub found_non_matching_sequence_names
66							{
67	0			0	1	0	my ($self) = @_;
68	0					0	my @sequence_names = sort(keys %{$self->non_matching_sequences});
	0					0
69	0					0	return \@sequence_names;
70							}
71
72
73							sub _word_sizes_for_given_allele_file
74							{
75	6			6		29	my ($self,$filename) = @_;
76	6					24	my %seq_lens;
77	6					171	my $seqio = Bio::SeqIO->new( -file => $filename , -format => 'Fasta');
78	6					19351	while( my $seq = $seqio->next_seq() ){
79	37					9739	$seq_lens{$seq->primary_id} = $seq->length;
80							}
81	6					558	return \%seq_lens;
82							}
83
84							sub _get_word_size_from_blast_hit {
85	0			0		0	my ( $self, $word_sizes, $blast_hit, $allele_filename ) = @_;
86
87							# return len of top blast hit allele, otherwise return len of first seq in allele file
88	0					0	my ($word_size, $first_seq);
89	0	0				0	if( defined $blast_hit->{allele_name} ){
90	0					0	$word_size = $word_sizes->{$blast_hit->{allele_name}};
91							}
92							else{
93	0					0	my $seqio = Bio::SeqIO->new( -file => $allele_filename, -format => 'Fasta' );
94	0					0	$word_size = $seqio->next_seq->length;
95							}
96
97	0					0	return $word_size;
98							}
99
100							sub _build_matching_sequences
101							{
102	6			6		29	my ($self) = @_;
103	6					37	my %matching_sequence_names;
104							my %non_matching_sequence_names;
105	0					0	my %missing_locus_names;
106
107	6					27	for my $allele_filename (@{$self->allele_filenames})
	6					335
108							{
109	6					106	my $word_sizes = $self->_word_sizes_for_given_allele_file($allele_filename);
110							# TODO: You'll never get matches or contamination noted if there is a SNP
111							# near the end of the allele. This is because we filter all matches which
112							# are shorter than the length of the allele in the profiles. This could
113							# mean that we're missing contamination of falsly noting matches against
114							# truncated alleles.
115	6					1215	my $blast_results = Bio::MLST::Blast::BlastN->new(
116							blast_database => $self->_blast_db_location,
117							query_file => $allele_filename,
118							word_sizes => $word_sizes,
119							exec => $self->blastn_exec
120							);
121	0						my %top_blast_hit = %{$blast_results->top_hit()};
	0
122
123							# possible missing locus
124	0	0					if(! %top_blast_hit)
125							{
126	0						my %absent_loci_type = %{$self->_absent_loci};
	0
127	0						my $allele = $self->_get_base_filename($allele_filename);
128	0	0					$missing_locus_names{$allele} = $absent_loci_type{$allele} if exists $absent_loci_type{$allele};
129							}
130
131	0						my $word_size = $self->_get_word_size_from_blast_hit($word_sizes, \%top_blast_hit, $allele_filename);
132
133							# unknown allele
134	0	0					if(! %top_blast_hit)
135							{
136	0						$non_matching_sequence_names{$self->_get_base_filename($allele_filename)} = $self->_pad_out_sequence("", $word_size);
137	0						next;
138							}
139
140							# more than 1 allele has a good match
141	0	0					if(defined($top_blast_hit{contamination}))
142							{
143	0						$self->contamination(1);
144	0						my $contaminants = $top_blast_hit{contamination};
145	0						my @contaminant_names = map { $_->{allele_name} } @$contaminants;
	0
146							# Add tilde to matches which are not 100%
147	0	0					my @contaminant_names_with_tilde = map { $_->{percentage_identity} == 100 ? $_->{allele_name} : "$_->{allele_name}~" } @$contaminants;
	0
148	0						my $contamination_alleles = join( ',', sort @contaminant_names_with_tilde );
149	0						$self->contamination_alleles( $contamination_alleles );
150	0						$self->_translate_contamination_names_into_sequence_types(\@contaminant_names, $top_blast_hit{allele_name});
151							}
152
153	0						$top_blast_hit{allele_name} =~ s![-_]+!-!g;
154
155	0	0					if($top_blast_hit{percentage_identity} == 100 )
156							{
157	0						$matching_sequence_names{$top_blast_hit{allele_name}} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
158							}
159							else
160							{
161							# If the top hit isn't 100%, add a tilde to the allele_name
162	0						my $name_with_tilde = "$top_blast_hit{allele_name}~";
163	0						$non_matching_sequence_names{$name_with_tilde} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
164	0						$self->new_st(1);
165							}
166							}
167
168							# deal with missing loci
169	0	0	0				if(%matching_sequence_names && %missing_locus_names)
170							{
171	0						for my $allele (keys %missing_locus_names)
172							{
173	0						delete $non_matching_sequence_names{$allele};
174	0						$matching_sequence_names{$allele.'-'.$missing_locus_names{$allele}} = '';
175							}
176							}
177
178							# set new ST flag
179	0	0					$self->new_st(1) if %non_matching_sequence_names;
180
181	0						$self->non_matching_sequences(\%non_matching_sequence_names);
182	0						return \%matching_sequence_names;
183							}
184
185							sub _translate_contamination_names_into_sequence_types
186							{
187	0			0			my ($self, $contamination_names, $main_allele_name) = @_;
188	0						my @contamination_sequence_types;
189
190	0						for my $allele_number (@{ $contamination_names})
	0
191							{
192	0	0					next if($main_allele_name eq $allele_number);
193	0						my $st = Bio::MLST::SequenceType->new(
194							profiles_filename => $self->profiles_filename,
195							matching_names => [$allele_number],
196							non_matching_names => []
197							);
198
199	0	0					if(defined($st->sequence_type()) )
200							{
201	0						push(@contamination_sequence_types, $st->sequence_type());
202							}
203							}
204
205	0						$self->contamination_sequence_names(\@contamination_sequence_types);
206							}
207
208
209							sub _get_blast_hit_sequence
210							{
211	0			0			my ($self, $contig_name, $start, $end, $word_size, $reverse_complement) = @_;
212	0						seek($self->_sequence_handle->_fh, 0,0);
213	0						while( my $input_sequence_obj = $self->_sequence_handle->next_seq() )
214							{
215	0	0					next if( $input_sequence_obj->id ne $contig_name);
216	0						my $sequence = $input_sequence_obj->subseq($start, $end);
217	0	0					if($reverse_complement)
218							{
219	0						my $reverse_sequence = revcom( $sequence );
220	0						$sequence = $reverse_sequence->{seq};
221							}
222
223	0						$sequence = $self->_pad_out_sequence($sequence, $word_size);
224	0						return $sequence;
225							}
226
227	0						return $self->_pad_out_sequence("", $word_size);
228							}
229
230							sub _get_base_filename
231							{
232	0			0			my($self, $filename) = @_;
233	0						my $filename_root = fileparse($filename, qr/\.[^.]*$/);
234	0						return $filename_root;
235							}
236
237							sub _pad_out_sequence
238							{
239	0			0			my($self, $input_sequence, $length_of_main_sequence) = @_;
240	0	0					return $input_sequence if(length($input_sequence) == $length_of_main_sequence);
241	0	0					if(length($input_sequence) > $length_of_main_sequence)
242							{
243	0						$input_sequence = substr($input_sequence,0,$length_of_main_sequence);
244							}
245	0	0					$input_sequence = "" if($input_sequence eq 'U');
246
247	0						for(my $i=length($input_sequence); $i < $length_of_main_sequence; $i++)
248							{
249	0						$input_sequence .= "N";
250							}
251	0						return $input_sequence;
252							}
253
254							sub _build__absent_loci
255							{
256	0			0			my( $self ) = @_;
257	0						my %absent_loci = ();
258
259	0						for my $allele_file (@{$self->allele_filenames})
	0
260							{
261	0						my $seq_io = Bio::SeqIO->new( -file => $allele_file , -format => 'Fasta');
262	0						while( my $seq = $seq_io->next_seq() )
263							{
264	0	0					if($seq->length == 0)
265							{
266	0						my($allele,$type) = split(/[-_]+/,$seq->id(),2);
267	0						$absent_loci{$allele} = $type;
268							}
269							}
270							}
271
272	0						return \%absent_loci;
273							}
274
275	12			12		141	no Moose;
	12					36
	12					123
276							__PACKAGE__->meta->make_immutable;
277							1;
278
279							__END__
280
281							=pod
282
283							=encoding UTF-8
284
285							=head1 NAME
286
287							Bio::MLST::CompareAlleles - Get a list of matching alleles between the sequence and database
288
289							=head1 VERSION
290
291							version 2.1.1630910
292
293							=head1 SYNOPSIS
294
295							Take in an assembly file in Fasta format, and a list of allele files (in multifasta format) and return a list of the alleles and IDs.
296
297							use Bio::MLST::CompareAlleles;
298
299							my $compare_alleles = Bio::MLST::CompareAlleles->new(
300
301							sequence_filename => 'contigs.fa',
302							allele_filenames => ['abc.tfa','efg.tfa']
303							);
304							$compare_alleles->found_sequence_names;
305							$compare_alleles->found_non_matching_sequence_names
306							$compare_alleles->matching_sequences;
307							$compare_alleles->non_matching_sequences
308
309							=head1 METHODS
310
311							=head2 found_sequence_names
312
313							Return a list of the sequence names which match.
314
315							=head2 found_non_matching_sequence_names
316
317							Return a list of the sequence names which dont match.
318
319							=head2 matching_sequences
320
321							Return a Hash containing the sequnces that match.
322
323							=head2 non_matching_sequences
324
325							Return a Hash containing the sequnces that dont match.
326
327							=head2 contamination
328
329							Flag which is set if more than one 100% match is found for a single locus.
330
331							=head2 new_st
332
333							Flag which is set if the results contain a novel combination of sequences or a new sequence.
334
335							=head1 SEE ALSO
336
337							=over 4
338
339							=item *
340
341							L<Bio::MLST::Check>
342
343							=back
344
345							=head1 AUTHOR
346
347							Andrew J. Page <ap13@sanger.ac.uk>
348
349							=head1 COPYRIGHT AND LICENSE
350
351							This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
352
353							This is free software, licensed under:
354
355							The GNU General Public License, Version 3, June 2007
356
357							=cut