File Coverage

lib/Bio/MLST/SequenceType.pm

Criterion	Covered	Total	%
statement	92	96	95.8
branch	28	34	82.3
condition	2	2	100.0
subroutine	13	14	92.8
pod	0	1	0.0
total	135	147	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MLST::SequenceType;
2							# ABSTRACT: Take in a list of matched alleles and look up the sequence type from the profile.
3							$Bio::MLST::SequenceType::VERSION = '2.1.1630910';
4
5	13			13		177473	use Data::Dumper;
	13					21
	13					1140
6	13			13		7524	use Text::CSV;
	13					121506
	13					91
7	13			13		627	use List::Util qw(min reduce);
	13					23
	13					1277
8
9	13			13		749	use Moose;
	13					438095
	13					123
10	13			13		81866	use Bio::MLST::Types;
	13					35
	13					490
11	13			13		6477	use Bio::MLST::FilterAlleles qw(is_metadata);
	13					31
	13					14855
12
13							has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
14							has 'matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15							has 'non_matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
16
17							has 'allele_to_number' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_allele_to_number' );
18							has '_profiles' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__profiles' );
19							has 'sequence_type' => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, builder => '_build_sequence_type' );
20
21							has 'nearest_sequence_type' => ( is => 'rw', isa => 'Maybe[Str]');
22							has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
23
24							sub sequence_type_or_nearest
25							{
26	0			0	0	0	my($self) = @_;
27	0	0				0	return $self->sequence_type if(defined($self->sequence_type));
28							# If there isn't a perfect match, add a tilde to the sequence type
29	0	0				0	return $self->nearest_sequence_type."~" if(defined($self->nearest_sequence_type));
30	0					0	return $self->nearest_sequence_type;
31							}
32
33							sub _build__profiles
34							{
35	8			8		10	my($self) = @_;
36	8	50				198	open(my $fh, $self->profiles_filename) or die "Couldnt open profile: ".$self->profiles_filename."\n";
37	8					61	my $csv_in = Text::CSV->new({sep_char=>"\t"});
38	8					581	my $profile = $csv_in->getline_all($fh);
39
40	8					16113	return $profile;
41							}
42
43							sub _build_allele_to_number
44							{
45	8			8		10	my($self) = @_;
46	8					8	my %allele_to_number;
47
48	8					9	for my $sequence_name (@{$self->non_matching_names})
	8					169
49							{
50	6					17	my @sequence_name_details = split(/[-_]/,$sequence_name);
51	6					9	my $num = pop @sequence_name_details;
52	6					10	my $name = join( "-", @sequence_name_details );
53	6					12	$allele_to_number{$name} = $num;
54							}
55
56	8					9	for my $sequence_name (@{$self->matching_names})
	8					161
57							{
58	19					45	my @sequence_name_details = split(/[-_]/,$sequence_name);
59	19					18	my $num = pop @sequence_name_details;
60	19					23	my $name = join( "-", @sequence_name_details );
61	19					40	$allele_to_number{$name} = $num;
62							}
63
64							#print "ALLELE TO NUMBER: ";
65							#print Dumper \%allele_to_number;
66
67	8					165	return \%allele_to_number;
68							}
69
70							sub _allele_numbers_similar
71							{
72	123			123		137	my($self, $number_a, $number_b) = @_;
73	123	100				315	if ($number_a eq $number_b) {
		100
		100
74	2					8	return 1;
75							} elsif ("$number_a~" eq $number_b) {
76	1					4	return 1;
77							} elsif ("$number_b~" eq $number_a) {
78	18					32	return 1;
79							} else {
80	102					312	return 0;
81							}
82							}
83
84							sub _build_sequence_type
85							{
86	8			8		9	my($self) = @_;
87
88	8					8	my @header_row = @{$self->_profiles->[0]};
	8					175
89
90	8					23	for(my $i=0; $i< @header_row; $i++)
91							{
92	50	100				104	next if(is_metadata($header_row[$i]) == 1);
93	28					36	$header_row[$i] =~ s!_!!g;
94	28					55	$header_row[$i] =~ s!-!!g;
95							}
96
97	8					18	my $num_loci = 0;
98	8					9	my %sequence_type_match_freq;
99							my %sequence_type_part_match_freq;
100
101	8					11	for(my $row = 1; $row < @{$self->_profiles}; $row++)
	62					1188
102							{
103	54					38	my @current_row = @{$self->_profiles->[$row]};
	54					1015
104	54					110	for(my $col = 0; $col< @current_row; $col++)
105							{
106	299	50				425	next unless(defined($header_row[$col]));
107	299	100				436	next if(is_metadata($header_row[$col]) == 1);
108	210	100				306	$num_loci++ if($row == 1);
109
110	210					4290	my $allele_number = $self->allele_to_number->{$header_row[$col]};
111	210	100				331	next if(!defined($allele_number) );
112	192	100				373	if ($allele_number eq $current_row[$col]) {
		100
113	75					217	$sequence_type_match_freq{$current_row[0]}++;
114							} elsif ($self->_allele_numbers_similar($allele_number, $current_row[$col])) {
115	17					47	$sequence_type_part_match_freq{$current_row[0]}++;
116							}
117							}
118							}
119
120	8					24	return $self->_get_sequence_type_or_set_nearest_match(\%sequence_type_match_freq,
121							\%sequence_type_part_match_freq,
122							$num_loci);
123							}
124
125							sub _get_sequence_type_or_set_nearest_match
126							{
127	8			8		12	my($self,$st_match_f, $st_part_match_f, $num_loci) = @_;
128	8					9	my %st_match_freq = %{$st_match_f};
	8					35
129
130							# Combine the frequencies of the perfect matches (%st_match_freq) and the
131							# partial matches ($st_part_match_f)
132	8					9	my %st_nearest_match_freq = %{$st_match_f};
	8					20
133	8					10	while (my($sequence_type, $freq) = each(%{$st_part_match_f})) {
	20					59
134	12		100			31	my $nearest_match_frequency = ( $st_nearest_match_freq{$sequence_type} \|\| 0 );
135	12					15	$st_nearest_match_freq{$sequence_type} = $nearest_match_frequency + $freq;
136							}
137
138							# if $num_loci is in $st_match_freq vals, return that, otherwise return lowest numbered sequence type
139	8					23	while (my($sequence_type, $freq) = each(%st_match_freq)) {
140	27	100				68	if ($freq == $num_loci) {
141	2					49	return $sequence_type;
142							}
143							}
144	6					5	my $best_sequence_type;
145	6	100				141	if ( $self->report_lowest_st ){
146
147	2					19	$best_sequence_type = min (keys %st_nearest_match_freq);
148							}
149							else {
150							# This reduce takes pairs of sequence types and compares them. It looks
151							# for the ST with the highest number of matching alleles; if two matches
152							# are just as good, it picks the ST with the smaller number.
153							$best_sequence_type = reduce {
154	12	100		12		26	if ( $st_nearest_match_freq{$a} > $st_nearest_match_freq{$b} ) {
		100
155	6					6	$a;
156							} elsif ( $st_nearest_match_freq{$a} < $st_nearest_match_freq{$b} ) {
157	3					4	$b;
158							} else {
159	3					13	min ($a, $b);
160							}
161	4					40	} keys %st_nearest_match_freq;
162							}
163
164	6					181	$self->nearest_sequence_type($best_sequence_type);
165	6					149	return undef;
166							}
167
168	13			13		96	no Moose;
	13					19
	13					127
169							__PACKAGE__->meta->make_immutable;
170							1;
171
172							__END__
173
174							=pod
175
176							=encoding UTF-8
177
178							=head1 NAME
179
180							Bio::MLST::SequenceType - Take in a list of matched alleles and look up the sequence type from the profile.
181
182							=head1 VERSION
183
184							version 2.1.1630910
185
186							=head1 SYNOPSIS
187
188							Take in a list of matched alleles and look up the sequence type from the profile.
189
190							use Bio::MLST::SequenceType;
191							my $st = Bio::MLST::SequenceType->new(
192							profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
193							matching_names => ['adk-2','purA-3','recA-1'],
194							non_matching_names => []
195							);
196							$st->sequence_type();
197
198							=head1 METHODS
199
200							=head2 allele_to_number
201
202							Maps the allele name to the corresponding locus sequence number.
203
204							=head2 sequence_type
205
206							Returns the sequence type (an integer).
207
208							=head2 nearest_sequence_type
209
210							Returns the nearest matching sequence type if there is no exact match, randomly chosen if there is more than 1 with equal identity.
211
212							=head1 AUTHOR
213
214							Andrew J. Page <ap13@sanger.ac.uk>
215
216							=head1 COPYRIGHT AND LICENSE
217
218							This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
219
220							This is free software, licensed under:
221
222							The GNU General Public License, Version 3, June 2007
223
224							=cut