File Coverage

blib/lib/String/Cluster/Hobohm.pm

Criterion	Covered	Total	%
statement	39	39	100.0
branch	7	8	87.5
condition	2	2	100.0
subroutine	9	9	100.0
pod	1	1	100.0
total	58	59	98.3

line	stmt	bran	cond	sub	pod	time	code
1							package String::Cluster::Hobohm;
2							$String::Cluster::Hobohm::VERSION = '0.141020';
3							# ABSTRACT: Cluster strings using the Hobohm algorithm
4
5	2			2		38628	use strict;
	2					6
	2					2227
6	2			2		26	use warnings;
	2					6
	2					90
7
8	2			2		2263	use Mouse;
	2					83905
	2					14
9	2			2		2560	use String::Cluster::Hobohm::Types 'Quotient';
	2					9
	2					14
10	2			2		569	use Carp 'croak';
	2					5
	2					142
11	2			2		1683	use Text::LevenshteinXS;
	2					15416
	2					840
12
13							# accept a list of strings or a closure that returns, stepwise, the
14							# strings (this prevents loading everything into memory in the case that
15							# the dataset is too big)
16
17							# Return, for now, a structure with the indices
18							# {
19							# index1 => [ index2, index3, index4 ]
20							# ...
21							# }
22
23							has similarity => ( is => 'ro', default => 0.62, isa => Quotient );
24
25							sub cluster {
26	3			3	1	100	my ($self, $sequences) = @_;
27	3	100				25	defined $sequences or croak "Need sequences as argument";
28
29	2					4	my @clusters;
30
31	2					8	foreach my $sequence (@$sequences) {
32
33	6					22	my $cluster_id = $self->_is_similar( $sequence, \@clusters );
34
35	6	100				16	if ( defined $cluster_id ) {
36	2					4	push @{ $clusters[ $cluster_id ] }, \$sequence;
	2					8
37							}
38							else {
39	4					14	push @clusters, [ \$sequence ]
40							}
41							}
42
43	2					11	return \@clusters;
44							}
45
46							sub _is_similar {
47	6			6		12	my ($self, $sequence, $clusters ) = @_;
48
49	6					17	foreach my $i (0 .. $#$clusters) {
50	4					14	my $similarity = $self->_similarity(\$sequence, $clusters->[$i][0]);
51
52	4	100				23	return $i if $similarity >= $self->similarity;
53							}
54
55	4					11	return;
56							}
57
58							sub _similarity {
59	4			4		12	my $self = shift;
60
61	4					9	my @seqs = map { $$_ } @_;
	8					19
62
63	4					18	my $distance = Text::LevenshteinXS::distance(@seqs);
64	4	50				16	defined $distance or croak "unable to compute distance";
65
66	4		100			24	return 1 - $distance / (length( $seqs[0] ) \|\| 1);
67							}
68
69							__PACKAGE__->meta->make_immutable;
70
71							__END__