line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package String::Cluster::Hobohm; |
2
|
|
|
|
|
|
|
$String::Cluster::Hobohm::VERSION = '0.141020'; |
3
|
|
|
|
|
|
|
# ABSTRACT: Cluster strings using the Hobohm algorithm |
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
38628
|
use strict; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
2227
|
|
6
|
2
|
|
|
2
|
|
26
|
use warnings; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
90
|
|
7
|
|
|
|
|
|
|
|
8
|
2
|
|
|
2
|
|
2263
|
use Mouse; |
|
2
|
|
|
|
|
83905
|
|
|
2
|
|
|
|
|
14
|
|
9
|
2
|
|
|
2
|
|
2560
|
use String::Cluster::Hobohm::Types 'Quotient'; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
14
|
|
10
|
2
|
|
|
2
|
|
569
|
use Carp 'croak'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
142
|
|
11
|
2
|
|
|
2
|
|
1683
|
use Text::LevenshteinXS; |
|
2
|
|
|
|
|
15416
|
|
|
2
|
|
|
|
|
840
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# accept a list of strings or a closure that returns, stepwise, the |
14
|
|
|
|
|
|
|
# strings (this prevents loading everything into memory in the case that |
15
|
|
|
|
|
|
|
# the dataset is too big) |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# Return, for now, a structure with the indices |
18
|
|
|
|
|
|
|
# { |
19
|
|
|
|
|
|
|
# index1 => [ index2, index3, index4 ] |
20
|
|
|
|
|
|
|
# ... |
21
|
|
|
|
|
|
|
# } |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
has similarity => ( is => 'ro', default => 0.62, isa => Quotient ); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub cluster { |
26
|
3
|
|
|
3
|
1
|
100
|
my ($self, $sequences) = @_; |
27
|
3
|
100
|
|
|
|
25
|
defined $sequences or croak "Need sequences as argument"; |
28
|
|
|
|
|
|
|
|
29
|
2
|
|
|
|
|
4
|
my @clusters; |
30
|
|
|
|
|
|
|
|
31
|
2
|
|
|
|
|
8
|
foreach my $sequence (@$sequences) { |
32
|
|
|
|
|
|
|
|
33
|
6
|
|
|
|
|
22
|
my $cluster_id = $self->_is_similar( $sequence, \@clusters ); |
34
|
|
|
|
|
|
|
|
35
|
6
|
100
|
|
|
|
16
|
if ( defined $cluster_id ) { |
36
|
2
|
|
|
|
|
4
|
push @{ $clusters[ $cluster_id ] }, \$sequence; |
|
2
|
|
|
|
|
8
|
|
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
else { |
39
|
4
|
|
|
|
|
14
|
push @clusters, [ \$sequence ] |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
2
|
|
|
|
|
11
|
return \@clusters; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub _is_similar { |
47
|
6
|
|
|
6
|
|
12
|
my ($self, $sequence, $clusters ) = @_; |
48
|
|
|
|
|
|
|
|
49
|
6
|
|
|
|
|
17
|
foreach my $i (0 .. $#$clusters) { |
50
|
4
|
|
|
|
|
14
|
my $similarity = $self->_similarity(\$sequence, $clusters->[$i][0]); |
51
|
|
|
|
|
|
|
|
52
|
4
|
100
|
|
|
|
23
|
return $i if $similarity >= $self->similarity; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
4
|
|
|
|
|
11
|
return; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub _similarity { |
59
|
4
|
|
|
4
|
|
12
|
my $self = shift; |
60
|
|
|
|
|
|
|
|
61
|
4
|
|
|
|
|
9
|
my @seqs = map { $$_ } @_; |
|
8
|
|
|
|
|
19
|
|
62
|
|
|
|
|
|
|
|
63
|
4
|
|
|
|
|
18
|
my $distance = Text::LevenshteinXS::distance(@seqs); |
64
|
4
|
50
|
|
|
|
16
|
defined $distance or croak "unable to compute distance"; |
65
|
|
|
|
|
|
|
|
66
|
4
|
|
100
|
|
|
24
|
return 1 - $distance / (length( $seqs[0] ) || 1); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
__END__ |