File Coverage

blib/lib/Algorithm/SetSimilarity/WeightedJaccard.pm

Criterion	Covered	Total	%
statement	153	176	86.9
branch	61	82	74.3
condition	29	48	60.4
subroutine	13	14	92.8
pod	0	8	0.0
total	256	328	78.0

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::SetSimilarity::WeightedJaccard;
2
3	1			1		35392	use 5.008005;
	1					6
	1					210
4	1			1		10	use strict;
	1					2
	1					44
5	1			1		21	use warnings;
	1					3
	1					67
6
7							our $VERSION = "0.0.0_03";
8
9	1			1		6	use Scalar::Util;
	1					1
	1					2756
10
11							sub new {
12	11			11	0	23540	my ($class, $param) = @_;
13	11					22	my %hash = ();
14	11	50				44	$hash{"is_sorted"} = $param->{is_sorted} if (exists $param->{is_sorted});
15	11	100				35	$hash{"data_type"} = $param->{data_type} if (exists $param->{data_type});
16	11					50	bless \%hash, $class;
17							}
18
19							sub _swap_set_ascending_order {
20	2			2		632	my ($self, $set1, $set2) = @_;
21	2	100				7	if (scalar(keys(%$set1)) > scalar(keys(%$set2))) {
22	1					3	my $tmp_ref = $set1;
23	1					3	$set1 = $set2;
24	1					3	$set2 = $tmp_ref;
25							}
26	2	50				8	return wantarray ? ($set1, $set2) : [$set1, $set2];
27							}
28
29							sub _swap_set_descending_order {
30	76			76		748	my ($self, $set1, $set2) = @_;
31	76	100				223	if (scalar(keys(%$set1)) < scalar(keys(%$set2))) {
32	1					3	my $tmp_ref = $set1;
33	1					22	$set1 = $set2;
34	1					3	$set2 = $tmp_ref;
35							}
36	76	50				254	return wantarray ? ($set1, $set2) : [$set1, $set2];
37							}
38
39							sub estimate_data_type {
40	7			7	0	2781	my ($self, $set1, $set2) = @_;
41	7					11	my $is_estimate = 0;
42	7					13	my ($type1, $type2) = ("string","string");
43
44	7					23	foreach my $key1 (keys %$set1) {
45	7	100				37	if (Scalar::Util::looks_like_number($key1)) {
46	2					3	$type1 = "number";
47							}
48	7					11	last;
49							}
50	7					23	foreach my $key2 (keys %$set2) {
51	6	100				19	if (Scalar::Util::looks_like_number($key2)) {
52	2					5	$type2 = "number";
53							}
54	6					8	last;
55							}
56
57	7	100				20	if ($type1 eq $type2) {
58	5					12	$self->{data_type} = $type1;
59	5					8	$is_estimate = 1;
60							}
61	7					18	return $is_estimate;
62							}
63
64							sub get_key_list_from_hash {
65	0			0	0	0	my ($self, $set) = @_;
66	0					0	my $s = scalar(keys(%$set));
67	0					0	my $list;
68	0	0				0	if ($self->{data_type} eq "number") {
69	0	0				0	if ($s > 1) {
70	0					0	@{$list} = sort {$set->{$a} <=> $set->{$b}} (keys %$set);
	0					0
	0					0
71							} else {
72	0					0	@{$list} = keys %$list;
	0					0
73							}
74							} else {
75	0	0				0	if ($s > 1) {
76	0					0	@{$list} = sort {$set->{$a} cmp $set->{$b}} (keys %$set);
	0					0
	0					0
77							} else {
78	0					0	@{$list} = keys %$set;
	0					0
79							}
80							}
81	0					0	return $list;
82							}
83
84							sub get_squared_norm {
85	150			150	0	726	my ($self, $vec) = @_;
86	150					156	my $squared_norm = 0;
87	150					340	foreach my $key (keys %$vec) {
88	513					877	$squared_norm += $vec->{$key} * $vec->{$key};
89							}
90	150					321	return $squared_norm;
91							}
92
93							sub get_similarity {
94	13			13	0	4814	my ($self, $set1, $set2, $threshold) = @_;
95	13					21	my $score = -1.0;
96	13	100	33			165	if ((ref $set1 eq "HASH") && (ref $set2 eq "HASH") && (%$set1) && (%$set2)) {
			66
			100
97	10	50	33			29	if ((defined $threshold) && ($threshold > 0.0)) {
98	0					0	$score = $self->filt_by_threshold($set1, $set2, $threshold);
99							} else{
100	10					20	($set1, $set2) = $self->_swap_set_descending_order($set1, $set2);
101	10					16	my $s1 = scalar(keys(%$set1));
102	10					13	my $s2 = scalar(keys(%$set2));
103	10	100				29	my $is_estimate = $self->estimate_data_type($set1, $set2) unless (exists $self->{data_type});
104	10	50	66			48	if (($is_estimate) \|\| (exists $self->{data_type})) {
105	10					17	my $s_norm1 = $self->get_squared_norm($set1);
106	10					21	my $s_norm2 = $self->get_squared_norm($set2);
107	10					13	my $cum_score = 0;
108	10					19	foreach my $key1 (keys %$set1) {
109	45	100				102	$cum_score += ($set1->{$key1} * $set2->{$key1}) if (exists $set2->{$key1});
110							}
111	10					26	$score = $cum_score / ($s_norm1 + $s_norm2 - $cum_score);
112							}
113							}
114							}
115	13					36	return $score;
116							}
117
118							sub get_cumulative_weight {
119	128			128	0	149	my ($self, $set) = @_;
120	128					126	my $cum_weight = 0;
121	128					255	foreach my $val (values %$set) {
122	420					598	$cum_weight += $val;
123							}
124	128					252	return $cum_weight;
125							}
126
127							sub make_pair_from_hash {
128	129			129	0	172	my ($self, $set) = @_;
129	129					175	my @pair = ();
130	129					270	foreach my $key (keys %$set) {
131	422					780	my $entry = [$key, $set->{$key}];
132	422					730	push @pair, $entry;
133							}
134	129	100				388	$self->estimate_data_type($set) unless (exists $self->{data_type});
135	129	100				261	if ($self->{data_type} eq "number") {
136	8					24	@pair = sort { $a->[0] <=> $b->[0] } @pair;
	190					256
137							} else {
138	121					279	@pair = sort { $a->[0] cmp $b->[0] } @pair;
	321					523
139							}
140	129					260	return \@pair;
141							}
142
143							sub filt_by_threshold {
144	65			65	0	33823	my ($self, $set1, $set2, $threshold) = @_;
145	65					90	my $score = -1.0;
146	65	100	33			1142	if ((ref $set1 eq "HASH") && (ref $set2 eq "HASH") &&
			33
			33
			66
			66
147							(%$set1) && (%$set2) &&
148							($threshold > 0.0) && ($threshold <= 1.0)) {
149
150	64					772	for(my $r = 0; $r <= 0; $r++) {
151	64					210	($set1, $set2) = $self->_swap_set_descending_order($set1, $set2);
152	64					97	my $s1 = scalar(keys(%$set1));
153	64					83	my $s2 = scalar(keys(%$set2));
154	64					121	my $cum_w1 = $self->get_cumulative_weight($set1);
155	64					126	my $cum_w2 = $self->get_cumulative_weight($set2);
156
157	64	50				176	last unless (($s1 * $threshold) <= $cum_w2); #size filtering
158
159	64	100				154	my $is_estimate = $self->estimate_data_type($set1, $set2) unless (exists $self->{data_type});
160	64	50	66			320	if (($is_estimate) \|\| (exists $self->{data_type})) {
161	64					131	my $s_norm1 = $self->get_squared_norm($set1);
162	64					127	my $s_norm2 = $self->get_squared_norm($set2);
163	64					143	my $datum1 = $self->make_pair_from_hash($set1);
164	64					134	my $datum2 = $self->make_pair_from_hash($set2);
165
166	64					98	my $cum_score = 0;
167
168	64					157	my $min_overlap = int(($threshold / (1 + $threshold)) * ($cum_w1 + $cum_w2));
169	64					100	my $alpha = $cum_w2 - ($min_overlap + 1) + 1;
170	64					59	my $match_num = 0;
171	64					73	my ($att1, $att2) = (0, 0);
172	64					118	my ($w1, $w2) = ($datum1->[$att1]->[1], $datum2->[$att2]->[1]);
173	64					76	my ($c1, $c2) = ($w1, $w2);
174
175	64		100			441	while (($c2 < $w2 + $alpha) && ($att1 < $s1) && ($att2 < $s2)) {
			66
176	161					258	my $judge = -1;
177	161	100				310	if ($self->{data_type} eq "number") {
178	31					56	$judge = ($datum1->[$att1]->[0] <=> $datum2->[$att2]->[0]);
179							} else {
180	130					249	$judge = ($datum1->[$att1]->[0] cmp $datum2->[$att2]->[0]);
181							}
182
183	161	100				299	if ($judge == -1) {
		100
184	42					44	$att1++;
185	42	100				89	if ($att1 < $s1) {
186	41					59	$w1 = $datum1->[$att1]->[1];
187	41					263	$c1 += $w1;
188							}
189							} elsif ($judge == 1) {
190	38					33	$att2++;
191	38	50				76	if ($att2 < $s2) {
192	38					45	$w2 = $datum2->[$att2]->[1];
193	38					181	$c2 += $w2;
194							}
195							} else {
196	81					106	$match_num += $w1 * $w2;
197	81					85	$att1++;
198	81					70	$att2++;
199	81	100	66			374	if (($att1 < $s1) && ($att2 < $s2)) {
200	61					116	($w1, $w2) = ($datum1->[$att1]->[1], $datum2->[$att2]->[1]);
201	61					69	$c1 += $w1;
202	61					297	$c2 += $w2;
203							}
204							}
205							}
206	64					87	my $min = ($cum_w2 - $c2);
207	64	100				129	$min = ($cum_w1 - $c1) if ($min > ($cum_w1 - $c1));
208
209	64	100				132	if (($match_num) + $min < $min_overlap) {
210	15					15	$match_num = 0;
211	15					77	last;
212							}
213
214	49	50				95	last unless ($match_num >= 1);
215
216	49		66			169	while (($att1 < $s1) && ($att2 < $s2)) {
217	43					48	my $judge = -1;
218	43	100				79	if ($self->{data_type} eq "number") {
219	9					15	$judge = ($datum1->[$att1]->[0] <=> $datum2->[$att2]->[0]);
220							} else {
221	34					63	$judge = ($datum1->[$att1]->[0] cmp $datum2->[$att2]->[0]);
222							}
223
224	43	100				86	if ($judge == -1) {
		50
225	7	50				17	last if ($match_num + ($cum_w1 - $c1) < $min_overlap);
226	7					8	$att1++;
227	7	100				18	if ($att1 < $s1) {
228	6					9	$w1 = $datum1->[$att1]->[1];
229	6					28	$c1 += $w1;
230							}
231							} elsif ($judge == 1) {
232	0	0				0	last if ($match_num + ($cum_w2 - $c2) < $min_overlap);
233	0					0	$att2++;
234	0	0				0	if ($att2 < $s2) {
235	0					0	$w2 = $datum2->[$att2]->[1];
236	0					0	$c2 += $w2;
237							}
238							} else {
239	36					39	$match_num += $w1 * $w2;
240	36					27	$att1++;
241	36					35	$att2++;
242	36	100	66			136	if (($att1 < $s1) && ($att2 < $s2)) {
243	9					15	($w1, $w2) = ($datum1->[$att1]->[1], $datum2->[$att2]->[1]);
244	9					11	$c1 += $w1;
245	9					36	$c2 += $w2;
246							}
247							}
248							}
249	49	100				105	last unless ($match_num >= $min_overlap + 1);
250
251	48					274	$score = $match_num / ($s_norm1 + $s_norm2 - $match_num );
252							}
253							}
254							}
255	65					188	return $score;
256							}
257
258							1;
259
260							__END__