line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# -*- mode: cperl; -*- |
2
|
|
|
|
|
|
|
package Set::Jaccard::SimilarityCoefficient; |
3
|
2
|
|
|
2
|
|
236783
|
use warnings; |
|
2
|
|
|
|
|
12
|
|
|
2
|
|
|
|
|
53
|
|
4
|
2
|
|
|
2
|
|
6
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
29
|
|
5
|
2
|
|
|
2
|
|
7
|
use utf8; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
12
|
|
6
|
2
|
|
|
2
|
|
32
|
use autodie; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
11
|
|
7
|
2
|
|
|
|
|
23
|
use Exception::Class qw( |
8
|
|
|
|
|
|
|
BadArgumentException |
9
|
|
|
|
|
|
|
DivideByZeroException |
10
|
2
|
|
|
2
|
|
8583
|
); |
|
2
|
|
|
|
|
10
|
|
11
|
2
|
|
|
2
|
|
1827
|
use ReadonlyX; |
|
2
|
|
|
|
|
2377
|
|
|
2
|
|
|
|
|
88
|
|
12
|
2
|
|
|
2
|
|
801
|
use Set::Scalar; |
|
2
|
|
|
|
|
17011
|
|
|
2
|
|
|
|
|
631
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '1.6.1'; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
## no critic( Subroutines::ProhibitCallsToUnexportedSubs ) |
17
|
|
|
|
|
|
|
# ------ Error messages |
18
|
|
|
|
|
|
|
Readonly::Scalar my $BAD_SET_A => |
19
|
|
|
|
|
|
|
'must have either ArrayRef or Set::Scalar value for set A'; |
20
|
|
|
|
|
|
|
Readonly::Scalar my $BAD_SET_B => |
21
|
|
|
|
|
|
|
'must have either ArrayRef or Set::Scalar value for set B'; |
22
|
|
|
|
|
|
|
Readonly::Scalar my $DIVIDE_BY_ZERO => |
23
|
|
|
|
|
|
|
'Cannot calculate when size(Union(A B)) == 0'; |
24
|
|
|
|
|
|
|
## use critic |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=function |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
Calculate the Jaccard Similarity Coefficient. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=cut |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub calc { |
33
|
14
|
|
|
14
|
0
|
14305
|
my ( $set_a_arg, $set_b_arg ) = @_; |
34
|
14
|
|
|
|
|
18
|
my $set_a; |
35
|
|
|
|
|
|
|
my $set_b; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
## no critic( Modules::RequireExplicitInclusion ) |
38
|
14
|
100
|
66
|
|
|
54
|
if ( !defined $set_a_arg |
|
|
|
100
|
|
|
|
|
39
|
|
|
|
|
|
|
|| ( ref $set_a_arg ne 'ARRAY' && ref $set_a_arg ne 'Set::Scalar' ) ) |
40
|
|
|
|
|
|
|
{ |
41
|
2
|
|
|
|
|
17
|
BadArgumentException->throw($BAD_SET_A); |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
12
|
100
|
66
|
|
|
36
|
if ( !defined $set_b_arg |
|
|
|
100
|
|
|
|
|
45
|
|
|
|
|
|
|
|| ( ref $set_b_arg ne 'ARRAY' && ref $set_b_arg ne 'Set::Scalar' ) ) |
46
|
|
|
|
|
|
|
{ |
47
|
2
|
|
|
|
|
6
|
BadArgumentException->throw($BAD_SET_B); |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
10
|
50
|
|
|
|
15
|
if ( ref $set_a_arg eq 'Set::Scalar' ) { |
51
|
0
|
|
|
|
|
0
|
$set_a = $set_a_arg->clone(); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
else { |
54
|
10
|
|
|
|
|
13
|
$set_a = Set::Scalar->new( @{$set_a_arg} ); |
|
10
|
|
|
|
|
37
|
|
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
10
|
50
|
|
|
|
706
|
if ( ref $set_b_arg eq 'Set::Scalar' ) { |
58
|
0
|
|
|
|
|
0
|
$set_b = $set_b_arg->clone(); |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
else { |
61
|
10
|
|
|
|
|
21
|
$set_b = Set::Scalar->new( @{$set_b_arg} ); |
|
10
|
|
|
|
|
19
|
|
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
10
|
|
|
|
|
528
|
my $intersection = $set_a->intersection($set_b); |
65
|
10
|
|
|
|
|
3136
|
my $union = $set_a->union($set_b); |
66
|
|
|
|
|
|
|
|
67
|
10
|
100
|
|
|
|
2960
|
if ( $union->size <= 0 ) { |
68
|
1
|
|
|
|
|
14
|
DivideByZeroException->throw($DIVIDE_BY_ZERO); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
## use critic |
71
|
|
|
|
|
|
|
|
72
|
9
|
|
|
|
|
46
|
return $intersection->size / $union->size; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# |
76
|
|
|
|
|
|
|
# This file is part of Set-Jaccard-SimilarityCoefficient |
77
|
|
|
|
|
|
|
# |
78
|
|
|
|
|
|
|
# This software is copyright (c) 2018 by Mark Leighton Fisher. |
79
|
|
|
|
|
|
|
# |
80
|
|
|
|
|
|
|
# This is free software; you can redistribute it and/or modify it under |
81
|
|
|
|
|
|
|
# the same terms as the Perl 5 programming language system itself. |
82
|
|
|
|
|
|
|
# |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
1; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=encoding utf8 |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head1 NAME |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Set::Jaccard::SimilarityCoefficient - Calculate the Jaccard Similarity Coefficient of 2 sets |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=head1 VERSION |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# VERSION |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 SYNOPSIS |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$res = Set::Jaccard::SimilarityCoefficient::calc(\@set_a, \@set_b); |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
OR |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
my $a = Set::Scalar->new(@set_a); |
103
|
|
|
|
|
|
|
my $b = Set::Scalar->new(@set_b); |
104
|
|
|
|
|
|
|
$res = Set::Jaccard::SimilarityCoefficient::calc($a, $b); |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head1 DESCRIPTION |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Set::Jaccard::SimilarityCoefficient lets you calculate the Jaccard Similarity |
109
|
|
|
|
|
|
|
Coefficient for either arrayrefs or Set::Scalar objects. |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Briefly, the Jaccard Similarity Coefficient is a simple measure of how similar |
112
|
|
|
|
|
|
|
2 sets are. The calculation is (in pseudo-code): |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=over 4 |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
count(difference(SET-A, SET-B)) / count(union(SET-A, SET-B)) |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=back |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
There is a Jaccard Similarity Coefficient routine already in CPAN, but it is |
121
|
|
|
|
|
|
|
specialized for use by Text::NSP. I wanted a generic routine that could be |
122
|
|
|
|
|
|
|
used by anyone so Set::Jaccard::SimilarityCoefficient was born. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
calc(A, B) calculates the Jaccard Similarity Coefficient for the arguments |
127
|
|
|
|
|
|
|
A and B. A and B can be either array references or Set::Scalar objects. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head1 DIAGNOSTICS |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
new() will complain if A or B is empty, not either a reference to an array, |
132
|
|
|
|
|
|
|
or not a Set::Scalar object. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
calc() could theoretically throw DivideByZeroException when the union |
135
|
|
|
|
|
|
|
of the two sets has 0 members. However, that would require set A or |
136
|
|
|
|
|
|
|
set B to have 0 members, which was previously prohibited by the |
137
|
|
|
|
|
|
|
prohibition on empty sets. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
This module should work wherever Perl works. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Set::Scalar |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head1 INCOMPATIBILITIES |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
None that I know of. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
There are no bugs that I know of. Given that this is non-trivial code, |
154
|
|
|
|
|
|
|
there will be bugs. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The types of arguments are limited to either array references or |
157
|
|
|
|
|
|
|
Set::Scalar objects. |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 AUTHOR |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Mark Leighton Fisher, |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Set::JaccardSimilarityCoefficient is licensed under the same terms |
166
|
|
|
|
|
|
|
as Perl itself. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=cut |