File Coverage

blib/lib/Algorithm/SetSimilarity/Join/WMPJoin.pm
Criterion Covered Total %
statement 101 106 95.2
branch 43 54 79.6
condition 12 15 80.0
subroutine 8 8 100.0
pod 0 5 0.0
total 164 188 87.2


line stmt bran cond sub pod time code
1             package Algorithm::SetSimilarity::Join::WMPJoin;
2              
3 1     1   22451 use 5.008005;
  1         4  
  1         38  
4 1     1   5 use strict;
  1         2  
  1         39  
5 1     1   5 use warnings;
  1         3  
  1         1140  
6              
7             our $VERSION = "0.0.0_03";
8              
9             sub new {
10 6     6 0 16743 my ($class, $param) = @_;
11 6         15 my %hash = ();
12 6 50       22 $hash{"is_sorted"} = $param->{is_sorted} if (exists $param->{is_sorted});
13 6         33 bless \%hash, $class;
14             }
15              
16             sub check_joinable {
17 13     13 0 1827 my ($self, $datum) = @_;
18 13         18 my $is_joinable = 0;
19 13 100       39 $is_joinable = 1 if (ref $datum eq "Algorithm::SetSimilarity::Join::Datum");
20 13         37 return $is_joinable;
21             }
22              
23             sub get_cumulative_weight {
24 31     31 0 45 my ($self, $set) = @_;
25 31         33 my $cum_weight = 0;
26 31         67 for (my $i = 0; $i <= $#$set; $i++) {
27 283         637 $cum_weight += $set->[$i]->[1];
28             }
29 31         65 return $cum_weight;
30             }
31              
32             sub get_squared_norm {
33 31     31 0 42 my ($self, $set) = @_;
34 31         32 my $squared_norm = 0;
35 31         68 for (my $i = 0; $i <= $#$set; $i++) {
36 283         726 $squared_norm += $set->[$i]->[1] * $set->[$i]->[1];
37             }
38 31         61 return $squared_norm;
39             }
40              
41             # only Jaccard coefficient yet.
42             sub join {
43 11     11 0 13679 my ($self, $datum, $threshold) = @_;
44 11         18 my @result = ();
45 11 50       27 if ($self->check_joinable($datum)) {
46 11 50 33     67 $datum->sort() unless ((exists $self->{"is_sorted"}) && ($self->{"is_sorted"}));
47 11         28 my $set_num = $datum->get_num();
48 11         16 my @cum_weight = ();
49 11         16 my @squared_norm = ();
50              
51 11         28 for (my $p = 0; $p < $set_num; $p++) {
52 31         93 my $p_set = $datum->get($p);
53 31         42 my $s1 = $#$p_set + 1;
54 31 100       80 $cum_weight[$p] = $self->get_cumulative_weight($p_set) unless ($cum_weight[$p]);
55 31 100       72 $squared_norm[$p] = $self->get_squared_norm($p_set) unless ($squared_norm[$p]);
56              
57             # $datum is sorted. Therefore $max_att is needless.
58             #my $maxpref = int ($s1 / $threshold);
59 31         101 for (my $c = $p + 1; $c < $set_num; $c++) {
60 39         116 my $c_set = $datum->get($c);
61 39         57 my $s2 = $#$c_set + 1;
62 39 100       101 $cum_weight[$c] = $self->get_cumulative_weight($c_set) unless ($cum_weight[$c]);
63 39 100       106 $squared_norm[$c] = $self->get_squared_norm($c_set) unless ($squared_norm[$c]);
64              
65 39 50       106 next if (($s1 * $threshold) > $cum_weight[$c]); # minsize filtering
66              
67             # $datum is sorted. Therefore $max_att is needless.
68             # my $max_att = $maxpref; # adopt maxpref size
69             # $max_att = $s2 if ($s2 < $max_att);
70              
71 39         88 my $min_overlap = int(($threshold / (1 + $threshold)) * ($cum_weight[$p] + $cum_weight[$c]));
72 39         54 my $alpha = $cum_weight[$c] - ($min_overlap + 1) + 1;
73 39         41 my $match_num = 0;
74 39         44 my ($att1, $att2) = (0, 0);
75 39         67 my ($w1, $w2) = ($p_set->[$att1]->[1], $c_set->[$att2]->[1]);
76 39         44 my ($c1, $c2) = ($w1, $w2);
77              
78 39   100     168 while (($att2 < $alpha) && ($att1 < $s1)) {
79 309         325 my $judge = -1;
80 309 100       609 if ($datum->{data_type} eq "number") {
81 39         64 $judge = ($p_set->[$att1]->[0] <=> $c_set->[$att2]->[0]);
82             }
83             else {
84 270         515 $judge = ($p_set->[$att1]->[0] cmp $c_set->[$att2]->[0]);
85             }
86 309 100       512 if ($judge == -1) {
    100          
87 170         161 $att1++;
88 170 100       333 if ($att1 < $s1) {
89 165         191 $w1 = $p_set->[$att1]->[1];
90 165         182 $c1 += $w1;
91             }
92             } elsif ($judge == 1) {
93 71         76 $att2++;
94 71 50       142 if ($att2 < $s2) {
95 71         86 $w2 = $c_set->[$att2]->[1];
96 71         85 $c2 += $w2;
97             }
98             } else {
99 68         78 $match_num += $w1 * $w2;
100 68         62 $att1++;
101 68         56 $att2++;
102 68 100 66     271 if (($att1 < $s1) && ($att2 < $s2)) {
103 62         114 ($w1, $w2) = ($p_set->[$att1]->[1], $c_set->[$att2]->[1]);
104 62         69 $c1 += $w1;
105 62         74 $c2 += $w2;
106             }
107             }
108              
109 309         402 my $min = ($cum_weight[$c] - $c2);
110 309 100       635 $min = ($cum_weight[$p] - $c1) if ($min > ($cum_weight[$p] - $c1));
111 309 100       1503 if (($match_num) + $min < $min_overlap) {
112 19         40 $match_num = 0;
113 19         58 last;
114             }
115             }
116 39 100       137 next unless ($match_num >= 1);
117              
118 20   100     75 while (($att1 < $s1) && ($att2 < $s2)) {
119 29         33 my $judge = -1;
120 29 100       60 if ($datum->{data_type} eq "number") {
121 5         11 $judge = ($p_set->[$att1]->[0] <=> $c_set->[$att2]->[0]);
122             }
123             else {
124 24         46 $judge = ($p_set->[$att1]->[0] cmp $c_set->[$att2]->[0]);
125             }
126              
127 29 100       61 if ($judge == -1) {
    50          
128 6 50       16 last if ($match_num + ($cum_weight[$p] - $c1) < $min_overlap);
129 6         6 $att1++;
130 6 100       16 if ($att1 < $s1) {
131 4         6 $w1 = $p_set->[$att1]->[1];
132 4         18 $c1 += $w1;
133             }
134             } elsif ($judge == 1) {
135 0 0       0 last if ($match_num + ($cum_weight[$c] - $c2) < $min_overlap);
136 0         0 $att2++;
137 0 0       0 if ($att2 < $s2) {
138 0         0 $w2 = $c_set->[$att2]->[1];
139 0         0 $c2 += $w2;
140             }
141             } else {
142 23         31 $match_num += $w1 * $w2;
143 23         24 $att1++;
144 23         21 $att2++;
145 23 100 100     102 if (($att1 < $s1) && ($att2 < $s2)) {
146 16         31 ($w1, $w2) = ($p_set->[$att1]->[1], $c_set->[$att2]->[1]);
147 16         19 $c1 += $w1;
148 16         101 $c2 += $w2;
149             }
150             }
151             }
152 20 100       52 last unless ($match_num >= $min_overlap + 1);
153 18         33 my $score = $match_num / ($squared_norm[$p] + $squared_norm[$c] - $match_num );
154 18         46 my @pair = ($p, $c, $score);
155 18         81 push @result, \@pair;
156             }
157             }
158             }
159 11         44 return \@result;
160             }
161              
162             1;
163              
164             __END__