File Coverage

blib/lib/Algorithm/SetSimilarity/Join/MPJoin.pm
Criterion Covered Total %
statement 67 69 97.1
branch 26 34 76.4
condition 7 9 77.7
subroutine 6 6 100.0
pod 0 3 0.0
total 106 121 87.6


line stmt bran cond sub pod time code
1             package Algorithm::SetSimilarity::Join::MPJoin;
2              
3 1     1   24618 use 5.008005;
  1         4  
  1         34  
4 1     1   5 use strict;
  1         1  
  1         30  
5 1     1   10 use warnings;
  1         2  
  1         671  
6              
7             our $VERSION = "0.0.0_03";
8              
9             sub new {
10 6     6 0 18505 my ($class, $param) = @_;
11 6         13 my %hash = ();
12 6 50       23 $hash{"is_sorted"} = $param->{is_sorted} if (exists $param->{is_sorted});
13 6         33 bless \%hash, $class;
14             }
15              
16             sub check_joinable {
17 13     13 0 1904 my ($self, $datum) = @_;
18 13         26 my $is_joinable = 0;
19 13 100       46 $is_joinable = 1 if (ref $datum eq "Algorithm::SetSimilarity::Join::Datum");
20 13         36 return $is_joinable;
21             }
22              
23             # only Jaccard coefficient yet.
24             sub join {
25 11     11 0 13506 my ($self, $datum, $threshold) = @_;
26 11         35 my @result = ();
27 11 50       30 if ($self->check_joinable($datum)) {
28 11 50 33     73 $datum->sort() unless ((exists $self->{"is_sorted"}) && ($self->{"is_sorted"}));
29 11         34 my $set_num = $datum->get_num();
30 11         33 for (my $p = 0; $p < $set_num; $p++) {
31 31         99 my $p_set = $datum->get($p);
32 31         45 my $s1 = $#$p_set + 1;
33              
34             # $datum is sorted. Therefore $max_att is needless.
35             #my $maxpref = int ($s1 / $threshold);
36 31         91 for (my $c = $p + 1; $c < $set_num; $c++) {
37 39         111 my $c_set = $datum->get($c);
38 39         60 my $s2 = $#$c_set + 1;
39 39 50       102 next if (($s1 * $threshold) > $s2); # minsize filtering
40              
41             # $datum is sorted. Therefore $max_att is needless.
42             # my $max_att = $maxpref; # adopt maxpref size
43             # $max_att = $s2 if ($s2 < $max_att);
44              
45 39         72 my $min_overlap = int(($threshold / (1 + $threshold)) * ($s1 + $s2));
46 39         54 my $alpha = $s2 - ($min_overlap + 1) + 1;
47 39         39 my $match_num = 0;
48 39         55 my ($att1, $att2) = (0, 0);
49              
50 39   100     148 while (($att2 < $alpha) && ($att1 < $s1)) {
51 327         334 my $judge = -1;
52 327 100       707 if ($datum->{data_type} eq "number") {
53 39         52 $judge = ($p_set->[$att1] <=> $c_set->[$att2]);
54             }
55             else {
56 288         449 $judge = ($p_set->[$att1] cmp $c_set->[$att2]);
57             }
58              
59 327 100       562 if ($judge == -1) {
    100          
60 183         180 $att1++;
61             } elsif ($judge == 1) {
62 76         106 $att2++;
63             } else {
64 68         69 $match_num++;
65 68         55 $att1++;
66 68         66 $att2++;
67             }
68              
69 327         375 my $min = ($s2 - $att2);
70 327 100       629 $min = ($s1 - $att1) if ($min > ($s1 - $att1));
71 327 100       1484 if (($match_num) + $min < $min_overlap) {
72 13         15 $match_num = 0;
73 13         16 last;
74             }
75             }
76 39 100       118 next unless ($match_num >= 1);
77 20   100     67 while (($att1 < $s1) && ($att2 < $s2)) {
78 29         35 my $judge = -1;
79 29 100       58 if ($datum->{data_type} eq "number") {
80 5         9 $judge = ($p_set->[$att1] <=> $c_set->[$att2]);
81             }
82             else {
83 24         44 $judge = ($p_set->[$att1] cmp $c_set->[$att2]);
84             }
85              
86 29 100       66 if ($judge == -1) {
    50          
87 6 50       14 last if ($match_num + ($s1 - $att1) < $min_overlap);
88 6         22 $att1++;
89             } elsif ($judge == 1) {
90 0 0       0 last if ($match_num + ($s2 - $att2) < $min_overlap);
91 0         0 $att2++;
92             } else {
93 23         27 $match_num++;
94 23         19 $att1++;
95 23         82 $att2++;
96             }
97             }
98 20 100       46 last unless ($match_num >= $min_overlap + 1);
99 18         24 my $diff_num = ($s1 - $match_num) + ($s2 - $match_num);
100 18         31 my $score = $match_num / ($match_num + $diff_num);
101 18         41 my @pair = ($p, $c, $score);
102 18         80 push @result, \@pair;
103             }
104             }
105             }
106 11         40 return \@result;
107             }
108              
109             1;
110              
111             __END__