line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::Search::Similarity; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Plucene::Search::Similarity - the score of a query |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 DESCRIPTION |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
The score of a query for a given document. |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 METHODS |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=cut |
14
|
|
|
|
|
|
|
|
15
|
18
|
|
|
18
|
|
1071
|
use strict; |
|
18
|
|
|
|
|
48
|
|
|
18
|
|
|
|
|
633
|
|
16
|
18
|
|
|
18
|
|
99
|
use warnings; |
|
18
|
|
|
|
|
36
|
|
|
18
|
|
|
|
|
558
|
|
17
|
|
|
|
|
|
|
|
18
|
18
|
|
|
18
|
|
117
|
use Carp qw(confess); |
|
18
|
|
|
|
|
31
|
|
|
18
|
|
|
|
|
1058
|
|
19
|
18
|
|
|
18
|
|
114
|
use POSIX qw(ceil); |
|
18
|
|
|
|
|
29
|
|
|
18
|
|
|
|
|
200
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head2 norm |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my $norm = $sim->norm($num_term); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=cut |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub norm { |
28
|
370
|
|
|
370
|
1
|
693
|
my ($self, $num_terms) = @_; |
29
|
370
|
100
|
66
|
|
|
1950
|
return 0 if not defined $num_terms or $num_terms == 0; |
30
|
369
|
|
|
|
|
2756
|
return ceil(255 / sqrt($num_terms)); |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 byte_norm |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $byte_norm = $sim->byte_norm($byte); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub byte_norm { |
40
|
372
|
|
|
372
|
1
|
754
|
my ($self, $byte) = @_; |
41
|
372
|
|
|
|
|
1209
|
ord($byte) / 255; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head2 tf |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
Computes a score factor based on a term or phrase's frequency in a document. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=cut |
49
|
|
|
|
|
|
|
|
50
|
6420
|
|
|
6420
|
1
|
8406
|
sub tf { my $self = shift; return sqrt(shift); } |
|
6420
|
|
|
|
|
18857
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=head2 idf |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Computes a score factor for a phrase. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=cut |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub idf { |
59
|
198
|
|
|
198
|
1
|
2028
|
my ($self, $tf, $docs) = @_; |
60
|
198
|
|
|
|
|
998
|
my ($x, $y) = ($docs->doc_freq($tf), $docs->max_doc); |
61
|
198
|
|
|
|
|
2746
|
return 1 + log($y / (1 + $x)); |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head2 coord |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Computes a score factor based on the fraction of all query terms that |
67
|
|
|
|
|
|
|
a document contains. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=cut |
70
|
|
|
|
|
|
|
|
71
|
60
|
|
|
60
|
1
|
412
|
sub coord { my ($self, $a, $b) = @_; $a / $b } # Duh. |
|
60
|
|
|
|
|
261
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
1; |