| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
2
|
|
|
2
|
|
1045
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
86
|
|
|
2
|
2
|
|
|
2
|
|
12
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
97
|
|
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package KSx::Search::RegexpTermQuery; |
|
5
|
2
|
|
|
2
|
|
22
|
use base qw( KinoSearch::Search::Query ); |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
2196
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.05'; |
|
8
|
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
8421
|
use Hash::Util::FieldHash::Compat 'fieldhashes'; |
|
|
2
|
|
|
|
|
6406
|
|
|
|
2
|
|
|
|
|
16
|
|
|
10
|
|
|
|
|
|
|
fieldhashes \my( %re, %prefix, %field ); |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub new { |
|
13
|
2
|
|
|
2
|
1
|
5746
|
my ($package, %args) = @_; |
|
14
|
|
|
|
|
|
|
|
|
15
|
2
|
|
|
|
|
15
|
my $re = delete $args{regexp}; |
|
16
|
2
|
|
|
|
|
7
|
my $field = delete $args{field}; |
|
17
|
|
|
|
|
|
|
|
|
18
|
2
|
|
|
|
|
46
|
my $self = $package->SUPER::new(%args); |
|
19
|
|
|
|
|
|
|
|
|
20
|
2
|
|
|
|
|
529
|
$re{$self} = $re; |
|
21
|
2
|
|
|
|
|
12
|
$field{$self} = $field; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# get the literal prefix of the regexp, if any. |
|
24
|
2
|
100
|
50
|
|
|
80
|
if($re{$self} =~ |
|
|
1
|
|
|
|
|
21
|
|
|
25
|
|
|
|
|
|
|
m<^ |
|
26
|
|
|
|
|
|
|
(?: # prefix for qr//'s, without allowing /i : |
|
27
|
|
|
|
|
|
|
\(\? ([a-hj-z]*) (?:-[a-z]*)?: |
|
28
|
|
|
|
|
|
|
)? |
|
29
|
|
|
|
|
|
|
(\\[GA]|\^) # anchor |
|
30
|
|
|
|
|
|
|
([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments) |
|
31
|
|
|
|
|
|
|
>x |
|
32
|
|
|
|
|
|
|
) {{ |
|
33
|
1
|
|
|
|
|
2
|
my ($mod,$anchor,$prefix) = ($1||'',$2,$3); |
|
34
|
1
|
50
|
33
|
|
|
8
|
$anchor eq '^' and $mod =~ /m/ and last; |
|
35
|
1
|
|
|
|
|
28
|
for($prefix) { |
|
36
|
1
|
50
|
|
|
|
5
|
$mod =~ /x/ and s/\s+//g; |
|
37
|
|
|
|
|
|
|
} |
|
38
|
1
|
|
|
|
|
6
|
$prefix{$self} = $prefix; |
|
39
|
|
|
|
|
|
|
}} |
|
40
|
|
|
|
|
|
|
|
|
41
|
2
|
|
|
|
|
10
|
$self; |
|
42
|
|
|
|
|
|
|
} |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
#sub extract_terms { |
|
45
|
|
|
|
|
|
|
# my $self = shift; |
|
46
|
|
|
|
|
|
|
# return @{ $self->{terms} }; |
|
47
|
|
|
|
|
|
|
#} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub make_compiler { |
|
50
|
2
|
|
|
2
|
1
|
108
|
return KSx::Search::RegexpTermCompiler->new( |
|
51
|
|
|
|
|
|
|
parent => @_ |
|
52
|
|
|
|
|
|
|
); |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
package KSx::Search::RegexpTermCompiler; |
|
57
|
2
|
|
|
2
|
|
828
|
use base qw( KinoSearch::Search::Compiler ); |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
2454
|
|
|
58
|
|
|
|
|
|
|
|
|
59
|
2
|
|
|
2
|
|
345
|
use Hash::Util::FieldHash::Compat 'fieldhashes'; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
19
|
|
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
fieldhashes \my ( %idf, %raw_impact, #%plists, |
|
62
|
|
|
|
|
|
|
%terms, |
|
63
|
|
|
|
|
|
|
%query_norm_factor, % normalized_impact, %tfs ); |
|
64
|
|
|
|
|
|
|
sub new { |
|
65
|
2
|
|
|
2
|
|
10
|
my($pack, %args) = @_; |
|
66
|
|
|
|
|
|
|
|
|
67
|
2
|
|
|
|
|
6
|
my $searcher = $args{searchable}; |
|
68
|
2
|
|
|
|
|
556
|
my $reader = $searcher->get_reader; |
|
69
|
0
|
|
|
|
|
|
my $lex_reader = $reader->fetch("KinoSearch::Index::LexiconReader"); |
|
70
|
0
|
|
|
|
|
|
my $post_reader = $reader->fetch("KinoSearch::Index::PostingsReader"); |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Retrieve the correct Similarity for the Query's field. |
|
73
|
0
|
|
|
|
|
|
my $sim = $args{similarity} = |
|
74
|
|
|
|
|
|
|
$searcher->get_schema->fetch_sim($field{$args{parent}}); |
|
75
|
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
my $self = $pack->SUPER::new(%args); |
|
77
|
|
|
|
|
|
|
|
|
78
|
0
|
|
|
|
|
|
my $parent = $args{parent}; |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Get a lexicon and find our place therein |
|
81
|
0
|
|
|
|
|
|
my( $re, $prefix ) = ($re{$parent}, $prefix{$parent}); |
|
82
|
0
|
0
|
|
|
|
|
ref $re eq 'Regexp' or $re = qr/$re/; # avoid repetitive recompilation |
|
83
|
0
|
|
|
|
|
|
my $lexcn = $lex_reader->lexicon( field => $field{$parent} ); |
|
84
|
0
|
0
|
|
|
|
|
$lexcn->seek(defined $prefix ? $prefix : ''); |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# iterate through it, stopping at terms that match |
|
87
|
0
|
|
|
|
|
|
my @terms; #my @plists; |
|
88
|
|
|
|
|
|
|
my %hits; # The keys are the doc nums; the values the tfs. |
|
89
|
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
|
while () { |
|
91
|
0
|
|
|
|
|
|
my $term = get_term $lexcn; |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# sift out unwanted terms |
|
94
|
0
|
0
|
0
|
|
|
|
last if defined $prefix and index( $term, $prefix ) != 0; |
|
95
|
0
|
0
|
|
|
|
|
next unless $term =~ $re; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# for terms that match... |
|
98
|
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
|
push @terms, $term; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# We have to iterate through the documents in each posting list, |
|
102
|
|
|
|
|
|
|
# recording the doc numbers, so we can calc the doc freq later on. |
|
103
|
|
|
|
|
|
|
# E.g., if there are two documents, one containing ‘dog’ and ‘dot,’ |
|
104
|
|
|
|
|
|
|
# and the other containing just ‘dog,’ and the re is /^do.*/, then |
|
105
|
|
|
|
|
|
|
# the doc freq has to be 2, since the re matches two docs. The doc |
|
106
|
|
|
|
|
|
|
# freqs of the individual terms are 1 and 2, so we can’t add or |
|
107
|
|
|
|
|
|
|
# average them. |
|
108
|
0
|
|
|
|
|
|
my $plist = $post_reader->posting_list( |
|
109
|
|
|
|
|
|
|
term => $term, |
|
110
|
|
|
|
|
|
|
field => $field{$parent}, |
|
111
|
|
|
|
|
|
|
); |
|
112
|
0
|
|
|
|
|
|
my $posting; my $weight; |
|
113
|
0
|
|
|
|
|
|
while (my $doc_num = $plist ->next) { |
|
114
|
|
|
|
|
|
|
# For efficiency’s sake, we’ll collect the results now, to |
|
115
|
|
|
|
|
|
|
# avoid iterating through postings (the slowest part of search- |
|
116
|
|
|
|
|
|
|
# ing) more than once, even though this code probably belongs |
|
117
|
|
|
|
|
|
|
# in RegexpTermScorer |
|
118
|
0
|
|
0
|
|
|
|
my $posting ||= $plist->get_posting; |
|
119
|
0
|
|
0
|
|
|
|
$hits{$doc_num} += |
|
120
|
|
|
|
|
|
|
$weight ||= $posting->get_freq * $posting->get_weight |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
} continue { |
|
124
|
0
|
0
|
|
|
|
|
last unless $lexcn->next ; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
0
|
|
|
|
|
|
my $doc_freq = scalar keys %hits; |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# Save the hits and terms for later |
|
129
|
|
|
|
|
|
|
# $plists{$self} = \@plists; |
|
130
|
0
|
|
|
|
|
|
$tfs{$self} = \%hits; |
|
131
|
0
|
|
|
|
|
|
$terms{$self} = \@terms; |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# Calculate and store the IDF |
|
134
|
0
|
|
|
|
|
|
my $max_doc = $searcher->doc_max; |
|
135
|
0
|
0
|
|
|
|
|
my $idf = $idf{$self} = $max_doc |
|
136
|
|
|
|
|
|
|
? 1 + log( $max_doc / ( 1 + $doc_freq ) ) |
|
137
|
|
|
|
|
|
|
: 1 |
|
138
|
|
|
|
|
|
|
; |
|
139
|
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
$raw_impact{$self} = $idf * $parent->get_boost; |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# make final preparations |
|
143
|
0
|
|
|
|
|
|
$self->perform_query_normalization($searcher); |
|
144
|
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
|
$self; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub perform_query_normalization { |
|
149
|
|
|
|
|
|
|
# copied from KinoSearch::Search::Weight originally |
|
150
|
0
|
|
|
0
|
|
|
my ( $self, $searcher ) = @_; |
|
151
|
0
|
|
|
|
|
|
my $sim = $self->get_similarity; |
|
152
|
|
|
|
|
|
|
|
|
153
|
0
|
|
|
|
|
|
my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t ) |
|
154
|
0
|
|
|
|
|
|
$factor = $sim->query_norm($factor); # factor /= norm_q |
|
155
|
0
|
|
|
|
|
|
$self->normalize($factor); # impact *= factor |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
0
|
|
|
0
|
|
|
sub get_value { shift->get_parent->get_boost } |
|
159
|
|
|
|
|
|
|
|
|
160
|
0
|
|
|
0
|
|
|
sub sum_of_squared_weights { $raw_impact{+shift}**2 } |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub normalize { # copied from TermQuery |
|
163
|
0
|
|
|
0
|
|
|
my ( $self, $query_norm_factor ) = @_; |
|
164
|
0
|
|
|
|
|
|
$query_norm_factor{$self} = $query_norm_factor; |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# Multiply raw impact by ( tf_q * idf_q / norm_q ) |
|
167
|
|
|
|
|
|
|
# |
|
168
|
|
|
|
|
|
|
# Note: factoring in IDF a second time is correct. See formula. |
|
169
|
0
|
|
|
|
|
|
$normalized_impact{$self} |
|
170
|
|
|
|
|
|
|
= $raw_impact{$self} * $idf{$self} * $query_norm_factor; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub make_matcher { |
|
174
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
175
|
|
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
return KSx::Search::RegexpTermScorer->new( |
|
177
|
|
|
|
|
|
|
# posting_lists => $plists{$self}, |
|
178
|
|
|
|
|
|
|
@_, |
|
179
|
|
|
|
|
|
|
compiler => $self, |
|
180
|
|
|
|
|
|
|
); |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub highlight_spans { # plagiarised form of TermWeight’s routine |
|
184
|
0
|
|
|
0
|
|
|
my ($self, %args) = @_; |
|
185
|
0
|
|
|
|
|
|
my $doc_vector = $args{doc_vec}; |
|
186
|
0
|
|
|
|
|
|
my $field_name = $args{field}; |
|
187
|
0
|
0
|
|
|
|
|
return if $field{$self->get_parent} ne $field_name; |
|
188
|
0
|
|
|
|
|
|
my $searcher = $args{searcher}; |
|
189
|
0
|
|
|
|
|
|
my $terms = $terms{$self}; |
|
190
|
|
|
|
|
|
|
|
|
191
|
0
|
|
|
|
|
|
require KinoSearch::Search::Span; |
|
192
|
|
|
|
|
|
|
|
|
193
|
0
|
|
|
|
|
|
my @posits; |
|
194
|
0
|
|
|
|
|
|
my $weight_val = $self->get_value; |
|
195
|
0
|
|
|
|
|
|
for (@$terms) { |
|
196
|
0
|
|
|
|
|
|
my $term_vector |
|
197
|
|
|
|
|
|
|
= $doc_vector->term_vector( field => $field_name, term => $_ ); |
|
198
|
0
|
0
|
|
|
|
|
next unless defined $term_vector; |
|
199
|
0
|
|
|
|
|
|
my $starts = $term_vector->get_start_offsets->to_arrayref; |
|
200
|
0
|
|
|
|
|
|
my $ends = $term_vector->get_end_offsets->to_arrayref; |
|
201
|
0
|
|
|
|
|
|
while (@$starts) { |
|
202
|
0
|
|
|
|
|
|
my $start = shift @$starts; |
|
203
|
0
|
|
|
|
|
|
push @posits, KinoSearch::Search::Span->new( |
|
204
|
|
|
|
|
|
|
offset => $start, |
|
205
|
|
|
|
|
|
|
length => shift(@$ends)-$start, |
|
206
|
|
|
|
|
|
|
weight => $weight_val, |
|
207
|
|
|
|
|
|
|
); |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
return \@posits; |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
package KSx::Search::RegexpTermScorer; |
|
216
|
2
|
|
|
2
|
|
2406
|
use base 'KinoSearch::Search::Matcher'; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
2345
|
|
|
217
|
|
|
|
|
|
|
|
|
218
|
2
|
|
|
2
|
|
439
|
use Hash::Util::FieldHash::Compat 'fieldhashes'; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
20
|
|
|
219
|
|
|
|
|
|
|
fieldhashes\my( %doc_nums, %pos, %wv, %sim, %compiler ); |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub new { |
|
222
|
0
|
|
|
0
|
|
|
my ($class, %args) = @_; |
|
223
|
|
|
|
|
|
|
# my $plists = delete $args{posting_lists}; |
|
224
|
0
|
|
|
|
|
|
my $compiler = delete $args{compiler}; |
|
225
|
0
|
|
|
|
|
|
my $reader = delete $args{reader}; |
|
226
|
0
|
|
|
|
|
|
my $need_score = delete $args{need_score}; |
|
227
|
0
|
|
|
|
|
|
my $self = $class->SUPER::new(%args); |
|
228
|
0
|
|
|
|
|
|
$sim{$self} = $compiler->get_similarity; |
|
229
|
|
|
|
|
|
|
|
|
230
|
0
|
|
|
|
|
|
my $tfs = $tfs{$compiler}; |
|
231
|
0
|
|
|
|
|
|
$doc_nums{$self} = [ sort { $a <=> $b } keys %$tfs ]; |
|
|
0
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
|
233
|
0
|
|
|
|
|
|
$pos{$self} = -1; |
|
234
|
0
|
|
|
|
|
|
$wv {$self} = $compiler->get_value; |
|
235
|
0
|
|
|
|
|
|
$compiler{$self} = $compiler; |
|
236
|
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
$self |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
sub next { |
|
241
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
242
|
0
|
|
|
|
|
|
my $doc_nums = $doc_nums{$self}; |
|
243
|
0
|
0
|
|
|
|
|
return 0 if $pos{$self} >= $#$doc_nums; |
|
244
|
0
|
|
|
|
|
|
return $$doc_nums[ ++$pos{$self} ]; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub get_doc_num { |
|
248
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
249
|
0
|
|
|
|
|
|
my $pos = $pos{$self}; |
|
250
|
0
|
|
|
|
|
|
my $doc_nums = $doc_nums{$self}; |
|
251
|
0
|
0
|
|
|
|
|
return $pos < scalar @$doc_nums ? $$doc_nums[$pos] : 0; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
sub score { |
|
255
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
256
|
0
|
|
|
|
|
|
my $pos = $pos{$self}; |
|
257
|
0
|
|
|
|
|
|
my $doc_nums = $doc_nums{$self}; |
|
258
|
0
|
|
|
|
|
|
return $wv{$self} * $sim{$self}->tf( |
|
259
|
|
|
|
|
|
|
$tfs{$compiler{$self}}{$$doc_nums[$pos]} |
|
260
|
|
|
|
|
|
|
); |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
1; |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
__END__ |