line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
2
|
|
|
2
|
|
1045
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
86
|
|
2
|
2
|
|
|
2
|
|
12
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
97
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package KSx::Search::RegexpTermQuery; |
5
|
2
|
|
|
2
|
|
22
|
use base qw( KinoSearch::Search::Query ); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
2196
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.05'; |
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
8421
|
use Hash::Util::FieldHash::Compat 'fieldhashes'; |
|
2
|
|
|
|
|
6406
|
|
|
2
|
|
|
|
|
16
|
|
10
|
|
|
|
|
|
|
fieldhashes \my( %re, %prefix, %field ); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub new { |
13
|
2
|
|
|
2
|
1
|
5746
|
my ($package, %args) = @_; |
14
|
|
|
|
|
|
|
|
15
|
2
|
|
|
|
|
15
|
my $re = delete $args{regexp}; |
16
|
2
|
|
|
|
|
7
|
my $field = delete $args{field}; |
17
|
|
|
|
|
|
|
|
18
|
2
|
|
|
|
|
46
|
my $self = $package->SUPER::new(%args); |
19
|
|
|
|
|
|
|
|
20
|
2
|
|
|
|
|
529
|
$re{$self} = $re; |
21
|
2
|
|
|
|
|
12
|
$field{$self} = $field; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# get the literal prefix of the regexp, if any. |
24
|
2
|
100
|
50
|
|
|
80
|
if($re{$self} =~ |
|
1
|
|
|
|
|
21
|
|
25
|
|
|
|
|
|
|
m<^ |
26
|
|
|
|
|
|
|
(?: # prefix for qr//'s, without allowing /i : |
27
|
|
|
|
|
|
|
\(\? ([a-hj-z]*) (?:-[a-z]*)?: |
28
|
|
|
|
|
|
|
)? |
29
|
|
|
|
|
|
|
(\\[GA]|\^) # anchor |
30
|
|
|
|
|
|
|
([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments) |
31
|
|
|
|
|
|
|
>x |
32
|
|
|
|
|
|
|
) {{ |
33
|
1
|
|
|
|
|
2
|
my ($mod,$anchor,$prefix) = ($1||'',$2,$3); |
34
|
1
|
50
|
33
|
|
|
8
|
$anchor eq '^' and $mod =~ /m/ and last; |
35
|
1
|
|
|
|
|
28
|
for($prefix) { |
36
|
1
|
50
|
|
|
|
5
|
$mod =~ /x/ and s/\s+//g; |
37
|
|
|
|
|
|
|
} |
38
|
1
|
|
|
|
|
6
|
$prefix{$self} = $prefix; |
39
|
|
|
|
|
|
|
}} |
40
|
|
|
|
|
|
|
|
41
|
2
|
|
|
|
|
10
|
$self; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
#sub extract_terms { |
45
|
|
|
|
|
|
|
# my $self = shift; |
46
|
|
|
|
|
|
|
# return @{ $self->{terms} }; |
47
|
|
|
|
|
|
|
#} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub make_compiler { |
50
|
2
|
|
|
2
|
1
|
108
|
return KSx::Search::RegexpTermCompiler->new( |
51
|
|
|
|
|
|
|
parent => @_ |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
package KSx::Search::RegexpTermCompiler; |
57
|
2
|
|
|
2
|
|
828
|
use base qw( KinoSearch::Search::Compiler ); |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
2454
|
|
58
|
|
|
|
|
|
|
|
59
|
2
|
|
|
2
|
|
345
|
use Hash::Util::FieldHash::Compat 'fieldhashes'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
19
|
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
fieldhashes \my ( %idf, %raw_impact, #%plists, |
62
|
|
|
|
|
|
|
%terms, |
63
|
|
|
|
|
|
|
%query_norm_factor, % normalized_impact, %tfs ); |
64
|
|
|
|
|
|
|
sub new { |
65
|
2
|
|
|
2
|
|
10
|
my($pack, %args) = @_; |
66
|
|
|
|
|
|
|
|
67
|
2
|
|
|
|
|
6
|
my $searcher = $args{searchable}; |
68
|
2
|
|
|
|
|
556
|
my $reader = $searcher->get_reader; |
69
|
0
|
|
|
|
|
|
my $lex_reader = $reader->fetch("KinoSearch::Index::LexiconReader"); |
70
|
0
|
|
|
|
|
|
my $post_reader = $reader->fetch("KinoSearch::Index::PostingsReader"); |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Retrieve the correct Similarity for the Query's field. |
73
|
0
|
|
|
|
|
|
my $sim = $args{similarity} = |
74
|
|
|
|
|
|
|
$searcher->get_schema->fetch_sim($field{$args{parent}}); |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
my $self = $pack->SUPER::new(%args); |
77
|
|
|
|
|
|
|
|
78
|
0
|
|
|
|
|
|
my $parent = $args{parent}; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Get a lexicon and find our place therein |
81
|
0
|
|
|
|
|
|
my( $re, $prefix ) = ($re{$parent}, $prefix{$parent}); |
82
|
0
|
0
|
|
|
|
|
ref $re eq 'Regexp' or $re = qr/$re/; # avoid repetitive recompilation |
83
|
0
|
|
|
|
|
|
my $lexcn = $lex_reader->lexicon( field => $field{$parent} ); |
84
|
0
|
0
|
|
|
|
|
$lexcn->seek(defined $prefix ? $prefix : ''); |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# iterate through it, stopping at terms that match |
87
|
0
|
|
|
|
|
|
my @terms; #my @plists; |
88
|
|
|
|
|
|
|
my %hits; # The keys are the doc nums; the values the tfs. |
89
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
|
while () { |
91
|
0
|
|
|
|
|
|
my $term = get_term $lexcn; |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# sift out unwanted terms |
94
|
0
|
0
|
0
|
|
|
|
last if defined $prefix and index( $term, $prefix ) != 0; |
95
|
0
|
0
|
|
|
|
|
next unless $term =~ $re; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# for terms that match... |
98
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
|
push @terms, $term; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# We have to iterate through the documents in each posting list, |
102
|
|
|
|
|
|
|
# recording the doc numbers, so we can calc the doc freq later on. |
103
|
|
|
|
|
|
|
# E.g., if there are two documents, one containing ‘dog’ and ‘dot,’ |
104
|
|
|
|
|
|
|
# and the other containing just ‘dog,’ and the re is /^do.*/, then |
105
|
|
|
|
|
|
|
# the doc freq has to be 2, since the re matches two docs. The doc |
106
|
|
|
|
|
|
|
# freqs of the individual terms are 1 and 2, so we can’t add or |
107
|
|
|
|
|
|
|
# average them. |
108
|
0
|
|
|
|
|
|
my $plist = $post_reader->posting_list( |
109
|
|
|
|
|
|
|
term => $term, |
110
|
|
|
|
|
|
|
field => $field{$parent}, |
111
|
|
|
|
|
|
|
); |
112
|
0
|
|
|
|
|
|
my $posting; my $weight; |
113
|
0
|
|
|
|
|
|
while (my $doc_num = $plist ->next) { |
114
|
|
|
|
|
|
|
# For efficiency’s sake, we’ll collect the results now, to |
115
|
|
|
|
|
|
|
# avoid iterating through postings (the slowest part of search- |
116
|
|
|
|
|
|
|
# ing) more than once, even though this code probably belongs |
117
|
|
|
|
|
|
|
# in RegexpTermScorer |
118
|
0
|
|
0
|
|
|
|
my $posting ||= $plist->get_posting; |
119
|
0
|
|
0
|
|
|
|
$hits{$doc_num} += |
120
|
|
|
|
|
|
|
$weight ||= $posting->get_freq * $posting->get_weight |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
} continue { |
124
|
0
|
0
|
|
|
|
|
last unless $lexcn->next ; |
125
|
|
|
|
|
|
|
} |
126
|
0
|
|
|
|
|
|
my $doc_freq = scalar keys %hits; |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# Save the hits and terms for later |
129
|
|
|
|
|
|
|
# $plists{$self} = \@plists; |
130
|
0
|
|
|
|
|
|
$tfs{$self} = \%hits; |
131
|
0
|
|
|
|
|
|
$terms{$self} = \@terms; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# Calculate and store the IDF |
134
|
0
|
|
|
|
|
|
my $max_doc = $searcher->doc_max; |
135
|
0
|
0
|
|
|
|
|
my $idf = $idf{$self} = $max_doc |
136
|
|
|
|
|
|
|
? 1 + log( $max_doc / ( 1 + $doc_freq ) ) |
137
|
|
|
|
|
|
|
: 1 |
138
|
|
|
|
|
|
|
; |
139
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
$raw_impact{$self} = $idf * $parent->get_boost; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# make final preparations |
143
|
0
|
|
|
|
|
|
$self->perform_query_normalization($searcher); |
144
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
|
$self; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub perform_query_normalization { |
149
|
|
|
|
|
|
|
# copied from KinoSearch::Search::Weight originally |
150
|
0
|
|
|
0
|
|
|
my ( $self, $searcher ) = @_; |
151
|
0
|
|
|
|
|
|
my $sim = $self->get_similarity; |
152
|
|
|
|
|
|
|
|
153
|
0
|
|
|
|
|
|
my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t ) |
154
|
0
|
|
|
|
|
|
$factor = $sim->query_norm($factor); # factor /= norm_q |
155
|
0
|
|
|
|
|
|
$self->normalize($factor); # impact *= factor |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
0
|
|
|
sub get_value { shift->get_parent->get_boost } |
159
|
|
|
|
|
|
|
|
160
|
0
|
|
|
0
|
|
|
sub sum_of_squared_weights { $raw_impact{+shift}**2 } |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub normalize { # copied from TermQuery |
163
|
0
|
|
|
0
|
|
|
my ( $self, $query_norm_factor ) = @_; |
164
|
0
|
|
|
|
|
|
$query_norm_factor{$self} = $query_norm_factor; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# Multiply raw impact by ( tf_q * idf_q / norm_q ) |
167
|
|
|
|
|
|
|
# |
168
|
|
|
|
|
|
|
# Note: factoring in IDF a second time is correct. See formula. |
169
|
0
|
|
|
|
|
|
$normalized_impact{$self} |
170
|
|
|
|
|
|
|
= $raw_impact{$self} * $idf{$self} * $query_norm_factor; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub make_matcher { |
174
|
0
|
|
|
0
|
|
|
my $self = shift; |
175
|
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
return KSx::Search::RegexpTermScorer->new( |
177
|
|
|
|
|
|
|
# posting_lists => $plists{$self}, |
178
|
|
|
|
|
|
|
@_, |
179
|
|
|
|
|
|
|
compiler => $self, |
180
|
|
|
|
|
|
|
); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub highlight_spans { # plagiarised form of TermWeight’s routine |
184
|
0
|
|
|
0
|
|
|
my ($self, %args) = @_; |
185
|
0
|
|
|
|
|
|
my $doc_vector = $args{doc_vec}; |
186
|
0
|
|
|
|
|
|
my $field_name = $args{field}; |
187
|
0
|
0
|
|
|
|
|
return if $field{$self->get_parent} ne $field_name; |
188
|
0
|
|
|
|
|
|
my $searcher = $args{searcher}; |
189
|
0
|
|
|
|
|
|
my $terms = $terms{$self}; |
190
|
|
|
|
|
|
|
|
191
|
0
|
|
|
|
|
|
require KinoSearch::Search::Span; |
192
|
|
|
|
|
|
|
|
193
|
0
|
|
|
|
|
|
my @posits; |
194
|
0
|
|
|
|
|
|
my $weight_val = $self->get_value; |
195
|
0
|
|
|
|
|
|
for (@$terms) { |
196
|
0
|
|
|
|
|
|
my $term_vector |
197
|
|
|
|
|
|
|
= $doc_vector->term_vector( field => $field_name, term => $_ ); |
198
|
0
|
0
|
|
|
|
|
next unless defined $term_vector; |
199
|
0
|
|
|
|
|
|
my $starts = $term_vector->get_start_offsets->to_arrayref; |
200
|
0
|
|
|
|
|
|
my $ends = $term_vector->get_end_offsets->to_arrayref; |
201
|
0
|
|
|
|
|
|
while (@$starts) { |
202
|
0
|
|
|
|
|
|
my $start = shift @$starts; |
203
|
0
|
|
|
|
|
|
push @posits, KinoSearch::Search::Span->new( |
204
|
|
|
|
|
|
|
offset => $start, |
205
|
|
|
|
|
|
|
length => shift(@$ends)-$start, |
206
|
|
|
|
|
|
|
weight => $weight_val, |
207
|
|
|
|
|
|
|
); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
return \@posits; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
package KSx::Search::RegexpTermScorer; |
216
|
2
|
|
|
2
|
|
2406
|
use base 'KinoSearch::Search::Matcher'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
2345
|
|
217
|
|
|
|
|
|
|
|
218
|
2
|
|
|
2
|
|
439
|
use Hash::Util::FieldHash::Compat 'fieldhashes'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
20
|
|
219
|
|
|
|
|
|
|
fieldhashes\my( %doc_nums, %pos, %wv, %sim, %compiler ); |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub new { |
222
|
0
|
|
|
0
|
|
|
my ($class, %args) = @_; |
223
|
|
|
|
|
|
|
# my $plists = delete $args{posting_lists}; |
224
|
0
|
|
|
|
|
|
my $compiler = delete $args{compiler}; |
225
|
0
|
|
|
|
|
|
my $reader = delete $args{reader}; |
226
|
0
|
|
|
|
|
|
my $need_score = delete $args{need_score}; |
227
|
0
|
|
|
|
|
|
my $self = $class->SUPER::new(%args); |
228
|
0
|
|
|
|
|
|
$sim{$self} = $compiler->get_similarity; |
229
|
|
|
|
|
|
|
|
230
|
0
|
|
|
|
|
|
my $tfs = $tfs{$compiler}; |
231
|
0
|
|
|
|
|
|
$doc_nums{$self} = [ sort { $a <=> $b } keys %$tfs ]; |
|
0
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
0
|
|
|
|
|
|
$pos{$self} = -1; |
234
|
0
|
|
|
|
|
|
$wv {$self} = $compiler->get_value; |
235
|
0
|
|
|
|
|
|
$compiler{$self} = $compiler; |
236
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
$self |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
sub next { |
241
|
0
|
|
|
0
|
|
|
my $self = shift; |
242
|
0
|
|
|
|
|
|
my $doc_nums = $doc_nums{$self}; |
243
|
0
|
0
|
|
|
|
|
return 0 if $pos{$self} >= $#$doc_nums; |
244
|
0
|
|
|
|
|
|
return $$doc_nums[ ++$pos{$self} ]; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub get_doc_num { |
248
|
0
|
|
|
0
|
|
|
my $self = shift; |
249
|
0
|
|
|
|
|
|
my $pos = $pos{$self}; |
250
|
0
|
|
|
|
|
|
my $doc_nums = $doc_nums{$self}; |
251
|
0
|
0
|
|
|
|
|
return $pos < scalar @$doc_nums ? $$doc_nums[$pos] : 0; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
sub score { |
255
|
0
|
|
|
0
|
|
|
my $self = shift; |
256
|
0
|
|
|
|
|
|
my $pos = $pos{$self}; |
257
|
0
|
|
|
|
|
|
my $doc_nums = $doc_nums{$self}; |
258
|
0
|
|
|
|
|
|
return $wv{$self} * $sim{$self}->tf( |
259
|
|
|
|
|
|
|
$tfs{$compiler{$self}}{$$doc_nums[$pos]} |
260
|
|
|
|
|
|
|
); |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
1; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
__END__ |