line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package KinoSearch1::QueryParser::QueryParser; |
2
|
18
|
|
|
18
|
|
30973
|
use strict; |
|
18
|
|
|
|
|
39
|
|
|
18
|
|
|
|
|
623
|
|
3
|
18
|
|
|
18
|
|
96
|
use warnings; |
|
18
|
|
|
|
|
40
|
|
|
18
|
|
|
|
|
464
|
|
4
|
18
|
|
|
18
|
|
866
|
use KinoSearch1::Util::ToolSet; |
|
18
|
|
|
|
|
40
|
|
|
18
|
|
|
|
|
2729
|
|
5
|
18
|
|
|
18
|
|
106
|
use base qw( KinoSearch1::Util::Class ); |
|
18
|
|
|
|
|
49
|
|
|
18
|
|
|
|
|
2841
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
BEGIN { |
8
|
18
|
|
|
18
|
|
196
|
__PACKAGE__->init_instance_vars( |
9
|
|
|
|
|
|
|
# constructor args / members |
10
|
|
|
|
|
|
|
analyzer => undef, |
11
|
|
|
|
|
|
|
default_boolop => 'OR', |
12
|
|
|
|
|
|
|
default_field => undef, # back compat |
13
|
|
|
|
|
|
|
fields => undef, |
14
|
|
|
|
|
|
|
# members |
15
|
|
|
|
|
|
|
bool_groups => undef, |
16
|
|
|
|
|
|
|
phrases => undef, |
17
|
|
|
|
|
|
|
bool_group_re => undef, |
18
|
|
|
|
|
|
|
phrase_re => undef, |
19
|
|
|
|
|
|
|
label_inc => 0, |
20
|
|
|
|
|
|
|
); |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
18
|
|
|
18
|
|
16726
|
use KinoSearch1::Analysis::TokenBatch; |
|
18
|
|
|
|
|
54
|
|
|
18
|
|
|
|
|
473
|
|
24
|
18
|
|
|
18
|
|
6950
|
use KinoSearch1::Analysis::Tokenizer; |
|
18
|
|
|
|
|
52
|
|
|
18
|
|
|
|
|
620
|
|
25
|
18
|
|
|
18
|
|
10261
|
use KinoSearch1::Search::BooleanQuery; |
|
18
|
|
|
|
|
60
|
|
|
18
|
|
|
|
|
661
|
|
26
|
18
|
|
|
18
|
|
11126
|
use KinoSearch1::Search::PhraseQuery; |
|
18
|
|
|
|
|
65
|
|
|
18
|
|
|
|
|
592
|
|
27
|
18
|
|
|
18
|
|
116
|
use KinoSearch1::Search::TermQuery; |
|
18
|
|
|
|
|
44
|
|
|
18
|
|
|
|
|
405
|
|
28
|
18
|
|
|
18
|
|
638
|
use KinoSearch1::Index::Term; |
|
18
|
|
|
|
|
36
|
|
|
18
|
|
|
|
|
36926
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub init_instance { |
31
|
47
|
|
|
47
|
1
|
253
|
my $self = shift; |
32
|
47
|
|
|
|
|
192
|
$self->{bool_groups} = {}; |
33
|
47
|
|
|
|
|
111
|
$self->{phrases} = {}; |
34
|
|
|
|
|
|
|
|
35
|
47
|
50
|
|
|
|
478
|
croak("default_boolop must be either 'AND' or 'OR'") |
36
|
|
|
|
|
|
|
unless $self->{default_boolop} =~ /^(?:AND|OR)$/; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# create a random string that presumably won't appear in a search string |
39
|
47
|
|
|
|
|
468
|
my @chars = ( 'A' .. 'Z' ); |
40
|
47
|
|
|
|
|
103
|
my $randstring = ''; |
41
|
47
|
|
|
|
|
1538
|
$randstring .= $chars[ rand @chars ] for ( 1 .. 16 ); |
42
|
47
|
|
|
|
|
145
|
$self->{randstring} = $randstring; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# create labels which won't appear in search strings |
45
|
47
|
|
|
|
|
2392
|
$self->{phrase_re} = qr/^(_phrase$randstring\d+)/; |
46
|
47
|
|
|
|
|
797
|
$self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# verify fields param |
49
|
47
|
100
|
|
|
|
226
|
my $fields |
50
|
|
|
|
|
|
|
= defined $self->{fields} |
51
|
|
|
|
|
|
|
? $self->{fields} |
52
|
|
|
|
|
|
|
: [ $self->{default_field} ]; |
53
|
47
|
50
|
33
|
|
|
488
|
croak("Required parameter 'fields' not supplied as arrayref") |
54
|
|
|
|
|
|
|
unless ( defined $fields |
55
|
|
|
|
|
|
|
and reftype($fields) eq 'ARRAY' ); |
56
|
47
|
|
|
|
|
110
|
$self->{fields} = $fields; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# verify analyzer |
59
|
47
|
50
|
|
|
|
218
|
croak("Missing required param 'analyzer'") |
60
|
|
|
|
|
|
|
unless a_isa_b( $self->{analyzer}, |
61
|
|
|
|
|
|
|
'KinoSearch1::Analysis::Analyzer' ); |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# regex matching a quoted string |
65
|
|
|
|
|
|
|
my $quoted_re = qr/ |
66
|
|
|
|
|
|
|
" # opening quote |
67
|
|
|
|
|
|
|
( # capture |
68
|
|
|
|
|
|
|
[^"]*? # anything not a quote |
69
|
|
|
|
|
|
|
) |
70
|
|
|
|
|
|
|
(?:"|$) # closed by either a quote or end of string |
71
|
|
|
|
|
|
|
/xsm; |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# regex matching a parenthetical group |
74
|
|
|
|
|
|
|
my $paren_re = qr/ |
75
|
|
|
|
|
|
|
\( # opening paren |
76
|
|
|
|
|
|
|
( # capture |
77
|
|
|
|
|
|
|
[^()]*? # anything not a paren |
78
|
|
|
|
|
|
|
) |
79
|
|
|
|
|
|
|
(?:\)|$) # closed by paren or end of string |
80
|
|
|
|
|
|
|
/xsm; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# regex matching a negating boolean operator |
83
|
|
|
|
|
|
|
my $neg_re = qr/^(?: |
84
|
|
|
|
|
|
|
NOT\s+ # NOT followed by space |
85
|
|
|
|
|
|
|
|-(?=\S) # minus followed by something not-spacey |
86
|
|
|
|
|
|
|
)/xsm; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# regex matching a requiring boolean operator |
89
|
|
|
|
|
|
|
my $req_re = qr/^ |
90
|
|
|
|
|
|
|
\+(?=\S) # plus followed by something not-spacey |
91
|
|
|
|
|
|
|
/xsm; |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# regex matching a field indicator |
94
|
|
|
|
|
|
|
my $field_re = qr/^ |
95
|
|
|
|
|
|
|
( # capture |
96
|
|
|
|
|
|
|
[^"(:\s]+ # non-spacey string |
97
|
|
|
|
|
|
|
) |
98
|
|
|
|
|
|
|
: # followed by : |
99
|
|
|
|
|
|
|
/xsm; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub parse { |
102
|
348
|
|
|
348
|
1
|
1006
|
my ( $self, $qstring_orig, $default_fields ) = @_; |
103
|
348
|
50
|
|
|
|
1159
|
$qstring_orig = '' unless defined $qstring_orig; |
104
|
348
|
|
66
|
|
|
2058
|
$default_fields ||= $self->{fields}; |
105
|
348
|
|
|
|
|
846
|
my $default_boolop = $self->{default_boolop}; |
106
|
348
|
|
|
|
|
504
|
my @clauses; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# substitute contiguous labels for phrases and boolean groups |
109
|
348
|
|
|
|
|
2181
|
my $qstring = $self->_extract_phrases($qstring_orig); |
110
|
348
|
|
|
|
|
1027
|
$qstring = $self->_extract_boolgroups($qstring); |
111
|
|
|
|
|
|
|
|
112
|
348
|
|
|
|
|
635
|
local $_ = $qstring; |
113
|
348
|
|
|
|
|
1366
|
while ( bytes::length $_ ) { |
114
|
|
|
|
|
|
|
# fast-forward past whitespace |
115
|
971
|
100
|
|
|
|
9256
|
next if s/^\s+//; |
116
|
|
|
|
|
|
|
|
117
|
578
|
100
|
|
|
|
1693
|
my $occur = $default_boolop eq 'AND' ? 'MUST' : 'SHOULD'; |
118
|
|
|
|
|
|
|
|
119
|
578
|
100
|
|
|
|
2367
|
if (s/^AND\s+//) { |
|
|
100
|
|
|
|
|
|
120
|
41
|
100
|
|
|
|
129
|
if (@clauses) { |
121
|
|
|
|
|
|
|
# require the previous clause (unless it's negated) |
122
|
31
|
100
|
|
|
|
551
|
if ( $clauses[-1]{occur} eq 'SHOULD' ) { |
123
|
18
|
|
|
|
|
51
|
$clauses[-1]{occur} = 'MUST'; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
# require this clause |
127
|
41
|
|
|
|
|
145
|
$occur = 'MUST'; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
elsif (s/^OR\s+//) { |
130
|
42
|
100
|
|
|
|
147
|
if (@clauses) { |
131
|
38
|
|
|
|
|
110
|
$clauses[-1]{occur} = 'SHOULD'; |
132
|
|
|
|
|
|
|
} |
133
|
42
|
|
|
|
|
114
|
$occur = 'SHOULD'; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# detect tokens which cause this clause to be required or negated |
137
|
578
|
100
|
|
|
|
5115
|
if (s/$neg_re//) { |
|
|
100
|
|
|
|
|
|
138
|
55
|
|
|
|
|
156
|
$occur = 'MUST_NOT'; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
elsif (s/$req_re//) { |
141
|
41
|
|
|
|
|
94
|
$occur = 'MUST'; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# set the field |
145
|
578
|
100
|
|
|
|
4186
|
my $fields = s/^$field_re// ? [$1] : $default_fields; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# if a phrase label is detected... |
148
|
578
|
100
|
|
|
|
6853
|
if (s/$self->{phrase_re}//) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
149
|
96
|
|
|
|
|
236
|
my $query; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# retreive the text and analyze it |
152
|
96
|
|
|
|
|
381
|
my $orig_phrase_text = delete $self->{phrases}{$1}; |
153
|
96
|
|
|
|
|
247
|
my $token_texts = $self->_analyze($orig_phrase_text); |
154
|
96
|
50
|
|
|
|
316
|
if (@$token_texts) { |
155
|
96
|
|
|
|
|
302
|
my $query = $self->_get_field_query( $fields, $token_texts ); |
156
|
96
|
50
|
|
|
|
873
|
push @clauses, { query => $query, occur => $occur } |
157
|
|
|
|
|
|
|
if defined $query; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
# if a label indicating a bool group is detected... |
161
|
|
|
|
|
|
|
elsif (s/$self->{bool_group_re}//) { |
162
|
|
|
|
|
|
|
# parse boolean subqueries recursively |
163
|
83
|
|
|
|
|
363
|
my $inner_text = delete $self->{bool_groups}{$1}; |
164
|
83
|
|
|
|
|
294
|
my $query = $self->parse( $inner_text, $fields ); |
165
|
83
|
|
|
|
|
473
|
push @clauses, { query => $query, occur => $occur }; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
# what's left is probably a term |
168
|
|
|
|
|
|
|
elsif (s/([^"(\s]+)//) { |
169
|
399
|
|
|
|
|
1252
|
my $token_texts = $self->_analyze($1); |
170
|
399
|
|
|
|
|
1265
|
@$token_texts = grep { $_ ne '' } @$token_texts; |
|
399
|
|
|
|
|
1552
|
|
171
|
399
|
100
|
|
|
|
1701
|
if (@$token_texts) { |
172
|
357
|
|
|
|
|
1237
|
my $query = $self->_get_field_query( $fields, $token_texts ); |
173
|
357
|
|
|
|
|
3028
|
push @clauses, { occur => $occur, query => $query }; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
348
|
100
|
100
|
|
|
2852
|
if ( @clauses == 1 and $clauses[0]{occur} ne 'MUST_NOT' ) { |
179
|
|
|
|
|
|
|
# if it's just a simple query, return it unwrapped |
180
|
147
|
|
|
|
|
917
|
return $clauses[0]{query}; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
else { |
183
|
|
|
|
|
|
|
# otherwise, build a boolean query |
184
|
201
|
|
|
|
|
1467
|
my $bool_query = KinoSearch1::Search::BooleanQuery->new; |
185
|
201
|
|
|
|
|
623
|
for my $clause (@clauses) { |
186
|
389
|
|
|
|
|
1488
|
$bool_query->add_clause( |
187
|
|
|
|
|
|
|
query => $clause->{query}, |
188
|
|
|
|
|
|
|
occur => $clause->{occur}, |
189
|
|
|
|
|
|
|
); |
190
|
|
|
|
|
|
|
} |
191
|
201
|
|
|
|
|
1641
|
return $bool_query; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Wrap a TermQuery/PhraseQuery to deal with multiple fields. |
196
|
|
|
|
|
|
|
sub _get_field_query { |
197
|
453
|
|
|
453
|
|
861
|
my ( $self, $fields, $token_texts ) = @_; |
198
|
|
|
|
|
|
|
|
199
|
534
|
|
|
|
|
1913
|
my @queries = grep { defined $_ } |
|
534
|
|
|
|
|
1301
|
|
200
|
453
|
|
|
|
|
1021
|
map { $self->_gen_single_field_query( $_, $token_texts ) } @$fields; |
201
|
|
|
|
|
|
|
|
202
|
453
|
50
|
|
|
|
1963
|
if ( @queries == 0 ) { |
|
|
100
|
|
|
|
|
|
203
|
0
|
|
|
|
|
0
|
return; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
elsif ( @queries == 1 ) { |
206
|
408
|
|
|
|
|
1112
|
return $queries[0]; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
else { |
209
|
45
|
|
|
|
|
456
|
my $wrapper_query = KinoSearch1::Search::BooleanQuery->new; |
210
|
45
|
|
|
|
|
109
|
for my $query (@queries) { |
211
|
126
|
|
|
|
|
514
|
$wrapper_query->add_clause( |
212
|
|
|
|
|
|
|
query => $query, |
213
|
|
|
|
|
|
|
occur => 'SHOULD', |
214
|
|
|
|
|
|
|
); |
215
|
|
|
|
|
|
|
} |
216
|
45
|
|
|
|
|
148
|
return $wrapper_query; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Create a TermQuery, a PhraseQuery, or nothing. |
221
|
|
|
|
|
|
|
sub _gen_single_field_query { |
222
|
534
|
|
|
534
|
|
964
|
my ( $self, $field, $token_texts ) = @_; |
223
|
|
|
|
|
|
|
|
224
|
534
|
100
|
|
|
|
1598
|
if ( @$token_texts == 1 ) { |
|
|
50
|
|
|
|
|
|
225
|
468
|
|
|
|
|
3134
|
my $term = KinoSearch1::Index::Term->new( $field, $token_texts->[0] ); |
226
|
468
|
|
|
|
|
6021
|
return KinoSearch1::Search::TermQuery->new( term => $term ); |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
elsif ( @$token_texts > 1 ) { |
229
|
66
|
|
|
|
|
465
|
my $phrase_query = KinoSearch1::Search::PhraseQuery->new; |
230
|
66
|
|
|
|
|
198
|
for my $token_text (@$token_texts) { |
231
|
140
|
|
|
|
|
531
|
$phrase_query->add_term( |
232
|
|
|
|
|
|
|
KinoSearch1::Index::Term->new( $field, $token_text ), |
233
|
|
|
|
|
|
|
); |
234
|
|
|
|
|
|
|
} |
235
|
66
|
|
|
|
|
226
|
return $phrase_query; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# break a string into tokens |
240
|
|
|
|
|
|
|
sub _analyze { |
241
|
495
|
|
|
495
|
|
1300
|
my ( $self, $string ) = @_; |
242
|
|
|
|
|
|
|
|
243
|
495
|
|
|
|
|
4231
|
my $token_batch = KinoSearch1::Analysis::TokenBatch->new; |
244
|
495
|
|
|
|
|
2260
|
$token_batch->append( $string, 0, bytes::length($string) ); |
245
|
495
|
|
|
|
|
6270
|
$token_batch = $self->{analyzer}->analyze($token_batch); |
246
|
495
|
|
|
|
|
11022
|
my @token_texts; |
247
|
495
|
|
|
|
|
1984
|
while ( $token_batch->next ) { |
248
|
565
|
|
|
|
|
3298
|
push @token_texts, $token_batch->get_text; |
249
|
|
|
|
|
|
|
} |
250
|
495
|
|
|
|
|
2433
|
return \@token_texts; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# replace all phrases with labels |
254
|
|
|
|
|
|
|
sub _extract_phrases { |
255
|
348
|
|
|
348
|
|
672
|
my ( $self, $qstring ) = @_; |
256
|
|
|
|
|
|
|
|
257
|
348
|
|
|
|
|
2465
|
while ( $qstring =~ $quoted_re ) { |
258
|
96
|
|
|
|
|
628
|
my $label |
259
|
|
|
|
|
|
|
= sprintf( "_phrase$self->{randstring}%d", $self->{label_inc}++ ); |
260
|
96
|
|
|
|
|
987
|
$qstring =~ s/$quoted_re/$label /; # extra space for safety |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
# store the phrase text for later retrieval |
263
|
96
|
|
|
|
|
994
|
$self->{phrases}{$label} = $1; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
348
|
|
|
|
|
917
|
return $qstring; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# recursively replace boolean groupings with labels, innermost first |
270
|
|
|
|
|
|
|
sub _extract_boolgroups { |
271
|
348
|
|
|
348
|
|
998
|
my ( $self, $qstring ) = @_; |
272
|
|
|
|
|
|
|
|
273
|
348
|
|
|
|
|
2317
|
while ( $qstring =~ $paren_re ) { |
274
|
83
|
|
|
|
|
526
|
my $label = sprintf( "_boolgroup$self->{randstring}%d", |
275
|
|
|
|
|
|
|
$self->{label_inc}++ ); |
276
|
83
|
|
|
|
|
899
|
$qstring =~ s/$paren_re/$label /; # extra space for safety |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# store the text for later retrieval |
279
|
83
|
|
|
|
|
670
|
$self->{bool_groups}{$label} = $1; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
348
|
|
|
|
|
810
|
return $qstring; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
1; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
__END__ |