| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package KinoSearch1::QueryParser::QueryParser; | 
| 2 | 18 |  |  | 18 |  | 30973 | use strict; | 
|  | 18 |  |  |  |  | 39 |  | 
|  | 18 |  |  |  |  | 623 |  | 
| 3 | 18 |  |  | 18 |  | 96 | use warnings; | 
|  | 18 |  |  |  |  | 40 |  | 
|  | 18 |  |  |  |  | 464 |  | 
| 4 | 18 |  |  | 18 |  | 866 | use KinoSearch1::Util::ToolSet; | 
|  | 18 |  |  |  |  | 40 |  | 
|  | 18 |  |  |  |  | 2729 |  | 
| 5 | 18 |  |  | 18 |  | 106 | use base qw( KinoSearch1::Util::Class ); | 
|  | 18 |  |  |  |  | 49 |  | 
|  | 18 |  |  |  |  | 2841 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | BEGIN { | 
| 8 | 18 |  |  | 18 |  | 196 | __PACKAGE__->init_instance_vars( | 
| 9 |  |  |  |  |  |  | # constructor args / members | 
| 10 |  |  |  |  |  |  | analyzer       => undef, | 
| 11 |  |  |  |  |  |  | default_boolop => 'OR', | 
| 12 |  |  |  |  |  |  | default_field  => undef,    # back compat | 
| 13 |  |  |  |  |  |  | fields         => undef, | 
| 14 |  |  |  |  |  |  | # members | 
| 15 |  |  |  |  |  |  | bool_groups   => undef, | 
| 16 |  |  |  |  |  |  | phrases       => undef, | 
| 17 |  |  |  |  |  |  | bool_group_re => undef, | 
| 18 |  |  |  |  |  |  | phrase_re     => undef, | 
| 19 |  |  |  |  |  |  | label_inc     => 0, | 
| 20 |  |  |  |  |  |  | ); | 
| 21 |  |  |  |  |  |  | } | 
| 22 |  |  |  |  |  |  |  | 
| 23 | 18 |  |  | 18 |  | 16726 | use KinoSearch1::Analysis::TokenBatch; | 
|  | 18 |  |  |  |  | 54 |  | 
|  | 18 |  |  |  |  | 473 |  | 
| 24 | 18 |  |  | 18 |  | 6950 | use KinoSearch1::Analysis::Tokenizer; | 
|  | 18 |  |  |  |  | 52 |  | 
|  | 18 |  |  |  |  | 620 |  | 
| 25 | 18 |  |  | 18 |  | 10261 | use KinoSearch1::Search::BooleanQuery; | 
|  | 18 |  |  |  |  | 60 |  | 
|  | 18 |  |  |  |  | 661 |  | 
| 26 | 18 |  |  | 18 |  | 11126 | use KinoSearch1::Search::PhraseQuery; | 
|  | 18 |  |  |  |  | 65 |  | 
|  | 18 |  |  |  |  | 592 |  | 
| 27 | 18 |  |  | 18 |  | 116 | use KinoSearch1::Search::TermQuery; | 
|  | 18 |  |  |  |  | 44 |  | 
|  | 18 |  |  |  |  | 405 |  | 
| 28 | 18 |  |  | 18 |  | 638 | use KinoSearch1::Index::Term; | 
|  | 18 |  |  |  |  | 36 |  | 
|  | 18 |  |  |  |  | 36926 |  | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | sub init_instance { | 
| 31 | 47 |  |  | 47 | 1 | 253 | my $self = shift; | 
| 32 | 47 |  |  |  |  | 192 | $self->{bool_groups} = {}; | 
| 33 | 47 |  |  |  |  | 111 | $self->{phrases}     = {}; | 
| 34 |  |  |  |  |  |  |  | 
| 35 | 47 | 50 |  |  |  | 478 | croak("default_boolop must be either 'AND' or 'OR'") | 
| 36 |  |  |  |  |  |  | unless $self->{default_boolop} =~ /^(?:AND|OR)$/; | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | # create a random string that presumably won't appear in a search string | 
| 39 | 47 |  |  |  |  | 468 | my @chars      = ( 'A' .. 'Z' ); | 
| 40 | 47 |  |  |  |  | 103 | my $randstring = ''; | 
| 41 | 47 |  |  |  |  | 1538 | $randstring .= $chars[ rand @chars ] for ( 1 .. 16 ); | 
| 42 | 47 |  |  |  |  | 145 | $self->{randstring} = $randstring; | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | # create labels which won't appear in search strings | 
| 45 | 47 |  |  |  |  | 2392 | $self->{phrase_re}     = qr/^(_phrase$randstring\d+)/; | 
| 46 | 47 |  |  |  |  | 797 | $self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/; | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | # verify fields param | 
| 49 | 47 | 100 |  |  |  | 226 | my $fields | 
| 50 |  |  |  |  |  |  | = defined $self->{fields} | 
| 51 |  |  |  |  |  |  | ? $self->{fields} | 
| 52 |  |  |  |  |  |  | : [ $self->{default_field} ]; | 
| 53 | 47 | 50 | 33 |  |  | 488 | croak("Required parameter 'fields' not supplied as arrayref") | 
| 54 |  |  |  |  |  |  | unless ( defined $fields | 
| 55 |  |  |  |  |  |  | and reftype($fields) eq 'ARRAY' ); | 
| 56 | 47 |  |  |  |  | 110 | $self->{fields} = $fields; | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | # verify analyzer | 
| 59 | 47 | 50 |  |  |  | 218 | croak("Missing required param 'analyzer'") | 
| 60 |  |  |  |  |  |  | unless a_isa_b( $self->{analyzer}, | 
| 61 |  |  |  |  |  |  | 'KinoSearch1::Analysis::Analyzer' ); | 
| 62 |  |  |  |  |  |  | } | 
| 63 |  |  |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | # regex matching a quoted string | 
| 65 |  |  |  |  |  |  | my $quoted_re = qr/ | 
| 66 |  |  |  |  |  |  | "            # opening quote | 
| 67 |  |  |  |  |  |  | (            # capture | 
| 68 |  |  |  |  |  |  | [^"]*?   # anything not a quote | 
| 69 |  |  |  |  |  |  | ) | 
| 70 |  |  |  |  |  |  | (?:"|$)      # closed by either a quote or end of string | 
| 71 |  |  |  |  |  |  | /xsm; | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | # regex matching a parenthetical group | 
| 74 |  |  |  |  |  |  | my $paren_re = qr/ | 
| 75 |  |  |  |  |  |  | \(           # opening paren | 
| 76 |  |  |  |  |  |  | (            # capture | 
| 77 |  |  |  |  |  |  | [^()]*?  # anything not a paren | 
| 78 |  |  |  |  |  |  | ) | 
| 79 |  |  |  |  |  |  | (?:\)|$)     # closed by paren or end of string | 
| 80 |  |  |  |  |  |  | /xsm; | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | # regex matching a negating boolean operator | 
| 83 |  |  |  |  |  |  | my $neg_re = qr/^(?: | 
| 84 |  |  |  |  |  |  | NOT\s+         # NOT followed by space | 
| 85 |  |  |  |  |  |  | |-(?=\S)       # minus followed by something not-spacey | 
| 86 |  |  |  |  |  |  | )/xsm; | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | # regex matching a requiring boolean operator | 
| 89 |  |  |  |  |  |  | my $req_re = qr/^ | 
| 90 |  |  |  |  |  |  | \+(?=\S)       # plus followed by something not-spacey | 
| 91 |  |  |  |  |  |  | /xsm; | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | # regex matching a field indicator | 
| 94 |  |  |  |  |  |  | my $field_re = qr/^ | 
| 95 |  |  |  |  |  |  | (              # capture | 
| 96 |  |  |  |  |  |  | [^"(:\s]+  # non-spacey string | 
| 97 |  |  |  |  |  |  | ) | 
| 98 |  |  |  |  |  |  | :              # followed by : | 
| 99 |  |  |  |  |  |  | /xsm; | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | sub parse { | 
| 102 | 348 |  |  | 348 | 1 | 1006 | my ( $self, $qstring_orig, $default_fields ) = @_; | 
| 103 | 348 | 50 |  |  |  | 1159 | $qstring_orig = '' unless defined $qstring_orig; | 
| 104 | 348 |  | 66 |  |  | 2058 | $default_fields ||= $self->{fields}; | 
| 105 | 348 |  |  |  |  | 846 | my $default_boolop = $self->{default_boolop}; | 
| 106 | 348 |  |  |  |  | 504 | my @clauses; | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | # substitute contiguous labels for phrases and boolean groups | 
| 109 | 348 |  |  |  |  | 2181 | my $qstring = $self->_extract_phrases($qstring_orig); | 
| 110 | 348 |  |  |  |  | 1027 | $qstring = $self->_extract_boolgroups($qstring); | 
| 111 |  |  |  |  |  |  |  | 
| 112 | 348 |  |  |  |  | 635 | local $_ = $qstring; | 
| 113 | 348 |  |  |  |  | 1366 | while ( bytes::length $_ ) { | 
| 114 |  |  |  |  |  |  | # fast-forward past whitespace | 
| 115 | 971 | 100 |  |  |  | 9256 | next if s/^\s+//; | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 578 | 100 |  |  |  | 1693 | my $occur = $default_boolop eq 'AND' ? 'MUST' : 'SHOULD'; | 
| 118 |  |  |  |  |  |  |  | 
| 119 | 578 | 100 |  |  |  | 2367 | if (s/^AND\s+//) { | 
|  |  | 100 |  |  |  |  |  | 
| 120 | 41 | 100 |  |  |  | 129 | if (@clauses) { | 
| 121 |  |  |  |  |  |  | # require the previous clause (unless it's negated) | 
| 122 | 31 | 100 |  |  |  | 551 | if ( $clauses[-1]{occur} eq 'SHOULD' ) { | 
| 123 | 18 |  |  |  |  | 51 | $clauses[-1]{occur} = 'MUST'; | 
| 124 |  |  |  |  |  |  | } | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  | # require this clause | 
| 127 | 41 |  |  |  |  | 145 | $occur = 'MUST'; | 
| 128 |  |  |  |  |  |  | } | 
| 129 |  |  |  |  |  |  | elsif (s/^OR\s+//) { | 
| 130 | 42 | 100 |  |  |  | 147 | if (@clauses) { | 
| 131 | 38 |  |  |  |  | 110 | $clauses[-1]{occur} = 'SHOULD'; | 
| 132 |  |  |  |  |  |  | } | 
| 133 | 42 |  |  |  |  | 114 | $occur = 'SHOULD'; | 
| 134 |  |  |  |  |  |  | } | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | # detect tokens which cause this clause to be required or negated | 
| 137 | 578 | 100 |  |  |  | 5115 | if (s/$neg_re//) { | 
|  |  | 100 |  |  |  |  |  | 
| 138 | 55 |  |  |  |  | 156 | $occur = 'MUST_NOT'; | 
| 139 |  |  |  |  |  |  | } | 
| 140 |  |  |  |  |  |  | elsif (s/$req_re//) { | 
| 141 | 41 |  |  |  |  | 94 | $occur = 'MUST'; | 
| 142 |  |  |  |  |  |  | } | 
| 143 |  |  |  |  |  |  |  | 
| 144 |  |  |  |  |  |  | # set the field | 
| 145 | 578 | 100 |  |  |  | 4186 | my $fields = s/^$field_re// ? [$1] : $default_fields; | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | # if a phrase label is detected... | 
| 148 | 578 | 100 |  |  |  | 6853 | if (s/$self->{phrase_re}//) { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 149 | 96 |  |  |  |  | 236 | my $query; | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | # retreive the text and analyze it | 
| 152 | 96 |  |  |  |  | 381 | my $orig_phrase_text = delete $self->{phrases}{$1}; | 
| 153 | 96 |  |  |  |  | 247 | my $token_texts      = $self->_analyze($orig_phrase_text); | 
| 154 | 96 | 50 |  |  |  | 316 | if (@$token_texts) { | 
| 155 | 96 |  |  |  |  | 302 | my $query = $self->_get_field_query( $fields, $token_texts ); | 
| 156 | 96 | 50 |  |  |  | 873 | push @clauses, { query => $query, occur => $occur } | 
| 157 |  |  |  |  |  |  | if defined $query; | 
| 158 |  |  |  |  |  |  | } | 
| 159 |  |  |  |  |  |  | } | 
| 160 |  |  |  |  |  |  | # if a label indicating a bool group is detected... | 
| 161 |  |  |  |  |  |  | elsif (s/$self->{bool_group_re}//) { | 
| 162 |  |  |  |  |  |  | # parse boolean subqueries recursively | 
| 163 | 83 |  |  |  |  | 363 | my $inner_text = delete $self->{bool_groups}{$1}; | 
| 164 | 83 |  |  |  |  | 294 | my $query = $self->parse( $inner_text, $fields ); | 
| 165 | 83 |  |  |  |  | 473 | push @clauses, { query => $query, occur => $occur }; | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  | # what's left is probably a term | 
| 168 |  |  |  |  |  |  | elsif (s/([^"(\s]+)//) { | 
| 169 | 399 |  |  |  |  | 1252 | my $token_texts = $self->_analyze($1); | 
| 170 | 399 |  |  |  |  | 1265 | @$token_texts = grep { $_ ne '' } @$token_texts; | 
|  | 399 |  |  |  |  | 1552 |  | 
| 171 | 399 | 100 |  |  |  | 1701 | if (@$token_texts) { | 
| 172 | 357 |  |  |  |  | 1237 | my $query = $self->_get_field_query( $fields, $token_texts ); | 
| 173 | 357 |  |  |  |  | 3028 | push @clauses, { occur => $occur, query => $query }; | 
| 174 |  |  |  |  |  |  | } | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  |  | 
| 178 | 348 | 100 | 100 |  |  | 2852 | if ( @clauses == 1 and $clauses[0]{occur} ne 'MUST_NOT' ) { | 
| 179 |  |  |  |  |  |  | # if it's just a simple query, return it unwrapped | 
| 180 | 147 |  |  |  |  | 917 | return $clauses[0]{query}; | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  | else { | 
| 183 |  |  |  |  |  |  | # otherwise, build a boolean query | 
| 184 | 201 |  |  |  |  | 1467 | my $bool_query = KinoSearch1::Search::BooleanQuery->new; | 
| 185 | 201 |  |  |  |  | 623 | for my $clause (@clauses) { | 
| 186 | 389 |  |  |  |  | 1488 | $bool_query->add_clause( | 
| 187 |  |  |  |  |  |  | query => $clause->{query}, | 
| 188 |  |  |  |  |  |  | occur => $clause->{occur}, | 
| 189 |  |  |  |  |  |  | ); | 
| 190 |  |  |  |  |  |  | } | 
| 191 | 201 |  |  |  |  | 1641 | return $bool_query; | 
| 192 |  |  |  |  |  |  | } | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | # Wrap a TermQuery/PhraseQuery to deal with multiple fields. | 
| 196 |  |  |  |  |  |  | sub _get_field_query { | 
| 197 | 453 |  |  | 453 |  | 861 | my ( $self, $fields, $token_texts ) = @_; | 
| 198 |  |  |  |  |  |  |  | 
| 199 | 534 |  |  |  |  | 1913 | my @queries = grep { defined $_ } | 
|  | 534 |  |  |  |  | 1301 |  | 
| 200 | 453 |  |  |  |  | 1021 | map { $self->_gen_single_field_query( $_, $token_texts ) } @$fields; | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 453 | 50 |  |  |  | 1963 | if ( @queries == 0 ) { | 
|  |  | 100 |  |  |  |  |  | 
| 203 | 0 |  |  |  |  | 0 | return; | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  | elsif ( @queries == 1 ) { | 
| 206 | 408 |  |  |  |  | 1112 | return $queries[0]; | 
| 207 |  |  |  |  |  |  | } | 
| 208 |  |  |  |  |  |  | else { | 
| 209 | 45 |  |  |  |  | 456 | my $wrapper_query = KinoSearch1::Search::BooleanQuery->new; | 
| 210 | 45 |  |  |  |  | 109 | for my $query (@queries) { | 
| 211 | 126 |  |  |  |  | 514 | $wrapper_query->add_clause( | 
| 212 |  |  |  |  |  |  | query => $query, | 
| 213 |  |  |  |  |  |  | occur => 'SHOULD', | 
| 214 |  |  |  |  |  |  | ); | 
| 215 |  |  |  |  |  |  | } | 
| 216 | 45 |  |  |  |  | 148 | return $wrapper_query; | 
| 217 |  |  |  |  |  |  | } | 
| 218 |  |  |  |  |  |  | } | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | # Create a TermQuery, a PhraseQuery, or nothing. | 
| 221 |  |  |  |  |  |  | sub _gen_single_field_query { | 
| 222 | 534 |  |  | 534 |  | 964 | my ( $self, $field, $token_texts ) = @_; | 
| 223 |  |  |  |  |  |  |  | 
| 224 | 534 | 100 |  |  |  | 1598 | if ( @$token_texts == 1 ) { | 
|  |  | 50 |  |  |  |  |  | 
| 225 | 468 |  |  |  |  | 3134 | my $term = KinoSearch1::Index::Term->new( $field, $token_texts->[0] ); | 
| 226 | 468 |  |  |  |  | 6021 | return KinoSearch1::Search::TermQuery->new( term => $term ); | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  | elsif ( @$token_texts > 1 ) { | 
| 229 | 66 |  |  |  |  | 465 | my $phrase_query = KinoSearch1::Search::PhraseQuery->new; | 
| 230 | 66 |  |  |  |  | 198 | for my $token_text (@$token_texts) { | 
| 231 | 140 |  |  |  |  | 531 | $phrase_query->add_term( | 
| 232 |  |  |  |  |  |  | KinoSearch1::Index::Term->new( $field, $token_text ), | 
| 233 |  |  |  |  |  |  | ); | 
| 234 |  |  |  |  |  |  | } | 
| 235 | 66 |  |  |  |  | 226 | return $phrase_query; | 
| 236 |  |  |  |  |  |  | } | 
| 237 |  |  |  |  |  |  | } | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | # break a string into tokens | 
| 240 |  |  |  |  |  |  | sub _analyze { | 
| 241 | 495 |  |  | 495 |  | 1300 | my ( $self, $string ) = @_; | 
| 242 |  |  |  |  |  |  |  | 
| 243 | 495 |  |  |  |  | 4231 | my $token_batch = KinoSearch1::Analysis::TokenBatch->new; | 
| 244 | 495 |  |  |  |  | 2260 | $token_batch->append( $string, 0, bytes::length($string) ); | 
| 245 | 495 |  |  |  |  | 6270 | $token_batch = $self->{analyzer}->analyze($token_batch); | 
| 246 | 495 |  |  |  |  | 11022 | my @token_texts; | 
| 247 | 495 |  |  |  |  | 1984 | while ( $token_batch->next ) { | 
| 248 | 565 |  |  |  |  | 3298 | push @token_texts, $token_batch->get_text; | 
| 249 |  |  |  |  |  |  | } | 
| 250 | 495 |  |  |  |  | 2433 | return \@token_texts; | 
| 251 |  |  |  |  |  |  | } | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | # replace all phrases with labels | 
| 254 |  |  |  |  |  |  | sub _extract_phrases { | 
| 255 | 348 |  |  | 348 |  | 672 | my ( $self, $qstring ) = @_; | 
| 256 |  |  |  |  |  |  |  | 
| 257 | 348 |  |  |  |  | 2465 | while ( $qstring =~ $quoted_re ) { | 
| 258 | 96 |  |  |  |  | 628 | my $label | 
| 259 |  |  |  |  |  |  | = sprintf( "_phrase$self->{randstring}%d", $self->{label_inc}++ ); | 
| 260 | 96 |  |  |  |  | 987 | $qstring =~ s/$quoted_re/$label /;    # extra space for safety | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | # store the phrase text for later retrieval | 
| 263 | 96 |  |  |  |  | 994 | $self->{phrases}{$label} = $1; | 
| 264 |  |  |  |  |  |  | } | 
| 265 |  |  |  |  |  |  |  | 
| 266 | 348 |  |  |  |  | 917 | return $qstring; | 
| 267 |  |  |  |  |  |  | } | 
| 268 |  |  |  |  |  |  |  | 
| 269 |  |  |  |  |  |  | # recursively replace boolean groupings with labels, innermost first | 
| 270 |  |  |  |  |  |  | sub _extract_boolgroups { | 
| 271 | 348 |  |  | 348 |  | 998 | my ( $self, $qstring ) = @_; | 
| 272 |  |  |  |  |  |  |  | 
| 273 | 348 |  |  |  |  | 2317 | while ( $qstring =~ $paren_re ) { | 
| 274 | 83 |  |  |  |  | 526 | my $label = sprintf( "_boolgroup$self->{randstring}%d", | 
| 275 |  |  |  |  |  |  | $self->{label_inc}++ ); | 
| 276 | 83 |  |  |  |  | 899 | $qstring =~ s/$paren_re/$label /;    # extra space for safety | 
| 277 |  |  |  |  |  |  |  | 
| 278 |  |  |  |  |  |  | # store the text for later retrieval | 
| 279 | 83 |  |  |  |  | 670 | $self->{bool_groups}{$label} = $1; | 
| 280 |  |  |  |  |  |  | } | 
| 281 |  |  |  |  |  |  |  | 
| 282 | 348 |  |  |  |  | 810 | return $qstring; | 
| 283 |  |  |  |  |  |  | } | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | 1; | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | __END__ |