File Coverage

blib/lib/Search/QueryParser.pm
Criterion Covered Total %
statement 82 84 97.6
branch 41 52 78.8
condition 41 45 91.1
subroutine 10 10 100.0
pod 4 5 80.0
total 178 196 90.8


line stmt bran cond sub pod time code
1             package Search::QueryParser;
2              
3 1     1   24512 use strict;
  1         3  
  1         56  
4 1     1   5 use warnings;
  1         2  
  1         50  
5 1     1   872 use locale;
  1         259  
  1         5  
6              
7             our $VERSION = "0.94";
8              
9             =head1 NAME
10              
11             Search::QueryParser - parses a query string into a data structure
12             suitable for external search engines
13              
14             =head1 SYNOPSIS
15              
16             my $qp = new Search::QueryParser;
17             my $s = '+mandatoryWord -excludedWord +field:word "exact phrase"';
18             my $query = $qp->parse($s) or die "Error in query : " . $qp->err;
19             $someIndexer->search($query);
20              
21             # query with comparison operators and implicit plus (second arg is true)
22             $query = $qp->parse("txt~'^foo.*' date>='01.01.2001' date<='02.02.2002'", 1);
23              
24             # boolean operators (example below is equivalent to "+a +(b c) -d")
25             $query = $qp->parse("a AND (b OR c) AND NOT d");
26              
27             # subset of rows
28             $query = $qp->parse("Id#123,444,555,666 AND (b OR c)");
29              
30              
31             =head1 DESCRIPTION
32              
33             This module parses a query string into a data structure to be handled
34             by external search engines. For examples of such engines, see
35             L and L.
36              
37             The query string can contain simple terms, "exact phrases", field
38             names and comparison operators, '+/-' prefixes, parentheses, and
39             boolean connectors.
40              
41             The parser can be parameterized by regular expressions for specific
42             notions of "term", "field name" or "operator" ; see the L
43             method. The parser has no support for lemmatization or other term
44             transformations : these should be done externally, before passing the
45             query data structure to the search engine.
46              
47             The data structure resulting from a parsed query is a tree of terms
48             and operators, as described below in the L method. The
49             interpretation of the structure is up to the external search engine
50             that will receive the parsed query ; the present module does not make
51             any assumption about what it means to be "equal" or to "contain" a
52             term.
53              
54              
55             =head1 QUERY STRING
56              
57             The query string is decomposed into "items", where
58             each item has an optional sign prefix,
59             an optional field name and comparison operator,
60             and a mandatory value.
61              
62             =head2 Sign prefix
63              
64             Prefix '+' means that the item is mandatory.
65             Prefix '-' means that the item must be excluded.
66             No prefix means that the item will be searched
67             for, but is not mandatory.
68              
69             As far as the result set is concerned,
70             C<+a +b c> is strictly equivalent to C<+a +b> : the search engine will
71             return documents containing both terms 'a' and 'b', and possibly
72             also term 'c'. However, if the search engine also returns
73             relevance scores, query C<+a +b c> might give a better score
74             to documents containing also term 'c'.
75              
76             See also section L below, which is another
77             way to combine items into a query.
78              
79             =head2 Field name and comparison operator
80              
81             Internally, each query item has a field name and comparison
82             operator; if not written explicitly in the query, these
83             take default values C<''> (empty field name) and
84             C<':'> (colon operator).
85              
86             Operators have a left operand (the field name) and
87             a right operand (the value to be compared with);
88             for example, C means "search documents containing
89             term 'bar' in field 'foo'", whereas C means
90             "search documents where field 'foo' has exact value 'bar'".
91              
92             Here is the list of admitted operators with their intended meaning :
93              
94             =over
95              
96             =item C<:>
97              
98             treat value as a term to be searched within field.
99             This is the default operator.
100              
101             =item C<~> or C<=~>
102              
103             treat value as a regex; match field against the regex.
104              
105             =item C
106              
107             negation of above
108              
109             =item C<==> or C<=>, C=>, C=>, C, C>, C>
110              
111             classical relational operators
112              
113             =item C<#>
114              
115             Inclusion in the set of comma-separated integers supplied
116             on the right-hand side.
117              
118              
119             =back
120              
121             Operators C<:>, C<~>, C<=~>, C and C<#> admit an empty
122             left operand (so the field name will be C<''>).
123             Search engines will usually interpret this as
124             "any field" or "the whole data record".
125              
126             =head2 Value
127              
128             A value (right operand to a comparison operator) can be
129              
130             =over
131              
132             =item *
133              
134             just a term (as recognized by regex C, see L method below)
135              
136             =item *
137              
138             A quoted phrase, i.e. a collection of terms within
139             single or double quotes.
140              
141             Quotes can be used not only for "exact phrases", but also
142             to prevent misinterpretation of some values : for example
143             C<-2> would mean "value '2' with prefix '-'",
144             in other words "exclude term '2'", so if you want to search for
145             value -2, you should write C<"-2"> instead. In the
146             last example of the synopsis, quotes were used to
147             prevent splitting of dates into several search terms.
148              
149             =item *
150              
151             a subquery within parentheses.
152             Field names and operators distribute over parentheses, so for
153             example C is equivalent to
154             C.
155             Nested field names such as C are not allowed.
156             Sign prefixes do not distribute : C<+(foo bar) +bie> is not
157             equivalent to C<+foo +bar +bie>.
158              
159              
160             =back
161              
162              
163             =head2 Boolean connectors
164              
165             Queries can contain boolean connectors 'AND', 'OR', 'NOT'
166             (or their equivalent in some other languages).
167             This is mere syntactic sugar for the '+' and '-' prefixes :
168             C is translated into C<+a +b>;
169             C is translated into C<(a b)>;
170             C is translated into C<-a>.
171             C<+a OR b> does not make sense,
172             but it is translated into C<(a b)>, under the assumption
173             that the user understands "OR" better than a
174             '+' prefix.
175             C<-a OR b> does not make sense either,
176             but has no meaningful approximation, so it is rejected.
177              
178             Combinations of AND/OR clauses must be surrounded by
179             parentheses, i.e. C<(a AND b) OR c> or C are
180             allowed, but C is not.
181              
182              
183             =head1 METHODS
184              
185             =over
186              
187             =cut
188              
189 1         1367 use constant DEFAULT => {
190             rxTerm => qr/[^\s()]+/,
191             rxField => qr/\w+/,
192              
193             rxOp => qr/==|<=|>=|!=|=~|!~|[:=<>~#]/, # longest ops first !
194             rxOpNoField => qr/=~|!~|[~:#]/, # ops that admit an empty left operand
195              
196             rxAnd => qr/AND|ET|UND|E/,
197             rxOr => qr/OR|OU|ODER|O/,
198             rxNot => qr/NOT|PAS|NICHT|NON/,
199              
200             defField => "",
201 1     1   294 };
  1         2  
202              
203             =item new
204              
205             new(rxTerm => qr/.../, rxOp => qr/.../, ...)
206              
207             Creates a new query parser, initialized with (optional) regular
208             expressions :
209              
210             =over
211              
212             =item rxTerm
213              
214             Regular expression for matching a term.
215             Of course it should not match the empty string.
216             Default value is C.
217             A term should not be allowed to include parenthesis, otherwise the parser
218             might get into trouble.
219              
220             =item rxField
221              
222             Regular expression for matching a field name.
223             Default value is C (meaning of C<\w> according to C).
224              
225             =item rxOp
226              
227             Regular expression for matching an operator.
228             Default value is C=|E=|!=|=~|!~|:|=|E|E|~/>.
229             Note that the longest operators come first in the regex, because
230             "alternatives are tried from left to right"
231             (see L) :
232             this is to avoid C=3> being parsed as
233             C '=3'>.
234              
235             =item rxOpNoField
236              
237             Regular expression for a subset of the operators
238             which admit an empty left operand (no field name).
239             Default value is C.
240             Such operators can be meaningful for comparisons
241             with "any field" or with "the whole record" ;
242             the precise interpretation depends on the search engine.
243              
244             =item rxAnd
245              
246             Regular expression for boolean connector AND.
247             Default value is C.
248              
249             =item rxOr
250              
251             Regular expression for boolean connector OR.
252             Default value is C.
253              
254             =item rxNot
255              
256             Regular expression for boolean connector NOT.
257             Default value is C.
258              
259             =item defField
260              
261             If no field is specified in the query, use I.
262             The default is the empty string "".
263              
264             =back
265              
266             =cut
267              
268             sub new {
269 2     2 1 16 my $class = shift;
270 2 50       11 my $args = ref $_[0] eq 'HASH' ? $_[0] : {@_};
271              
272             # create object with default values
273 2         8 my $self = bless {}, $class;
274             $self->{$_} = $args->{$_} || DEFAULT->{$_}
275 2   100     76 foreach qw(rxTerm rxField rxOp rxOpNoField rxAnd rxOr rxNot defField);
276 2         10 return $self;
277             }
278              
279             =item parse
280              
281             $q = $queryParser->parse($queryString, $implicitPlus);
282              
283             Returns a data structure corresponding to the parsed string.
284             The second argument is optional; if true, it adds an implicit
285             '+' in front of each term without prefix, so
286             C is equivalent to C.
287             This is often seen in common WWW search engines
288             as an option "match all words".
289              
290             The return value has following structure :
291              
292             { '+' => [{field=>'f1', op=>':', value=>'v1', quote=>'q1'},
293             {field=>'f2', op=>':', value=>'v2', quote=>'q2'}, ...],
294             '' => [...],
295             '-' => [...]
296             }
297              
298             In other words, it is a hash ref with 3 keys C<'+'>, C<''> and C<'-'>,
299             corresponding to the 3 sign prefixes (mandatory, ordinary or excluded
300             items). Each key holds either a ref to an array of items, or
301             C (no items with this prefix in the query).
302              
303             An I is a hash ref containing
304              
305             =over
306              
307             =item C
308              
309             scalar, field name (may be the empty string)
310              
311             =item C
312              
313             scalar, operator
314              
315             =item C
316              
317             scalar, character that was used for quoting the value ('"', "'" or undef)
318              
319             =item C
320              
321             Either
322              
323             =over
324              
325             =item *
326              
327             a scalar (simple term), or
328              
329             =item *
330              
331             a recursive ref to another query structure. In that case,
332             C is necessarily C<'()'> ; this corresponds
333             to a subquery in parentheses.
334              
335             =back
336              
337             =back
338              
339             In case of a parsing error, C returns C;
340             method L can be called to get an explanatory message.
341              
342             =cut
343              
344              
345              
346 10     10 1 628 sub parse { return (_parse(@_))[0]; } # just return 1st result from _parse
347              
348             sub _parse{ # returns ($parsedQuery, $restOfString)
349 14     14   20 my $self = shift;
350 14         19 my $s = shift;
351 14         15 my $implicitPlus = shift;
352 14         14 my $parentField = shift; # only for recursive calls
353 14         17 my $parentOp = shift; # only for recursive calls
354              
355 14         20 my $q = {};
356 14         16 my $preBool = '';
357 14         16 my $err = undef;
358 14         16 my $s_orig = $s;
359              
360 14         36 $s =~ s/^\s+//; # remove leading spaces
361              
362             LOOP :
363 14         29 while ($s) { # while query string is not empty
364 35         52 for ($s) { # temporary alias to $_ for easier regex application
365 35 100       66 my $sign = $implicitPlus ? "+" : "";
366 35   100     125 my $field = $parentField || $self->{defField};
367 35   100     104 my $op = $parentOp || ":";
368              
369 35 100       82 last LOOP if m/^\)/; # return from recursive call if meeting a ')'
370              
371             # try to parse sign prefix ('+', '-' or 'NOT')
372 32 100       227 if (s/^(\+|-)\s*//) { $sign = $1; }
  7 100       17  
373 1         3 elsif (s/^($self->{rxNot})\b\s*//) { $sign = '-'; }
374              
375             # try to parse field name and operator
376 32 100 100     809 if (s/^"($self->{rxField})"\s*($self->{rxOp})\s*// # "field name" and op
      100        
      100        
377             or
378             s/^'($self->{rxField})'\s*($self->{rxOp})\s*// # 'field name' and op
379             or
380             s/^($self->{rxField})\s*($self->{rxOp})\s*// # field name and op
381             or
382             s/^()($self->{rxOpNoField})\s*//) { # no field, just op
383 13 100       31 $err = "field '$1' inside '$parentField'", last LOOP if $parentField;
384 12         34 ($field, $op) = ($1, $2);
385             }
386              
387             # parse a value (single term or quoted list or parens)
388 31         41 my $subQ = undef;
389              
390 31 100 100     359 if (s/^(")([^"]*?)"\s*// or
    100          
    50          
391             s/^(')([^']*?)'\s*//) { # parse a quoted string.
392 6         16 my ($quote, $val) = ($1, $2);
393 6         35 $subQ = {field=>$field, op=>$op, value=>$val, quote=>$quote};
394             }
395             elsif (s/^\(\s*//) { # parse parentheses
396 4         16 my ($r, $s2) = $self->_parse($s, $implicitPlus, $field, $op);
397 4 100       13 $err = $self->err, last LOOP if not $r;
398 3         5 $s = $s2;
399 3 50       17 $s =~ s/^\)\s*// or $err = "no matching ) ", last LOOP;
400 3         588 $subQ = {field=>'', op=>'()', value=>$r};
401             }
402             elsif (s/^($self->{rxTerm})\s*//) { # parse a single term
403 21         104 $subQ = {field=>$field, op=>$op, value=>$1};
404             }
405              
406             # deal with boolean connectors
407 30         55 my $postBool = '';
408 30 100       263 if (s/^($self->{rxAnd})\b\s*//) { $postBool = 'AND' }
  3 100       6  
409 1         2 elsif (s/^($self->{rxOr})\b\s*//) { $postBool = 'OR' }
410 30 50 100     87 $err = "cannot mix AND/OR in requests; use parentheses", last LOOP
      66        
411             if $preBool and $postBool and $preBool ne $postBool;
412 30   100     92 my $bool = $preBool || $postBool;
413 30         32 $preBool = $postBool; # for next loop
414              
415             # insert subquery in query structure
416 30 50       57 if ($subQ) {
417 30 50 66     83 $sign = '' if $sign eq '+' and $bool eq 'OR';
418 30 100 100     108 $sign = '+' if $sign eq '' and $bool eq 'AND';
419 30 50 66     73 $err = 'operands of "OR" cannot have "-" or "NOT" prefix', last LOOP
420             if $sign eq '-' and $bool eq 'OR';
421 30         35 push @{$q->{$sign}}, $subQ;
  30         192  
422             }
423             else {
424 0 0       0 $err = "unexpected string in query : $_", last LOOP if $_;
425 0 0       0 $err = "missing value after $field $op", last LOOP if $field;
426             }
427             }
428             }
429              
430 14 100 50     72 $err ||= "no positive value in query" unless $q->{'+'} or $q->{''};
      100        
431 14 100       33 $self->{err} = $err ? "[$s_orig] : $err" : "";
432 14 100       27 $q = undef if $err;
433 14         55 return ($q, $s);
434             }
435              
436              
437             =item err
438              
439             $msg = $queryParser->err;
440              
441             Message describing the last parse error
442              
443             =cut
444              
445             sub err {
446 2     2 1 251 my $self = shift;
447 2         13 return $self->{err};
448             }
449              
450              
451             =item unparse
452              
453             $s = $queryParser->unparse($query);
454              
455             Returns a string representation of the C<$query> data structure.
456              
457             =cut
458              
459             sub unparse {
460 12     12 1 398 my $self = shift;
461 12         14 my $q = shift;
462              
463 12         14 my @subQ;
464 12         19 foreach my $prefix ('+', '', '-') {
465 36 100       87 next if not $q->{$prefix};
466 18         19 push @subQ, $prefix . $self->unparse_subQ($_) foreach @{$q->{$prefix}};
  18         59  
467             }
468 12         74 return join " ", @subQ;
469             }
470              
471             sub unparse_subQ {
472 30     30 0 37 my $self = shift;
473 30         29 my $subQ = shift;
474              
475 30 100       92 return "(" . $self->unparse($subQ->{value}) . ")" if $subQ->{op} eq '()';
476 27   100     88 my $quote = $subQ->{quote} || "";
477 27         134 return "$subQ->{field}$subQ->{op}$quote$subQ->{value}$quote";
478             }
479              
480             =back
481              
482             =head1 AUTHOR
483              
484             Laurent Dami, Elaurent.dami AT etat ge chE
485              
486             =head1 COPYRIGHT AND LICENSE
487              
488             Copyright (C) 2005, 2007 by Laurent Dami.
489              
490             This library is free software; you can redistribute it and/or modify
491             it under the same terms as Perl itself.
492              
493             =cut
494              
495             1;
496