File Coverage

blib/lib/App/ElasticSearch/Utilities/QueryString.pm
Criterion Covered Total %
statement 80 83 96.3
branch 24 32 75.0
condition 6 9 66.6
subroutine 12 12 100.0
pod 1 1 100.0
total 123 137 89.7


line stmt bran cond sub pod time code
1             package App::ElasticSearch::Utilities::QueryString;
2             # ABSTRACT: CLI query string fixer
3              
4 3     3   245746 use v5.16;
  3         14  
5 3     3   17 use warnings;
  3         7  
  3         282  
6              
7             our $VERSION = '8.8'; # VERSION
8              
9 3     3   1979 use App::ElasticSearch::Utilities qw(:config);
  3         11  
  3         41  
10 3     3   3232 use App::ElasticSearch::Utilities::Query;
  3         17  
  3         160  
11 3     3   28 use CLI::Helpers qw(:output);
  3         4  
  3         36  
12 3     3   2803 use Module::Pluggable::Object;
  3         24346  
  3         155  
13 3     3   23 use Moo;
  3         6  
  3         26  
14 3     3   1338 use Ref::Util qw(is_arrayref);
  3         8  
  3         216  
15 3     3   18 use Types::Standard qw(ArrayRef Enum HashRef);
  3         5  
  3         30  
16              
17 3     3   9584 use namespace::autoclean;
  3         7  
  3         32  
18              
19              
20             my %JOINING = map { $_ => 1 } qw( AND OR );
21             my %TRAILING = map { $_ => 1 } qw( AND OR NOT );
22              
23              
24             has 'context' => (
25             is => 'rw',
26             isa => Enum[qw(query filter)],
27             lazy => 1,
28             default => sub { 'query' },
29             );
30              
31              
32             has search_path => (
33             is => 'rw',
34             isa => ArrayRef,
35             default => sub {[]},
36             );
37              
38              
39             has default_join => (
40             is => 'rw',
41             isa => Enum[qw(AND OR)],
42             default => sub { 'AND' },
43             );
44              
45              
46             has plugins => (
47             is => 'ro',
48             isa => ArrayRef,
49             builder => '_build_plugins',
50             lazy => 1,
51             );
52              
53              
54             has fields_meta => (
55             is => 'rw',
56             isa => HashRef,
57             default => sub { {} },
58             );
59              
60              
61             sub expand_query_string {
62 14     14 1 106 my $self = shift;
63              
64 14         557 my $query = App::ElasticSearch::Utilities::Query->new(
65             fields_meta => $self->fields_meta,
66             );
67 14         13310 my @processed = ();
68 14         46 TOKEN: foreach my $token (@_) {
69 22         45 foreach my $p (@{ $self->plugins }) {
  22         608  
70 70         2296 my $res = $p->handle_token($token);
71 70 100       280 if( defined $res ) {
72 18 100       47 push @processed, is_arrayref($res) ? @{$res} : $res;
  8         19  
73 18         61 next TOKEN;
74             }
75             }
76 4         21 push @processed, { query_string => $token };
77             }
78              
79 14         81 debug({color=>"magenta"}, "Processed parts");
80 14         195 debug_var({color=>"magenta"},\@processed);
81              
82 14 50       853 my $context = $self->context eq 'query' ? 'must' : 'filter';
83 14         153 my $invert=0;
84 14         30 my @dangling=();
85 14         25 my @qs=();
86 14         35 foreach my $part (@processed) {
87 22 100       87 if( exists $part->{dangles} ) {
    100          
    50          
    0          
88 6         15 push @dangling, $part->{query_string};
89             }
90             elsif( exists $part->{query_string} ) {
91 4         13 push @qs, @dangling, $part->{query_string};
92 4         8 @dangling=(),
93             }
94             elsif( exists $part->{condition} ) {
95 12 50       32 my $target = $invert ? 'must_not' : $context;
96 12         61 $query->add_bool( $target => $part->{condition} );
97 12         54 @dangling=();
98             }
99             elsif( exists $part->{nested} ) {
100 0         0 $query->nested($part->{nested}{query});
101 0         0 $query->nested_path($part->{nested}{path});
102 0         0 @dangling=();
103             }
104             # Carry over the Inversion for instance where we jump out of the QS
105 22   100     88 $invert = exists $part->{invert} && $part->{invert};
106             }
107 14 100       39 if(@qs) {
108 3   33     38 pop @qs while @qs && exists $TRAILING{$qs[-1]};
109 3   66     20 shift @qs while @qs && exists $JOINING{$qs[0]};
110              
111             # Ensure there's a joining token, otherwise use our default
112 3 100       12 if( @qs > 1 ) {
113 2         5 my $prev_query = 0;
114 2         4 my @joined = ();
115 2         6 foreach my $part ( @qs ) {
116 6 100       21 if( $prev_query ) {
117 1 50       101 push @joined, $self->default_join() unless exists $JOINING{$part};
118             }
119 6         16 push @joined, $part;
120             # Here we include AND, NOT, OR
121 6 100       20 $prev_query = exists $TRAILING{$part} ? 0 : 1;
122             }
123 2         12 @qs = @joined;
124             }
125             }
126 14 100       67 $query->add_bool($context => { query_string => { query => join(' ', @qs) } }) if @qs;
127              
128 14         78 return $query;
129             }
130              
131             # Builder Routines for QS Objects
132             sub _build_plugins {
133 1     1   16 my $self = shift;
134 1         7 my $globals = es_globals('plugins');
135             my $finder = Module::Pluggable::Object->new(
136 1         3 search_path => ['App::ElasticSearch::Utilities::QueryString',@{ $self->search_path }],
  1         28  
137             except => [qw(
138             App::ElasticSearch::Utilities::QueryString::AutoEscape
139             App::ElasticSearch::Utilities::QueryString::Plugin
140             )],
141             instantiate => 'new',
142             );
143 1         36 my @plugins;
144 1 50       29 foreach my $p ( sort { $a->priority <=> $b->priority || $a->name cmp $b->name }
  12 50       704  
145             $finder->plugins(
146             fields_meta => $self->fields_meta,
147             options => defined $globals ? $globals : {},
148             )
149             ) {
150 7         180 debug(sprintf "Loaded %s with priority:%d", $p->name, $p->priority);
151 7         446 push @plugins, $p;
152             }
153 1         60 return \@plugins;
154             }
155              
156             # Return true
157             1;
158              
159             __END__
160              
161             =pod
162              
163             =head1 NAME
164              
165             App::ElasticSearch::Utilities::QueryString - CLI query string fixer
166              
167             =head1 VERSION
168              
169             version 8.8
170              
171             =head1 SYNOPSIS
172              
173             This class provides a pluggable architecture to expand query strings on the
174             command-line into complex Elasticsearch queries.
175              
176             =head1 ATTRIBUTES
177              
178             =head2 context
179              
180             Defaults to 'query', but can also be set to 'filter' so the elements will be
181             added to the 'must' or 'filter' parameter.
182              
183             =head2 search_path
184              
185             An array reference of additional namespaces to search for loading the query string
186             processing plugins. Example:
187              
188             $qs->search_path([qw(My::Company::QueryString)]);
189              
190             This will search:
191              
192             App::ElasticSearch::Utilities::QueryString::*
193             My::Company::QueryString::*
194              
195             For query processing plugins.
196              
197             =head2 default_join
198              
199             When fixing up the query string, if two tokens are found next to eachother
200             missing a joining token, join using this token. Can be either C<AND> or C<OR>,
201             and defaults to C<AND>.
202              
203             =head2 plugins
204              
205             Array reference of ordered query string processing plugins, lazily assembled.
206              
207             =head2 fields_meta
208              
209             A hash reference with the field data from L<App::ElasticSearch::Utilities::es_index_fields>.
210              
211             =head1 METHODS
212              
213             =head2 expand_query_string(@tokens)
214              
215             This function takes a list of tokens, often from the command line via @ARGV. Uses
216             a plugin infrastructure to allow customization.
217              
218             Returns: L<App::ElasticSearch::Utilities::Query> object
219              
220             =head1 TOKENS
221              
222             The token expansion plugins can return undefined, which is basically a noop on the token.
223             The plugin can return a hash reference, which marks that token as handled and no other plugins
224             receive that token. The hash reference may contain:
225              
226             =over 2
227              
228             =item query_string
229              
230             This is the rewritten bits that will be reassembled in to the final query string.
231              
232             =item condition
233              
234             This is usually a hash reference representing the condition going into the bool query. For instance:
235              
236             { terms => { field => [qw(alice bob charlie)] } }
237              
238             Or
239              
240             { prefix => { user_agent => 'Go ' } }
241              
242             These conditions will wind up in the B<must> or B<must_not> section of the B<bool> query depending on the
243             state of the the invert flag.
244              
245             =item invert
246              
247             This is used by the bareword "not" to track whether the token invoked a flip from the B<must> to the B<must_not>
248             state. After each token is processed, if it didn't set this flag, the flag is reset.
249              
250             =item dangles
251              
252             This is used for bare words like "not", "or", and "and" to denote that these terms cannot dangle from the
253             beginning or end of the query_string. This allows the final pass of the query_string builder to strip these
254             words to prevent syntax errors.
255              
256             =back
257              
258             =head1 Extended Syntax
259              
260             The search string is pre-analyzed before being sent to ElasticSearch. The following plugins
261             work to manipulate the query string and provide richer, more complete syntax for CLI applications.
262              
263             =head2 App::ElasticSearch::Utilities::QueryString::Barewords
264              
265             The following barewords are transformed:
266              
267             or => OR
268             and => AND
269             not => NOT
270              
271             =head2 App::ElasticSearch::Utilities::QueryString::Text
272              
273             Provides field prefixes to manipulate the text search capabilities.
274              
275             =head3 Terms Query via '='
276              
277             Provide an '=' prefix to a query string parameter to promote that parameter to a C<term> filter.
278              
279             This allows for exact matches of a field without worrying about escaping Lucene special character filters.
280              
281             E.g.:
282              
283             user_agent:"Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1"
284              
285             Is evaluated into a weird query that doesn't do what you want. However:
286              
287             =user_agent:"Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1"
288              
289             Is translated into:
290              
291             { term => { user_agent => "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" } }
292              
293             =head3 Wildcard Query via '*'
294              
295             Provide an '*' prefix to a query string parameter to promote that parameter to a C<wildcard> filter.
296              
297             This uses the wild card match for text fields to making matching more intuitive.
298              
299             E.g.:
300              
301             *user_agent:"Mozilla*"
302              
303             Is translated into:
304              
305             { wildcard => { user_agent => "Mozilla* } }
306              
307             =head3 Regexp Query via '/'
308              
309             Provide an '/' prefix to a query string parameter to promote that parameter to a C<regexp> filter.
310              
311             If you want to use regexp matching for finding data, you can use:
312              
313             /message:'\\bden(ial|ied|y)'
314              
315             Is translated into:
316              
317             { regexp => { message => "\\bden(ial|ied|y)" } }
318              
319             =head3 Fuzzy Matching via '~'
320              
321             Provide an '~' prefix to a query string parameter to promote that parameter to a C<fuzzy> filter.
322              
323             ~message:deny
324              
325             Is translated into:
326              
327             { fuzzy => { message => "deny" } }
328              
329             =head3 Phrase Matching via '+'
330              
331             Provide an '+' prefix to a query string parameter to promote that parameter to a C<match_phrase> filter.
332              
333             +message:"login denied"
334              
335             Is translated into:
336              
337             { match_phrase => { message => "login denied" } }
338              
339             =head3 Automatic Match Queries for Text Fields
340              
341             If the field meta data is provided and the field is a C<text> type, the query
342             will automatically be mapped to a C<match> query.
343              
344             # message field is text
345             message:"foo"
346              
347             Is translated into:
348              
349             { match => { message => "foo" } }
350              
351             =head2 App::ElasticSearch::Utilities::QueryString::IP
352              
353             If a field is an IP address uses CIDR Notation, it's expanded to a range query.
354              
355             src_ip:10.0/8 => src_ip:[10.0.0.0 TO 10.255.255.255]
356              
357             =head2 App::ElasticSearch::Utilities::QueryString::Ranges
358              
359             This plugin translates some special comparison operators so you don't need to
360             remember them anymore.
361              
362             Example:
363              
364             price:<100
365              
366             Will translate into a:
367              
368             { range: { price: { lt: 100 } } }
369              
370             And:
371              
372             price:>50,<100
373              
374             Will translate to:
375              
376             { range: { price: { gt: 50, lt: 100 } } }
377              
378             =head3 Supported Operators
379              
380             B<gt> via E<gt>, B<gte> via E<gt>=, B<lt> via E<lt>, B<lte> via E<lt>=
381              
382             =head2 App::ElasticSearch::Utilities::QueryString::Underscored
383              
384             This plugin translates some special underscore surrounded tokens into
385             the Elasticsearch Query DSL.
386              
387             Implemented:
388              
389             =head3 _prefix_
390              
391             Example query string:
392              
393             _prefix_:useragent:'Go '
394              
395             Translates into:
396              
397             { prefix => { useragent => 'Go ' } }
398              
399             =head2 App::ElasticSearch::Utilities::QueryString::FileExpansion
400              
401             If the match ends in .dat, .txt, .csv, or .json then we attempt to read a file with that name and OR the condition:
402              
403             $ cat test.dat
404             50 1.2.3.4
405             40 1.2.3.5
406             30 1.2.3.6
407             20 1.2.3.7
408              
409             Or
410              
411             $ cat test.csv
412             50,1.2.3.4
413             40,1.2.3.5
414             30,1.2.3.6
415             20,1.2.3.7
416              
417             Or
418              
419             $ cat test.txt
420             1.2.3.4
421             1.2.3.5
422             1.2.3.6
423             1.2.3.7
424              
425             Or
426              
427             $ cat test.json
428             { "ip": "1.2.3.4" }
429             { "ip": "1.2.3.5" }
430             { "ip": "1.2.3.6" }
431             { "ip": "1.2.3.7" }
432              
433             We can source that file:
434              
435             src_ip:test.dat => src_ip:(1.2.3.4 1.2.3.5 1.2.3.6 1.2.3.7)
436             src_ip:test.json[ip] => src_ip:(1.2.3.4 1.2.3.5 1.2.3.6 1.2.3.7)
437              
438             This make it simple to use the --data-file output options and build queries
439             based off previous queries. For .txt and .dat file, the delimiter for columns
440             in the file must be either a tab or a null. For files ending in
441             .csv, Text::CSV_XS is used to accurate parsing of the file format. Files
442             ending in .json are considered to be newline-delimited JSON.
443              
444             You can also specify the column of the data file to use, the default being the last column or (-1). Columns are
445             B<zero-based> indexing. This means the first column is index 0, second is 1, .. The previous example can be rewritten
446             as:
447              
448             src_ip:test.dat[1]
449              
450             or:
451             src_ip:test.dat[-1]
452              
453             For newline delimited JSON files, you need to specify the key path you want to extract from the file. If we have a
454             JSON source file with:
455              
456             { "first": { "second": { "third": [ "bob", "alice" ] } } }
457             { "first": { "second": { "third": "ginger" } } }
458             { "first": { "second": { "nope": "fred" } } }
459              
460             We could search using:
461              
462             actor:test.json[first.second.third]
463              
464             Which would expand to:
465              
466             { "terms": { "actor": [ "alice", "bob", "ginger" ] } }
467              
468             This option will iterate through the whole file and unique the elements of the list. They will then be transformed into
469             an appropriate L<terms query|http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html>.
470              
471             =head3 Wildcards
472              
473             We can also have a group of wildcard or regexp in a file:
474              
475             $ cat wildcards.dat
476             *@gmail.com
477             *@yahoo.com
478              
479             To enable wildcard parsing, prefix the filename with a C<*>.
480              
481             es-search.pl to_address:*wildcards.dat
482              
483             Which expands the query to:
484              
485             {
486             "bool": {
487             "minimum_should_match":1,
488             "should": [
489             {"wildcard":{"to_outbound":{"value":"*@gmail.com"}}},
490             {"wildcard":{"to_outbound":{"value":"*@yahoo.com"}}}
491             ]
492             }
493             }
494              
495             No attempt is made to verify or validate the wildcard patterns.
496              
497             =head3 Regular Expressions
498              
499             If you'd like to specify a file full of regexp, you can do that as well:
500              
501             $ cat regexp.dat
502             .*google\.com$
503             .*yahoo\.com$
504              
505             To enable regexp parsing, prefix the filename with a C<~>.
506              
507             es-search.pl to_address:~regexp.dat
508              
509             Which expands the query to:
510              
511             {
512             "bool": {
513             "minimum_should_match":1,
514             "should": [
515             {"regexp":{"to_outbound":{"value":".*google\\.com$"}}},
516             {"regexp":{"to_outbound":{"value":".*yahoo\\.com$"}}}
517             ]
518             }
519             }
520              
521             No attempt is made to verify or validate the regexp expressions.
522              
523             =head2 App::ElasticSearch::Utilities::QueryString::Nested
524              
525             Implement the proposed nested query syntax early. Example:
526              
527             nested_path:"field:match AND string"
528              
529             =head1 AUTHOR
530              
531             Brad Lhotsky <brad@divisionbyzero.net>
532              
533             =head1 COPYRIGHT AND LICENSE
534              
535             This software is Copyright (c) 2024 by Brad Lhotsky.
536              
537             This is free software, licensed under:
538              
539             The (three-clause) BSD License
540              
541             =cut