File Coverage

blib/lib/App/ElasticSearch/Utilities/Aggregations.pm

Criterion	Covered	Total	%
statement	86	105	81.9
branch	26	50	52.0
condition	13	25	52.0
subroutine	7	8	87.5
pod	3	3	100.0
total	135	191	70.6

line	stmt	bran	cond	sub	pod	time	code
1							package App::ElasticSearch::Utilities::Aggregations;
2							# ABSTRACT: Code to simplify creating and working with Elasticsearch aggregations
3
4	4			4		1231	use v5.16;
	4					29
5	4			4		33	use warnings;
	4					9
	4					157
6
7	4			4		1432	use Storable qw(dclone);
	4					7015
	4					387
8	4					58	use Sub::Exporter -setup => {
9							exports => [ qw(
10							expand_aggregate_string
11							es_flatten_aggregations es_flatten_aggs
12							is_single_stat
13							)],
14							groups => {
15							default => [qw(
16							expand_aggregate_string
17							es_flatten_aggregations es_flatten_aggs
18							is_single_stat
19							)],
20							},
21	4			4		1481	};
	4					33630
22
23							my %Aggregations;
24
25
26							$Aggregations{terms} = {
27							params => sub { $_[0] && $_[0] =~ /^\d+$/ ? { size => $_[0] } : {} },
28							type => 'bucket',
29							composite => 1,
30							};
31
32
33							$Aggregations{significant_terms} = {
34							params => sub { $_[0] =~ /^\d+$/ ? { size => $_[0] } : {} },
35							type => 'bucket',
36							};
37
38
39							$Aggregations{rare_terms} = {
40							params => sub { $_[0] =~ /^\d+$/ ? { max_doc_count => $_[0] } : {} },
41							type => 'bucket',
42							};
43
44
45							$Aggregations{histogram} = {
46							params => sub {
47							return unless $_[0] > 0;
48							return { interval => $_[0] };
49							},
50							type => 'bucket',
51							composite => 1,
52							};
53
54
55							$Aggregations{date_histogram} = {
56							params => sub { { calendar_interval => $_[0] \|\| '1h' } },
57							type => 'bucket',
58							composite => 1,
59							};
60
61
62							$Aggregations{geohash_grid} = {
63							params => sub { $_[0] =~ /^\d+$/ ? { precision => $_[0] } : {} },
64							type => 'bucket',
65							composite => 1,
66							};
67
68
69							$Aggregations{missing} = { type => 'bucket' };
70
71
72							$Aggregations{avg} = { single_stat => 1, type => 'metric' };
73							$Aggregations{max} = { single_stat => 1, type => 'metric' };
74							$Aggregations{min} = { single_stat => 1, type => 'metric' };
75							$Aggregations{sum} = { single_stat => 1, type => 'metric' };
76
77
78							$Aggregations{cardinality} = { single_stat => 1, type => 'metric' };
79
80
81							$Aggregations{stats} = { type => 'metric' };
82
83
84							$Aggregations{extended_stats} = { type => 'metric' };
85
86
87							$Aggregations{percentiles} = {
88							params => sub {
89							my @pcts = $_[0] ? split /,/, $_[0] : qw(25 50 75 90);
90							return { percents => \@pcts };
91							},
92							};
93
94
95							$Aggregations{geo_centroid} = { type => 'metric' };
96
97
98
99
100							sub is_single_stat {
101	0			0	1	0	my ($agg) = @_;
102	0	0				0	return unless $agg;
103	0	0				0	return unless exists $Aggregations{$agg};
104	0	0				0	return unless exists $Aggregations{$agg}->{single_stat};
105	0					0	return $Aggregations{$agg}->{single_stat};
106							}
107
108
109							sub expand_aggregate_string {
110	4			4	1	3038	my ($token) = @_;
111
112	4					10	my %aggs = ();
113	4					15	foreach my $def ( split /\+/, $token ) {
114	4	100				25	my $alias = $def =~ s/^(\w+)=// ? $1 : undef;
115	4					24	my @parts = split /:/, $def, 3;
116	4	100				14	if( @parts == 1 ) {
117	1		33			10	$alias \|\|= $def;
118	1					5	$aggs{$alias} = { terms => { field => $def, size => 20 } };
119	1					5	next;
120							}
121	3					7	my ($agg, $field);
122	3	50				9	if( exists $Aggregations{$parts[0]} ) {
123	0					0	$agg = shift @parts;
124	0					0	$field = shift @parts;
125							}
126							else {
127	3					6	$agg = 'terms';
128	3					6	$field = shift @parts;
129							}
130	3					7	my $params = {};
131	3					5	my $paramStr = shift @parts;
132
133	3	100	66			24	if( $paramStr && $paramStr =~ /\w+=/ ) {
		50
134							# split on commas using a positive lookahead for a "word="
135	2					8	foreach my $token (split /,(?=\w+=)/, $paramStr) {
136	2					8	my ($k,$v) = split /=/, $token, 2;
137	2	50	33			11	next unless $k and $v;
138	2	50				11	$params->{$k} = $v =~ /,/ ? [ split /,/, $v ] : $v;
139							}
140							}
141							elsif( exists $Aggregations{$agg}->{params} ) {
142							# Process parameters
143	1					4	$params = $Aggregations{$agg}->{params}->($paramStr);
144							}
145	3	50	66			49	$alias \|\|= join ".", $agg eq 'terms' ? ($field) : ($agg, $field);
146	3					6	$aggs{$alias} = { $agg => { field => $field, %{ $params } } };
	3					20
147							}
148	4					14	return \%aggs;
149							}
150
151
152							sub es_flatten_aggregations {
153	7			7	1	2161	my ($result,$field,$parent) = @_;
154
155	7		100			31	$parent \|\|= [];
156	7					13	my @rows = ();
157
158	7					17	my @remove = qw(
159							doc_count_error_upper_bound
160							sum_other_doc_count
161							);
162
163	7					142	my $row = dclone($parent);
164							my $extract = sub {
165	4			4		10	my ($key, $hash) = @_;
166
167	4	50				16	if( $hash->{value_as_string} ) {
		50
		50
168	0					0	push @{ $row }, $key, $hash->{value_as_string};
	0					0
169							}
170							elsif( $hash->{value} ) {
171	0					0	push @{ $row }, $key, $hash->{value};
	0					0
172							}
173							elsif( $hash->{values} ) {
174	0					0	foreach my $k ( sort keys %{ $hash->{values} } ) {
	0					0
175	0					0	push @{ $row }, "$key.$k", $hash->{values}{$k}
176	0	0				0	if $hash->{values}{$k};
177							}
178							}
179							else {
180	4					8	foreach my $k (sort keys %{ $hash }) {
	4					12
181	4	50				14	last if $k eq 'buckets';
182	0					0	push @{ $row }, "$key.$k", $hash->{$k}
183	0	0				0	if defined $hash->{values}{$k};
184							}
185							}
186	7					33	};
187
188	7	100				29	if( $field ) {
189	4					10	delete $result->{$_} for @remove;
190	4	50	33			18	if( $result->{key} and exists $result->{doc_count} ) {
191	4					9	my $k = delete $result->{key};
192	4					7	my $ks = delete $result->{key_as_string};
193	4		66			6	push @{ $row }, $field, $ks \|\| $k;
	4					18
194	4	100				11	push @{ $row }, "$field.raw", $k if $ks;
	1					4
195	4		50			9	push @{ $row }, "$field.hits", delete $result->{doc_count} \|\| 0;
	4					17
196							}
197	4					9	my %buckets = ();
198	4					7	foreach my $k ( sort keys %{ $result } ) {
	4					13
199	1	50				6	if( ref $result->{$k} eq 'HASH' ) {
200	1					3	$extract->($k, $result->{$k});
201
202	1	50				3	if( my $buckets = delete $result->{$k}{buckets} ) {
203	1					3	$buckets{$k} = $buckets;
204							}
205							}
206							}
207	4	100				11	if( keys %buckets ) {
208	1					3	foreach my $k ( sort keys %buckets ) {
209	1	50				3	if( @{ $buckets{$k} } ) {
	1					3
210	1					2	foreach my $bucket ( @{ $buckets{$k} } ) {
	1					4
211	1					3	push @rows, @{ es_flatten_aggregations($bucket, $k, $row) };
	1					8
212							}
213							}
214							else {
215	0					0	push @rows, $row;
216							}
217							}
218							}
219							else {
220	3					9	push @rows, $row;
221							}
222							}
223							else {
224	3					6	foreach my $k ( sort keys %{ $result } ) {
	3					12
225	3					11	delete $result->{$k}{$_} for @remove;
226	3					10	$extract->($k, $result->{$k});
227	3					9	my $buckets = delete $result->{$k}{buckets};
228	3	50	33			10	if( $buckets and @{ $buckets } ) {
	3					11
229	3					5	foreach my $bucket ( @{ $buckets } ) {
	3					7
230	3					4	push @rows, @{ es_flatten_aggregations($bucket,$k,$row) };
	3					12
231							}
232							}
233							else {
234	0					0	push @rows, $row;
235							}
236							}
237							}
238
239	7					58	return \@rows;
240							}
241
242							# Setup Aliases
243							*es_flatten_aggs = \&es_flatten_aggregations;
244
245
246							1;
247
248							__END__
249
250							=pod
251
252							=head1 NAME
253
254							App::ElasticSearch::Utilities::Aggregations - Code to simplify creating and working with Elasticsearch aggregations
255
256							=head1 VERSION
257
258							version 8.7
259
260							=head1 FUNCTIONS
261
262							=head2 is_single_stat()
263
264							Returns true if an aggregation returns a single value.
265
266							=head2 expand_aggregate_string( token )
267
268							Takes a simplified aggregation grammar and expands it the full aggregation hash.
269
270							Simple Terms:
271
272							field_name
273
274							To
275
276							{
277							field_name => {
278							terms => {
279							field => 'field_name',
280							size => 20,
281							}
282							}
283							}
284
285							Alias expansion:
286
287							alias=field_name
288
289							To
290
291							{
292							alias => {
293							terms => {
294							field => 'field_name',
295							size => 20,
296							}
297							}
298							}
299
300							Parameters:
301
302							alias=field_name:10
303
304							To
305
306							{
307							alias => {
308							terms => {
309							field => 'field_name',
310							size => 10,
311							}
312							}
313							}
314
315							Parameters, k/v:
316
317							alias=field_name:size=13
318
319							To
320
321							{
322							alias => {
323							terms => {
324							field => 'field_name',
325							size => 13,
326							}
327							}
328							}
329
330							=head2 es_flatten_aggregations()
331
332							Takes the B<aggregations> section of the query result and parses it into a flat
333							structure so each row contains all the sub aggregation information.
334
335							It returns an array reference, containing arrray references. The individual
336							rows of the array are ordered in a depth first fashion. The array does include
337							a key for every value, so the array can be cast to a hash safely.
338
339							=head1 Aggregations
340
341							List of supported aggregations. Other aggregation may work, but these have defined behavior.
342
343							=head2 Bucket Aggregations
344
345							These aggregations will support sub aggregations.
346
347							=over 2
348
349							=item B<terms>
350
351							The default aggregation if none is specified.
352
353							field_name
354							terms:field_name
355
356							Results in
357
358							{
359							"field_name": {
360							"terms": {
361							"field": "field_name"
362							}
363							}
364							}
365
366							Supports a positional parameter: size
367
368							field_name:20
369							terms:field_name:20
370
371							Results in
372
373							{
374							"field_name": {
375							"terms": {
376							"field": "field_name",
377							"size": 20
378							}
379							}
380							}
381
382							=item B<significant_terms>
383
384							Same as C<terms>.
385
386							significant_terms:field_name:10
387
388							Results in:
389
390							{
391							"rare_terms.field_name": {
392							"terms": {
393							"field": "field_name",
394							"size": 10
395							}
396							}
397							}
398
399							=item B<rare_terms>
400
401							Same as C<terms> but the positional parameter is the C<max_doc_count>.
402
403							rare_terms:field_name:10
404
405							Results in:
406
407							{
408							"rare_terms.field_name": {
409							"terms": {
410							"field": "field_name",
411							"max_doc_count": 10
412							}
413							}
414							}
415
416							=item B<histogram>
417
418							Creates a histogram for numeric fields. Positional parameter is the interval.
419
420							histogram:field_name:10
421
422							Results in:
423
424							{
425							"histogram.field_name": {
426							"histogram": {
427							"field": "field_name",
428							"interval": 10
429							}
430							}
431							}
432
433							=item B<date_histogram>
434
435							Creates a histogram for date fields. Positional parameter is the calendar_interval.
436
437							date_histogram:field_name:1h
438
439							Results in:
440
441							{
442							"histogram.field_name": {
443							"histogram": {
444							"field": "field_name",
445							"calendar_interval": "1h"
446							}
447							}
448							}
449
450							=item B<geohash_grid>
451
452							Creates a geohash grid bucket aggregation. Positional parameter is the precision.
453
454							geohash_grid:field_name:6
455
456							Results in:
457
458							{
459							"geohash_grid.field_name": {
460							"geohash_grid": {
461							"field": "field_name",
462							"precision": 6
463							}
464							}
465							}
466
467							=item B<missing>
468
469							Creates a bucket for documents missing the field. No positional parameters.
470
471							missing:field_name
472
473							Results in:
474
475							{
476							"missing.field_name": {
477							"missing": {
478							"field": "field_name"
479							}
480							}
481							}
482
483							=back
484
485							=head2 Metric Aggregations
486
487							Aggregations that generate metrics from enclosing buckets.
488
489							=over 2
490
491							=item B<avg>, B<max>, B<min>, B<sum>
492
493							Single stat metric aggregations to generate the various single statistics over the enclosing bucket.
494
495							sum:field_name
496
497							Results in
498
499							{
500							"sum.field_names": {
501							"sum": {
502							"field": "field_name"
503							}
504							}
505							}
506
507							=item B<cardinality>
508
509							Computes the unique count of terms in a field.
510
511							cardinality:field_name
512
513							Results in
514
515							{
516							"cardinality.field_names": {
517							"cardinality": {
518							"field": "field_name"
519							}
520							}
521							}
522
523							=item B<stats>
524
525							Runs the stats aggregation that returns min, max, avg, sum, and count.
526
527							stats:field_name
528
529							Results in
530
531							{
532							"stats.field_names": {
533							"stats": {
534							"field": "field_name"
535							}
536							}
537							}
538
539							=item B<extended_stats>
540
541							Runs the stats aggregation that returns the same data as the C<sum> aggregation
542							plus variance, sum of squares, and standard deviation.
543
544							extended_stats:field_name
545
546							Results in
547
548							{
549							"extended_stats.field_names": {
550							"extended_stats": {
551							"field": "field_name"
552							}
553							}
554							}
555
556							=item B<percentiles>
557
558							Computes percentiles for the enclosing bucket. The positional parameter is
559							interpretted at the percents computed. If ommitted, the percentiles computed
560							will be: 25, 50, 75, 90.
561
562							percentiles:field_name:75,95,99
563
564							Results in
565
566							{
567							"percentiles.field_names": {
568							"percentiles": {
569							"field": "field_name",
570							"percents": [ 75, 95, 99 ]
571							}
572							}
573							}
574
575							=item B<geo_centroid>
576
577							Computes center of a group of geo points. No positional parameters supported.
578
579							geo_centroid:field_name
580
581							Results in
582
583							{
584							"geo_centroid.field_names": {
585							"geo_centroid": {
586							"field": "field_name"
587							}
588							}
589							}
590
591							=back
592
593							=for Pod::Coverage es_flatten_aggs
594
595							=head1 AUTHOR
596
597							Brad Lhotsky <brad@divisionbyzero.net>
598
599							=head1 COPYRIGHT AND LICENSE
600
601							This software is Copyright (c) 2023 by Brad Lhotsky.
602
603							This is free software, licensed under:
604
605							The (three-clause) BSD License
606
607							=cut