File Coverage

blib/lib/App/ElasticSearch/Utilities/QueryString/FileExpansion.pm

Criterion	Covered	Total	%
statement	55	80	68.7
branch	5	22	22.7
condition	1	3	33.3
subroutine	13	14	92.8
pod			n/a
total	74	119	62.1

line	stmt	bran	cond	sub	time	code
1						package App::ElasticSearch::Utilities::QueryString::FileExpansion;
2						# ABSTRACT: Build a terms query from unique values in a column of a file
3
4	1			1	859	use strict;
	1				3
	1				29
5	1			1	6	use warnings;
	1				3
	1				42
6
7						our $VERSION = '8.5'; # VERSION
8
9	1			1	6	use CLI::Helpers qw(:output);
	1				2
	1				6
10	1			1	583	use File::Slurp::Tiny qw(read_lines);
	1				1321
	1				56
11	1			1	7	use JSON::MaybeXS;
	1				2
	1				57
12	1			1	6	use Ref::Util qw(is_ref is_arrayref is_hashref);
	1				3
	1				43
13	1			1	934	use Text::CSV_XS;
	1				12567
	1				48
14	1			1	8	use namespace::autoclean;
	1				4
	1				6
15
16	1			1	60	use Moo;
	1				2
	1				6
17						with 'App::ElasticSearch::Utilities::QueryString::Plugin';
18
19	1			1	23	sub _build_priority { 10; }
20
21						my %parsers = (
22						txt => \&_parse_txt,
23						dat => \&_parse_txt,
24						csv => \&_parse_csv,
25						json => \&_parse_json,
26						);
27
28
29						sub handle_token {
30						my($self,$token) = @_;
31
32						my $makeMatcher = sub {
33						my ($matcher,$field,$patterns) = @_;
34						my @tests;
35						foreach my $pattern (@{ $patterns }) {
36						push @tests, { $matcher => { $field => { value => $pattern } } };
37						}
38						return {
39						bool => {
40						should => \@tests,
41						minimum_should_match => 1,
42						}
43						}
44						};
45						my %make = (
46						terms => sub {
47						my ($field, $uniq) = @_;
48						return { terms => { $field => $uniq } };
49						},
50						regexp => sub { $makeMatcher->(regexp => @_) },
51						wildcard => sub { $makeMatcher->(wildcard => @_) },
52						);
53						if( my ($term,$match) = split /\:/, $token, 2 ) {
54						if( defined $match && $match =~ /(.*\.(\w{3,4}))(?:\[([^\]]+)\])?$/) {
55						my($file,$type,$col) = ($1,$2,$3);
56						# Support Wildcards
57						my $matcher = $file =~ s/^\~// ? 'regexp'
58						: $file =~ s/^\*// ? 'wildcard'
59						: 'terms';
60						$col //= -1;
61						$type = lc $type;
62						verbose({level=>2,color=>'magenta'}, sprintf "# %s attempt of %s type, %s[%s] %s",
63						$self->name, $type, $file, $col, -f $file ? 'exists' : 'does not exist'
64						);
65						if( exists $parsers{$type} && -f $file ) {
66						my $uniq = $parsers{$type}->($file,$col);
67						if (defined $uniq && is_hashref($uniq) && scalar(keys %$uniq)) {
68						verbose({color=>'cyan'},
69						sprintf "# FILE:%s[%s] contained %d unique elements.",
70						$file,
71						$col,
72						scalar(keys %$uniq),
73						);
74						my $qs = [ sort keys %{ $uniq } ];
75						return [{condition => $make{$matcher}->($term,$qs) }];
76						}
77						}
78						}
79						}
80						return;
81						}
82
83						sub _parse_csv {
84	1			1	6	my ($file,$col) = @_;
85	1				14	my $csv = Text::CSV_XS->new({binary=>1,empty_is_undef=>1});
86	1	50		1	11	open my $fh, "<:encoding(utf8)", $file or die "Unable to read $file: $!";
	1				3
	1				11
	1				225
87	1				11826	my %uniq = ();
88	1				67	while( my $row = $csv->getline($fh) ) {
89	3				157	my $val;
90	3				5	eval {
91	3				8	$val = $row->[$col];
92						};
93	3	50			8	next unless defined $val;
94	3				68	$uniq{$val} = 1;
95						}
96	1				65	return \%uniq;
97						}
98
99						sub _parse_txt {
100	1			1	4	my ($file,$col) = @_;
101	1				4	my %uniq=();
102	1	50	33		6	my @rows = grep { defined && length && !/^#/ } read_lines($file);
	3				130
103	1				6	debug({color=>'magenta'}, @rows);
104	1	50			13	if(@rows) {
105	1				3	for(@rows) {
106	3				7	chomp;
107						# Split on tabs or nulls
108	3				15	my @cols = split /[\t\0]/;
109	3				8	my $value = $cols[$col];
110	3	50			8	if(defined $value) {
111	3				11	$uniq{$value} = 1;
112						}
113						}
114						}
115	1				5	return \%uniq;
116						}
117
118						sub _parse_json {
119	0			0		my ($file,$field) = @_;
120
121	0	0				die "For new line delimited JSON, please specify the key, ie <field>:$file\[key.path.i.want\]"
122						if $field eq "-1";
123
124	0					my %uniq = ();
125	0					my $line = 0;
126	0					my @path = split /\./, $field; # Supports key.subkey.subsubkey format
127	0					JSON_LINE: foreach my $json ( read_lines($file) ) {
128	0					$line++;
129	0					my $data;
130						eval {
131	0					$data = decode_json($json);
132	0					1;
133	0	0				} or do {
134	0					my $err = $@;
135	0					output({stderr=>1,color=>'yellow'}, sprintf "Invalid JSON in %s, line %d: %s",
136						$file,
137						$line,
138						$err,
139						);
140	0					verbose({stderr=>1,color=>'magenta',indent=>1}, $json);
141	0					next;
142						};
143						# Walk the path
144	0					foreach my $k (@path) {
145	0	0				next JSON_LINE unless exists $data->{$k};
146	0					$data = $data->{$k};
147						}
148						# At this point $data should contain our values
149	0	0				if( is_arrayref($data) ) {
		0
150	0					$uniq{$_} = 1 for grep { !is_ref($_) } @{ $data };
	0
	0
151						}
152						elsif( !is_ref($data) ) {
153	0					$uniq{$data} = 1;
154						}
155						}
156
157	0	0				die "Expected newline-delimited JSON in $file, but it was empty or didn't contain '$field'"
158						unless keys %uniq;
159
160	0					return \%uniq;
161						}
162
163						1;
164
165						__END__
166
167						=pod
168
169						=head1 NAME
170
171						App::ElasticSearch::Utilities::QueryString::FileExpansion - Build a terms query from unique values in a column of a file
172
173						=head1 VERSION
174
175						version 8.5
176
177						=head1 SYNOPSIS
178
179						=head2 App::ElasticSearch::Utilities::QueryString::FileExpansion
180
181						If the match ends in .dat, .txt, .csv, or .json then we attempt to read a file with that name and OR the condition:
182
183						$ cat test.dat
184						50 1.2.3.4
185						40 1.2.3.5
186						30 1.2.3.6
187						20 1.2.3.7
188
189						Or
190
191						$ cat test.csv
192						50,1.2.3.4
193						40,1.2.3.5
194						30,1.2.3.6
195						20,1.2.3.7
196
197						Or
198
199						$ cat test.txt
200						1.2.3.4
201						1.2.3.5
202						1.2.3.6
203						1.2.3.7
204
205						Or
206
207						$ cat test.json
208						{ "ip": "1.2.3.4" }
209						{ "ip": "1.2.3.5" }
210						{ "ip": "1.2.3.6" }
211						{ "ip": "1.2.3.7" }
212
213						We can source that file:
214
215						src_ip:test.dat => src_ip:(1.2.3.4 1.2.3.5 1.2.3.6 1.2.3.7)
216						src_ip:test.json[ip] => src_ip:(1.2.3.4 1.2.3.5 1.2.3.6 1.2.3.7)
217
218						This make it simple to use the --data-file output options and build queries
219						based off previous queries. For .txt and .dat file, the delimiter for columns
220						in the file must be either a tab or a null. For files ending in
221						.csv, Text::CSV_XS is used to accurate parsing of the file format. Files
222						ending in .json are considered to be newline-delimited JSON.
223
224						You can also specify the column of the data file to use, the default being the last column or (-1). Columns are
225						B<zero-based> indexing. This means the first column is index 0, second is 1, .. The previous example can be rewritten
226						as:
227
228						src_ip:test.dat[1]
229
230						or:
231						src_ip:test.dat[-1]
232
233						For newline delimited JSON files, you need to specify the key path you want to extract from the file. If we have a
234						JSON source file with:
235
236						{ "first": { "second": { "third": [ "bob", "alice" ] } } }
237						{ "first": { "second": { "third": "ginger" } } }
238						{ "first": { "second": { "nope": "fred" } } }
239
240						We could search using:
241
242						actor:test.json[first.second.third]
243
244						Which would expand to:
245
246						{ "terms": { "actor": [ "alice", "bob", "ginger" ] } }
247
248						This option will iterate through the whole file and unique the elements of the list. They will then be transformed into
249						an appropriate L<terms query\|http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html>.
250
251						=head3 Wildcards
252
253						We can also have a group of wildcard or regexp in a file:
254
255						$ cat wildcards.dat
256						*@gmail.com
257						*@yahoo.com
258
259						To enable wildcard parsing, prefix the filename with a C<*>.
260
261						es-search.pl to_address:*wildcards.dat
262
263						Which expands the query to:
264
265						{
266						"bool": {
267						"minimum_should_match":1,
268						"should": [
269						{"wildcard":{"to_outbound":{"value":"*@gmail.com"}}},
270						{"wildcard":{"to_outbound":{"value":"*@yahoo.com"}}}
271						]
272						}
273						}
274
275						No attempt is made to verify or validate the wildcard patterns.
276
277						=head3 Regular Expressions
278
279						If you'd like to specify a file full of regexp, you can do that as well:
280
281						$ cat regexp.dat
282						.*google\.com$
283						.*yahoo\.com$
284
285						To enable regexp parsing, prefix the filename with a C<~>.
286
287						es-search.pl to_address:~regexp.dat
288
289						Which expands the query to:
290
291						{
292						"bool": {
293						"minimum_should_match":1,
294						"should": [
295						{"regexp":{"to_outbound":{"value":".*google\\.com$"}}},
296						{"regexp":{"to_outbound":{"value":".*yahoo\\.com$"}}}
297						]
298						}
299						}
300
301						No attempt is made to verify or validate the regexp expressions.
302
303						=for Pod::Coverage handle_token
304
305						=head1 AUTHOR
306
307						Brad Lhotsky <brad@divisionbyzero.net>
308
309						=head1 COPYRIGHT AND LICENSE
310
311						This software is Copyright (c) 2023 by Brad Lhotsky.
312
313						This is free software, licensed under:
314
315						The (three-clause) BSD License
316
317						=cut