File Coverage

blib/lib/Data/Tubes/Plugin/Parser.pm

Criterion	Covered	Total	%
statement	244	273	89.3
branch	82	130	63.0
condition	34	50	68.0
subroutine	26	28	92.8
pod	8	8	100.0
total	394	489	80.5

line	stmt	bran	cond	sub	pod	time	code
1							package Data::Tubes::Plugin::Parser;
2	14			14		1440	use strict;
	14					31
	14					450
3	14			14		71	use warnings;
	14					27
	14					429
4	14			14		69	use English qw< -no_match_vars >;
	14					27
	14					102
5	14			14		5452	use Data::Dumper;
	14					40
	14					1061
6							our $VERSION = '0.737';
7
8	14			14		105	use Log::Log4perl::Tiny qw< :easy :dead_if_first >;
	14					26
	14					138
9
10	14					1277	use Data::Tubes::Util qw<
11							assert_all_different
12							generalized_hashy
13							metadata
14							normalize_args
15							shorter_sub_names
16							test_all_equal
17							trim
18							unzip
19	14			14		5460	>;
	14					32
20	14			14		4341	use Data::Tubes::Plugin::Util qw< identify >;
	14					35
	14					51070
21							my %global_defaults = (
22							input => 'raw',
23							output => 'structured',
24							);
25
26							sub parse_by_format {
27	24			24	1	18484	my %args = normalize_args(@_,
28							[{%global_defaults, name => 'parse by format'}, 'format']);
29	24					123	identify(\%args);
30
31	24					58	my $format = $args{format};
32	24	50				67	LOGDIE "parser of type 'format' needs a definition"
33							unless defined $format;
34
35	24					151	my @items = split m{(\W+)}, $format;
36	24	50				73	return parse_single(key => $items[0]) if @items == 1;
37
38	24					90	my ($keys, $separators) = unzip(\@items);
39
40							# all keys MUST be different, otherwise some fields are just trumping
41							# on each other
42	24	50				45	eval { assert_all_different($keys); }
	24					65
43							or LOGDIE "'format' parser [$format] "
44							. "has duplicate key $EVAL_ERROR->{message}";
45
46	24		100			96	my $value = $args{value} //= ['whatever'];
47	24	100				69	$value = [$value] unless ref $value;
48	24		100			161	my $multiple =
49							(ref($value) ne 'ARRAY')
50							\|\| (scalar(@$value) > 1)
51							\|\| ($value->[0] ne 'whatever');
52
53	24	100	100			111	return parse_by_separators(
54							%args,
55							keys => $keys,
56							separators => $separators
57							) if $multiple \|\| !test_all_equal(@$separators);
58
59							# a simple split will do if all separators are the same
60	13					59	return parse_by_split(
61							%args,
62							keys => $keys,
63							separator => $separators->[0]
64							);
65							} ## end sub parse_by_format
66
67							sub parse_by_regex {
68	4			4	1	5357	my %args =
69							normalize_args(@_,
70							[{%global_defaults, name => 'parse by regex'}, 'regex']);
71	4					30	identify(\%args);
72
73	4					13	my $name = $args{name};
74	4					10	my $regex = $args{regex};
75	4	50				14	LOGDIE "parse_by_regex needs a regex"
76							unless defined $regex;
77
78	4					79	$regex = qr{$regex};
79	4					15	my $input = $args{input};
80	4					11	my $output = $args{output};
81							return sub {
82	4			4		29	my $record = shift;
83	4	50				47	$record->{$input} =~ m{$regex}
84							or die {
85							message => "'$name': invalid record, regex is $regex",
86							input => $input,
87							record => $record,
88							};
89	4					95	my $retval = {%+};
90	4					21	$record->{$output} = $retval;
91	4					13	return $record;
92	4					30	};
93							} ## end sub parse_by_regex
94
95							sub _resolve_separator {
96	63			63		582	my ($separator, $args) = @_;
97	63	50				157	return unless defined $separator;
98	63	50				149	$separator = $separator->($args) if ref($separator) eq 'CODE';
99	63					96	my $ref = ref $separator;
100	63	100				159	return $separator if $ref eq 'Regexp';
101	47	50				96	LOGCROAK "$args->{name}: unknown separator type $ref" if $ref;
102	47					82	$separator = quotemeta $separator;
103	47					524	return qr{(?-i:$separator)};
104							} ## end sub _resolve_separator
105
106							sub _resolve_value {
107	25			25		61	my ($value, $args) = @_;
108	25		100			94	$value //= 'whatever';
109	25	50				64	$value = $value->($args) if ref($value) eq 'CODE';
110	25					45	my $ref = ref $value;
111	25	100	66			116	($value, $ref) = ([$value], 'ARRAY') if (!$ref) \|\| ($ref eq 'Regexp');
112	25	50				73	LOGCROAK "$args->{name}: unknown value type $ref" if $ref ne 'ARRAY';
113
114	25					50	my (%flag_for, @regexps);
115	25					60	for my $part (@$value) {
116	31					55	my $ref = ref $part;
117	31	50				205	if ($ref eq 'Regexp') {
		100
		50
		50
		0
118	0					0	push @regexps, $part;
119							}
120							elsif (
121							$part =~ m{\A(?:
122							(?:single\|double)[-_]quoted
123							\| escaped
124							\| whatever
125							)\z}mxs
126							)
127							{
128	30					75	$part =~ s{-}{_}mxs;
129	30					90	$flag_for{$part} = 1;
130							} ## end elsif ($part =~ m{\A(?: )})
131							elsif ($part eq 'quoted') {
132	0					0	$flag_for{single_quoted} = 1;
133	0					0	$flag_for{double_quoted} = 1;
134							}
135							elsif ($part eq 'specials') {
136	1					3	$flag_for{single_quoted} = 1;
137	1					3	$flag_for{double_quoted} = 1;
138	1					3	$flag_for{escaped} = 1;
139							}
140							elsif ($ref) {
141	0					0	LOGCROAK "$args->{name}: unknown part of type $ref";
142							}
143							else {
144	0					0	LOGCROAK "$args->{name}: unknown part $part";
145							}
146							} ## end for my $part (@$value)
147
148	25					46	my @escape;
149	25	100				75	if ($flag_for{single_quoted}) {
150	7					15	push @escape, q{'};
151	7					15	unshift @regexps, q{(?mxs: '[^']*' )};
152							}
153	25	100				66	if ($flag_for{double_quoted}) {
154	3					5	push @escape, q{"};
155	3					7	unshift @regexps, q{(?mxs: "(?: [^\\"] \| \\\\.)*" )};
156							}
157	25	100				56	if ($flag_for{escaped}) {
158	7					13	push @escape, '\\';
159	7					21	my $escape = quotemeta join '', @escape;
160	7					20	push @regexps, qq{(?mxs-i: (?: [^$escape] \| \\\\.)*?)};
161							}
162	25	100				61	if ($flag_for{whatever}) {
163	16					35	push @regexps, qq{(?mxs:.*?)};
164							}
165
166	25					87	my $regex = '(' . join('\|', @regexps) . ')';
167	25					90	return ($regex, \%flag_for);
168							} ## end sub _resolve_value
169
170							sub _resolve_decode {
171	25			25		47	my $args = shift;
172	25					51	my $name = $args->{name};
173	25					37	my $escape = $args->{escaped};
174	25					42	my $squote = $args->{single_quoted};
175	25					43	my $dquote = $args->{double_quoted};
176	25					37	my $vdecode = $args->{decode};
177	25					45	my $decode = $args->{decode_values};
178	25	50	100			156	if ($vdecode) {
		100	100
179							$decode \|\|= sub {
180	0			0		0	my $values = shift;
181	0					0	for my $value (@$values) {
182	0					0	$value = $vdecode->($value);
183							}
184	0					0	return $values;
185							}
186	0		0			0	} ## end if ($vdecode)
187							elsif ($escape \|\| $squote \|\| $dquote) {
188							$decode \|\|= sub {
189	12			12		19	my $values = shift;
190	12					35	for my $i (0 .. $#$values) {
191	41					72	my $value = $values->[$i];
192	41	50				81	my $len = length $value or next;
193	41					85	my $first = substr $value, 0, 1;
194	41	100	100			176	if ($dquote && $first eq q{"}) {
		100	100
		100
195	5	50	33			17	die {message => "'$name': invalid record, "
196							. "unterminated double quote at field $i (0-based)"
197							}
198							unless $len > 1 && substr($value, -1, 1) eq q{"};
199	5					12	$values->[$i] = substr $value, 1, $len - 2; # unquote
200	5					22	$values->[$i] =~ s{\\(.)}{$1}gmxs; # unescape
201							} ## end if ($dquote && $first ...)
202							elsif ($squote && $first eq q{'}) {
203	11	50	33			50	die {message => "'$name': invalid record, "
204							. "unterminated single quote at field $i (0-based)",
205							}
206							unless $len > 1 && substr($value, -1, 1) eq q{'};
207	11					31	$values->[$i] = substr $value, 1, $len - 2; # unquote
208							} ## end elsif ($squote && $first ...)
209							elsif ($escape) {
210	21					74	$values->[$i] =~ s{\\(.)}{$1}gmxs; # unescape
211							}
212							} ## end for my $i (0 .. $#$values)
213	12					44	return $values;
214							}
215	11		50			85	} ## end elsif ($escape \|\| $squote...)
216	25					56	return $decode;
217							} ## end sub _resolve_decode
218
219							sub parse_by_separators {
220	16			16	1	4045	my %args = normalize_args(@_,
221							[{%global_defaults, name => 'parse by separators'}, 'separators']);
222	16					90	identify(\%args);
223	16					47	my $name = $args{name};
224
225	16					33	my $separators = $args{separators};
226	16	50				46	LOGDIE "parse_by_separators needs separators"
227							unless defined $separators;
228	16					43	$separators = [map { _resolve_separator($_, \%args) } @$separators];
	41					112
229
230	16					39	my $keys = $args{keys};
231	16					40	my ($delta, $n_keys);
232	16	100				43	if (defined $keys) {
233	12					22	$n_keys = scalar @$keys;
234	12					20	$delta = $n_keys - scalar(@$separators);
235	12	50	33			57	LOGDIE "parse_by_separators 0 <= #keys - #separators <= 1"
236							if ($delta < 0) \|\| ($delta > 1);
237							} ## end if (defined $keys)
238							else {
239	4					15	$keys = [0 .. scalar(@$separators)];
240	4					11	$n_keys = 0; # don't bother
241	4					7	$delta = 1;
242							}
243
244	16					57	my ($value_regex, $flag_for) = _resolve_value($args{value}, \%args);
245
246	16					36	my @items;
247	16					83	for my $i (0 .. $#$keys) {
248	57					91	push @items, $value_regex;
249	57	100				146	push @items, $separators->[$i] if $i <= $#$separators;
250							}
251
252							# if not a separator, the last item becomes a catchall
253	16	50				48	$items[-1] = '(.*)' if $delta > 0;
254
255							# ready to generate the regexp. We bind the end to \z anyway because
256							# the last element might be a separator
257	16					52	my $format = join '', '(?:\\A', @items, '\\z)';
258	16					478	my $regex = qr{$format};
259	16					113	DEBUG "$name: regex will be: $regex";
260
261							# this sub will use the regexp above, do checking and return captured
262							# values in a hash with @keys
263	16					293	my $input = $args{input};
264	16					35	my $output = $args{output};
265	16					28	my $trim = $args{trim};
266	16					134	my $decode = _resolve_decode({%args, %$flag_for});
267							return sub {
268	16			16		128	my $record = shift;
269	16	50				214	my @values = $record->{$input} =~ m{$regex}
270							or die {
271							message => 'invalid record',
272							record => $record,
273							regex => $regex
274							};
275	16	100				65	trim(@values) if $trim;
276	16	100				42	if ($decode) {
277	7	50				11	eval { @values = @{$decode->(\@values)}; 1 } or do {
	7					11
	7					14
	7					21
278	0					0	my $e = $@;
279	0	0				0	$e = {message => $e} unless ref $e;
280	0	0				0	$e = {%$e, record => $record} if ref($e) eq 'HASH';
281	0					0	die $e;
282							};
283							} ## end if ($decode)
284
285	16	100				46	if ($n_keys) {
286	12					21	my $n_values = scalar @values;
287	12	50				30	die {
288							message => "'$name': invalid record, expected $n_keys, "
289							. "got $n_values only",
290							values => \@values,
291							record => $record
292							}
293							if $n_values < $n_keys;
294
295	12					27	$record->{$output} = \my %retval;
296	12					61	@retval{@$keys} = @values;
297							} ## end if ($n_keys)
298							else {
299	4					11	$record->{$output} = \@values;
300							}
301	16					49	return $record;
302	16					221	};
303							} ## end sub parse_by_separators
304
305							sub parse_by_split {
306	13			13	1	71	my %args =
307							normalize_args(@_,
308							[{%global_defaults, name => 'parse by split'}, 'separator']);
309	13					65	identify(\%args);
310
311	13					47	my $separator = _resolve_separator($args{separator}, \%args);
312
313	13					33	my $name = $args{name};
314	13					23	my $keys = $args{keys};
315	13	50				36	my $n_keys = defined($keys) ? scalar(@$keys) : 0;
316	13					21	my $input = $args{input};
317	13					21	my $output = $args{output};
318	13		100			103	my $allow_missing = $args{allow_missing} \|\| 0;
319	13					28	my $trim = $args{trim};
320
321							return sub {
322	16			16		64	my $record = shift;
323
324	16					79	my @values = split(/$separator/, $record->{$input}, $n_keys);
325	16	100				44	trim(@values) if $trim;
326
327	16					58	my $n_values = @values;
328	16	100				83	die {
329							message => "'$name': invalid record, expected $n_keys items, "
330							. "got $n_values",
331							input => $input,
332							record => $record,
333							}
334							if $n_values + $allow_missing < $n_keys;
335
336	11					27	$record->{$output} = \my %retval;
337	11					44	@retval{@$keys} = @values;
338	11					39	return $record;
339							}
340	13	50				151	if $n_keys;
341
342							return sub {
343	0			0		0	my $record = shift;
344	0					0	my @retval = split /$separator/, $record->{$input};
345	0	0				0	trim(@retval) if $trim;
346	0					0	$record->{$output} = \@retval;
347	0					0	return $record;
348	0					0	};
349
350							} ## end sub parse_by_split
351
352							sub parse_by_value_separator {
353	9			9	1	6210	my %args = normalize_args(
354							@_,
355							[
356							{%global_defaults, name => 'parse by value and separator'},
357							'separator'
358							]
359							);
360	9					48	identify(\%args);
361	9					25	my $name = $args{name};
362
363	9					36	my $separator = _resolve_separator($args{separator}, \%args);
364	9	50				23	LOGCROAK "$name: argument separator is mandatory"
365							unless defined $separator;
366
367	9					33	my ($value, $flag_for) = _resolve_value($args{value}, \%args);
368	9					67	my $decode = _resolve_decode({%args, %$flag_for});
369
370	9					28	my $keys = $args{keys};
371	9	100				20	my $n_keys = defined($keys) ? scalar(@$keys) : 0;
372	9					16	my $input = $args{input};
373	9					14	my $output = $args{output};
374	9		50			30	my $allow_missing = $args{allow_missing} \|\| 0;
375	9		50			27	my $allow_surplus = $args{allow_surplus} \|\| 0;
376	9					15	my $trim = $args{trim};
377	9					103	my $go_global = $^V lt v5.18.0;
378
379							return sub {
380	10			10		683	my $record = shift;
381
382	10					12	my @values;
383	10	50				21	if ($go_global) {
384	0					0	local our @global_values = ();
385	0					0	my $collector = qr/(?{push @global_values, $^N})/;
	0					0
386	0	0				0	$record->{$input} =~ m/
387							\A (?: $value $separator $collector )*
388							$value \z $collector
389							/gmxs
390							or die {
391							message => 'invalid record',
392							separator => $separator,
393							value => $value,
394							record => $record,
395							};
396	0					0	@values = @global_values;
397							}
398							else {
399	10	50				436	$record->{$input} =~ m/
400	30					165	\A (?: $value $separator (?{push @values, $^N}) )*
401	10					68	$value \z (?{push @values, $^N})
402							/gmxs
403							or die {
404							message => 'invalid record',
405							separator => $separator,
406							value => $value,
407							record => $record,
408							};
409							}
410	10	100				54	trim(@values) if $trim;
411	10	100				27	if ($decode) {
412	5	50				9	eval { @values = @{$decode->(\@values)}; 1 } or do {
	5					11
	5					11
	5					15
413	0					0	my $e = $EVAL_ERROR;
414	0	0				0	$e = {message => $e} unless ref $e;
415	0	0				0	$e = {%$e, record => $record} if ref($e) eq 'HASH';
416	0					0	die $e;
417							};
418							} ## end if ($decode)
419
420	10	100				20	if ($n_keys) {
421	6					10	my $n_values = @values;
422	6	50	33			30	die {
423							message => "'$name': invalid record, expected $n_keys items, "
424							. "got $n_values",
425							input => $input,
426							record => $record,
427							}
428							if ($n_values + $allow_missing < $n_keys)
429							\|\| ($n_values - $allow_surplus > $n_keys);
430	6					36	$record->{$output} = \my %retval;
431	6					30	@retval{@$keys} = @values;
432							} ## end if ($n_keys)
433							else {
434	4					10	$record->{$output} = \@values;
435							}
436	10					30	return $record;
437	9					89	};
438							} ## end sub parse_by_value_separator
439
440							sub parse_ghashy {
441	3			3	1	4096	my %args = normalize_args(@_,
442							{%global_defaults, default_key => '', name => 'parse ghashy'});
443	3					23	identify(\%args);
444
445	3	50				9	my %defaults = %{$args{defaults} \|\| {}};
	3					31
446	3					12	my $input = $args{input};
447	3					10	my $output = $args{output};
448
449							# pre-compile capture thing from generalized_hashy
450	3					22	$args{capture} = generalized_hashy(%args, text => undef)->{capture};
451
452							return sub {
453	3			3		72	my $record = shift;
454	3					13	my $outcome = generalized_hashy(%args, text => $record->{$input});
455							die {
456							input => $input,
457							message => $outcome->{failure},
458							outcome => $outcome,
459							record => $record,
460							}
461	3	50				14	unless exists $outcome->{hash};
462	3					7	$record->{$output} = {%defaults, %{$outcome->{hash}}};
	3					18
463	3					18	return $record;
464	3					27	};
465							} ## end sub parse_ghashy
466
467							sub parse_hashy {
468	4			4	1	1991	my %args = normalize_args(
469							@_,
470							{
471							%global_defaults,
472							chunks_separator => ' ',
473							default_key => '',
474							key_value_separator => '=',
475							name => 'parse hashy',
476							}
477							);
478	4					21	identify(\%args);
479	4	50				7	my %defaults = %{$args{defaults} \|\| {}};
	4					32
480	4					9	my $input = $args{input};
481	4					7	my $output = $args{output};
482							return sub {
483	3			3		446	my $record = shift;
484	3					21	my $parsed = metadata($record->{$input}, %args);
485	3					17	$record->{$output} = {%defaults, %$parsed};
486	3					13	return $record;
487	4					24	};
488							} ## end sub parse_hashy
489
490							sub parse_single {
491	2			2	1	1909	my %args = normalize_args(
492							@_,
493							{
494							key => 'key',
495							%global_defaults,
496							}
497							);
498	2					10	identify(\%args);
499	2					6	my $key = $args{key};
500	2		33			12	my $has_key = defined($key) && length($key);
501	2					3	my $input = $args{input};
502	2					4	my $output = $args{output};
503							return sub {
504	2			2		12	my $record = shift;
505							$record->{$output} =
506	2	50				8	$has_key ? {$key => $record->{$input}} : $record->{$input};
507	2					6	return $record;
508							}
509	2					12	} ## end sub parse_single
510
511							shorter_sub_names(__PACKAGE__, 'parse_');
512
513							1;