File Coverage

blib/lib/Data/Tubes/Plugin/Parser.pm

Criterion	Covered	Total	%
statement	244	273	89.3
branch	82	130	63.0
condition	34	50	68.0
subroutine	26	28	92.8
pod	8	8	100.0
total	394	489	80.5

line	stmt	bran	cond	sub	pod	time	code
1							package Data::Tubes::Plugin::Parser;
2	14			14		1297	use strict;
	14					31
	14					453
3	14			14		73	use warnings;
	14					33
	14					505
4	14			14		76	use English qw< -no_match_vars >;
	14					27
	14					143
5	14			14		5608	use Data::Dumper;
	14					40
	14					1152
6							our $VERSION = '0.738';
7
8	14			14		110	use Log::Log4perl::Tiny qw< :easy :dead_if_first >;
	14					33
	14					142
9
10	14					1370	use Data::Tubes::Util qw<
11							assert_all_different
12							generalized_hashy
13							metadata
14							normalize_args
15							shorter_sub_names
16							test_all_equal
17							trim
18							unzip
19	14			14		5618	>;
	14					32
20	14			14		4247	use Data::Tubes::Plugin::Util qw< identify >;
	14					39
	14					49556
21							my %global_defaults = (
22							input => 'raw',
23							output => 'structured',
24							);
25
26							sub parse_by_format {
27	24			24	1	19009	my %args = normalize_args(@_,
28							[{%global_defaults, name => 'parse by format'}, 'format']);
29	24					156	identify(\%args);
30
31	24					68	my $format = $args{format};
32	24	50				89	LOGDIE "parser of type 'format' needs a definition"
33							unless defined $format;
34
35	24					178	my @items = split m{(\W+)}, $format;
36	24	50				94	return parse_single(key => $items[0]) if @items == 1;
37
38	24					125	my ($keys, $separators) = unzip(\@items);
39
40							# all keys MUST be different, otherwise some fields are just trumping
41							# on each other
42	24	50				54	eval { assert_all_different($keys); }
	24					83
43							or LOGDIE "'format' parser [$format] "
44							. "has duplicate key $EVAL_ERROR->{message}";
45
46	24		100			119	my $value = $args{value} //= ['whatever'];
47	24	100				70	$value = [$value] unless ref $value;
48	24		100			197	my $multiple =
49							(ref($value) ne 'ARRAY')
50							\|\| (scalar(@$value) > 1)
51							\|\| ($value->[0] ne 'whatever');
52
53	24	100	100			148	return parse_by_separators(
54							%args,
55							keys => $keys,
56							separators => $separators
57							) if $multiple \|\| !test_all_equal(@$separators);
58
59							# a simple split will do if all separators are the same
60	13					75	return parse_by_split(
61							%args,
62							keys => $keys,
63							separator => $separators->[0]
64							);
65							} ## end sub parse_by_format
66
67							sub parse_by_regex {
68	4			4	1	5443	my %args =
69							normalize_args(@_,
70							[{%global_defaults, name => 'parse by regex'}, 'regex']);
71	4					29	identify(\%args);
72
73	4					15	my $name = $args{name};
74	4					8	my $regex = $args{regex};
75	4	50				15	LOGDIE "parse_by_regex needs a regex"
76							unless defined $regex;
77
78	4					34	$regex = qr{$regex};
79	4					11	my $input = $args{input};
80	4					10	my $output = $args{output};
81							return sub {
82	4			4		28	my $record = shift;
83	4	50				40	$record->{$input} =~ m{$regex}
84							or die {
85							message => "'$name': invalid record, regex is $regex",
86							input => $input,
87							record => $record,
88							};
89	4					85	my $retval = {%+};
90	4					19	$record->{$output} = $retval;
91	4					12	return $record;
92	4					33	};
93							} ## end sub parse_by_regex
94
95							sub _resolve_separator {
96	63			63		148	my ($separator, $args) = @_;
97	63	50				157	return unless defined $separator;
98	63	50				156	$separator = $separator->($args) if ref($separator) eq 'CODE';
99	63					108	my $ref = ref $separator;
100	63	100				159	return $separator if $ref eq 'Regexp';
101	47	50				98	LOGCROAK "$args->{name}: unknown separator type $ref" if $ref;
102	47					91	$separator = quotemeta $separator;
103	47					552	return qr{(?-i:$separator)};
104							} ## end sub _resolve_separator
105
106							sub _resolve_value {
107	25			25		63	my ($value, $args) = @_;
108	25		100			93	$value //= 'whatever';
109	25	50				72	$value = $value->($args) if ref($value) eq 'CODE';
110	25					43	my $ref = ref $value;
111	25	100	66			116	($value, $ref) = ([$value], 'ARRAY') if (!$ref) \|\| ($ref eq 'Regexp');
112	25	50				80	LOGCROAK "$args->{name}: unknown value type $ref" if $ref ne 'ARRAY';
113
114	25					39	my (%flag_for, @regexps);
115	25					64	for my $part (@$value) {
116	31					61	my $ref = ref $part;
117	31	50				210	if ($ref eq 'Regexp') {
		100
		50
		50
		0
118	0					0	push @regexps, $part;
119							}
120							elsif (
121							$part =~ m{\A(?:
122							(?:single\|double)[-_]quoted
123							\| escaped
124							\| whatever
125							)\z}mxs
126							)
127							{
128	30					77	$part =~ s{-}{_}mxs;
129	30					93	$flag_for{$part} = 1;
130							} ## end elsif ($part =~ m{\A(?: )})
131							elsif ($part eq 'quoted') {
132	0					0	$flag_for{single_quoted} = 1;
133	0					0	$flag_for{double_quoted} = 1;
134							}
135							elsif ($part eq 'specials') {
136	1					5	$flag_for{single_quoted} = 1;
137	1					4	$flag_for{double_quoted} = 1;
138	1					4	$flag_for{escaped} = 1;
139							}
140							elsif ($ref) {
141	0					0	LOGCROAK "$args->{name}: unknown part of type $ref";
142							}
143							else {
144	0					0	LOGCROAK "$args->{name}: unknown part $part";
145							}
146							} ## end for my $part (@$value)
147
148	25					44	my @escape;
149	25	100				74	if ($flag_for{single_quoted}) {
150	7					16	push @escape, q{'};
151	7					17	unshift @regexps, q{(?mxs: '[^']*' )};
152							}
153	25	100				61	if ($flag_for{double_quoted}) {
154	3					8	push @escape, q{"};
155	3					8	unshift @regexps, q{(?mxs: "(?: [^\\"] \| \\\\.)*" )};
156							}
157	25	100				96	if ($flag_for{escaped}) {
158	7					14	push @escape, '\\';
159	7					25	my $escape = quotemeta join '', @escape;
160	7					20	push @regexps, qq{(?mxs-i: (?: [^$escape] \| \\\\.)*?)};
161							}
162	25	100				68	if ($flag_for{whatever}) {
163	16					33	push @regexps, qq{(?mxs:.*?)};
164							}
165
166	25					86	my $regex = '(' . join('\|', @regexps) . ')';
167	25					161	return ($regex, \%flag_for);
168							} ## end sub _resolve_value
169
170							sub _resolve_decode {
171	25			25		41	my $args = shift;
172	25					54	my $name = $args->{name};
173	25					39	my $escape = $args->{escaped};
174	25					42	my $squote = $args->{single_quoted};
175	25					38	my $dquote = $args->{double_quoted};
176	25					38	my $vdecode = $args->{decode};
177	25					44	my $decode = $args->{decode_values};
178	25	50	100			169	if ($vdecode) {
		100	100
179							$decode \|\|= sub {
180	0			0		0	my $values = shift;
181	0					0	for my $value (@$values) {
182	0					0	$value = $vdecode->($value);
183							}
184	0					0	return $values;
185							}
186	0		0			0	} ## end if ($vdecode)
187							elsif ($escape \|\| $squote \|\| $dquote) {
188							$decode \|\|= sub {
189	12			12		24	my $values = shift;
190	12					40	for my $i (0 .. $#$values) {
191	41					86	my $value = $values->[$i];
192	41	50				82	my $len = length $value or next;
193	41					73	my $first = substr $value, 0, 1;
194	41	100	100			194	if ($dquote && $first eq q{"}) {
		100	100
		100
195	5	50	33			28	die {message => "'$name': invalid record, "
196							. "unterminated double quote at field $i (0-based)"
197							}
198							unless $len > 1 && substr($value, -1, 1) eq q{"};
199	5					16	$values->[$i] = substr $value, 1, $len - 2; # unquote
200	5					25	$values->[$i] =~ s{\\(.)}{$1}gmxs; # unescape
201							} ## end if ($dquote && $first ...)
202							elsif ($squote && $first eq q{'}) {
203	11	50	33			59	die {message => "'$name': invalid record, "
204							. "unterminated single quote at field $i (0-based)",
205							}
206							unless $len > 1 && substr($value, -1, 1) eq q{'};
207	11					34	$values->[$i] = substr $value, 1, $len - 2; # unquote
208							} ## end elsif ($squote && $first ...)
209							elsif ($escape) {
210	21					83	$values->[$i] =~ s{\\(.)}{$1}gmxs; # unescape
211							}
212							} ## end for my $i (0 .. $#$values)
213	12					50	return $values;
214							}
215	11		50			96	} ## end elsif ($escape \|\| $squote...)
216	25					60	return $decode;
217							} ## end sub _resolve_decode
218
219							sub parse_by_separators {
220	16			16	1	3214	my %args = normalize_args(@_,
221							[{%global_defaults, name => 'parse by separators'}, 'separators']);
222	16					91	identify(\%args);
223	16					49	my $name = $args{name};
224
225	16					35	my $separators = $args{separators};
226	16	50				55	LOGDIE "parse_by_separators needs separators"
227							unless defined $separators;
228	16					45	$separators = [map { _resolve_separator($_, \%args) } @$separators];
	41					116
229
230	16					46	my $keys = $args{keys};
231	16					31	my ($delta, $n_keys);
232	16	100				50	if (defined $keys) {
233	12					27	$n_keys = scalar @$keys;
234	12					28	$delta = $n_keys - scalar(@$separators);
235	12	50	33			70	LOGDIE "parse_by_separators 0 <= #keys - #separators <= 1"
236							if ($delta < 0) \|\| ($delta > 1);
237							} ## end if (defined $keys)
238							else {
239	4					12	$keys = [0 .. scalar(@$separators)];
240	4					6	$n_keys = 0; # don't bother
241	4					7	$delta = 1;
242							}
243
244	16					66	my ($value_regex, $flag_for) = _resolve_value($args{value}, \%args);
245
246	16					42	my @items;
247	16					69	for my $i (0 .. $#$keys) {
248	57					90	push @items, $value_regex;
249	57	100				148	push @items, $separators->[$i] if $i <= $#$separators;
250							}
251
252							# if not a separator, the last item becomes a catchall
253	16	50				54	$items[-1] = '(.*)' if $delta > 0;
254
255							# ready to generate the regexp. We bind the end to \z anyway because
256							# the last element might be a separator
257	16					53	my $format = join '', '(?:\\A', @items, '\\z)';
258	16					614	my $regex = qr{$format};
259	16					119	DEBUG "$name: regex will be: $regex";
260
261							# this sub will use the regexp above, do checking and return captured
262							# values in a hash with @keys
263	16					287	my $input = $args{input};
264	16					34	my $output = $args{output};
265	16					29	my $trim = $args{trim};
266	16					138	my $decode = _resolve_decode({%args, %$flag_for});
267							return sub {
268	16			16		109	my $record = shift;
269	16	50				237	my @values = $record->{$input} =~ m{$regex}
270							or die {
271							message => 'invalid record',
272							record => $record,
273							regex => $regex
274							};
275	16	100				119	trim(@values) if $trim;
276	16	100				45	if ($decode) {
277	7	50				15	eval { @values = @{$decode->(\@values)}; 1 } or do {
	7					11
	7					22
	7					23
278	0					0	my $e = $@;
279	0	0				0	$e = {message => $e} unless ref $e;
280	0	0				0	$e = {%$e, record => $record} if ref($e) eq 'HASH';
281	0					0	die $e;
282							};
283							} ## end if ($decode)
284
285	16	100				43	if ($n_keys) {
286	12					22	my $n_values = scalar @values;
287	12	50				37	die {
288							message => "'$name': invalid record, expected $n_keys, "
289							. "got $n_values only",
290							values => \@values,
291							record => $record
292							}
293							if $n_values < $n_keys;
294
295	12					34	$record->{$output} = \my %retval;
296	12					63	@retval{@$keys} = @values;
297							} ## end if ($n_keys)
298							else {
299	4					11	$record->{$output} = \@values;
300							}
301	16					57	return $record;
302	16					232	};
303							} ## end sub parse_by_separators
304
305							sub parse_by_split {
306	13			13	1	87	my %args =
307							normalize_args(@_,
308							[{%global_defaults, name => 'parse by split'}, 'separator']);
309	13					75	identify(\%args);
310
311	13					60	my $separator = _resolve_separator($args{separator}, \%args);
312
313	13					36	my $name = $args{name};
314	13					23	my $keys = $args{keys};
315	13	50				40	my $n_keys = defined($keys) ? scalar(@$keys) : 0;
316	13					24	my $input = $args{input};
317	13					24	my $output = $args{output};
318	13		100			59	my $allow_missing = $args{allow_missing} \|\| 0;
319	13					34	my $trim = $args{trim};
320
321							return sub {
322	16			16		62	my $record = shift;
323
324	16					87	my @values = split(/$separator/, $record->{$input}, $n_keys);
325	16	100				59	trim(@values) if $trim;
326
327	16					30	my $n_values = @values;
328	16	100				91	die {
329							message => "'$name': invalid record, expected $n_keys items, "
330							. "got $n_values",
331							input => $input,
332							record => $record,
333							}
334							if $n_values + $allow_missing < $n_keys;
335
336	11					29	$record->{$output} = \my %retval;
337	11					51	@retval{@$keys} = @values;
338	11					37	return $record;
339							}
340	13	50				159	if $n_keys;
341
342							return sub {
343	0			0		0	my $record = shift;
344	0					0	my @retval = split /$separator/, $record->{$input};
345	0	0				0	trim(@retval) if $trim;
346	0					0	$record->{$output} = \@retval;
347	0					0	return $record;
348	0					0	};
349
350							} ## end sub parse_by_split
351
352							sub parse_by_value_separator {
353	9			9	1	5433	my %args = normalize_args(
354							@_,
355							[
356							{%global_defaults, name => 'parse by value and separator'},
357							'separator'
358							]
359							);
360	9					45	identify(\%args);
361	9					22	my $name = $args{name};
362
363	9					27	my $separator = _resolve_separator($args{separator}, \%args);
364	9	50				36	LOGCROAK "$name: argument separator is mandatory"
365							unless defined $separator;
366
367	9					24	my ($value, $flag_for) = _resolve_value($args{value}, \%args);
368	9					60	my $decode = _resolve_decode({%args, %$flag_for});
369
370	9					27	my $keys = $args{keys};
371	9	100				19	my $n_keys = defined($keys) ? scalar(@$keys) : 0;
372	9					15	my $input = $args{input};
373	9					12	my $output = $args{output};
374	9		50			28	my $allow_missing = $args{allow_missing} \|\| 0;
375	9		50			23	my $allow_surplus = $args{allow_surplus} \|\| 0;
376	9					12	my $trim = $args{trim};
377	9					96	my $go_global = $^V lt v5.18.0;
378
379							return sub {
380	10			10		659	my $record = shift;
381
382	10					14	my @values;
383	10	50				22	if ($go_global) {
384	0					0	local our @global_values = ();
385	0					0	my $collector = qr/(?{push @global_values, $^N})/;
	0					0
386	0	0				0	$record->{$input} =~ m/
387							\A (?: $value $separator $collector )*
388							$value \z $collector
389							/gmxs
390							or die {
391							message => 'invalid record',
392							separator => $separator,
393							value => $value,
394							record => $record,
395							};
396	0					0	@values = @global_values;
397							}
398							else {
399	10	50				413	$record->{$input} =~ m/
400	30					163	\A (?: $value $separator (?{push @values, $^N}) )*
401	10					66	$value \z (?{push @values, $^N})
402							/gmxs
403							or die {
404							message => 'invalid record',
405							separator => $separator,
406							value => $value,
407							record => $record,
408							};
409							}
410	10	100				48	trim(@values) if $trim;
411	10	100				26	if ($decode) {
412	5	50				10	eval { @values = @{$decode->(\@values)}; 1 } or do {
	5					7
	5					13
	5					15
413	0					0	my $e = $EVAL_ERROR;
414	0	0				0	$e = {message => $e} unless ref $e;
415	0	0				0	$e = {%$e, record => $record} if ref($e) eq 'HASH';
416	0					0	die $e;
417							};
418							} ## end if ($decode)
419
420	10	100				17	if ($n_keys) {
421	6					10	my $n_values = @values;
422	6	50	33			28	die {
423							message => "'$name': invalid record, expected $n_keys items, "
424							. "got $n_values",
425							input => $input,
426							record => $record,
427							}
428							if ($n_values + $allow_missing < $n_keys)
429							\|\| ($n_values - $allow_surplus > $n_keys);
430	6					13	$record->{$output} = \my %retval;
431	6					31	@retval{@$keys} = @values;
432							} ## end if ($n_keys)
433							else {
434	4					11	$record->{$output} = \@values;
435							}
436	10					31	return $record;
437	9					83	};
438							} ## end sub parse_by_value_separator
439
440							sub parse_ghashy {
441	3			3	1	3234	my %args = normalize_args(@_,
442							{%global_defaults, default_key => '', name => 'parse ghashy'});
443	3					15	identify(\%args);
444
445	3	50				6	my %defaults = %{$args{defaults} \|\| {}};
	3					19
446	3					8	my $input = $args{input};
447	3					5	my $output = $args{output};
448
449							# pre-compile capture thing from generalized_hashy
450	3					13	$args{capture} = generalized_hashy(%args, text => undef)->{capture};
451
452							return sub {
453	3			3		87	my $record = shift;
454	3					11	my $outcome = generalized_hashy(%args, text => $record->{$input});
455							die {
456							input => $input,
457							message => $outcome->{failure},
458							outcome => $outcome,
459							record => $record,
460							}
461	3	50				11	unless exists $outcome->{hash};
462	3					5	$record->{$output} = {%defaults, %{$outcome->{hash}}};
	3					15
463	3					19	return $record;
464	3					20	};
465							} ## end sub parse_ghashy
466
467							sub parse_hashy {
468	4			4	1	2300	my %args = normalize_args(
469							@_,
470							{
471							%global_defaults,
472							chunks_separator => ' ',
473							default_key => '',
474							key_value_separator => '=',
475							name => 'parse hashy',
476							}
477							);
478	4					24	identify(\%args);
479	4	50				10	my %defaults = %{$args{defaults} \|\| {}};
	4					39
480	4					12	my $input = $args{input};
481	4					10	my $output = $args{output};
482							return sub {
483	3			3		558	my $record = shift;
484	3					30	my $parsed = metadata($record->{$input}, %args);
485	3					20	$record->{$output} = {%defaults, %$parsed};
486	3					13	return $record;
487	4					30	};
488							} ## end sub parse_hashy
489
490							sub parse_single {
491	2			2	1	2072	my %args = normalize_args(
492							@_,
493							{
494							key => 'key',
495							%global_defaults,
496							}
497							);
498	2					11	identify(\%args);
499	2					5	my $key = $args{key};
500	2		33			12	my $has_key = defined($key) && length($key);
501	2					3	my $input = $args{input};
502	2					4	my $output = $args{output};
503							return sub {
504	2			2		10	my $record = shift;
505							$record->{$output} =
506	2	50				9	$has_key ? {$key => $record->{$input}} : $record->{$input};
507	2					5	return $record;
508							}
509	2					14	} ## end sub parse_single
510
511							shorter_sub_names(__PACKAGE__, 'parse_');
512
513							1;