File Coverage

blib/lib/Log/Fmt.pm

Criterion	Covered	Total	%
statement	79	85	92.9
branch	33	38	86.8
condition	6	7	85.7
subroutine	14	14	100.0
pod	3	4	75.0
total	135	148	91.2

line	stmt	bran	cond	sub	pod	time	code
1	7			7		98	use v5.20;
	7					26
2	7			7		57	use warnings;
	7					14
	7					544
3							package Log::Fmt 3.013;
4							# ABSTRACT: a little parser and emitter of structured log lines
5
6	7			7		42	use experimental 'postderef'; # Not dangerous. Is accepted without changed.
	7					14
	7					53
7
8	7			7		5839	use Encode ();
	7					137464
	7					373
9	7			7		4051	use Params::Util qw(_ARRAY0 _HASH0 _CODELIKE);
	7					32476
	7					770
10	7			7		89	use Scalar::Util qw(refaddr);
	7					14
	7					361
11	7			7		3917	use String::Flogger ();
	7					98813
	7					15715
12
13							#pod =head1 OVERVIEW
14							#pod
15							#pod This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
16							#pod methods. It converts an arrayref of key/value pairs to a string that a human
17							#pod can scan tolerably well, and which a machine can parse about as well. It can
18							#pod also do that tolerably-okay parsing for you.
19							#pod
20							#pod =head1 SPECIFICATION
21							#pod
22							#pod =head2 The logfmt text format
23							#pod
24							#pod Although quite a few tools exist for managing C<logfmt>, there is no spec-like
25							#pod document for it. Because you may require multiple implementations, a
26							#pod specification can be helpful.
27							#pod
28							#pod Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
29							#pod separated by a single space.
30							#pod
31							#pod event = pair *(WSP pair)
32							#pod pair = key "=" value
33							#pod okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
34							#pod key = 1*(okchr)
35							#pod value = key / quoted
36							#pod
37							#pod quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
38							#pod escaped = escaped-special / escaped-hex
39							#pod escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
40							#pod escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
41							#pod quoted-ok = SP / "="
42							#pod eightbit = %x80-ff
43							#pod
44							#pod When formatting a value, if a value is already a valid C<key> token, use it
45							#pod without further quoting.
46							#pod
47							#pod =head2 Quoting a Unicode string
48							#pod
49							#pod It is preferable to build quoted values from a Unicode string, because it's
50							#pod possible to know whether a given codepoint is a non-ASCII unsafe character,
51							#pod like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
52							#pod encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
53							#pod with a standard terminal can show something like:
54							#pod
55							#pod user.name="Jürgen"
56							#pod
57							#pod To generate a C<quoted> from a Unicode string, for each codepoint:
58							#pod
59							#pod =begin :list
60							#pod
61							#pod * convert C<\> to C<\\>
62							#pod * convert C<"> to C<\">
63							#pod * convert a newline (U+000A) to C<\n>
64							#pod * convert a carriage return (U+000D) to C<\r>
65							#pod * convert a character tabulation (U+0009) to C<\t>
66							#pod * for any control character (by general category) or vertical newline:
67							#pod
68							#pod =begin :list
69							#pod
70							#pod * encode the character into a UTF-8 bytestring
71							#pod * convert each byte in the bytestring into C<\x{...}> form
72							#pod * use that sequence of C<\x{...}> codes in place of the replaced character
73							#pod
74							#pod =end :list
75							#pod
76							#pod =end :list
77							#pod
78							#pod Finally, UTF-8 encode the entire string and wrap it in double qoutes.
79							#pod
80							#pod B<This Perl implementation assumes that all string values to be encoded are
81							#pod character strings!>
82							#pod
83							#pod =head3 Quoting a bytestring
84							#pod
85							#pod Encoding a Unicode string is preferable, but may not be practical. In those
86							#pod cases when you have only a byte sequence, apply these steps.
87							#pod
88							#pod For each byte (using ASCII conventions):
89							#pod
90							#pod =for :list
91							#pod * convert C<\> to C<\\>
92							#pod * convert C<"> to C<\">
93							#pod * convert a newline (C<%0a>) to C<\n>
94							#pod * convert a carriage return (C<%0d>) to C<\r>
95							#pod * convert a character tabulation (C<%x09>) to C<\t>
96							#pod * convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
97							#pod * convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
98							#pod
99							#pod Finally, wrap the string in double quotes.
100							#pod
101							#pod =cut
102
103							#pod =method format_event_string
104							#pod
105							#pod my $octets = Log::Fmt->format_event_string([
106							#pod key1 => $value1,
107							#pod key2 => $value2,
108							#pod ]);
109							#pod
110							#pod Note especially that if any value to encode is a reference I<to a reference>,
111							#pod then String::Flogger is used to encode the referenced value. This means you
112							#pod can embed, in your logfmt, a JSON dump of a structure by passing a reference to
113							#pod the structure, instead of passing the structure itself.
114							#pod
115							#pod String values are assumed to be character strings, and will be UTF-8 encoded as
116							#pod part of the formatting process.
117							#pod
118							#pod =cut
119
120							# okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
121							# key = 1*(okchr)
122							# value = key / quoted
123							my $KEY_RE = qr{[\x21\x23-\x3c\x3e-\x5b\x5d-\x7e]+};
124
125							sub _escape_unprintable {
126	2			2		6	my ($chr) = @_;
127
128							return join q{},
129	2					13	map {; sprintf '\\x{%02x}', ord }
	6					77
130							split //, Encode::encode('utf-8', $chr, Encode::FB_DEFAULT);
131							}
132
133							sub _quote_string {
134	9			9		26	my ($string) = @_;
135
136	9					23	$string =~ s{\\}{\\\\}g;
137	9					27	$string =~ s{"}{\\"}g;
138	9					19	$string =~ s{\x09}{\\t}g;
139	9					17	$string =~ s{\x0A}{\\n}g;
140	9					17	$string =~ s{\x0D}{\\r}g;
141	9					34	$string =~ s{([\pC\v])}{_escape_unprintable($1)}ge;
	2					6
142
143	9					80	$string = Encode::encode('utf-8', $string, Encode::FB_DEFAULT);
144
145	9					594	return qq{"$string"};
146							}
147
148	2			2	0	21	sub string_flogger { 'String::Flogger' }
149
150							sub _pairs_to_kvstr_aref {
151	41			41		102	my ($self, $aref, $seen, $prefix) = @_;
152
153	41		100			145	$seen //= {};
154
155	41					58	my @kvstrs;
156
157	41					99	KEY: for (my $i = 0; $i < @$aref; $i += 2) {
158							# replace non-ident-safe chars with ?
159	86	100				257	my $key = length $aref->[$i] ? "$aref->[$i]" : '~';
160	86					141	$key =~ tr/\x21\x23-\x3C\x3E-\x7E/?/c;
161
162							# If the prefix is "" you can end up with a pair like ".foo=1" which is
163							# weird but probably best. And that means you could end up with
164							# "foo..bar=1" which is also weird, but still probably for the best.
165	86	100				177	$key = "$prefix.$key" if defined $prefix;
166
167	86					154	my $value = $aref->[$i+1];
168
169	86	100				253	if (_CODELIKE $value) {
170	4					14	$value = $value->();
171							}
172
173	86	100	100			601	if (ref $value && ref $value eq 'REF') {
174	2					9	$value = $self->string_flogger->flog([ '%s', $$value ]);
175							}
176
177	86	50				462	if (! defined $value) {
		100
178	0					0	$value = '~missing~';
179							} elsif (ref $value) {
180	22					39	my $refaddr = refaddr $value;
181
182	22	100				104	if ($seen->{ $refaddr }) {
		100
		100
183	1					3	$value = $seen->{ $refaddr };
184							} elsif (_ARRAY0($value)) {
185	7					23	$seen->{ $refaddr } = "&$key";
186
187							push @kvstrs, $self->_pairs_to_kvstr_aref(
188	7					43	[ map {; $_ => $value->[$_] } (0 .. $#$value) ],
	17					61
189							$seen,
190							$key,
191							)->@*;
192
193	7					35	next KEY;
194							} elsif (_HASH0($value)) {
195	6					25	$seen->{ $refaddr } = "&$key";
196
197							push @kvstrs, $self->_pairs_to_kvstr_aref(
198	6					52	[ $value->%{ sort keys %$value } ],
199							$seen,
200							$key,
201							)->@*;
202
203	6					30	next KEY;
204							} else {
205	8					59	$value = "$value"; # Meh.
206							}
207							}
208
209	73	100				777	my $str = "$key="
210							. ($value =~ /\A$KEY_RE\z/
211							? "$value"
212							: _quote_string($value));
213
214	73					309	push @kvstrs, $str;
215							}
216
217	41					153	return \@kvstrs;
218							}
219
220							sub format_event_string {
221	1			1	1	32	my ($self, $aref) = @_;
222
223	1					6	return join q{ }, $self->_pairs_to_kvstr_aref($aref)->@*;
224							}
225
226							#pod =method parse_event_string
227							#pod
228							#pod my $kv_pairs = Log::Fmt->parse_event_string($octets);
229							#pod
230							#pod Given the kind of (byte) string emitted by C<format_event_string>, this method
231							#pod returns a reference to an array of key/value pairs. After being unquoted,
232							#pod value strings will be UTF-8 decoded into character strings.
233							#pod
234							#pod This isn't exactly a round trip. First off, the formatting can change illegal
235							#pod keys by replacing characters with question marks, or replacing empty strings
236							#pod with tildes. Secondly, the formatter will expand some values like arrayrefs
237							#pod and hashrefs into multiple keys, but the parser will not recombined those keys
238							#pod into structures. Also, there might be other asymmetric conversions. That
239							#pod said, the string escaping done by the formatter should correctly reverse.
240							#pod
241							#pod If the input string is badly formed, hunks that don't appear to be value
242							#pod key/value pairs will be presented as values for the key C<junk>.
243							#pod
244							#pod =cut
245
246							sub parse_event_string {
247	9			9	1	9690	my ($self, $octets) = @_;
248
249	9					18	my @result;
250
251	9					30	HUNK: while (length $octets) {
252	26	100				356	if ($octets =~ s/\A($KEY_RE)=($KEY_RE)(?:\s+\|\z)//) {
253	17					72	push @result, $1, $2;
254	17					50	next HUNK;
255							}
256
257	9	50				267	if ($octets =~ s/\A($KEY_RE)="((\\\\\|\\"\|[^"])*?)"(?:\s+\|\z)//) {
258	9					26	my $key = $1;
259	9					22	my $qstring = $2;
260
261	9					54	$qstring =~ s{
262							( \\\\ \| \\["nrt] \| (\\x)\{([[:xdigit:]]{2})\} )
263							}
264							{
265	19	50	50			147	$1 eq "\\\\" ? "\\"
		100
		100
		100
		100
		100
266							: $1 eq "\\\"" ? q{"}
267							: $1 eq "\\n" ? qq{\n}
268							: $1 eq "\\r" ? qq{\r}
269							: $1 eq "\\t" ? qq{\t}
270							: ($2//'') eq "\\x" ? chr(hex("0x$3"))
271							: $1
272							}gex;
273
274	9					70	push @result, $key, Encode::decode('utf-8', $qstring, Encode::FB_DEFAULT);
275	9					435	next HUNK;
276							}
277
278	0	0				0	if ($octets =~ s/\A(\S+)(?:\s+\|\z)//) {
279	0					0	push @result, 'junk', $1;
280	0					0	next HUNK;
281							}
282
283							# I hope this is unreachable. -- rjbs, 2022-11-03
284	0					0	push (@result, 'junk', $octets, aborted => 1);
285	0					0	last HUNK;
286							}
287
288	9					35	return \@result;
289							}
290
291							#pod =method parse_event_string_as_hash
292							#pod
293							#pod my $hashref = Log::Fmt->parse_event_string_as_hash($line);
294							#pod
295							#pod This parses the given line as logfmt, then puts the key/value pairs into a hash
296							#pod and returns a reference to it.
297							#pod
298							#pod Because nothing prevents a single key from appearing more than once, you should
299							#pod use this with the understanding that data could be lost. No guarantee is made
300							#pod of which value will be preserved.
301							#pod
302							#pod =cut
303
304							sub parse_event_string_as_hash {
305	1			1	1	10	my ($self, $octets) = @_;
306
307	1					4	return { $self->parse_event_string($octets)->@* };
308							}
309
310							1;
311
312							__END__
313
314							=pod
315
316							=encoding UTF-8
317
318							=head1 NAME
319
320							Log::Fmt - a little parser and emitter of structured log lines
321
322							=head1 VERSION
323
324							version 3.013
325
326							=head1 OVERVIEW
327
328							This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
329							methods. It converts an arrayref of key/value pairs to a string that a human
330							can scan tolerably well, and which a machine can parse about as well. It can
331							also do that tolerably-okay parsing for you.
332
333							=head1 PERL VERSION
334
335							This library should run on perls released even a long time ago. It should
336							work on any version of perl released in the last five years.
337
338							Although it may work on older versions of perl, no guarantee is made that the
339							minimum required version will not be increased. The version may be increased
340							for any reason, and there is no promise that patches will be accepted to
341							lower the minimum required perl.
342
343							=head1 METHODS
344
345							=head2 format_event_string
346
347							my $octets = Log::Fmt->format_event_string([
348							key1 => $value1,
349							key2 => $value2,
350							]);
351
352							Note especially that if any value to encode is a reference I<to a reference>,
353							then String::Flogger is used to encode the referenced value. This means you
354							can embed, in your logfmt, a JSON dump of a structure by passing a reference to
355							the structure, instead of passing the structure itself.
356
357							String values are assumed to be character strings, and will be UTF-8 encoded as
358							part of the formatting process.
359
360							=head2 parse_event_string
361
362							my $kv_pairs = Log::Fmt->parse_event_string($octets);
363
364							Given the kind of (byte) string emitted by C<format_event_string>, this method
365							returns a reference to an array of key/value pairs. After being unquoted,
366							value strings will be UTF-8 decoded into character strings.
367
368							This isn't exactly a round trip. First off, the formatting can change illegal
369							keys by replacing characters with question marks, or replacing empty strings
370							with tildes. Secondly, the formatter will expand some values like arrayrefs
371							and hashrefs into multiple keys, but the parser will not recombined those keys
372							into structures. Also, there might be other asymmetric conversions. That
373							said, the string escaping done by the formatter should correctly reverse.
374
375							If the input string is badly formed, hunks that don't appear to be value
376							key/value pairs will be presented as values for the key C<junk>.
377
378							=head2 parse_event_string_as_hash
379
380							my $hashref = Log::Fmt->parse_event_string_as_hash($line);
381
382							This parses the given line as logfmt, then puts the key/value pairs into a hash
383							and returns a reference to it.
384
385							Because nothing prevents a single key from appearing more than once, you should
386							use this with the understanding that data could be lost. No guarantee is made
387							of which value will be preserved.
388
389							=head1 SPECIFICATION
390
391							=head2 The logfmt text format
392
393							Although quite a few tools exist for managing C<logfmt>, there is no spec-like
394							document for it. Because you may require multiple implementations, a
395							specification can be helpful.
396
397							Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
398							separated by a single space.
399
400							event = pair *(WSP pair)
401							pair = key "=" value
402							okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
403							key = 1*(okchr)
404							value = key / quoted
405
406							quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
407							escaped = escaped-special / escaped-hex
408							escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
409							escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
410							quoted-ok = SP / "="
411							eightbit = %x80-ff
412
413							When formatting a value, if a value is already a valid C<key> token, use it
414							without further quoting.
415
416							=head2 Quoting a Unicode string
417
418							It is preferable to build quoted values from a Unicode string, because it's
419							possible to know whether a given codepoint is a non-ASCII unsafe character,
420							like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
421							encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
422							with a standard terminal can show something like:
423
424							user.name="Jürgen"
425
426							To generate a C<quoted> from a Unicode string, for each codepoint:
427
428							=over 4
429
430							=item *
431
432							convert C<\> to C<\\>
433
434							=item *
435
436							convert C<"> to C<\">
437
438							=item *
439
440							convert a newline (U+000A) to C<\n>
441
442							=item *
443
444							convert a carriage return (U+000D) to C<\r>
445
446							=item *
447
448							convert a character tabulation (U+0009) to C<\t>
449
450							=item *
451
452							for any control character (by general category) or vertical newline:
453
454							=over 4
455
456							=item *
457
458							encode the character into a UTF-8 bytestring
459
460							=item *
461
462							convert each byte in the bytestring into C<\x{...}> form
463
464							=item *
465
466							use that sequence of C<\x{...}> codes in place of the replaced character
467
468							=back
469
470							=back
471
472							Finally, UTF-8 encode the entire string and wrap it in double qoutes.
473
474							B<This Perl implementation assumes that all string values to be encoded are
475							character strings!>
476
477							=head3 Quoting a bytestring
478
479							Encoding a Unicode string is preferable, but may not be practical. In those
480							cases when you have only a byte sequence, apply these steps.
481
482							For each byte (using ASCII conventions):
483
484							=over 4
485
486							=item *
487
488							convert C<\> to C<\\>
489
490							=item *
491
492							convert C<"> to C<\">
493
494							=item *
495
496							convert a newline (C<%0a>) to C<\n>
497
498							=item *
499
500							convert a carriage return (C<%0d>) to C<\r>
501
502							=item *
503
504							convert a character tabulation (C<%x09>) to C<\t>
505
506							=item *
507
508							convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
509
510							=item *
511
512							convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
513
514							=back
515
516							Finally, wrap the string in double quotes.
517
518							=head1 AUTHOR
519
520							Ricardo SIGNES <cpan@semiotic.systems>
521
522							=head1 COPYRIGHT AND LICENSE
523
524							This software is copyright (c) 2025 by Ricardo SIGNES.
525
526							This is free software; you can redistribute it and/or modify it under
527							the same terms as the Perl 5 programming language system itself.
528
529							=cut