File Coverage

blib/lib/Log/Fmt.pm

Criterion	Covered	Total	%
statement	83	85	97.6
branch	36	38	94.7
condition	6	7	85.7
subroutine	14	14	100.0
pod	3	4	75.0
total	142	148	95.9

line	stmt	bran	cond	sub	pod	time	code
1	7			7		108	use v5.20;
	7					35
2	7			7		38	use warnings;
	7					13
	7					595
3							package Log::Fmt 3.101;
4							# ABSTRACT: a little parser and emitter of structured log lines
5
6	7			7		46	use experimental 'postderef'; # Not dangerous. Is accepted without changed.
	7					11
	7					57
7
8	7			7		5374	use Encode ();
	7					112184
	7					332
9	7			7		3262	use Params::Util qw(_ARRAY0 _HASH0 _CODELIKE);
	7					25263
	7					632
10	7			7		61	use Scalar::Util qw(refaddr);
	7					11
	7					348
11	7			7		3220	use String::Flogger ();
	7					77549
	7					12444
12
13							#pod =head1 OVERVIEW
14							#pod
15							#pod This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
16							#pod methods. It converts an arrayref of key/value pairs to a string that a human
17							#pod can scan tolerably well, and which a machine can parse about as well. It can
18							#pod also do that tolerably-okay parsing for you.
19							#pod
20							#pod =head1 SPECIFICATION
21							#pod
22							#pod =head2 The logfmt text format
23							#pod
24							#pod Although quite a few tools exist for managing C<logfmt>, there is no spec-like
25							#pod document for it. Because you may require multiple implementations, a
26							#pod specification can be helpful.
27							#pod
28							#pod Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
29							#pod separated by a single space.
30							#pod
31							#pod event = pair *(WSP pair)
32							#pod pair = key "=" value
33							#pod okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
34							#pod key = 1*(okchr)
35							#pod value = key / quoted
36							#pod
37							#pod quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
38							#pod escaped = escaped-special / escaped-hex
39							#pod escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
40							#pod escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
41							#pod quoted-ok = SP / "="
42							#pod eightbit = %x80-ff
43							#pod
44							#pod When formatting a value, if a value is already a valid C<key> token, use it
45							#pod without further quoting.
46							#pod
47							#pod =head2 Quoting a Unicode string
48							#pod
49							#pod It is preferable to build quoted values from a Unicode string, because it's
50							#pod possible to know whether a given codepoint is a non-ASCII unsafe character,
51							#pod like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
52							#pod encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
53							#pod with a standard terminal can show something like:
54							#pod
55							#pod user.name="Jürgen"
56							#pod
57							#pod To generate a C<quoted> from a Unicode string, for each codepoint:
58							#pod
59							#pod =begin :list
60							#pod
61							#pod * convert C<\> to C<\\>
62							#pod * convert C<"> to C<\">
63							#pod * convert a newline (U+000A) to C<\n>
64							#pod * convert a carriage return (U+000D) to C<\r>
65							#pod * convert a character tabulation (U+0009) to C<\t>
66							#pod * for any control character (by general category) or vertical newline:
67							#pod
68							#pod =begin :list
69							#pod
70							#pod * encode the character into a UTF-8 bytestring
71							#pod * convert each byte in the bytestring into C<\x{...}> form
72							#pod * use that sequence of C<\x{...}> codes in place of the replaced character
73							#pod
74							#pod =end :list
75							#pod
76							#pod =end :list
77							#pod
78							#pod Finally, UTF-8 encode the entire string and wrap it in double qoutes.
79							#pod
80							#pod B<This Perl implementation assumes that all string values to be encoded are
81							#pod character strings!>
82							#pod
83							#pod =head3 Quoting a bytestring
84							#pod
85							#pod Encoding a Unicode string is preferable, but may not be practical. In those
86							#pod cases when you have only a byte sequence, apply these steps.
87							#pod
88							#pod For each byte (using ASCII conventions):
89							#pod
90							#pod =for :list
91							#pod * convert C<\> to C<\\>
92							#pod * convert C<"> to C<\">
93							#pod * convert a newline (C<%0a>) to C<\n>
94							#pod * convert a carriage return (C<%0d>) to C<\r>
95							#pod * convert a character tabulation (C<%x09>) to C<\t>
96							#pod * convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
97							#pod * convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
98							#pod
99							#pod Finally, wrap the string in double quotes.
100							#pod
101							#pod =head3 Special values
102							#pod
103							#pod There are no reserved or sentinel values for undefined values, null pointers,
104							#pod or the like. Convention is to use a single identifier surrounded by tildes to
105							#pod indicate such a value. For example, given a null pointer for the C<user.name>
106							#pod value, a system might log:
107							#pod
108							#pod user.name=~null~
109							#pod
110							#pod If C<~null~> was a valid username, there would be ambiguity here. The choice
111							#pod of exact sentinels is left to the implementing system.
112							#pod
113							#pod =method format_event_string
114							#pod
115							#pod my $octets = Log::Fmt->format_event_string([
116							#pod key1 => $value1,
117							#pod key2 => $value2,
118							#pod ]);
119							#pod
120							#pod Note especially that if any value to encode is a reference I<to a reference>,
121							#pod then String::Flogger is used to encode the referenced value. This means you
122							#pod can embed, in your logfmt, a JSON dump of a structure by passing a reference to
123							#pod the structure, instead of passing the structure itself.
124							#pod
125							#pod String values are assumed to be character strings, and will be UTF-8 encoded as
126							#pod part of the formatting process.
127							#pod
128							#pod =cut
129
130							# okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
131							# key = 1*(okchr)
132							# value = key / quoted
133							#
134							# Remember to update the tr/// expression in _pairs_to_kvstr_aref if this
135							# changes!
136							my $KEY_RE = qr{[\x21\x23-\x3c\x3e-\x5b\x5d-\x7e]+};
137
138							sub _escape_unprintable {
139	4			4		13	my ($chr) = @_;
140
141							return join q{},
142	4					28	map {; sprintf '\\x{%02x}', ord }
	12					222
143							split //, Encode::encode('utf-8', $chr, Encode::FB_DEFAULT);
144							}
145
146							sub _quote_string {
147	24			24		46	my ($string) = @_;
148
149	24					62	$string =~ s{\\}{\\\\}g;
150	24					59	$string =~ s{"}{\\"}g;
151	24					46	$string =~ s{\x09}{\\t}g;
152	24					49	$string =~ s{\x0A}{\\n}g;
153	24					36	$string =~ s{\x0D}{\\r}g;
154	24					64	$string =~ s{([\pC\v])}{_escape_unprintable($1)}ge;
	4					13
155
156	24					146	$string = Encode::encode('utf-8', $string, Encode::FB_DEFAULT);
157
158	24					945	return qq{"$string"};
159							}
160
161	3			3	0	20	sub string_flogger { 'String::Flogger' }
162
163							sub _pairs_to_kvstr_aref {
164	86			86		152	my ($self, $aref, $seen, $prefix) = @_;
165
166	86		100			280	$seen //= {};
167
168	86					121	my @kvstrs;
169
170	86					192	KEY: for (my $i = 0; $i < @$aref; $i += 2) {
171							# replace non-ident-safe chars with ?
172	148	100				283	my $key = length $aref->[$i] ? "$aref->[$i]" : '~';
173	148					203	$key =~ tr{\x21\x23-\x3c\x3e-\x5b\x5d-\x7e}{?}c;
174
175							# If the prefix is "" you can end up with a pair like ".foo=1" which is
176							# weird but probably best. And that means you could end up with
177							# "foo..bar=1" which is also weird, but still probably for the best.
178	148	100				249	$key = "$prefix.$key" if defined $prefix;
179
180	148					185	my $value = $aref->[$i+1];
181
182	148	100				276	if (_CODELIKE $value) {
183	5					9	$value = $value->();
184							}
185
186	148	100	100			433	if (ref $value && ref $value eq 'REF') {
187	3					10	$value = $self->string_flogger->flog([ '%s', $$value ]);
188							}
189
190	148	100				455	if (! defined $value) {
		100
191	1					2	$value = '~missing~';
192							} elsif (ref $value) {
193	34					40	my $refaddr = refaddr $value;
194
195	34	100				102	if ($seen->{ $refaddr }) {
		100
		100
196	2					6	$value = $seen->{ $refaddr };
197							} elsif (_ARRAY0($value)) {
198	13					30	$seen->{ $refaddr } = "&$key";
199
200							push @kvstrs, $self->_pairs_to_kvstr_aref(
201	13					28	[ map {; $_ => $value->[$_] } (0 .. $#$value) ],
	31					71
202							$seen,
203							$key,
204							)->@*;
205
206	13					36	next KEY;
207							} elsif (_HASH0($value)) {
208	11					23	$seen->{ $refaddr } = "&$key";
209
210							push @kvstrs, $self->_pairs_to_kvstr_aref(
211	11					62	[ $value->%{ sort keys %$value } ],
212							$seen,
213							$key,
214							)->@*;
215
216	11					32	next KEY;
217							} else {
218	8					31	$value = "$value"; # Meh.
219							}
220							}
221
222	124	100				829	my $str = "$key="
223							. ($value =~ /\A$KEY_RE\z/
224							? "$value"
225							: _quote_string($value));
226
227	124					303	push @kvstrs, $str;
228							}
229
230	86					209	return \@kvstrs;
231							}
232
233							sub format_event_string {
234	5			5	1	14	my ($self, $aref) = @_;
235
236	5					16	return join q{ }, $self->_pairs_to_kvstr_aref($aref)->@*;
237							}
238
239							#pod =method parse_event_string
240							#pod
241							#pod my $kv_pairs = Log::Fmt->parse_event_string($octets);
242							#pod
243							#pod Given the kind of (byte) string emitted by C<format_event_string>, this method
244							#pod returns a reference to an array of key/value pairs. After being unquoted,
245							#pod value strings will be UTF-8 decoded into character strings.
246							#pod
247							#pod This isn't exactly a round trip. First off, the formatting can change illegal
248							#pod keys by replacing characters with question marks, or replacing empty strings
249							#pod with tildes. Secondly, the formatter will expand some values like arrayrefs
250							#pod and hashrefs into multiple keys, but the parser will not recombined those keys
251							#pod into structures. Also, there might be other asymmetric conversions. That
252							#pod said, the string escaping done by the formatter should correctly reverse.
253							#pod
254							#pod If the input string is badly formed, hunks that don't appear to be value
255							#pod key/value pairs will be presented as values for the key C<junk>.
256							#pod
257							#pod =cut
258
259							sub parse_event_string {
260	14			14	1	37	my ($self, $octets) = @_;
261
262	14					19	my @result;
263
264	14					34	HUNK: while (length $octets) {
265	33	100				293	if ($octets =~ s/\A($KEY_RE)=($KEY_RE)(?:\s+\|\z)//) {
266	19					60	push @result, $1, $2;
267	19					39	next HUNK;
268							}
269
270	14	100				305	if ($octets =~ s/\A($KEY_RE)="((\\\\\|\\"\|[^"])*?)"(?:\s+\|\z)//) {
271	10					24	my $key = $1;
272	10					16	my $qstring = $2;
273
274	10					46	$qstring =~ s{
275							( \\\\ \| \\["nrt] \| (\\x)\{([[:xdigit:]]{2})\} )
276							}
277							{
278	19	50	50			136	$1 eq "\\\\" ? "\\"
		100
		100
		100
		100
		100
279							: $1 eq "\\\"" ? q{"}
280							: $1 eq "\\n" ? qq{\n}
281							: $1 eq "\\r" ? qq{\r}
282							: $1 eq "\\t" ? qq{\t}
283							: ($2//'') eq "\\x" ? chr(hex("0x$3"))
284							: $1
285							}gex;
286
287	10					66	push @result, $key, Encode::decode('utf-8', $qstring, Encode::FB_DEFAULT);
288	10					400	next HUNK;
289							}
290
291	4	50				17	if ($octets =~ s/\A(\S+)(?:\s+\|\z)//) {
292	4					9	push @result, 'junk', $1;
293	4					8	next HUNK;
294							}
295
296							# I hope this is unreachable. -- rjbs, 2022-11-03
297	0					0	push (@result, 'junk', $octets, aborted => 1);
298	0					0	last HUNK;
299							}
300
301	14					59	return \@result;
302							}
303
304							#pod =method parse_event_string_as_hash
305							#pod
306							#pod my $hashref = Log::Fmt->parse_event_string_as_hash($line);
307							#pod
308							#pod This parses the given line as logfmt, then puts the key/value pairs into a hash
309							#pod and returns a reference to it.
310							#pod
311							#pod Because nothing prevents a single key from appearing more than once, you should
312							#pod use this with the understanding that data could be lost. No guarantee is made
313							#pod of which value will be preserved.
314							#pod
315							#pod =cut
316
317							sub parse_event_string_as_hash {
318	1			1	1	2	my ($self, $octets) = @_;
319
320	1					3	return { $self->parse_event_string($octets)->@* };
321							}
322
323							1;
324
325							__END__
326
327							=pod
328
329							=encoding UTF-8
330
331							=head1 NAME
332
333							Log::Fmt - a little parser and emitter of structured log lines
334
335							=head1 VERSION
336
337							version 3.101
338
339							=head1 OVERVIEW
340
341							This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
342							methods. It converts an arrayref of key/value pairs to a string that a human
343							can scan tolerably well, and which a machine can parse about as well. It can
344							also do that tolerably-okay parsing for you.
345
346							=head1 PERL VERSION
347
348							This library should run on perls released even a long time ago. It should
349							work on any version of perl released in the last five years.
350
351							Although it may work on older versions of perl, no guarantee is made that the
352							minimum required version will not be increased. The version may be increased
353							for any reason, and there is no promise that patches will be accepted to
354							lower the minimum required perl.
355
356							=head1 METHODS
357
358							=head2 format_event_string
359
360							my $octets = Log::Fmt->format_event_string([
361							key1 => $value1,
362							key2 => $value2,
363							]);
364
365							Note especially that if any value to encode is a reference I<to a reference>,
366							then String::Flogger is used to encode the referenced value. This means you
367							can embed, in your logfmt, a JSON dump of a structure by passing a reference to
368							the structure, instead of passing the structure itself.
369
370							String values are assumed to be character strings, and will be UTF-8 encoded as
371							part of the formatting process.
372
373							=head2 parse_event_string
374
375							my $kv_pairs = Log::Fmt->parse_event_string($octets);
376
377							Given the kind of (byte) string emitted by C<format_event_string>, this method
378							returns a reference to an array of key/value pairs. After being unquoted,
379							value strings will be UTF-8 decoded into character strings.
380
381							This isn't exactly a round trip. First off, the formatting can change illegal
382							keys by replacing characters with question marks, or replacing empty strings
383							with tildes. Secondly, the formatter will expand some values like arrayrefs
384							and hashrefs into multiple keys, but the parser will not recombined those keys
385							into structures. Also, there might be other asymmetric conversions. That
386							said, the string escaping done by the formatter should correctly reverse.
387
388							If the input string is badly formed, hunks that don't appear to be value
389							key/value pairs will be presented as values for the key C<junk>.
390
391							=head2 parse_event_string_as_hash
392
393							my $hashref = Log::Fmt->parse_event_string_as_hash($line);
394
395							This parses the given line as logfmt, then puts the key/value pairs into a hash
396							and returns a reference to it.
397
398							Because nothing prevents a single key from appearing more than once, you should
399							use this with the understanding that data could be lost. No guarantee is made
400							of which value will be preserved.
401
402							=head1 SPECIFICATION
403
404							=head2 The logfmt text format
405
406							Although quite a few tools exist for managing C<logfmt>, there is no spec-like
407							document for it. Because you may require multiple implementations, a
408							specification can be helpful.
409
410							Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
411							separated by a single space.
412
413							event = pair *(WSP pair)
414							pair = key "=" value
415							okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
416							key = 1*(okchr)
417							value = key / quoted
418
419							quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
420							escaped = escaped-special / escaped-hex
421							escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
422							escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
423							quoted-ok = SP / "="
424							eightbit = %x80-ff
425
426							When formatting a value, if a value is already a valid C<key> token, use it
427							without further quoting.
428
429							=head2 Quoting a Unicode string
430
431							It is preferable to build quoted values from a Unicode string, because it's
432							possible to know whether a given codepoint is a non-ASCII unsafe character,
433							like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
434							encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
435							with a standard terminal can show something like:
436
437							user.name="Jürgen"
438
439							To generate a C<quoted> from a Unicode string, for each codepoint:
440
441							=over 4
442
443							=item *
444
445							convert C<\> to C<\\>
446
447							=item *
448
449							convert C<"> to C<\">
450
451							=item *
452
453							convert a newline (U+000A) to C<\n>
454
455							=item *
456
457							convert a carriage return (U+000D) to C<\r>
458
459							=item *
460
461							convert a character tabulation (U+0009) to C<\t>
462
463							=item *
464
465							for any control character (by general category) or vertical newline:
466
467							=over 4
468
469							=item *
470
471							encode the character into a UTF-8 bytestring
472
473							=item *
474
475							convert each byte in the bytestring into C<\x{...}> form
476
477							=item *
478
479							use that sequence of C<\x{...}> codes in place of the replaced character
480
481							=back
482
483							=back
484
485							Finally, UTF-8 encode the entire string and wrap it in double qoutes.
486
487							B<This Perl implementation assumes that all string values to be encoded are
488							character strings!>
489
490							=head3 Quoting a bytestring
491
492							Encoding a Unicode string is preferable, but may not be practical. In those
493							cases when you have only a byte sequence, apply these steps.
494
495							For each byte (using ASCII conventions):
496
497							=over 4
498
499							=item *
500
501							convert C<\> to C<\\>
502
503							=item *
504
505							convert C<"> to C<\">
506
507							=item *
508
509							convert a newline (C<%0a>) to C<\n>
510
511							=item *
512
513							convert a carriage return (C<%0d>) to C<\r>
514
515							=item *
516
517							convert a character tabulation (C<%x09>) to C<\t>
518
519							=item *
520
521							convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
522
523							=item *
524
525							convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
526
527							=back
528
529							Finally, wrap the string in double quotes.
530
531							=head3 Special values
532
533							There are no reserved or sentinel values for undefined values, null pointers,
534							or the like. Convention is to use a single identifier surrounded by tildes to
535							indicate such a value. For example, given a null pointer for the C<user.name>
536							value, a system might log:
537
538							user.name=~null~
539
540							If C<~null~> was a valid username, there would be ambiguity here. The choice
541							of exact sentinels is left to the implementing system.
542
543							=head1 AUTHOR
544
545							Ricardo SIGNES <cpan@semiotic.systems>
546
547							=head1 COPYRIGHT AND LICENSE
548
549							This software is copyright (c) 2026 by Ricardo SIGNES.
550
551							This is free software; you can redistribute it and/or modify it under
552							the same terms as the Perl 5 programming language system itself.
553
554							=cut