File Coverage

blib/lib/Log/Fmt.pm
Criterion Covered Total %
statement 83 85 97.6
branch 36 38 94.7
condition 6 7 85.7
subroutine 14 14 100.0
pod 3 4 75.0
total 142 148 95.9


line stmt bran cond sub pod time code
1 7     7   108 use v5.20;
  7         35  
2 7     7   38 use warnings;
  7         13  
  7         595  
3             package Log::Fmt 3.101;
4             # ABSTRACT: a little parser and emitter of structured log lines
5              
6 7     7   46 use experimental 'postderef'; # Not dangerous. Is accepted without changed.
  7         11  
  7         57  
7              
8 7     7   5374 use Encode ();
  7         112184  
  7         332  
9 7     7   3262 use Params::Util qw(_ARRAY0 _HASH0 _CODELIKE);
  7         25263  
  7         632  
10 7     7   61 use Scalar::Util qw(refaddr);
  7         11  
  7         348  
11 7     7   3220 use String::Flogger ();
  7         77549  
  7         12444  
12              
13             #pod =head1 OVERVIEW
14             #pod
15             #pod This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
16             #pod methods. It converts an arrayref of key/value pairs to a string that a human
17             #pod can scan tolerably well, and which a machine can parse about as well. It can
18             #pod also do that tolerably-okay parsing for you.
19             #pod
20             #pod =head1 SPECIFICATION
21             #pod
22             #pod =head2 The logfmt text format
23             #pod
24             #pod Although quite a few tools exist for managing C<logfmt>, there is no spec-like
25             #pod document for it. Because you may require multiple implementations, a
26             #pod specification can be helpful.
27             #pod
28             #pod Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
29             #pod separated by a single space.
30             #pod
31             #pod event = pair *(WSP pair)
32             #pod pair = key "=" value
33             #pod okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
34             #pod key = 1*(okchr)
35             #pod value = key / quoted
36             #pod
37             #pod quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
38             #pod escaped = escaped-special / escaped-hex
39             #pod escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
40             #pod escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
41             #pod quoted-ok = SP / "="
42             #pod eightbit = %x80-ff
43             #pod
44             #pod When formatting a value, if a value is already a valid C<key> token, use it
45             #pod without further quoting.
46             #pod
47             #pod =head2 Quoting a Unicode string
48             #pod
49             #pod It is preferable to build quoted values from a Unicode string, because it's
50             #pod possible to know whether a given codepoint is a non-ASCII unsafe character,
51             #pod like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
52             #pod encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
53             #pod with a standard terminal can show something like:
54             #pod
55             #pod user.name="Jürgen"
56             #pod
57             #pod To generate a C<quoted> from a Unicode string, for each codepoint:
58             #pod
59             #pod =begin :list
60             #pod
61             #pod * convert C<\> to C<\\>
62             #pod * convert C<"> to C<\">
63             #pod * convert a newline (U+000A) to C<\n>
64             #pod * convert a carriage return (U+000D) to C<\r>
65             #pod * convert a character tabulation (U+0009) to C<\t>
66             #pod * for any control character (by general category) or vertical newline:
67             #pod
68             #pod =begin :list
69             #pod
70             #pod * encode the character into a UTF-8 bytestring
71             #pod * convert each byte in the bytestring into C<\x{...}> form
72             #pod * use that sequence of C<\x{...}> codes in place of the replaced character
73             #pod
74             #pod =end :list
75             #pod
76             #pod =end :list
77             #pod
78             #pod Finally, UTF-8 encode the entire string and wrap it in double qoutes.
79             #pod
80             #pod B<This Perl implementation assumes that all string values to be encoded are
81             #pod character strings!>
82             #pod
83             #pod =head3 Quoting a bytestring
84             #pod
85             #pod Encoding a Unicode string is preferable, but may not be practical. In those
86             #pod cases when you have only a byte sequence, apply these steps.
87             #pod
88             #pod For each byte (using ASCII conventions):
89             #pod
90             #pod =for :list
91             #pod * convert C<\> to C<\\>
92             #pod * convert C<"> to C<\">
93             #pod * convert a newline (C<%0a>) to C<\n>
94             #pod * convert a carriage return (C<%0d>) to C<\r>
95             #pod * convert a character tabulation (C<%x09>) to C<\t>
96             #pod * convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
97             #pod * convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
98             #pod
99             #pod Finally, wrap the string in double quotes.
100             #pod
101             #pod =head3 Special values
102             #pod
103             #pod There are no reserved or sentinel values for undefined values, null pointers,
104             #pod or the like. Convention is to use a single identifier surrounded by tildes to
105             #pod indicate such a value. For example, given a null pointer for the C<user.name>
106             #pod value, a system might log:
107             #pod
108             #pod user.name=~null~
109             #pod
110             #pod If C<~null~> was a valid username, there would be ambiguity here. The choice
111             #pod of exact sentinels is left to the implementing system.
112             #pod
113             #pod =method format_event_string
114             #pod
115             #pod my $octets = Log::Fmt->format_event_string([
116             #pod key1 => $value1,
117             #pod key2 => $value2,
118             #pod ]);
119             #pod
120             #pod Note especially that if any value to encode is a reference I<to a reference>,
121             #pod then String::Flogger is used to encode the referenced value. This means you
122             #pod can embed, in your logfmt, a JSON dump of a structure by passing a reference to
123             #pod the structure, instead of passing the structure itself.
124             #pod
125             #pod String values are assumed to be character strings, and will be UTF-8 encoded as
126             #pod part of the formatting process.
127             #pod
128             #pod =cut
129              
130             # okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
131             # key = 1*(okchr)
132             # value = key / quoted
133             #
134             # Remember to update the tr/// expression in _pairs_to_kvstr_aref if this
135             # changes!
136             my $KEY_RE = qr{[\x21\x23-\x3c\x3e-\x5b\x5d-\x7e]+};
137              
138             sub _escape_unprintable {
139 4     4   13 my ($chr) = @_;
140              
141             return join q{},
142 4         28 map {; sprintf '\\x{%02x}', ord }
  12         222  
143             split //, Encode::encode('utf-8', $chr, Encode::FB_DEFAULT);
144             }
145              
146             sub _quote_string {
147 24     24   46 my ($string) = @_;
148              
149 24         62 $string =~ s{\\}{\\\\}g;
150 24         59 $string =~ s{"}{\\"}g;
151 24         46 $string =~ s{\x09}{\\t}g;
152 24         49 $string =~ s{\x0A}{\\n}g;
153 24         36 $string =~ s{\x0D}{\\r}g;
154 24         64 $string =~ s{([\pC\v])}{_escape_unprintable($1)}ge;
  4         13  
155              
156 24         146 $string = Encode::encode('utf-8', $string, Encode::FB_DEFAULT);
157              
158 24         945 return qq{"$string"};
159             }
160              
161 3     3 0 20 sub string_flogger { 'String::Flogger' }
162              
163             sub _pairs_to_kvstr_aref {
164 86     86   152 my ($self, $aref, $seen, $prefix) = @_;
165              
166 86   100     280 $seen //= {};
167              
168 86         121 my @kvstrs;
169              
170 86         192 KEY: for (my $i = 0; $i < @$aref; $i += 2) {
171             # replace non-ident-safe chars with ?
172 148 100       283 my $key = length $aref->[$i] ? "$aref->[$i]" : '~';
173 148         203 $key =~ tr{\x21\x23-\x3c\x3e-\x5b\x5d-\x7e}{?}c;
174              
175             # If the prefix is "" you can end up with a pair like ".foo=1" which is
176             # weird but probably best. And that means you could end up with
177             # "foo..bar=1" which is also weird, but still probably for the best.
178 148 100       249 $key = "$prefix.$key" if defined $prefix;
179              
180 148         185 my $value = $aref->[$i+1];
181              
182 148 100       276 if (_CODELIKE $value) {
183 5         9 $value = $value->();
184             }
185              
186 148 100 100     433 if (ref $value && ref $value eq 'REF') {
187 3         10 $value = $self->string_flogger->flog([ '%s', $$value ]);
188             }
189              
190 148 100       455 if (! defined $value) {
    100          
191 1         2 $value = '~missing~';
192             } elsif (ref $value) {
193 34         40 my $refaddr = refaddr $value;
194              
195 34 100       102 if ($seen->{ $refaddr }) {
    100          
    100          
196 2         6 $value = $seen->{ $refaddr };
197             } elsif (_ARRAY0($value)) {
198 13         30 $seen->{ $refaddr } = "&$key";
199              
200             push @kvstrs, $self->_pairs_to_kvstr_aref(
201 13         28 [ map {; $_ => $value->[$_] } (0 .. $#$value) ],
  31         71  
202             $seen,
203             $key,
204             )->@*;
205              
206 13         36 next KEY;
207             } elsif (_HASH0($value)) {
208 11         23 $seen->{ $refaddr } = "&$key";
209              
210             push @kvstrs, $self->_pairs_to_kvstr_aref(
211 11         62 [ $value->%{ sort keys %$value } ],
212             $seen,
213             $key,
214             )->@*;
215              
216 11         32 next KEY;
217             } else {
218 8         31 $value = "$value"; # Meh.
219             }
220             }
221              
222 124 100       829 my $str = "$key="
223             . ($value =~ /\A$KEY_RE\z/
224             ? "$value"
225             : _quote_string($value));
226              
227 124         303 push @kvstrs, $str;
228             }
229              
230 86         209 return \@kvstrs;
231             }
232              
233             sub format_event_string {
234 5     5 1 14 my ($self, $aref) = @_;
235              
236 5         16 return join q{ }, $self->_pairs_to_kvstr_aref($aref)->@*;
237             }
238              
239             #pod =method parse_event_string
240             #pod
241             #pod my $kv_pairs = Log::Fmt->parse_event_string($octets);
242             #pod
243             #pod Given the kind of (byte) string emitted by C<format_event_string>, this method
244             #pod returns a reference to an array of key/value pairs. After being unquoted,
245             #pod value strings will be UTF-8 decoded into character strings.
246             #pod
247             #pod This isn't exactly a round trip. First off, the formatting can change illegal
248             #pod keys by replacing characters with question marks, or replacing empty strings
249             #pod with tildes. Secondly, the formatter will expand some values like arrayrefs
250             #pod and hashrefs into multiple keys, but the parser will not recombined those keys
251             #pod into structures. Also, there might be other asymmetric conversions. That
252             #pod said, the string escaping done by the formatter should correctly reverse.
253             #pod
254             #pod If the input string is badly formed, hunks that don't appear to be value
255             #pod key/value pairs will be presented as values for the key C<junk>.
256             #pod
257             #pod =cut
258              
259             sub parse_event_string {
260 14     14 1 37 my ($self, $octets) = @_;
261              
262 14         19 my @result;
263              
264 14         34 HUNK: while (length $octets) {
265 33 100       293 if ($octets =~ s/\A($KEY_RE)=($KEY_RE)(?:\s+|\z)//) {
266 19         60 push @result, $1, $2;
267 19         39 next HUNK;
268             }
269              
270 14 100       305 if ($octets =~ s/\A($KEY_RE)="((\\\\|\\"|[^"])*?)"(?:\s+|\z)//) {
271 10         24 my $key = $1;
272 10         16 my $qstring = $2;
273              
274 10         46 $qstring =~ s{
275             ( \\\\ | \\["nrt] | (\\x)\{([[:xdigit:]]{2})\} )
276             }
277             {
278 19 50 50     136 $1 eq "\\\\" ? "\\"
    100          
    100          
    100          
    100          
    100          
279             : $1 eq "\\\"" ? q{"}
280             : $1 eq "\\n" ? qq{\n}
281             : $1 eq "\\r" ? qq{\r}
282             : $1 eq "\\t" ? qq{\t}
283             : ($2//'') eq "\\x" ? chr(hex("0x$3"))
284             : $1
285             }gex;
286              
287 10         66 push @result, $key, Encode::decode('utf-8', $qstring, Encode::FB_DEFAULT);
288 10         400 next HUNK;
289             }
290              
291 4 50       17 if ($octets =~ s/\A(\S+)(?:\s+|\z)//) {
292 4         9 push @result, 'junk', $1;
293 4         8 next HUNK;
294             }
295              
296             # I hope this is unreachable. -- rjbs, 2022-11-03
297 0         0 push (@result, 'junk', $octets, aborted => 1);
298 0         0 last HUNK;
299             }
300              
301 14         59 return \@result;
302             }
303              
304             #pod =method parse_event_string_as_hash
305             #pod
306             #pod my $hashref = Log::Fmt->parse_event_string_as_hash($line);
307             #pod
308             #pod This parses the given line as logfmt, then puts the key/value pairs into a hash
309             #pod and returns a reference to it.
310             #pod
311             #pod Because nothing prevents a single key from appearing more than once, you should
312             #pod use this with the understanding that data could be lost. No guarantee is made
313             #pod of which value will be preserved.
314             #pod
315             #pod =cut
316              
317             sub parse_event_string_as_hash {
318 1     1 1 2 my ($self, $octets) = @_;
319              
320 1         3 return { $self->parse_event_string($octets)->@* };
321             }
322              
323             1;
324              
325             __END__
326              
327             =pod
328              
329             =encoding UTF-8
330              
331             =head1 NAME
332              
333             Log::Fmt - a little parser and emitter of structured log lines
334              
335             =head1 VERSION
336              
337             version 3.101
338              
339             =head1 OVERVIEW
340              
341             This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
342             methods. It converts an arrayref of key/value pairs to a string that a human
343             can scan tolerably well, and which a machine can parse about as well. It can
344             also do that tolerably-okay parsing for you.
345              
346             =head1 PERL VERSION
347              
348             This library should run on perls released even a long time ago. It should
349             work on any version of perl released in the last five years.
350              
351             Although it may work on older versions of perl, no guarantee is made that the
352             minimum required version will not be increased. The version may be increased
353             for any reason, and there is no promise that patches will be accepted to
354             lower the minimum required perl.
355              
356             =head1 METHODS
357              
358             =head2 format_event_string
359              
360             my $octets = Log::Fmt->format_event_string([
361             key1 => $value1,
362             key2 => $value2,
363             ]);
364              
365             Note especially that if any value to encode is a reference I<to a reference>,
366             then String::Flogger is used to encode the referenced value. This means you
367             can embed, in your logfmt, a JSON dump of a structure by passing a reference to
368             the structure, instead of passing the structure itself.
369              
370             String values are assumed to be character strings, and will be UTF-8 encoded as
371             part of the formatting process.
372              
373             =head2 parse_event_string
374              
375             my $kv_pairs = Log::Fmt->parse_event_string($octets);
376              
377             Given the kind of (byte) string emitted by C<format_event_string>, this method
378             returns a reference to an array of key/value pairs. After being unquoted,
379             value strings will be UTF-8 decoded into character strings.
380              
381             This isn't exactly a round trip. First off, the formatting can change illegal
382             keys by replacing characters with question marks, or replacing empty strings
383             with tildes. Secondly, the formatter will expand some values like arrayrefs
384             and hashrefs into multiple keys, but the parser will not recombined those keys
385             into structures. Also, there might be other asymmetric conversions. That
386             said, the string escaping done by the formatter should correctly reverse.
387              
388             If the input string is badly formed, hunks that don't appear to be value
389             key/value pairs will be presented as values for the key C<junk>.
390              
391             =head2 parse_event_string_as_hash
392              
393             my $hashref = Log::Fmt->parse_event_string_as_hash($line);
394              
395             This parses the given line as logfmt, then puts the key/value pairs into a hash
396             and returns a reference to it.
397              
398             Because nothing prevents a single key from appearing more than once, you should
399             use this with the understanding that data could be lost. No guarantee is made
400             of which value will be preserved.
401              
402             =head1 SPECIFICATION
403              
404             =head2 The logfmt text format
405              
406             Although quite a few tools exist for managing C<logfmt>, there is no spec-like
407             document for it. Because you may require multiple implementations, a
408             specification can be helpful.
409              
410             Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
411             separated by a single space.
412              
413             event = pair *(WSP pair)
414             pair = key "=" value
415             okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
416             key = 1*(okchr)
417             value = key / quoted
418              
419             quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
420             escaped = escaped-special / escaped-hex
421             escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
422             escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
423             quoted-ok = SP / "="
424             eightbit = %x80-ff
425              
426             When formatting a value, if a value is already a valid C<key> token, use it
427             without further quoting.
428              
429             =head2 Quoting a Unicode string
430              
431             It is preferable to build quoted values from a Unicode string, because it's
432             possible to know whether a given codepoint is a non-ASCII unsafe character,
433             like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
434             encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
435             with a standard terminal can show something like:
436              
437             user.name="Jürgen"
438              
439             To generate a C<quoted> from a Unicode string, for each codepoint:
440              
441             =over 4
442              
443             =item *
444              
445             convert C<\> to C<\\>
446              
447             =item *
448              
449             convert C<"> to C<\">
450              
451             =item *
452              
453             convert a newline (U+000A) to C<\n>
454              
455             =item *
456              
457             convert a carriage return (U+000D) to C<\r>
458              
459             =item *
460              
461             convert a character tabulation (U+0009) to C<\t>
462              
463             =item *
464              
465             for any control character (by general category) or vertical newline:
466              
467             =over 4
468              
469             =item *
470              
471             encode the character into a UTF-8 bytestring
472              
473             =item *
474              
475             convert each byte in the bytestring into C<\x{...}> form
476              
477             =item *
478              
479             use that sequence of C<\x{...}> codes in place of the replaced character
480              
481             =back
482              
483             =back
484              
485             Finally, UTF-8 encode the entire string and wrap it in double qoutes.
486              
487             B<This Perl implementation assumes that all string values to be encoded are
488             character strings!>
489              
490             =head3 Quoting a bytestring
491              
492             Encoding a Unicode string is preferable, but may not be practical. In those
493             cases when you have only a byte sequence, apply these steps.
494              
495             For each byte (using ASCII conventions):
496              
497             =over 4
498              
499             =item *
500              
501             convert C<\> to C<\\>
502              
503             =item *
504              
505             convert C<"> to C<\">
506              
507             =item *
508              
509             convert a newline (C<%0a>) to C<\n>
510              
511             =item *
512              
513             convert a carriage return (C<%0d>) to C<\r>
514              
515             =item *
516              
517             convert a character tabulation (C<%x09>) to C<\t>
518              
519             =item *
520              
521             convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
522              
523             =item *
524              
525             convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
526              
527             =back
528              
529             Finally, wrap the string in double quotes.
530              
531             =head3 Special values
532              
533             There are no reserved or sentinel values for undefined values, null pointers,
534             or the like. Convention is to use a single identifier surrounded by tildes to
535             indicate such a value. For example, given a null pointer for the C<user.name>
536             value, a system might log:
537              
538             user.name=~null~
539              
540             If C<~null~> was a valid username, there would be ambiguity here. The choice
541             of exact sentinels is left to the implementing system.
542              
543             =head1 AUTHOR
544              
545             Ricardo SIGNES <cpan@semiotic.systems>
546              
547             =head1 COPYRIGHT AND LICENSE
548              
549             This software is copyright (c) 2026 by Ricardo SIGNES.
550              
551             This is free software; you can redistribute it and/or modify it under
552             the same terms as the Perl 5 programming language system itself.
553              
554             =cut