File Coverage

blib/lib/Log/Fmt.pm
Criterion Covered Total %
statement 79 85 92.9
branch 33 38 86.8
condition 6 7 85.7
subroutine 14 14 100.0
pod 3 4 75.0
total 135 148 91.2


line stmt bran cond sub pod time code
1 7     7   98 use v5.20;
  7         26  
2 7     7   57 use warnings;
  7         14  
  7         544  
3             package Log::Fmt 3.013;
4             # ABSTRACT: a little parser and emitter of structured log lines
5              
6 7     7   42 use experimental 'postderef'; # Not dangerous. Is accepted without changed.
  7         14  
  7         53  
7              
8 7     7   5839 use Encode ();
  7         137464  
  7         373  
9 7     7   4051 use Params::Util qw(_ARRAY0 _HASH0 _CODELIKE);
  7         32476  
  7         770  
10 7     7   89 use Scalar::Util qw(refaddr);
  7         14  
  7         361  
11 7     7   3917 use String::Flogger ();
  7         98813  
  7         15715  
12              
13             #pod =head1 OVERVIEW
14             #pod
15             #pod This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
16             #pod methods. It converts an arrayref of key/value pairs to a string that a human
17             #pod can scan tolerably well, and which a machine can parse about as well. It can
18             #pod also do that tolerably-okay parsing for you.
19             #pod
20             #pod =head1 SPECIFICATION
21             #pod
22             #pod =head2 The logfmt text format
23             #pod
24             #pod Although quite a few tools exist for managing C<logfmt>, there is no spec-like
25             #pod document for it. Because you may require multiple implementations, a
26             #pod specification can be helpful.
27             #pod
28             #pod Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
29             #pod separated by a single space.
30             #pod
31             #pod event = pair *(WSP pair)
32             #pod pair = key "=" value
33             #pod okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
34             #pod key = 1*(okchr)
35             #pod value = key / quoted
36             #pod
37             #pod quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
38             #pod escaped = escaped-special / escaped-hex
39             #pod escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
40             #pod escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
41             #pod quoted-ok = SP / "="
42             #pod eightbit = %x80-ff
43             #pod
44             #pod When formatting a value, if a value is already a valid C<key> token, use it
45             #pod without further quoting.
46             #pod
47             #pod =head2 Quoting a Unicode string
48             #pod
49             #pod It is preferable to build quoted values from a Unicode string, because it's
50             #pod possible to know whether a given codepoint is a non-ASCII unsafe character,
51             #pod like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
52             #pod encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
53             #pod with a standard terminal can show something like:
54             #pod
55             #pod user.name="Jürgen"
56             #pod
57             #pod To generate a C<quoted> from a Unicode string, for each codepoint:
58             #pod
59             #pod =begin :list
60             #pod
61             #pod * convert C<\> to C<\\>
62             #pod * convert C<"> to C<\">
63             #pod * convert a newline (U+000A) to C<\n>
64             #pod * convert a carriage return (U+000D) to C<\r>
65             #pod * convert a character tabulation (U+0009) to C<\t>
66             #pod * for any control character (by general category) or vertical newline:
67             #pod
68             #pod =begin :list
69             #pod
70             #pod * encode the character into a UTF-8 bytestring
71             #pod * convert each byte in the bytestring into C<\x{...}> form
72             #pod * use that sequence of C<\x{...}> codes in place of the replaced character
73             #pod
74             #pod =end :list
75             #pod
76             #pod =end :list
77             #pod
78             #pod Finally, UTF-8 encode the entire string and wrap it in double qoutes.
79             #pod
80             #pod B<This Perl implementation assumes that all string values to be encoded are
81             #pod character strings!>
82             #pod
83             #pod =head3 Quoting a bytestring
84             #pod
85             #pod Encoding a Unicode string is preferable, but may not be practical. In those
86             #pod cases when you have only a byte sequence, apply these steps.
87             #pod
88             #pod For each byte (using ASCII conventions):
89             #pod
90             #pod =for :list
91             #pod * convert C<\> to C<\\>
92             #pod * convert C<"> to C<\">
93             #pod * convert a newline (C<%0a>) to C<\n>
94             #pod * convert a carriage return (C<%0d>) to C<\r>
95             #pod * convert a character tabulation (C<%x09>) to C<\t>
96             #pod * convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
97             #pod * convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
98             #pod
99             #pod Finally, wrap the string in double quotes.
100             #pod
101             #pod =cut
102              
103             #pod =method format_event_string
104             #pod
105             #pod my $octets = Log::Fmt->format_event_string([
106             #pod key1 => $value1,
107             #pod key2 => $value2,
108             #pod ]);
109             #pod
110             #pod Note especially that if any value to encode is a reference I<to a reference>,
111             #pod then String::Flogger is used to encode the referenced value. This means you
112             #pod can embed, in your logfmt, a JSON dump of a structure by passing a reference to
113             #pod the structure, instead of passing the structure itself.
114             #pod
115             #pod String values are assumed to be character strings, and will be UTF-8 encoded as
116             #pod part of the formatting process.
117             #pod
118             #pod =cut
119              
120             # okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
121             # key = 1*(okchr)
122             # value = key / quoted
123             my $KEY_RE = qr{[\x21\x23-\x3c\x3e-\x5b\x5d-\x7e]+};
124              
125             sub _escape_unprintable {
126 2     2   6 my ($chr) = @_;
127              
128             return join q{},
129 2         13 map {; sprintf '\\x{%02x}', ord }
  6         77  
130             split //, Encode::encode('utf-8', $chr, Encode::FB_DEFAULT);
131             }
132              
133             sub _quote_string {
134 9     9   26 my ($string) = @_;
135              
136 9         23 $string =~ s{\\}{\\\\}g;
137 9         27 $string =~ s{"}{\\"}g;
138 9         19 $string =~ s{\x09}{\\t}g;
139 9         17 $string =~ s{\x0A}{\\n}g;
140 9         17 $string =~ s{\x0D}{\\r}g;
141 9         34 $string =~ s{([\pC\v])}{_escape_unprintable($1)}ge;
  2         6  
142              
143 9         80 $string = Encode::encode('utf-8', $string, Encode::FB_DEFAULT);
144              
145 9         594 return qq{"$string"};
146             }
147              
148 2     2 0 21 sub string_flogger { 'String::Flogger' }
149              
150             sub _pairs_to_kvstr_aref {
151 41     41   102 my ($self, $aref, $seen, $prefix) = @_;
152              
153 41   100     145 $seen //= {};
154              
155 41         58 my @kvstrs;
156              
157 41         99 KEY: for (my $i = 0; $i < @$aref; $i += 2) {
158             # replace non-ident-safe chars with ?
159 86 100       257 my $key = length $aref->[$i] ? "$aref->[$i]" : '~';
160 86         141 $key =~ tr/\x21\x23-\x3C\x3E-\x7E/?/c;
161              
162             # If the prefix is "" you can end up with a pair like ".foo=1" which is
163             # weird but probably best. And that means you could end up with
164             # "foo..bar=1" which is also weird, but still probably for the best.
165 86 100       177 $key = "$prefix.$key" if defined $prefix;
166              
167 86         154 my $value = $aref->[$i+1];
168              
169 86 100       253 if (_CODELIKE $value) {
170 4         14 $value = $value->();
171             }
172              
173 86 100 100     601 if (ref $value && ref $value eq 'REF') {
174 2         9 $value = $self->string_flogger->flog([ '%s', $$value ]);
175             }
176              
177 86 50       462 if (! defined $value) {
    100          
178 0         0 $value = '~missing~';
179             } elsif (ref $value) {
180 22         39 my $refaddr = refaddr $value;
181              
182 22 100       104 if ($seen->{ $refaddr }) {
    100          
    100          
183 1         3 $value = $seen->{ $refaddr };
184             } elsif (_ARRAY0($value)) {
185 7         23 $seen->{ $refaddr } = "&$key";
186              
187             push @kvstrs, $self->_pairs_to_kvstr_aref(
188 7         43 [ map {; $_ => $value->[$_] } (0 .. $#$value) ],
  17         61  
189             $seen,
190             $key,
191             )->@*;
192              
193 7         35 next KEY;
194             } elsif (_HASH0($value)) {
195 6         25 $seen->{ $refaddr } = "&$key";
196              
197             push @kvstrs, $self->_pairs_to_kvstr_aref(
198 6         52 [ $value->%{ sort keys %$value } ],
199             $seen,
200             $key,
201             )->@*;
202              
203 6         30 next KEY;
204             } else {
205 8         59 $value = "$value"; # Meh.
206             }
207             }
208              
209 73 100       777 my $str = "$key="
210             . ($value =~ /\A$KEY_RE\z/
211             ? "$value"
212             : _quote_string($value));
213              
214 73         309 push @kvstrs, $str;
215             }
216              
217 41         153 return \@kvstrs;
218             }
219              
220             sub format_event_string {
221 1     1 1 32 my ($self, $aref) = @_;
222              
223 1         6 return join q{ }, $self->_pairs_to_kvstr_aref($aref)->@*;
224             }
225              
226             #pod =method parse_event_string
227             #pod
228             #pod my $kv_pairs = Log::Fmt->parse_event_string($octets);
229             #pod
230             #pod Given the kind of (byte) string emitted by C<format_event_string>, this method
231             #pod returns a reference to an array of key/value pairs. After being unquoted,
232             #pod value strings will be UTF-8 decoded into character strings.
233             #pod
234             #pod This isn't exactly a round trip. First off, the formatting can change illegal
235             #pod keys by replacing characters with question marks, or replacing empty strings
236             #pod with tildes. Secondly, the formatter will expand some values like arrayrefs
237             #pod and hashrefs into multiple keys, but the parser will not recombined those keys
238             #pod into structures. Also, there might be other asymmetric conversions. That
239             #pod said, the string escaping done by the formatter should correctly reverse.
240             #pod
241             #pod If the input string is badly formed, hunks that don't appear to be value
242             #pod key/value pairs will be presented as values for the key C<junk>.
243             #pod
244             #pod =cut
245              
246             sub parse_event_string {
247 9     9 1 9690 my ($self, $octets) = @_;
248              
249 9         18 my @result;
250              
251 9         30 HUNK: while (length $octets) {
252 26 100       356 if ($octets =~ s/\A($KEY_RE)=($KEY_RE)(?:\s+|\z)//) {
253 17         72 push @result, $1, $2;
254 17         50 next HUNK;
255             }
256              
257 9 50       267 if ($octets =~ s/\A($KEY_RE)="((\\\\|\\"|[^"])*?)"(?:\s+|\z)//) {
258 9         26 my $key = $1;
259 9         22 my $qstring = $2;
260              
261 9         54 $qstring =~ s{
262             ( \\\\ | \\["nrt] | (\\x)\{([[:xdigit:]]{2})\} )
263             }
264             {
265 19 50 50     147 $1 eq "\\\\" ? "\\"
    100          
    100          
    100          
    100          
    100          
266             : $1 eq "\\\"" ? q{"}
267             : $1 eq "\\n" ? qq{\n}
268             : $1 eq "\\r" ? qq{\r}
269             : $1 eq "\\t" ? qq{\t}
270             : ($2//'') eq "\\x" ? chr(hex("0x$3"))
271             : $1
272             }gex;
273              
274 9         70 push @result, $key, Encode::decode('utf-8', $qstring, Encode::FB_DEFAULT);
275 9         435 next HUNK;
276             }
277              
278 0 0       0 if ($octets =~ s/\A(\S+)(?:\s+|\z)//) {
279 0         0 push @result, 'junk', $1;
280 0         0 next HUNK;
281             }
282              
283             # I hope this is unreachable. -- rjbs, 2022-11-03
284 0         0 push (@result, 'junk', $octets, aborted => 1);
285 0         0 last HUNK;
286             }
287              
288 9         35 return \@result;
289             }
290              
291             #pod =method parse_event_string_as_hash
292             #pod
293             #pod my $hashref = Log::Fmt->parse_event_string_as_hash($line);
294             #pod
295             #pod This parses the given line as logfmt, then puts the key/value pairs into a hash
296             #pod and returns a reference to it.
297             #pod
298             #pod Because nothing prevents a single key from appearing more than once, you should
299             #pod use this with the understanding that data could be lost. No guarantee is made
300             #pod of which value will be preserved.
301             #pod
302             #pod =cut
303              
304             sub parse_event_string_as_hash {
305 1     1 1 10 my ($self, $octets) = @_;
306              
307 1         4 return { $self->parse_event_string($octets)->@* };
308             }
309              
310             1;
311              
312             __END__
313              
314             =pod
315              
316             =encoding UTF-8
317              
318             =head1 NAME
319              
320             Log::Fmt - a little parser and emitter of structured log lines
321              
322             =head1 VERSION
323              
324             version 3.013
325              
326             =head1 OVERVIEW
327              
328             This library primarily exists to service L<Log::Dispatchouli>'s C<log_event>
329             methods. It converts an arrayref of key/value pairs to a string that a human
330             can scan tolerably well, and which a machine can parse about as well. It can
331             also do that tolerably-okay parsing for you.
332              
333             =head1 PERL VERSION
334              
335             This library should run on perls released even a long time ago. It should
336             work on any version of perl released in the last five years.
337              
338             Although it may work on older versions of perl, no guarantee is made that the
339             minimum required version will not be increased. The version may be increased
340             for any reason, and there is no promise that patches will be accepted to
341             lower the minimum required perl.
342              
343             =head1 METHODS
344              
345             =head2 format_event_string
346              
347             my $octets = Log::Fmt->format_event_string([
348             key1 => $value1,
349             key2 => $value2,
350             ]);
351              
352             Note especially that if any value to encode is a reference I<to a reference>,
353             then String::Flogger is used to encode the referenced value. This means you
354             can embed, in your logfmt, a JSON dump of a structure by passing a reference to
355             the structure, instead of passing the structure itself.
356              
357             String values are assumed to be character strings, and will be UTF-8 encoded as
358             part of the formatting process.
359              
360             =head2 parse_event_string
361              
362             my $kv_pairs = Log::Fmt->parse_event_string($octets);
363              
364             Given the kind of (byte) string emitted by C<format_event_string>, this method
365             returns a reference to an array of key/value pairs. After being unquoted,
366             value strings will be UTF-8 decoded into character strings.
367              
368             This isn't exactly a round trip. First off, the formatting can change illegal
369             keys by replacing characters with question marks, or replacing empty strings
370             with tildes. Secondly, the formatter will expand some values like arrayrefs
371             and hashrefs into multiple keys, but the parser will not recombined those keys
372             into structures. Also, there might be other asymmetric conversions. That
373             said, the string escaping done by the formatter should correctly reverse.
374              
375             If the input string is badly formed, hunks that don't appear to be value
376             key/value pairs will be presented as values for the key C<junk>.
377              
378             =head2 parse_event_string_as_hash
379              
380             my $hashref = Log::Fmt->parse_event_string_as_hash($line);
381              
382             This parses the given line as logfmt, then puts the key/value pairs into a hash
383             and returns a reference to it.
384              
385             Because nothing prevents a single key from appearing more than once, you should
386             use this with the understanding that data could be lost. No guarantee is made
387             of which value will be preserved.
388              
389             =head1 SPECIFICATION
390              
391             =head2 The logfmt text format
392              
393             Although quite a few tools exist for managing C<logfmt>, there is no spec-like
394             document for it. Because you may require multiple implementations, a
395             specification can be helpful.
396              
397             Every logfmt event is a sequence of pairs in the form C<key=value>. Pairs are
398             separated by a single space.
399              
400             event = pair *(WSP pair)
401             pair = key "=" value
402             okchr = %x21 / %x23-3c / %x3e-5b / %x5d-7e ; VCHAR minus \ and " and =
403             key = 1*(okchr)
404             value = key / quoted
405              
406             quoted = DQUOTE *( escaped / quoted-ok / okchr / eightbit ) DQUOTE
407             escaped = escaped-special / escaped-hex
408             escaped-special = "\\" / "\n" / "\r" / "\t" / ("\" DQUOTE)
409             escaped-hex = "\x{" 2HEXDIG "}" ; lowercase forms okay also
410             quoted-ok = SP / "="
411             eightbit = %x80-ff
412              
413             When formatting a value, if a value is already a valid C<key> token, use it
414             without further quoting.
415              
416             =head2 Quoting a Unicode string
417              
418             It is preferable to build quoted values from a Unicode string, because it's
419             possible to know whether a given codepoint is a non-ASCII unsafe character,
420             like C<LINE SEPARATOR>. Safe non-ASCII characters can be directly UTF-8
421             encoded, rather than quoted with C<\x{...}>. In that way, viewing logfmt events
422             with a standard terminal can show something like:
423              
424             user.name="Jürgen"
425              
426             To generate a C<quoted> from a Unicode string, for each codepoint:
427              
428             =over 4
429              
430             =item *
431              
432             convert C<\> to C<\\>
433              
434             =item *
435              
436             convert C<"> to C<\">
437              
438             =item *
439              
440             convert a newline (U+000A) to C<\n>
441              
442             =item *
443              
444             convert a carriage return (U+000D) to C<\r>
445              
446             =item *
447              
448             convert a character tabulation (U+0009) to C<\t>
449              
450             =item *
451              
452             for any control character (by general category) or vertical newline:
453              
454             =over 4
455              
456             =item *
457              
458             encode the character into a UTF-8 bytestring
459              
460             =item *
461              
462             convert each byte in the bytestring into C<\x{...}> form
463              
464             =item *
465              
466             use that sequence of C<\x{...}> codes in place of the replaced character
467              
468             =back
469              
470             =back
471              
472             Finally, UTF-8 encode the entire string and wrap it in double qoutes.
473              
474             B<This Perl implementation assumes that all string values to be encoded are
475             character strings!>
476              
477             =head3 Quoting a bytestring
478              
479             Encoding a Unicode string is preferable, but may not be practical. In those
480             cases when you have only a byte sequence, apply these steps.
481              
482             For each byte (using ASCII conventions):
483              
484             =over 4
485              
486             =item *
487              
488             convert C<\> to C<\\>
489              
490             =item *
491              
492             convert C<"> to C<\">
493              
494             =item *
495              
496             convert a newline (C<%0a>) to C<\n>
497              
498             =item *
499              
500             convert a carriage return (C<%0d>) to C<\r>
501              
502             =item *
503              
504             convert a character tabulation (C<%x09>) to C<\t>
505              
506             =item *
507              
508             convert any control character (C<%x00-1f / %x7f>) to the C<\x{...}> form
509              
510             =item *
511              
512             convert any non-ASCII byte (C<%x80-ff>) to the C<\x{...}> form
513              
514             =back
515              
516             Finally, wrap the string in double quotes.
517              
518             =head1 AUTHOR
519              
520             Ricardo SIGNES <cpan@semiotic.systems>
521              
522             =head1 COPYRIGHT AND LICENSE
523              
524             This software is copyright (c) 2025 by Ricardo SIGNES.
525              
526             This is free software; you can redistribute it and/or modify it under
527             the same terms as the Perl 5 programming language system itself.
528              
529             =cut