File Coverage

lib/DMS/Parser.pm
Criterion Covered Total %
statement 939 1771 53.0
branch 374 1008 37.1
condition 149 555 26.8
subroutine 77 131 58.7
pod 5 43 11.6
total 1544 3508 44.0


line stmt bran cond sub pod time code
1             package DMS::Parser;
2             # DMS parser — Perl port of the Rust reference.
3             # Public (SPEC v0.14): DMS::Parser::decode($src) -> hashref/arrayref/scalar/blessed-DT.
4             # The legacy spelling DMS::Parser::parse($src) is retained as a deprecated
5             # alias and emits a one-time Carp::carp warning per process. SPEC §Decode/Encode.
6              
7 3     3   367385 use strict;
  3         7  
  3         111  
8 3     3   32 use warnings;
  3         5  
  3         161  
9 3     3   1557 use utf8;
  3         956  
  3         18  
10 3     3   97 use Carp ();
  3         8  
  3         91869  
11             # Tie::IxHash is loaded lazily (only when non-lite mode actually
12             # constructs a tied table) — saves ~30ms of startup and avoids the
13             # import entirely on the lite/encoder hot path.
14             # Several historical imports (POSIX qw(floor), Math::BigInt, Encode)
15             # have been removed: floor is unused; integer parsing now hand-rolls
16             # i64 on native IVs (no BigInt); UTF-8 work goes through utf8::is_utf8
17             # / utf8::decode core builtins (no Encode). Importing those modules
18             # only at need keeps DMS::Parser load time low — material on small
19             # CLI invocations like the conformance encoder.
20             # Unicode::Normalize is loaded lazily — pure-ASCII source short-circuits
21             # the NFC pass, so most CLI invocations never need the module at all.
22 0     0   0 sub _NFC { require Unicode::Normalize; *_NFC = \&Unicode::Normalize::NFC; goto &_NFC }
  0         0  
  0         0  
23              
24             our $VERSION = '0.5.3';
25              
26             # Capability flag — this port ships lite-mode decode + lite-mode encode_lite.
27             # See SPEC §Parsing modes — full and lite.
28             our $SUPPORTS_LITE_MODE = 1;
29              
30             # Capability flag — this port ships unordered-table parse mode.
31             # See SPEC §Unordered tables.
32             our $SUPPORTS_IGNORE_ORDER = 1;
33              
34             # Datetime sentinel classes -----------------------------------------------
35             # All typed-scalar sentinels are blessed *scalar refs* rather than blessed
36             # hashrefs. That's one allocation instead of three (HV + hv_entry + RV)
37             # per value — noticeable on wide-flat documents where every leaf is a
38             # typed scalar. Public accessor API (->value, ->bstr, ->is_neg) unchanged.
39 0     0   0 package DMS::Parser::LocalDate; sub new { my $v = "$_[1]"; bless \$v, $_[0] } sub value { ${ $_[0] } }
  0     0   0  
  0         0  
  0         0  
40 0     0   0 package DMS::Parser::LocalTime; sub new { my $v = "$_[1]"; bless \$v, $_[0] } sub value { ${ $_[0] } }
  0     0   0  
  0         0  
  0         0  
41 0     0   0 package DMS::Parser::LocalDateTime; sub new { my $v = "$_[1]"; bless \$v, $_[0] } sub value { ${ $_[0] } }
  0     0   0  
  0         0  
  0         0  
42 0     0   0 package DMS::Parser::OffsetDateTime; sub new { my $v = "$_[1]"; bless \$v, $_[0] } sub value { ${ $_[0] } }
  0     0   0  
  0         0  
  0         0  
43 0     0   0 package DMS::Parser::Float; sub new { my $v = 0 + $_[1]; bless \$v, $_[0] } sub value { ${ $_[0] } }
  0     0   0  
  0         0  
  0         0  
44             # DMS::Parser::Integer holds the canonical decimal representation. On 64-bit Perl
45             # it's stored as a native IV so the parser doesn't pay a string-format
46             # cost just to build it. `value()` returns the object itself so callers
47             # can chain `->bstr` / `->is_neg` — the public API matches what was
48             # previously a Math::BigInt instance (`$int->value->bstr` still works).
49             package DMS::Parser::Integer;
50 27     27   124 sub new { my $v = 0 + $_[1]; bless \$v, $_[0] }
  27         108  
51 0     0   0 sub value { $_[0] }
52 30     30   50 sub bstr { "${ $_[0] }" } # force stringification of the IV
  30         116  
53 0     0   0 sub is_neg { ${ $_[0] } < 0 }
  0         0  
54 0 0   0   0 package DMS::Parser::Bool; sub new { my $v = $_[1]?1:0; bless \$v, $_[0] } sub value { ${ $_[0] } }
  0     0   0  
  0         0  
  0         0  
55              
56             # Path-segment marker class used inside attached-comment paths to
57             # distinguish list-index segments (DMS::Parser::Index) from string-keyed table
58             # segments (plain Perl scalars). Mirrors Rust's BreadcrumbSegment::Index.
59             package DMS::Parser::Index;
60 22     22   42 sub new { my $v = 0 + $_[1]; bless \$v, $_[0] }
  22         70  
61 1     1   7016 sub value { ${ $_[0] } }
  1         9  
62              
63             # SPEC §"Unordered tables". Marker class for body tables produced by the
64             # *_unordered parser entry points. Underlying storage is a plain Perl
65             # hashref (no Tie::IxHash, no `\0_keys` sidecar) — iteration order is
66             # arbitrary per Perl's hash randomization. Mirrors Rust's
67             # Value::UnorderedTable. `to_dms` (full mode) refuses to round-trip a
68             # Document containing this variant; `to_dms_lite` accepts it.
69             package DMS::Parser::UnorderedTable;
70             # Construct from a plain hashref. Caller hands ownership of the hash.
71             sub new {
72 0     0   0 my ($class, $h) = @_;
73 0 0       0 $h = {} unless defined $h;
74 0         0 return bless $h, $class;
75             }
76              
77             package DMS::Parser;
78              
79             # UAX #31 §2 default identifier syntax frozen at Unicode 15.1.
80             # Sorted, non-overlapping ranges: XID_Continue \ Default_Ignorable_Code_Point
81             # (per SPEC: every parser must use this same frozen snapshot — relying on
82             # Perl's `\p{XID_Continue}` would track the host's Unicode data and accept
83             # newly-assigned codepoints DMS does not know about). 772 ranges total.
84             our @XID_CONTINUE_RANGES = (
85             [0x00AA, 0x00AA],
86             [0x00B5, 0x00B5],
87             [0x00B7, 0x00B7],
88             [0x00BA, 0x00BA],
89             [0x00C0, 0x00D6],
90             [0x00D8, 0x00F6],
91             [0x00F8, 0x02C1],
92             [0x02C6, 0x02D1],
93             [0x02E0, 0x02E4],
94             [0x02EC, 0x02EC],
95             [0x02EE, 0x02EE],
96             [0x0300, 0x034E],
97             [0x0350, 0x0374],
98             [0x0376, 0x0377],
99             [0x037B, 0x037D],
100             [0x037F, 0x037F],
101             [0x0386, 0x038A],
102             [0x038C, 0x038C],
103             [0x038E, 0x03A1],
104             [0x03A3, 0x03F5],
105             [0x03F7, 0x0481],
106             [0x0483, 0x0487],
107             [0x048A, 0x052F],
108             [0x0531, 0x0556],
109             [0x0559, 0x0559],
110             [0x0560, 0x0588],
111             [0x0591, 0x05BD],
112             [0x05BF, 0x05BF],
113             [0x05C1, 0x05C2],
114             [0x05C4, 0x05C5],
115             [0x05C7, 0x05C7],
116             [0x05D0, 0x05EA],
117             [0x05EF, 0x05F2],
118             [0x0610, 0x061A],
119             [0x0620, 0x0669],
120             [0x066E, 0x06D3],
121             [0x06D5, 0x06DC],
122             [0x06DF, 0x06E8],
123             [0x06EA, 0x06FC],
124             [0x06FF, 0x06FF],
125             [0x0710, 0x074A],
126             [0x074D, 0x07B1],
127             [0x07C0, 0x07F5],
128             [0x07FA, 0x07FA],
129             [0x07FD, 0x07FD],
130             [0x0800, 0x082D],
131             [0x0840, 0x085B],
132             [0x0860, 0x086A],
133             [0x0870, 0x0887],
134             [0x0889, 0x088E],
135             [0x0898, 0x08E1],
136             [0x08E3, 0x0963],
137             [0x0966, 0x096F],
138             [0x0971, 0x0983],
139             [0x0985, 0x098C],
140             [0x098F, 0x0990],
141             [0x0993, 0x09A8],
142             [0x09AA, 0x09B0],
143             [0x09B2, 0x09B2],
144             [0x09B6, 0x09B9],
145             [0x09BC, 0x09C4],
146             [0x09C7, 0x09C8],
147             [0x09CB, 0x09CE],
148             [0x09D7, 0x09D7],
149             [0x09DC, 0x09DD],
150             [0x09DF, 0x09E3],
151             [0x09E6, 0x09F1],
152             [0x09FC, 0x09FC],
153             [0x09FE, 0x09FE],
154             [0x0A01, 0x0A03],
155             [0x0A05, 0x0A0A],
156             [0x0A0F, 0x0A10],
157             [0x0A13, 0x0A28],
158             [0x0A2A, 0x0A30],
159             [0x0A32, 0x0A33],
160             [0x0A35, 0x0A36],
161             [0x0A38, 0x0A39],
162             [0x0A3C, 0x0A3C],
163             [0x0A3E, 0x0A42],
164             [0x0A47, 0x0A48],
165             [0x0A4B, 0x0A4D],
166             [0x0A51, 0x0A51],
167             [0x0A59, 0x0A5C],
168             [0x0A5E, 0x0A5E],
169             [0x0A66, 0x0A75],
170             [0x0A81, 0x0A83],
171             [0x0A85, 0x0A8D],
172             [0x0A8F, 0x0A91],
173             [0x0A93, 0x0AA8],
174             [0x0AAA, 0x0AB0],
175             [0x0AB2, 0x0AB3],
176             [0x0AB5, 0x0AB9],
177             [0x0ABC, 0x0AC5],
178             [0x0AC7, 0x0AC9],
179             [0x0ACB, 0x0ACD],
180             [0x0AD0, 0x0AD0],
181             [0x0AE0, 0x0AE3],
182             [0x0AE6, 0x0AEF],
183             [0x0AF9, 0x0AFF],
184             [0x0B01, 0x0B03],
185             [0x0B05, 0x0B0C],
186             [0x0B0F, 0x0B10],
187             [0x0B13, 0x0B28],
188             [0x0B2A, 0x0B30],
189             [0x0B32, 0x0B33],
190             [0x0B35, 0x0B39],
191             [0x0B3C, 0x0B44],
192             [0x0B47, 0x0B48],
193             [0x0B4B, 0x0B4D],
194             [0x0B55, 0x0B57],
195             [0x0B5C, 0x0B5D],
196             [0x0B5F, 0x0B63],
197             [0x0B66, 0x0B6F],
198             [0x0B71, 0x0B71],
199             [0x0B82, 0x0B83],
200             [0x0B85, 0x0B8A],
201             [0x0B8E, 0x0B90],
202             [0x0B92, 0x0B95],
203             [0x0B99, 0x0B9A],
204             [0x0B9C, 0x0B9C],
205             [0x0B9E, 0x0B9F],
206             [0x0BA3, 0x0BA4],
207             [0x0BA8, 0x0BAA],
208             [0x0BAE, 0x0BB9],
209             [0x0BBE, 0x0BC2],
210             [0x0BC6, 0x0BC8],
211             [0x0BCA, 0x0BCD],
212             [0x0BD0, 0x0BD0],
213             [0x0BD7, 0x0BD7],
214             [0x0BE6, 0x0BEF],
215             [0x0C00, 0x0C0C],
216             [0x0C0E, 0x0C10],
217             [0x0C12, 0x0C28],
218             [0x0C2A, 0x0C39],
219             [0x0C3C, 0x0C44],
220             [0x0C46, 0x0C48],
221             [0x0C4A, 0x0C4D],
222             [0x0C55, 0x0C56],
223             [0x0C58, 0x0C5A],
224             [0x0C5D, 0x0C5D],
225             [0x0C60, 0x0C63],
226             [0x0C66, 0x0C6F],
227             [0x0C80, 0x0C83],
228             [0x0C85, 0x0C8C],
229             [0x0C8E, 0x0C90],
230             [0x0C92, 0x0CA8],
231             [0x0CAA, 0x0CB3],
232             [0x0CB5, 0x0CB9],
233             [0x0CBC, 0x0CC4],
234             [0x0CC6, 0x0CC8],
235             [0x0CCA, 0x0CCD],
236             [0x0CD5, 0x0CD6],
237             [0x0CDD, 0x0CDE],
238             [0x0CE0, 0x0CE3],
239             [0x0CE6, 0x0CEF],
240             [0x0CF1, 0x0CF3],
241             [0x0D00, 0x0D0C],
242             [0x0D0E, 0x0D10],
243             [0x0D12, 0x0D44],
244             [0x0D46, 0x0D48],
245             [0x0D4A, 0x0D4E],
246             [0x0D54, 0x0D57],
247             [0x0D5F, 0x0D63],
248             [0x0D66, 0x0D6F],
249             [0x0D7A, 0x0D7F],
250             [0x0D81, 0x0D83],
251             [0x0D85, 0x0D96],
252             [0x0D9A, 0x0DB1],
253             [0x0DB3, 0x0DBB],
254             [0x0DBD, 0x0DBD],
255             [0x0DC0, 0x0DC6],
256             [0x0DCA, 0x0DCA],
257             [0x0DCF, 0x0DD4],
258             [0x0DD6, 0x0DD6],
259             [0x0DD8, 0x0DDF],
260             [0x0DE6, 0x0DEF],
261             [0x0DF2, 0x0DF3],
262             [0x0E01, 0x0E3A],
263             [0x0E40, 0x0E4E],
264             [0x0E50, 0x0E59],
265             [0x0E81, 0x0E82],
266             [0x0E84, 0x0E84],
267             [0x0E86, 0x0E8A],
268             [0x0E8C, 0x0EA3],
269             [0x0EA5, 0x0EA5],
270             [0x0EA7, 0x0EBD],
271             [0x0EC0, 0x0EC4],
272             [0x0EC6, 0x0EC6],
273             [0x0EC8, 0x0ECE],
274             [0x0ED0, 0x0ED9],
275             [0x0EDC, 0x0EDF],
276             [0x0F00, 0x0F00],
277             [0x0F18, 0x0F19],
278             [0x0F20, 0x0F29],
279             [0x0F35, 0x0F35],
280             [0x0F37, 0x0F37],
281             [0x0F39, 0x0F39],
282             [0x0F3E, 0x0F47],
283             [0x0F49, 0x0F6C],
284             [0x0F71, 0x0F84],
285             [0x0F86, 0x0F97],
286             [0x0F99, 0x0FBC],
287             [0x0FC6, 0x0FC6],
288             [0x1000, 0x1049],
289             [0x1050, 0x109D],
290             [0x10A0, 0x10C5],
291             [0x10C7, 0x10C7],
292             [0x10CD, 0x10CD],
293             [0x10D0, 0x10FA],
294             [0x10FC, 0x115E],
295             [0x1161, 0x1248],
296             [0x124A, 0x124D],
297             [0x1250, 0x1256],
298             [0x1258, 0x1258],
299             [0x125A, 0x125D],
300             [0x1260, 0x1288],
301             [0x128A, 0x128D],
302             [0x1290, 0x12B0],
303             [0x12B2, 0x12B5],
304             [0x12B8, 0x12BE],
305             [0x12C0, 0x12C0],
306             [0x12C2, 0x12C5],
307             [0x12C8, 0x12D6],
308             [0x12D8, 0x1310],
309             [0x1312, 0x1315],
310             [0x1318, 0x135A],
311             [0x135D, 0x135F],
312             [0x1369, 0x1371],
313             [0x1380, 0x138F],
314             [0x13A0, 0x13F5],
315             [0x13F8, 0x13FD],
316             [0x1401, 0x166C],
317             [0x166F, 0x167F],
318             [0x1681, 0x169A],
319             [0x16A0, 0x16EA],
320             [0x16EE, 0x16F8],
321             [0x1700, 0x1715],
322             [0x171F, 0x1734],
323             [0x1740, 0x1753],
324             [0x1760, 0x176C],
325             [0x176E, 0x1770],
326             [0x1772, 0x1773],
327             [0x1780, 0x17B3],
328             [0x17B6, 0x17D3],
329             [0x17D7, 0x17D7],
330             [0x17DC, 0x17DD],
331             [0x17E0, 0x17E9],
332             [0x1810, 0x1819],
333             [0x1820, 0x1878],
334             [0x1880, 0x18AA],
335             [0x18B0, 0x18F5],
336             [0x1900, 0x191E],
337             [0x1920, 0x192B],
338             [0x1930, 0x193B],
339             [0x1946, 0x196D],
340             [0x1970, 0x1974],
341             [0x1980, 0x19AB],
342             [0x19B0, 0x19C9],
343             [0x19D0, 0x19DA],
344             [0x1A00, 0x1A1B],
345             [0x1A20, 0x1A5E],
346             [0x1A60, 0x1A7C],
347             [0x1A7F, 0x1A89],
348             [0x1A90, 0x1A99],
349             [0x1AA7, 0x1AA7],
350             [0x1AB0, 0x1ABD],
351             [0x1ABF, 0x1ACE],
352             [0x1B00, 0x1B4C],
353             [0x1B50, 0x1B59],
354             [0x1B6B, 0x1B73],
355             [0x1B80, 0x1BF3],
356             [0x1C00, 0x1C37],
357             [0x1C40, 0x1C49],
358             [0x1C4D, 0x1C7D],
359             [0x1C80, 0x1C88],
360             [0x1C90, 0x1CBA],
361             [0x1CBD, 0x1CBF],
362             [0x1CD0, 0x1CD2],
363             [0x1CD4, 0x1CFA],
364             [0x1D00, 0x1F15],
365             [0x1F18, 0x1F1D],
366             [0x1F20, 0x1F45],
367             [0x1F48, 0x1F4D],
368             [0x1F50, 0x1F57],
369             [0x1F59, 0x1F59],
370             [0x1F5B, 0x1F5B],
371             [0x1F5D, 0x1F5D],
372             [0x1F5F, 0x1F7D],
373             [0x1F80, 0x1FB4],
374             [0x1FB6, 0x1FBC],
375             [0x1FBE, 0x1FBE],
376             [0x1FC2, 0x1FC4],
377             [0x1FC6, 0x1FCC],
378             [0x1FD0, 0x1FD3],
379             [0x1FD6, 0x1FDB],
380             [0x1FE0, 0x1FEC],
381             [0x1FF2, 0x1FF4],
382             [0x1FF6, 0x1FFC],
383             [0x203F, 0x2040],
384             [0x2054, 0x2054],
385             [0x2071, 0x2071],
386             [0x207F, 0x207F],
387             [0x2090, 0x209C],
388             [0x20D0, 0x20DC],
389             [0x20E1, 0x20E1],
390             [0x20E5, 0x20F0],
391             [0x2102, 0x2102],
392             [0x2107, 0x2107],
393             [0x210A, 0x2113],
394             [0x2115, 0x2115],
395             [0x2118, 0x211D],
396             [0x2124, 0x2124],
397             [0x2126, 0x2126],
398             [0x2128, 0x2128],
399             [0x212A, 0x2139],
400             [0x213C, 0x213F],
401             [0x2145, 0x2149],
402             [0x214E, 0x214E],
403             [0x2160, 0x2188],
404             [0x2C00, 0x2CE4],
405             [0x2CEB, 0x2CF3],
406             [0x2D00, 0x2D25],
407             [0x2D27, 0x2D27],
408             [0x2D2D, 0x2D2D],
409             [0x2D30, 0x2D67],
410             [0x2D6F, 0x2D6F],
411             [0x2D7F, 0x2D96],
412             [0x2DA0, 0x2DA6],
413             [0x2DA8, 0x2DAE],
414             [0x2DB0, 0x2DB6],
415             [0x2DB8, 0x2DBE],
416             [0x2DC0, 0x2DC6],
417             [0x2DC8, 0x2DCE],
418             [0x2DD0, 0x2DD6],
419             [0x2DD8, 0x2DDE],
420             [0x2DE0, 0x2DFF],
421             [0x3005, 0x3007],
422             [0x3021, 0x302F],
423             [0x3031, 0x3035],
424             [0x3038, 0x303C],
425             [0x3041, 0x3096],
426             [0x3099, 0x309A],
427             [0x309D, 0x309F],
428             [0x30A1, 0x30FF],
429             [0x3105, 0x312F],
430             [0x3131, 0x3163],
431             [0x3165, 0x318E],
432             [0x31A0, 0x31BF],
433             [0x31F0, 0x31FF],
434             [0x3400, 0x4DBF],
435             [0x4E00, 0xA48C],
436             [0xA4D0, 0xA4FD],
437             [0xA500, 0xA60C],
438             [0xA610, 0xA62B],
439             [0xA640, 0xA66F],
440             [0xA674, 0xA67D],
441             [0xA67F, 0xA6F1],
442             [0xA717, 0xA71F],
443             [0xA722, 0xA788],
444             [0xA78B, 0xA7CA],
445             [0xA7D0, 0xA7D1],
446             [0xA7D3, 0xA7D3],
447             [0xA7D5, 0xA7D9],
448             [0xA7F2, 0xA827],
449             [0xA82C, 0xA82C],
450             [0xA840, 0xA873],
451             [0xA880, 0xA8C5],
452             [0xA8D0, 0xA8D9],
453             [0xA8E0, 0xA8F7],
454             [0xA8FB, 0xA8FB],
455             [0xA8FD, 0xA92D],
456             [0xA930, 0xA953],
457             [0xA960, 0xA97C],
458             [0xA980, 0xA9C0],
459             [0xA9CF, 0xA9D9],
460             [0xA9E0, 0xA9FE],
461             [0xAA00, 0xAA36],
462             [0xAA40, 0xAA4D],
463             [0xAA50, 0xAA59],
464             [0xAA60, 0xAA76],
465             [0xAA7A, 0xAAC2],
466             [0xAADB, 0xAADD],
467             [0xAAE0, 0xAAEF],
468             [0xAAF2, 0xAAF6],
469             [0xAB01, 0xAB06],
470             [0xAB09, 0xAB0E],
471             [0xAB11, 0xAB16],
472             [0xAB20, 0xAB26],
473             [0xAB28, 0xAB2E],
474             [0xAB30, 0xAB5A],
475             [0xAB5C, 0xAB69],
476             [0xAB70, 0xABEA],
477             [0xABEC, 0xABED],
478             [0xABF0, 0xABF9],
479             [0xAC00, 0xD7A3],
480             [0xD7B0, 0xD7C6],
481             [0xD7CB, 0xD7FB],
482             [0xF900, 0xFA6D],
483             [0xFA70, 0xFAD9],
484             [0xFB00, 0xFB06],
485             [0xFB13, 0xFB17],
486             [0xFB1D, 0xFB28],
487             [0xFB2A, 0xFB36],
488             [0xFB38, 0xFB3C],
489             [0xFB3E, 0xFB3E],
490             [0xFB40, 0xFB41],
491             [0xFB43, 0xFB44],
492             [0xFB46, 0xFBB1],
493             [0xFBD3, 0xFC5D],
494             [0xFC64, 0xFD3D],
495             [0xFD50, 0xFD8F],
496             [0xFD92, 0xFDC7],
497             [0xFDF0, 0xFDF9],
498             [0xFE20, 0xFE2F],
499             [0xFE33, 0xFE34],
500             [0xFE4D, 0xFE4F],
501             [0xFE71, 0xFE71],
502             [0xFE73, 0xFE73],
503             [0xFE77, 0xFE77],
504             [0xFE79, 0xFE79],
505             [0xFE7B, 0xFE7B],
506             [0xFE7D, 0xFE7D],
507             [0xFE7F, 0xFEFC],
508             [0xFF10, 0xFF19],
509             [0xFF21, 0xFF3A],
510             [0xFF3F, 0xFF3F],
511             [0xFF41, 0xFF5A],
512             [0xFF65, 0xFF9F],
513             [0xFFA1, 0xFFBE],
514             [0xFFC2, 0xFFC7],
515             [0xFFCA, 0xFFCF],
516             [0xFFD2, 0xFFD7],
517             [0xFFDA, 0xFFDC],
518             [0x10000, 0x1000B],
519             [0x1000D, 0x10026],
520             [0x10028, 0x1003A],
521             [0x1003C, 0x1003D],
522             [0x1003F, 0x1004D],
523             [0x10050, 0x1005D],
524             [0x10080, 0x100FA],
525             [0x10140, 0x10174],
526             [0x101FD, 0x101FD],
527             [0x10280, 0x1029C],
528             [0x102A0, 0x102D0],
529             [0x102E0, 0x102E0],
530             [0x10300, 0x1031F],
531             [0x1032D, 0x1034A],
532             [0x10350, 0x1037A],
533             [0x10380, 0x1039D],
534             [0x103A0, 0x103C3],
535             [0x103C8, 0x103CF],
536             [0x103D1, 0x103D5],
537             [0x10400, 0x1049D],
538             [0x104A0, 0x104A9],
539             [0x104B0, 0x104D3],
540             [0x104D8, 0x104FB],
541             [0x10500, 0x10527],
542             [0x10530, 0x10563],
543             [0x10570, 0x1057A],
544             [0x1057C, 0x1058A],
545             [0x1058C, 0x10592],
546             [0x10594, 0x10595],
547             [0x10597, 0x105A1],
548             [0x105A3, 0x105B1],
549             [0x105B3, 0x105B9],
550             [0x105BB, 0x105BC],
551             [0x10600, 0x10736],
552             [0x10740, 0x10755],
553             [0x10760, 0x10767],
554             [0x10780, 0x10785],
555             [0x10787, 0x107B0],
556             [0x107B2, 0x107BA],
557             [0x10800, 0x10805],
558             [0x10808, 0x10808],
559             [0x1080A, 0x10835],
560             [0x10837, 0x10838],
561             [0x1083C, 0x1083C],
562             [0x1083F, 0x10855],
563             [0x10860, 0x10876],
564             [0x10880, 0x1089E],
565             [0x108E0, 0x108F2],
566             [0x108F4, 0x108F5],
567             [0x10900, 0x10915],
568             [0x10920, 0x10939],
569             [0x10980, 0x109B7],
570             [0x109BE, 0x109BF],
571             [0x10A00, 0x10A03],
572             [0x10A05, 0x10A06],
573             [0x10A0C, 0x10A13],
574             [0x10A15, 0x10A17],
575             [0x10A19, 0x10A35],
576             [0x10A38, 0x10A3A],
577             [0x10A3F, 0x10A3F],
578             [0x10A60, 0x10A7C],
579             [0x10A80, 0x10A9C],
580             [0x10AC0, 0x10AC7],
581             [0x10AC9, 0x10AE6],
582             [0x10B00, 0x10B35],
583             [0x10B40, 0x10B55],
584             [0x10B60, 0x10B72],
585             [0x10B80, 0x10B91],
586             [0x10C00, 0x10C48],
587             [0x10C80, 0x10CB2],
588             [0x10CC0, 0x10CF2],
589             [0x10D00, 0x10D27],
590             [0x10D30, 0x10D39],
591             [0x10E80, 0x10EA9],
592             [0x10EAB, 0x10EAC],
593             [0x10EB0, 0x10EB1],
594             [0x10EFD, 0x10F1C],
595             [0x10F27, 0x10F27],
596             [0x10F30, 0x10F50],
597             [0x10F70, 0x10F85],
598             [0x10FB0, 0x10FC4],
599             [0x10FE0, 0x10FF6],
600             [0x11000, 0x11046],
601             [0x11066, 0x11075],
602             [0x1107F, 0x110BA],
603             [0x110C2, 0x110C2],
604             [0x110D0, 0x110E8],
605             [0x110F0, 0x110F9],
606             [0x11100, 0x11134],
607             [0x11136, 0x1113F],
608             [0x11144, 0x11147],
609             [0x11150, 0x11173],
610             [0x11176, 0x11176],
611             [0x11180, 0x111C4],
612             [0x111C9, 0x111CC],
613             [0x111CE, 0x111DA],
614             [0x111DC, 0x111DC],
615             [0x11200, 0x11211],
616             [0x11213, 0x11237],
617             [0x1123E, 0x11241],
618             [0x11280, 0x11286],
619             [0x11288, 0x11288],
620             [0x1128A, 0x1128D],
621             [0x1128F, 0x1129D],
622             [0x1129F, 0x112A8],
623             [0x112B0, 0x112EA],
624             [0x112F0, 0x112F9],
625             [0x11300, 0x11303],
626             [0x11305, 0x1130C],
627             [0x1130F, 0x11310],
628             [0x11313, 0x11328],
629             [0x1132A, 0x11330],
630             [0x11332, 0x11333],
631             [0x11335, 0x11339],
632             [0x1133B, 0x11344],
633             [0x11347, 0x11348],
634             [0x1134B, 0x1134D],
635             [0x11350, 0x11350],
636             [0x11357, 0x11357],
637             [0x1135D, 0x11363],
638             [0x11366, 0x1136C],
639             [0x11370, 0x11374],
640             [0x11400, 0x1144A],
641             [0x11450, 0x11459],
642             [0x1145E, 0x11461],
643             [0x11480, 0x114C5],
644             [0x114C7, 0x114C7],
645             [0x114D0, 0x114D9],
646             [0x11580, 0x115B5],
647             [0x115B8, 0x115C0],
648             [0x115D8, 0x115DD],
649             [0x11600, 0x11640],
650             [0x11644, 0x11644],
651             [0x11650, 0x11659],
652             [0x11680, 0x116B8],
653             [0x116C0, 0x116C9],
654             [0x11700, 0x1171A],
655             [0x1171D, 0x1172B],
656             [0x11730, 0x11739],
657             [0x11740, 0x11746],
658             [0x11800, 0x1183A],
659             [0x118A0, 0x118E9],
660             [0x118FF, 0x11906],
661             [0x11909, 0x11909],
662             [0x1190C, 0x11913],
663             [0x11915, 0x11916],
664             [0x11918, 0x11935],
665             [0x11937, 0x11938],
666             [0x1193B, 0x11943],
667             [0x11950, 0x11959],
668             [0x119A0, 0x119A7],
669             [0x119AA, 0x119D7],
670             [0x119DA, 0x119E1],
671             [0x119E3, 0x119E4],
672             [0x11A00, 0x11A3E],
673             [0x11A47, 0x11A47],
674             [0x11A50, 0x11A99],
675             [0x11A9D, 0x11A9D],
676             [0x11AB0, 0x11AF8],
677             [0x11C00, 0x11C08],
678             [0x11C0A, 0x11C36],
679             [0x11C38, 0x11C40],
680             [0x11C50, 0x11C59],
681             [0x11C72, 0x11C8F],
682             [0x11C92, 0x11CA7],
683             [0x11CA9, 0x11CB6],
684             [0x11D00, 0x11D06],
685             [0x11D08, 0x11D09],
686             [0x11D0B, 0x11D36],
687             [0x11D3A, 0x11D3A],
688             [0x11D3C, 0x11D3D],
689             [0x11D3F, 0x11D47],
690             [0x11D50, 0x11D59],
691             [0x11D60, 0x11D65],
692             [0x11D67, 0x11D68],
693             [0x11D6A, 0x11D8E],
694             [0x11D90, 0x11D91],
695             [0x11D93, 0x11D98],
696             [0x11DA0, 0x11DA9],
697             [0x11EE0, 0x11EF6],
698             [0x11F00, 0x11F10],
699             [0x11F12, 0x11F3A],
700             [0x11F3E, 0x11F42],
701             [0x11F50, 0x11F59],
702             [0x11FB0, 0x11FB0],
703             [0x12000, 0x12399],
704             [0x12400, 0x1246E],
705             [0x12480, 0x12543],
706             [0x12F90, 0x12FF0],
707             [0x13000, 0x1342F],
708             [0x13440, 0x13455],
709             [0x14400, 0x14646],
710             [0x16800, 0x16A38],
711             [0x16A40, 0x16A5E],
712             [0x16A60, 0x16A69],
713             [0x16A70, 0x16ABE],
714             [0x16AC0, 0x16AC9],
715             [0x16AD0, 0x16AED],
716             [0x16AF0, 0x16AF4],
717             [0x16B00, 0x16B36],
718             [0x16B40, 0x16B43],
719             [0x16B50, 0x16B59],
720             [0x16B63, 0x16B77],
721             [0x16B7D, 0x16B8F],
722             [0x16E40, 0x16E7F],
723             [0x16F00, 0x16F4A],
724             [0x16F4F, 0x16F87],
725             [0x16F8F, 0x16F9F],
726             [0x16FE0, 0x16FE1],
727             [0x16FE3, 0x16FE4],
728             [0x16FF0, 0x16FF1],
729             [0x17000, 0x187F7],
730             [0x18800, 0x18CD5],
731             [0x18D00, 0x18D08],
732             [0x1AFF0, 0x1AFF3],
733             [0x1AFF5, 0x1AFFB],
734             [0x1AFFD, 0x1AFFE],
735             [0x1B000, 0x1B122],
736             [0x1B132, 0x1B132],
737             [0x1B150, 0x1B152],
738             [0x1B155, 0x1B155],
739             [0x1B164, 0x1B167],
740             [0x1B170, 0x1B2FB],
741             [0x1BC00, 0x1BC6A],
742             [0x1BC70, 0x1BC7C],
743             [0x1BC80, 0x1BC88],
744             [0x1BC90, 0x1BC99],
745             [0x1BC9D, 0x1BC9E],
746             [0x1CF00, 0x1CF2D],
747             [0x1CF30, 0x1CF46],
748             [0x1D165, 0x1D169],
749             [0x1D16D, 0x1D172],
750             [0x1D17B, 0x1D182],
751             [0x1D185, 0x1D18B],
752             [0x1D1AA, 0x1D1AD],
753             [0x1D242, 0x1D244],
754             [0x1D400, 0x1D454],
755             [0x1D456, 0x1D49C],
756             [0x1D49E, 0x1D49F],
757             [0x1D4A2, 0x1D4A2],
758             [0x1D4A5, 0x1D4A6],
759             [0x1D4A9, 0x1D4AC],
760             [0x1D4AE, 0x1D4B9],
761             [0x1D4BB, 0x1D4BB],
762             [0x1D4BD, 0x1D4C3],
763             [0x1D4C5, 0x1D505],
764             [0x1D507, 0x1D50A],
765             [0x1D50D, 0x1D514],
766             [0x1D516, 0x1D51C],
767             [0x1D51E, 0x1D539],
768             [0x1D53B, 0x1D53E],
769             [0x1D540, 0x1D544],
770             [0x1D546, 0x1D546],
771             [0x1D54A, 0x1D550],
772             [0x1D552, 0x1D6A5],
773             [0x1D6A8, 0x1D6C0],
774             [0x1D6C2, 0x1D6DA],
775             [0x1D6DC, 0x1D6FA],
776             [0x1D6FC, 0x1D714],
777             [0x1D716, 0x1D734],
778             [0x1D736, 0x1D74E],
779             [0x1D750, 0x1D76E],
780             [0x1D770, 0x1D788],
781             [0x1D78A, 0x1D7A8],
782             [0x1D7AA, 0x1D7C2],
783             [0x1D7C4, 0x1D7CB],
784             [0x1D7CE, 0x1D7FF],
785             [0x1DA00, 0x1DA36],
786             [0x1DA3B, 0x1DA6C],
787             [0x1DA75, 0x1DA75],
788             [0x1DA84, 0x1DA84],
789             [0x1DA9B, 0x1DA9F],
790             [0x1DAA1, 0x1DAAF],
791             [0x1DF00, 0x1DF1E],
792             [0x1DF25, 0x1DF2A],
793             [0x1E000, 0x1E006],
794             [0x1E008, 0x1E018],
795             [0x1E01B, 0x1E021],
796             [0x1E023, 0x1E024],
797             [0x1E026, 0x1E02A],
798             [0x1E030, 0x1E06D],
799             [0x1E08F, 0x1E08F],
800             [0x1E100, 0x1E12C],
801             [0x1E130, 0x1E13D],
802             [0x1E140, 0x1E149],
803             [0x1E14E, 0x1E14E],
804             [0x1E290, 0x1E2AE],
805             [0x1E2C0, 0x1E2F9],
806             [0x1E4D0, 0x1E4F9],
807             [0x1E7E0, 0x1E7E6],
808             [0x1E7E8, 0x1E7EB],
809             [0x1E7ED, 0x1E7EE],
810             [0x1E7F0, 0x1E7FE],
811             [0x1E800, 0x1E8C4],
812             [0x1E8D0, 0x1E8D6],
813             [0x1E900, 0x1E94B],
814             [0x1E950, 0x1E959],
815             [0x1EE00, 0x1EE03],
816             [0x1EE05, 0x1EE1F],
817             [0x1EE21, 0x1EE22],
818             [0x1EE24, 0x1EE24],
819             [0x1EE27, 0x1EE27],
820             [0x1EE29, 0x1EE32],
821             [0x1EE34, 0x1EE37],
822             [0x1EE39, 0x1EE39],
823             [0x1EE3B, 0x1EE3B],
824             [0x1EE42, 0x1EE42],
825             [0x1EE47, 0x1EE47],
826             [0x1EE49, 0x1EE49],
827             [0x1EE4B, 0x1EE4B],
828             [0x1EE4D, 0x1EE4F],
829             [0x1EE51, 0x1EE52],
830             [0x1EE54, 0x1EE54],
831             [0x1EE57, 0x1EE57],
832             [0x1EE59, 0x1EE59],
833             [0x1EE5B, 0x1EE5B],
834             [0x1EE5D, 0x1EE5D],
835             [0x1EE5F, 0x1EE5F],
836             [0x1EE61, 0x1EE62],
837             [0x1EE64, 0x1EE64],
838             [0x1EE67, 0x1EE6A],
839             [0x1EE6C, 0x1EE72],
840             [0x1EE74, 0x1EE77],
841             [0x1EE79, 0x1EE7C],
842             [0x1EE7E, 0x1EE7E],
843             [0x1EE80, 0x1EE89],
844             [0x1EE8B, 0x1EE9B],
845             [0x1EEA1, 0x1EEA3],
846             [0x1EEA5, 0x1EEA9],
847             [0x1EEAB, 0x1EEBB],
848             [0x1FBF0, 0x1FBF9],
849             [0x20000, 0x2A6DF],
850             [0x2A700, 0x2B739],
851             [0x2B740, 0x2B81D],
852             [0x2B820, 0x2CEA1],
853             [0x2CEB0, 0x2EBE0],
854             [0x2EBF0, 0x2EE5D],
855             [0x2F800, 0x2FA1D],
856             [0x30000, 0x3134A],
857             [0x31350, 0x323AF],);
858              
859             # Binary-search the frozen XID_Continue range table.
860             # Caller is responsible for the ASCII fast path; this is the slow path
861             # for non-ASCII codepoints in identifier-character checks.
862             sub _is_xid_continue {
863 0     0   0 my ($cp) = @_;
864 0         0 my $lo = 0;
865 0         0 my $hi = $#XID_CONTINUE_RANGES;
866 0         0 while ($lo <= $hi) {
867 0         0 my $mid = ($lo + $hi) >> 1;
868 0         0 my $r = $XID_CONTINUE_RANGES[$mid];
869 0 0       0 if ($cp < $r->[0]) { $hi = $mid - 1 }
  0 0       0  
870 0         0 elsif ($cp > $r->[1]) { $lo = $mid + 1 }
871 0         0 else { return 1 }
872             }
873 0         0 return 0;
874             }
875              
876             sub new_table {
877 52     52 0 1900 require Tie::IxHash;
878 52         11827 tie my %h, 'Tie::IxHash';
879 52         1011 return \%h;
880             }
881              
882             # SPEC §"Unordered tables". Plain hashref blessed into DMS::Parser::UnorderedTable.
883             # Used by parse_table_block / parse_flow_table / parse_list_item_value when
884             # the parser was constructed with ignore_order=true. Iteration order over
885             # the resulting table is arbitrary (Perl hash randomization).
886             sub new_unordered_table {
887 0     0 0 0 my %h;
888 0         0 return bless \%h, 'DMS::Parser::UnorderedTable';
889             }
890              
891             # Like new_table, but returns a plain (non-tied) hashref plus an
892             # external "insertion-order keys" arrayref. Used in the lite/encoder
893             # fast path where we don't need the round-trip Document tree — just
894             # enough structure for JSON emission. The encoder special-cases this
895             # shape (HASH refs accompanied by an "$order" arrayref).
896             #
897             # We tag the plain hashref with a hidden key holding the keys arrayref
898             # so it round-trips through `$t->{$k} = $v` access patterns: the
899             # encoder extracts the order list from `$t->{"\0_keys"}` if present and
900             # falls back to `keys %$t` otherwise.
901             our $ORDER_KEY = "\0_keys";
902             sub new_ordered_table {
903 0     0 0 0 return { $ORDER_KEY => [] };
904             }
905              
906             # Per-indent compiled bulk regex cache for parse_table_block's fast
907             # path. Common indent levels (0, 2, 4, 6, ...) compile once globally;
908             # subsequent encounters reuse the qr// from this cache instead of
909             # recompiling per parse_table_block call.
910             our %BULK_RE_CACHE;
911              
912             sub _err {
913 0     0   0 my ($self, $msg) = @_;
914 0         0 return "$self->{line}:" . ($self->{pos} - $self->{line_start} + 1) . ": $msg\n";
915             }
916              
917             sub _err_at {
918 4     4   40 my ($self, $line, $line_start, $pos, $msg) = @_;
919 4         20 my $col = $pos - $line_start + 1;
920 4         104 return "$line:$col: $msg\n";
921             }
922              
923             sub _die_at {
924 0     0   0 my ($self, $line, $line_start, $pos, $msg) = @_;
925 0         0 die "$line:" . ($pos - $line_start + 1) . ": $msg\n";
926             }
927              
928             sub _die {
929 0     0   0 my ($self, $msg) = @_;
930 0         0 die _err($self, $msg);
931             }
932              
933             # SPEC §Decode/Encode (v0.14): canonical entry point. Returns the body
934             # only — meta and comments are dropped. Use decode_document() to keep
935             # them.
936             sub decode {
937 2     2 1 6919 my ($src) = @_;
938 2         7 my $doc = decode_document($src);
939 2         11 return $doc->{body};
940             }
941              
942             # Deprecated alias for decode(). Removed in the next release.
943             # SPEC §Decode/Encode — Migration from the parse/to_dms era.
944             { my $warned;
945             sub parse {
946 1 50   1 0 11 unless ($warned++) {
947 1         207 Carp::carp(
948             'DMS::Parser::parse() is deprecated; use decode() instead. '
949             . 'SPEC v0.14 renamed parse() to decode().');
950             }
951 1         10 goto &decode;
952             }
953             }
954              
955             # SPEC §Parsing modes — full and lite. Lite-mode equivalents return
956             # a Document with empty comments + original_forms.
957             sub decode_lite {
958 0     0 1 0 my ($src) = @_;
959 0         0 return decode_lite_document($src)->{body};
960             }
961              
962             # Deprecated alias for decode_lite(). Removed in the next release.
963             { my $warned;
964             sub parse_lite {
965 0 0   0 0 0 unless ($warned++) {
966 0         0 Carp::carp(
967             'DMS::Parser::parse_lite() is deprecated; use decode_lite() instead. '
968             . 'SPEC v0.14 renamed parse_lite() to decode_lite().');
969             }
970 0         0 goto &decode_lite;
971             }
972             }
973              
974             sub decode_lite_document {
975 0     0 1 0 my ($src) = @_;
976             # Fast path: try the iterative mega-regex tokenizer for the simple
977             # subset (bareword keys, simple scalar values, nested-table headers,
978             # blank lines, single-line `#` / `//` comments, ASCII-only). Returns
979             # undef on anything it can't handle; we fall back to the full
980             # recursive-descent parser below. The fast path avoids the
981             # parse_table_block recursion and the parse_kvpair / _skip_trivia
982             # method-dispatch frames — pure-Perl method calls are the main
983             # cost on flat configs (kube values.yaml-shaped).
984 0         0 my $fast = _parse_lite_document_fast(\$src);
985 0 0       0 return $fast if defined $fast;
986 0         0 return _parse_document_with_mode($src, 1);
987             }
988              
989             # Iterative mega-regex tokenizer + manual stack assembly for the lite
990             # subset. Returns a Document hashref { meta, body, comments,
991             # original_forms } on success, or undef on any pattern the fast path
992             # can't handle (front matter, heredocs, list items, escaped strings,
993             # block comments, complex-key flow forms, non-ASCII strings).
994             # Caller's $src isn't modified; we do a pos() scan via a scalar ref.
995             sub _parse_lite_document_fast {
996 0     0   0 my ($src_ref) = @_;
997 0         0 my $len = length($$src_ref);
998              
999             # Pre-flight rejection: a single regex over the whole src checks
1000             # for any opener of a construct the fast path doesn't handle. If
1001             # any are present, return undef immediately so the slow parser
1002             # gets the (correct) full-spec implementation. This costs one
1003             # linear scan but it's bounded by the C regex engine and faster
1004             # than discovering mid-parse that we can't handle something. The
1005             # negative lookaheads for `(?!##)` exclude `### LABEL` block
1006             # comment openers. We deliberately allow `\u`/`\U` Unicode escapes
1007             # but reject other backslash escapes (would need decoding).
1008 0 0       0 return undef if $$src_ref =~ /"""|'''/; # heredocs
1009 0 0       0 return undef if $$src_ref =~ /^[ \t]*\+/m; # list items / front matter
1010 0 0       0 return undef if $$src_ref =~ /^[ \t]*###/m; # block comment opener
1011             # Note: we don't pre-reject `/*` because real configs sometimes
1012             # have it inside string literals (e.g., `path: "/etc/foo/*.tmpl"`).
1013             # The mega-regex handles strings as a unit so `/*` is consumed
1014             # inside the string capture; the inner loop will only fail on a
1015             # `/*` that appears at line-start as an actual block comment, in
1016             # which case it correctly bails to the slow parser.
1017             # Backslash escapes ARE handled in the string regex below — the
1018             # decode pass after match handles \", \\, \n, \r, \t, \b, \f, \uXXXX,
1019             # \UXXXXXXXX. Anything else with `\` is still rejected.
1020             # Quoted keys, inline tables/lists with content, dates, hex/oct/bin
1021             # numbers — all opt out by being matched as "didn't match the
1022             # mega-regex" inside the loop; the loop returns undef on the
1023             # first non-match.
1024              
1025             # Manual stack of containers, with the indent at which their
1026             # children live. The root has child_indent 0. When a header
1027             # (`key:\n`) arrives, we push the new child table with
1028             # child_indent=undef ("TBD — set by first child"). The pop rule
1029             # is "while top's child_indent > current indent, pop"; this
1030             # closes nested blocks as the indent decreases.
1031 0         0 my $root = { $ORDER_KEY => [] };
1032 0         0 my @stack = ({ c => $root, ci => 0 });
1033              
1034 0         0 pos($$src_ref) = 0;
1035 0         0 LINE: while (pos($$src_ref) < $len) {
1036             # Blank line — most common after kvpairs.
1037 0 0       0 if ($$src_ref =~ /\G[ \t]*\r?\n/gc) { next LINE; }
  0         0  
1038             # Single-line `#` or `//` comment (block forms `###` / `/*`
1039             # already pre-rejected).
1040 0 0       0 if ($$src_ref =~ /\G[ \t]*(?:#|\/\/)[^\n\r]*\r?\n/gc) { next LINE; }
  0         0  
1041              
1042             # Mega-match: kvpair with simple value OR nested-block header.
1043             # Group layout (capturing only — non-capturing alternations
1044             # don't shift indices):
1045             # $1 = indent (spaces)
1046             # $2 = bareword key
1047             # $3 = positive integer
1048             # $4 = negative integer
1049             # $5 = bool ('true'/'false')
1050             # $6 = basic-string content (escapes allowed; decoded after)
1051             # $7 = empty list / empty table literal ('[]' or '{}')
1052             # $8 = decimal float (no exponent)
1053             # $9 = flow-list-of-simple-values content (between [ and ],
1054             # no nested brackets, no embedded newlines)
1055             # $10 = flow-table-of-simple-kvpairs content
1056             # $11 = trailing '\r?\n' for the no-value (header) form
1057 0 0       0 if ($$src_ref =~ /\G([ ]*)([A-Za-z_][A-Za-z0-9_-]*):(?:[ ](?:(0|[1-9][0-9]{0,17})|(-[1-9][0-9]{0,17})|(true|false)|"((?:[\x20\x21\x23-\x5b\x5d-\x7e]|\\(?:["\\nrtbf]|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}))*)"|(\[\]|\{\})|(-?(?:0|[1-9][0-9]*)\.[0-9]+)|\[([^\[\]\n\r]+)\]|\{([^\{\}\n\r]+)\})\r?\n|(\r?\n))/gc) {
1058 0         0 my $ind = length($1);
1059 0         0 my $key = $2;
1060             # Pop closed levels (top's children no longer reachable).
1061 0   0     0 while (@stack > 1 && defined($stack[-1]{ci}) && $stack[-1]{ci} > $ind) {
      0        
1062 0         0 pop @stack;
1063             }
1064             # First child of a header? Its indent fixes the parent's
1065             # child_indent.
1066 0 0       0 if (!defined $stack[-1]{ci}) {
1067             # The new indent must be strictly greater than the
1068             # header's parent's child_indent (i.e., we descended).
1069 0 0 0     0 if (@stack >= 2 && $stack[-2]{ci} >= $ind) { return undef; }
  0         0  
1070 0         0 $stack[-1]{ci} = $ind;
1071             }
1072             # Indent must match the current container's expected level.
1073 0 0       0 return undef if $stack[-1]{ci} != $ind;
1074 0         0 my $container = $stack[-1]{c};
1075 0 0       0 return undef if exists $container->{$key}; # duplicate key
1076              
1077 0 0       0 if (defined $11) {
1078             # Nested-block header: child is a table (the fast path
1079             # doesn't handle child-as-list; pre-flight rejects any
1080             # `+` so this is safe).
1081 0         0 my $child = { $ORDER_KEY => [] };
1082 0         0 push @{$container->{$ORDER_KEY}}, $key;
  0         0  
1083 0         0 $container->{$key} = $child;
1084 0         0 push @stack, { c => $child, ci => undef };
1085             } else {
1086 0         0 my $val;
1087 0 0       0 if (defined $3) { my $iv = 0+$3; $val = bless \$iv, 'DMS::Parser::Integer'; }
  0 0       0  
  0 0       0  
    0          
    0          
    0          
    0          
1088 0         0 elsif (defined $4) { my $iv = 0+$4; $val = bless \$iv, 'DMS::Parser::Integer'; }
  0         0  
1089 0 0       0 elsif (defined $5) { my $bv = $5 eq 'true' ? 1 : 0; $val = bless \$bv, 'DMS::Parser::Bool'; }
  0         0  
1090             elsif (defined $6) {
1091 0         0 $val = $6;
1092             # Decode escapes only if any backslash present (the
1093             # common case is clean ASCII strings — skip the
1094             # substitution then).
1095 0 0       0 if (index($val, '\\') >= 0) {
1096 0         0 $val =~ s{\\(["\\nrtbf])}{
1097 0 0       0 $1 eq 'n' ? "\n"
    0          
    0          
    0          
    0          
1098             : $1 eq 't' ? "\t"
1099             : $1 eq 'r' ? "\r"
1100             : $1 eq 'b' ? "\b"
1101             : $1 eq 'f' ? "\f"
1102             : $1
1103             }ge;
1104 0         0 $val =~ s{\\u([0-9A-Fa-f]{4})}{chr(hex($1))}ge;
  0         0  
1105 0         0 $val =~ s{\\U([0-9A-Fa-f]{8})}{chr(hex($1))}ge;
  0         0  
1106             }
1107             }
1108 0 0       0 elsif (defined $7) { $val = $7 eq '[]' ? [] : { $ORDER_KEY => [] }; }
1109 0         0 elsif (defined $8) { my $fv = 0+$8; $val = bless \$fv, 'DMS::Parser::Float'; }
  0         0  
1110             elsif (defined $9) {
1111             # Flow-list with content (no nested brackets / newlines).
1112             # Try to parse inner content as a comma-separated
1113             # list of simple values. Bail to slow path if any
1114             # value isn't a clean leaf.
1115 0         0 my $inner = $9;
1116 0         0 my @items;
1117             # Split by ',' but allow whitespace.
1118 0         0 for my $raw (split /\s*,\s*/, $inner) {
1119 0 0       0 next if $raw eq ''; # trailing comma → empty trailing field
1120 0 0       0 if ($raw =~ /^(0|[1-9][0-9]{0,17})$/) {
    0          
    0          
    0          
    0          
1121 0         0 my $iv = 0+$raw; push @items, bless \$iv, 'DMS::Parser::Integer';
  0         0  
1122             } elsif ($raw =~ /^-[1-9][0-9]{0,17}$/) {
1123 0         0 my $iv = 0+$raw; push @items, bless \$iv, 'DMS::Parser::Integer';
  0         0  
1124             } elsif ($raw eq 'true') {
1125 0         0 my $bv = 1; push @items, bless \$bv, 'DMS::Parser::Bool';
  0         0  
1126             } elsif ($raw eq 'false') {
1127 0         0 my $bv = 0; push @items, bless \$bv, 'DMS::Parser::Bool';
  0         0  
1128             } elsif ($raw =~ /^"([\x20\x21\x23-\x5b\x5d-\x7e]*)"$/) {
1129 0         0 push @items, $1; # ASCII-clean string, no escapes
1130             } else {
1131 0         0 return undef; # complex flow content — fall back
1132             }
1133             }
1134 0         0 $val = \@items;
1135             }
1136             else {
1137             # $10: flow-table — fall back for now (rare; would
1138             # need split on `,` and inner `key: value` parsing).
1139 0         0 return undef;
1140             }
1141 0         0 push @{$container->{$ORDER_KEY}}, $key;
  0         0  
1142 0         0 $container->{$key} = $val;
1143             }
1144 0         0 next LINE;
1145             }
1146              
1147             # Anything else: bail out, slow parser owns it.
1148 0         0 return undef;
1149             }
1150              
1151             # Successful walk — return Document shape. comments/original_forms
1152             # are empty by definition (we rejected anything that could carry
1153             # them).
1154             return {
1155 0         0 meta => undef,
1156             body => $root,
1157             comments => [],
1158             original_forms => [],
1159             };
1160             }
1161              
1162             # SPEC §"Unordered tables" — opt-in. Body tables are produced as
1163             # DMS::Parser::UnorderedTable (plain Perl hashes, no insertion-order tracking).
1164             # Front matter remains ordered (per spec — meta is small and used by
1165             # tooling that benefits from stable order). Documents from these entry
1166             # points cannot round-trip via `encode` (full mode); use `encode_lite`
1167             # instead. `decode_document_unordered` is full-mode (comments AST +
1168             # original_forms still recorded); `decode_lite_document_unordered` is
1169             # the (unordered, lite) combo — fastest read-only path.
1170             sub decode_document_unordered {
1171 0     0 0 0 my ($src) = @_;
1172 0         0 return _parse_document_with_mode($src, 0, 1);
1173             }
1174             sub decode_lite_document_unordered {
1175 0     0 0 0 my ($src) = @_;
1176 0         0 return _parse_document_with_mode($src, 1, 1);
1177             }
1178              
1179             # Deprecated aliases. Removed in the next release.
1180             { my $warned;
1181             sub parse_document_unordered {
1182 0 0   0 0 0 unless ($warned++) {
1183 0         0 Carp::carp(
1184             'DMS::Parser::parse_document_unordered() is deprecated; '
1185             . 'use decode_document_unordered() instead. '
1186             . 'SPEC v0.14 renamed parse_*() to decode_*().');
1187             }
1188 0         0 goto &decode_document_unordered;
1189             }
1190             }
1191             { my $warned;
1192             sub parse_lite_document_unordered {
1193 0 0   0 0 0 unless ($warned++) {
1194 0         0 Carp::carp(
1195             'DMS::Parser::parse_lite_document_unordered() is deprecated; '
1196             . 'use decode_lite_document_unordered() instead. '
1197             . 'SPEC v0.14 renamed parse_*() to decode_*().');
1198             }
1199 0         0 goto &decode_lite_document_unordered;
1200             }
1201             }
1202              
1203             # Re-emit a parsed Document as DMS source. See SPEC §encode.
1204             sub encode {
1205 21     21 0 1580 my ($doc) = @_;
1206 21         1111 require DMS::Parser::Emitter;
1207 21         88 return DMS::Parser::Emitter::encode($doc);
1208             }
1209              
1210             # Lite-mode emit: canonical DMS source — no comments, decimal integers,
1211             # basic-quoted strings — ignoring any comments / original_forms in $doc.
1212             # `decode(encode_lite($doc))` is data-equivalent to $doc; round-trip of
1213             # comment + literal-form is *not* preserved. SPEC §encode.
1214             sub encode_lite {
1215 2     2 0 981 my ($doc) = @_;
1216 2         14 require DMS::Parser::Emitter;
1217 2         9 return DMS::Parser::Emitter::encode_lite($doc);
1218             }
1219              
1220             # Deprecated aliases for encode/encode_lite. Removed in the next release.
1221             { my $warned;
1222             sub to_dms {
1223 1 50   1 0 9 unless ($warned++) {
1224 1         126 Carp::carp(
1225             'DMS::Parser::to_dms() is deprecated; use encode() instead. '
1226             . 'SPEC v0.14 renamed to_dms() to encode().');
1227             }
1228 1         9 goto &encode;
1229             }
1230             }
1231             { my $warned;
1232             sub to_dms_lite {
1233 1 50   1 0 7 unless ($warned++) {
1234 1         120 Carp::carp(
1235             'DMS::Parser::to_dms_lite() is deprecated; use encode_lite() instead. '
1236             . 'SPEC v0.14 renamed to_dms_lite() to encode_lite().');
1237             }
1238 1         10 goto &encode_lite;
1239             }
1240             }
1241              
1242             sub decode_document {
1243 42     42 1 537792 my ($src) = @_;
1244 42         136 return _parse_document_with_mode($src, 0, 0);
1245             }
1246              
1247             # Deprecated alias for decode_document(). Removed in the next release.
1248             { my $warned;
1249             sub parse_document {
1250 1 50   1 0 8 unless ($warned++) {
1251 1         139 Carp::carp(
1252             'DMS::Parser::parse_document() is deprecated; use decode_document() instead. '
1253             . 'SPEC v0.14 renamed parse_document() to decode_document().');
1254             }
1255 1         10 goto &decode_document;
1256             }
1257             }
1258              
1259             # SPEC §"UTF-8 only": reject any non-strict UTF-8 byte sequence —
1260             # overlongs, lone continuation bytes, 5/6-byte forms, codepoints above
1261             # U+10FFFF, and surrogates encoded as 3-byte UTF-8. Perl's built-in
1262             # `utf8::decode` is too lax (it accepts the legacy "extended UTF-8"
1263             # encoding for code points up to 2^31), so we walk the bytes ourselves
1264             # before any decoding.
1265             sub _validate_strict_utf8 {
1266 53     53   114 my ($s) = @_;
1267             # Fast path: pure-ASCII source has no UTF-8 work to do. A single
1268             # regex hit returns immediately on flat-ASCII data — saves ~150ms.
1269 53 50       241 return if $s !~ /[\x80-\xFF]/;
1270 0         0 my $n = length($s);
1271 0         0 my $i = 0;
1272 0         0 while ($i < $n) {
1273 0         0 my $b0 = ord(substr($s, $i, 1));
1274 0 0       0 if ($b0 < 0x80) { $i++; next; }
  0         0  
  0         0  
1275 0         0 my ($expect, $cp_lo, $cp_hi);
1276 0 0       0 if (($b0 & 0xE0) == 0xC0) {
    0          
    0          
1277 0 0       0 return _utf8_die($s, $i) if $b0 < 0xC2; # overlong
1278 0         0 $expect = 2; $cp_lo = 0x80; $cp_hi = 0x7FF;
  0         0  
  0         0  
1279             } elsif (($b0 & 0xF0) == 0xE0) {
1280 0         0 $expect = 3; $cp_lo = 0x800; $cp_hi = 0xFFFF;
  0         0  
  0         0  
1281             } elsif (($b0 & 0xF8) == 0xF0) {
1282 0 0       0 return _utf8_die($s, $i) if $b0 > 0xF4; # > U+10FFFF
1283 0         0 $expect = 4; $cp_lo = 0x10000; $cp_hi = 0x10FFFF;
  0         0  
  0         0  
1284             } else {
1285 0         0 return _utf8_die($s, $i); # bare cont / 5-6 byte
1286             }
1287 0 0       0 return _utf8_die($s, $i) if $i + $expect > $n;
1288 0         0 my $cp = ($b0 & ((1 << (7 - $expect)) - 1));
1289 0         0 for (my $k = 1; $k < $expect; $k++) {
1290 0         0 my $bk = ord(substr($s, $i + $k, 1));
1291 0 0       0 return _utf8_die($s, $i) if ($bk & 0xC0) != 0x80;
1292 0         0 $cp = ($cp << 6) | ($bk & 0x3F);
1293             }
1294 0 0 0     0 return _utf8_die($s, $i) if $cp < $cp_lo || $cp > $cp_hi;
1295 0 0 0     0 return _utf8_die($s, $i) if $cp >= 0xD800 && $cp <= 0xDFFF;
1296 0         0 $i += $expect;
1297             }
1298             }
1299              
1300             sub _utf8_die {
1301 0     0   0 my ($s, $i) = @_;
1302 0         0 my $prefix = substr($s, 0, $i);
1303 0         0 my $line = 1 + ($prefix =~ tr/\n//);
1304 0         0 my $last_nl = rindex($prefix, "\n");
1305 0         0 my $col = $i - ($last_nl + 1) + 1;
1306 0         0 die "$line:$col: input is not valid UTF-8\n";
1307             }
1308              
1309             # Shared input-normalization for every public decode entry point.
1310             # SPEC §"UTF-8 only, NFC-normalized": reject BOM at offset 0, reject
1311             # malformed UTF-8 bytes (5/6-byte forms, > U+10FFFF, lone continuation
1312             # bytes, overlongs, surrogates), reject U+0000 anywhere, then NFC the
1313             # source. Returns the (possibly NFC'd) Perl-internal-encoded string.
1314             sub _normalize_source {
1315 53     53   112 my ($src) = @_;
1316             # SPEC §"UTF-8 only, NFC-normalized": DMS source is plain UTF-8 with
1317             # no byte-order mark. A leading U+FEFF is not silently consumed —
1318             # reject it explicitly so encoding mistakes surface loudly. (BOMs
1319             # *inside* string/heredoc bodies are fine; this only fires at offset 0.)
1320             # We check for the raw UTF-8 BOM bytes (EF BB BF) before any decoding,
1321             # so the rejection is independent of how the caller passed `$src`.
1322 53 50 66     606 if (!utf8::is_utf8($src) && length($src) >= 3
      66        
1323             && substr($src, 0, 3) eq "\xEF\xBB\xBF") {
1324 0         0 die "1:1: BOM (U+FEFF) at file start is not allowed; DMS source is plain UTF-8\n";
1325             }
1326             # SPEC §"UTF-8 only": reject malformed UTF-8 bytes (codepoints
1327             # > U+10FFFF, lone continuation bytes, overlongs, surrogates encoded
1328             # as bytes). Perl's `utf8::decode` is permissive about 5/6-byte
1329             # forms and codepoints above U+10FFFF, so we additionally walk the
1330             # raw bytes to confirm strict UTF-8 conformance.
1331 53 50       189 if (!utf8::is_utf8($src)) {
1332 53         147 _validate_strict_utf8($src);
1333 53         117 my $copy = $src;
1334 53 50       199 if (!utf8::decode($copy)) {
1335 0         0 die "1:1: input is not valid UTF-8\n";
1336             }
1337 53         110 $src = $copy;
1338             }
1339 53 50 66     251 if (length($src) >= 1 && substr($src, 0, 1) eq "\x{FEFF}") {
1340 0         0 die "1:1: BOM (U+FEFF) at file start is not allowed; DMS source is plain UTF-8\n";
1341             }
1342             # U+0000 is not allowed anywhere in DMS source (see SPEC §Strings).
1343 53 50       169 if ((my $nul = index($src, "\0")) >= 0) {
1344 0         0 my $prefix = substr($src, 0, $nul);
1345 0         0 my $line = 1 + ($prefix =~ tr/\n//);
1346 0         0 my $last_nl = rindex($prefix, "\n");
1347 0         0 my $col = $nul - ($last_nl + 1) + 1;
1348 0         0 die "$line:$col: U+0000 (NUL) is not allowed in DMS source\n";
1349             }
1350             # SPEC §Unicode normalization: NFC the source before tokenization.
1351             # Fast path: pure-ASCII source is already in NFC; skip the (linear,
1352             # codepoint-walking) NFC pass entirely. The check itself is a single
1353             # regex that finds the first non-ASCII byte if any.
1354 53 50       169 $src = _NFC($src) if $src =~ /[^\x00-\x7F]/;
1355 53         113 return $src;
1356             }
1357              
1358             # SPEC §Front-matter-only decode. Decodes the leading `+++ ... +++`
1359             # block (if any) and returns it as a hashref, then stops without
1360             # tokenizing the body. Returns undef when the document has no front
1361             # matter at all (no opening `+++` after trivia). An empty front matter
1362             # (`+++\n+++\n`) returns a defined-but-empty hashref, distinguishable
1363             # from undef.
1364             #
1365             # Operates in lite mode (no comment AST, no original_forms recorded
1366             # inside the FM). Diagnostics inside the `+++ ... +++` block are
1367             # byte-identical to a full decode.
1368             sub decode_front_matter {
1369 11     11 1 214463 my ($src) = @_;
1370 11         29 $src = _normalize_source($src);
1371 11         100 my $self = bless {
1372             src => $src,
1373             len => length($src),
1374             pos => 0,
1375             line => 1,
1376             line_start => 0,
1377             pending_leading => [],
1378             path => [],
1379             comments => [],
1380             original_forms => [],
1381             record_forms => 1,
1382             # SPEC §Front-matter-only decode: "Mode: front-matter-only
1383             # decode runs in lite mode."
1384             lite => 1,
1385             # FM is always ordered (SPEC §"Unordered tables"); ignore_order
1386             # only affects body tables, which we never reach.
1387             ignore_order => 0,
1388             }, __PACKAGE__;
1389 11         41 return $self->parse_front_matter;
1390             }
1391              
1392             sub _parse_document_with_mode {
1393 42     42   98 my ($src, $lite, $ignore_order) = @_;
1394 42 50       132 $ignore_order = 0 unless $ignore_order;
1395 42         108 $src = _normalize_source($src);
1396 42         443 my $self = bless {
1397             src => $src,
1398             len => length($src),
1399             pos => 0,
1400             line => 1,
1401             line_start => 0,
1402             # Comment-AST state. `pending_leading` accumulates full-line
1403             # comments seen by `_skip_trivia`; on the next sibling-entry
1404             # they're flushed as Leading on its breadcrumb, on a blank line
1405             # gap or end-of-block they're flushed as Floating on the current
1406             # path. `path` is the breadcrumb stack for the value currently
1407             # being parsed (strings for table keys; DMS::Parser::Index for list
1408             # indices). `comments` is the accumulator returned to the caller.
1409             pending_leading => [],
1410             path => [],
1411             comments => [],
1412             # Original-literal records for to_dms round-trip. Sparse: only
1413             # nodes whose surface form differs from the emitter's default
1414             # (decimal-no-underscores for ints, basic-quoted for strings)
1415             # get an entry. See SPEC §to_dms.
1416             original_forms => [],
1417             # When false, integer/string lexeme recording is suppressed.
1418             # Used inside key parses and inside heredoc modifier args.
1419             record_forms => 1,
1420             # Lite mode: skip comment-AST + original_forms bookkeeping.
1421             # Same grammar, same errors.
1422             lite => $lite,
1423             # Unordered mode: when true, body tables are produced as
1424             # DMS::Parser::UnorderedTable (plain Perl hashes) instead of Tie::IxHash
1425             # tied tables. Front-matter parsing ignores this flag — meta
1426             # stays ordered. See SPEC §"Unordered tables".
1427             ignore_order => $ignore_order,
1428             }, __PACKAGE__;
1429 42         161 my $meta = $self->parse_front_matter;
1430 42         145 my $body = $self->parse_body;
1431             return {
1432             meta => $meta,
1433             body => $body,
1434             comments => $self->{comments},
1435             original_forms => $self->{original_forms},
1436 42         449 };
1437             }
1438              
1439             # Append an OriginalLiteral record at the current path. Skipped when
1440             # record_forms is false or when in lite mode.
1441             sub _record_form {
1442 41     41   98 my ($self, $lit) = @_;
1443 41 50 33     176 return if $self->{lite} || !$self->{record_forms};
1444 41         88 push @{$self->{original_forms}}, [ [@{$self->{path}}], $lit ];
  41         92  
  41         235  
1445             }
1446              
1447             sub _peek {
1448 899     899   1251 my $self = shift;
1449 899 50       1862 return undef if $self->{pos} >= $self->{len};
1450 899         2126 return substr($self->{src}, $self->{pos}, 1);
1451             }
1452              
1453             sub _peek_at {
1454 4     4   11 my ($self, $off) = @_;
1455 4         9 my $p = $self->{pos} + $off;
1456 4 50       13 return undef if $p >= $self->{len};
1457 4         15 return substr($self->{src}, $p, 1);
1458             }
1459              
1460             sub _starts_with {
1461 92     92   225 my ($self, $s) = @_;
1462 92         363 return substr($self->{src}, $self->{pos}, length($s)) eq $s;
1463             }
1464              
1465             sub _bump {
1466 0     0   0 my $self = shift;
1467 0 0       0 return undef if $self->{pos} >= $self->{len};
1468 0         0 my $c = substr($self->{src}, $self->{pos}, 1);
1469 0         0 $self->{pos}++;
1470 0         0 return $c;
1471             }
1472              
1473 253     253   802 sub _eof { return $_[0]{pos} >= $_[0]{len} }
1474              
1475             sub _advance_line {
1476 140     140   200 my $self = shift;
1477 140         257 $self->{line}++;
1478 140         265 $self->{line_start} = $self->{pos};
1479             }
1480              
1481             sub _is_bare_key_char {
1482 296     296   551 my ($c) = @_;
1483 296 50       608 return 1 if $c eq '-';
1484 296         428 my $o = ord($c);
1485 296 50       601 if ($o < 128) {
1486 296         996 return $c =~ /[A-Za-z0-9_]/;
1487             }
1488             # Frozen Unicode 15.1 XID_Continue snapshot - see @XID_CONTINUE_RANGES.
1489 0         0 return _is_xid_continue($o);
1490             }
1491              
1492             sub _is_label_start {
1493 37     37   74 my ($c) = @_;
1494 37   33     239 return defined($c) && ($c eq '_' || $c =~ /[A-Za-z]/);
1495             }
1496              
1497             sub _is_label_cont {
1498 62     62   114 my ($c) = @_;
1499 62   33     350 return defined($c) && ($c eq '_' || $c =~ /[A-Za-z0-9]/);
1500             }
1501              
1502             sub _looks_like_date_prefix {
1503 0     0   0 my ($s) = @_;
1504 0 0       0 return 0 if length($s) < 10;
1505 0         0 return $s =~ /^\d\d\d\d-\d\d-\d\d/;
1506             }
1507              
1508             sub _looks_like_time_prefix {
1509 0     0   0 my ($s) = @_;
1510 0 0       0 return 0 if length($s) < 8;
1511 0         0 return $s =~ /^\d\d:\d\d:\d\d/;
1512             }
1513              
1514             my %TERMINATORS = map { $_ => 1 } (' ', "\t", "\n", "\r", '#', '/', ',', ']', '}');
1515             sub _is_value_terminator {
1516 0     0   0 my ($c) = @_;
1517 0 0       0 return 1 if !defined($c);
1518 0         0 return exists $TERMINATORS{$c};
1519             }
1520              
1521             # SPEC §Lexical "Reserved decorator sigils": the seventeen characters
1522             # below are reserved as decorator sigils at line-start position. A body
1523             # line whose first non-whitespace character is one of these is a tier-0
1524             # parse error. Underscore is *not* in this set — it has its own role
1525             # for core / built-in decorators (e.g. heredoc modifiers `_trim`).
1526             # The reservation only applies to structural body positions; sigils
1527             # appearing inside strings, comments, or heredoc bodies are ordinary
1528             # content and remain valid.
1529             my %RESERVED_DECORATOR_SIGIL = map { $_ => 1 }
1530             ('!', '@', '$', '%', '^', '&', '*', '|', '~', '`',
1531             '.', ',', '>', '<', '?', ';', '=');
1532              
1533             sub _check_reserved_sigil {
1534 109     109   174 my $self = shift;
1535 109         184 my $p = $self->{pos};
1536 109 50       324 return if $p >= $self->{len};
1537 109         250 my $c = substr($self->{src}, $p, 1);
1538 109 50       336 return unless exists $RESERVED_DECORATOR_SIGIL{$c};
1539 0         0 $self->_die(
1540             "'$c' is a reserved decorator sigil at line-start (tier 0)"
1541             );
1542             }
1543              
1544             sub _skip_inline_ws {
1545 88     88   141 my $self = shift;
1546 88         272 pos($self->{src}) = $self->{pos};
1547 88 100       374 if ($self->{src} =~ /\G[ \t]+/gc) {
1548 6         14 $self->{pos} = pos($self->{src});
1549             }
1550             }
1551              
1552             sub _consume_eol {
1553 123     123   200 my $self = shift;
1554 123         268 my $c = $self->_peek;
1555 123 50 33     417 if (defined($c) && $c eq "\n") {
1556 123         211 $self->{pos}++;
1557 123         310 $self->_advance_line;
1558 123         274 return 1;
1559             }
1560 0 0       0 if ($self->_starts_with("\r\n")) {
1561 0         0 $self->{pos} += 2;
1562 0         0 $self->_advance_line;
1563 0         0 return 1;
1564             }
1565 0         0 return 0;
1566             }
1567              
1568             sub _skip_trivia {
1569 330     330   542 my $self = shift;
1570             # Hot path: the next byte is something other than ws/EOL/comment.
1571             # Skip the whole loop. parse_table_block hits this between every
1572             # 50k keys.
1573 330         586 my $p = $self->{pos};
1574 330 100       778 if ($p < $self->{len}) {
1575 223         521 my $c0 = substr($self->{src}, $p, 1);
1576 223 100 66     2091 return if $c0 ne ' ' && $c0 ne "\t" && $c0 ne "\n" && $c0 ne "\r"
      100        
      66        
      100        
      66        
1577             && $c0 ne '#' && $c0 ne '/';
1578             } else {
1579 107         179 return;
1580             }
1581 53         122 while (1) {
1582 93         174 my $start = $self->{pos};
1583             # Inline _skip_inline_ws + _peek in one regex hit.
1584 93         265 pos($self->{src}) = $start;
1585 93 100       423 if ($self->{src} =~ /\G[ \t]+/gc) { $self->{pos} = pos($self->{src}); }
  15         31  
1586 93         163 $p = $self->{pos};
1587 93 50       268 my $c = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
1588 93 100 66     991 if (defined($c) && ($c eq "\n" || $c eq "\r")) {
    100 66        
    50 66        
    50 33        
      33        
      33        
      33        
1589 4 50 33     19 if ($c eq "\r" && !$self->_starts_with("\r\n")) {
1590 0         0 $self->_die("bare CR is not a valid line terminator");
1591             }
1592             # Blank line: any pending leading comments are now separated
1593             # from a future sibling, so flush them as Floating on the
1594             # current path.
1595 4         17 $self->_flush_pending_as_floating;
1596 4         14 $self->_consume_eol;
1597             } elsif (defined($c) && $c eq '#') {
1598 40 100       128 if ($self->_starts_with("###")) {
1599 2         7 my $raw = $self->_read_hash_block_comment;
1600 2         14 push @{$self->{pending_leading}},
1601 2 50       6 { content => $raw, kind => 'block' } unless $self->{lite};
1602             } else {
1603 38         97 my $raw = $self->_read_line_comment_to_eol;
1604 38         122 $self->_consume_eol;
1605 37         232 push @{$self->{pending_leading}},
1606 38 100       93 { content => $raw, kind => 'line' } unless $self->{lite};
1607             }
1608             } elsif (defined($c) && $c eq '/' && $self->_starts_with("//")) {
1609 0         0 my $raw = $self->_read_line_comment_to_eol;
1610 0         0 $self->_consume_eol;
1611 0         0 push @{$self->{pending_leading}},
1612 0 0       0 { content => $raw, kind => 'line' } unless $self->{lite};
1613             } elsif (defined($c) && $c eq '/' && $self->_starts_with("/*")) {
1614 0         0 my $raw = $self->_read_c_block_comment;
1615 0         0 push @{$self->{pending_leading}},
1616 0 0       0 { content => $raw, kind => 'block' } unless $self->{lite};
1617             } else {
1618 49         94 $self->{pos} = $start;
1619 49         105 return;
1620             }
1621 44 100       123 return if $self->_eof;
1622             }
1623             }
1624              
1625             # Drain pending_leading and attach each as a Floating comment on the
1626             # current path. Called on blank-line gaps and at end-of-block.
1627             sub _flush_pending_as_floating {
1628 99     99   156 my $self = shift;
1629 99 100       168 return if !@{$self->{pending_leading}};
  99         258  
1630 5         11 my @drained = @{$self->{pending_leading}};
  5         14  
1631 5         14 $self->{pending_leading} = [];
1632 5         13 for my $c (@drained) {
1633 5         14 push @{$self->{comments}}, {
1634             comment => $c,
1635             position => 'floating',
1636 5         8 path => [@{$self->{path}}],
  5         27  
1637             };
1638             }
1639             }
1640              
1641             # Drain pending_leading and attach each as a Leading comment on the
1642             # current path. Called by sibling-entry sites (parse_kvpair,
1643             # parse_list_block) right after pushing the new sibling's breadcrumb.
1644             sub _flush_pending_as_leading_on_current {
1645 109     109   220 my $self = shift;
1646 109 100       171 return if !@{$self->{pending_leading}};
  109         345  
1647 20         36 my @drained = @{$self->{pending_leading}};
  20         55  
1648 20         51 $self->{pending_leading} = [];
1649 20         63 for my $c (@drained) {
1650 20         51 push @{$self->{comments}}, {
1651             comment => $c,
1652             position => 'leading',
1653 20         30 path => [@{$self->{path}}],
  20         141  
1654             };
1655             }
1656             }
1657              
1658             # Read a `# ...` or `// ...` line comment (without consuming the EOL)
1659             # and return the raw delimiter+body text.
1660             sub _read_line_comment_to_eol {
1661 55     55   97 my $self = shift;
1662             # Bulk scan to next \n or \r in one regex op rather than char-by-char
1663             # _peek loop. The bench fixture is 56% comments so this fires often.
1664 55         158 pos($self->{src}) = $self->{pos};
1665 55         187 $self->{src} =~ /\G[^\n\r]*/gc;
1666 55         112 my $end = pos($self->{src});
1667 55         90 my $start = $self->{pos};
1668 55         109 $self->{pos} = $end;
1669             # Lite mode discards comments — skip building the substr entirely.
1670 55 100       185 return '' if $self->{lite};
1671 54         169 return substr($self->{src}, $start, $end - $start);
1672             }
1673              
1674             # Read `/* ... */` (nested), returning the raw text including delimiters.
1675             sub _read_c_block_comment {
1676 0     0   0 my $self = shift;
1677 0         0 my ($sl, $sls, $sp) = ($self->{line}, $self->{line_start}, $self->{pos});
1678 0         0 my $start = $self->{pos};
1679 0         0 $self->{pos} += 2;
1680 0         0 my $depth = 1;
1681 0         0 while ($depth > 0) {
1682 0 0       0 if ($self->_eof) {
1683 0         0 die $self->_err_at($sl, $sls, $sp, "unterminated /* block comment");
1684             }
1685 0         0 my $c = $self->_peek;
1686 0 0 0     0 if ($c eq '/' && $self->_starts_with("/*")) { $self->{pos} += 2; $depth++; }
  0 0 0     0  
  0 0 0     0  
    0          
1687 0         0 elsif ($c eq '*' && $self->_starts_with("*/")) { $self->{pos} += 2; $depth--; }
  0         0  
1688 0         0 elsif ($c eq "\n") { $self->{pos}++; $self->_advance_line; }
  0         0  
1689 0         0 elsif ($c eq "\r" && $self->_starts_with("\r\n")) { $self->{pos} += 2; $self->_advance_line; }
  0         0  
1690 0         0 else { $self->{pos}++; }
1691             }
1692 0         0 return substr($self->{src}, $start, $self->{pos} - $start);
1693             }
1694              
1695             # Read `### ... ###` or `###LABEL ... LABEL`. The terminator's EOL is
1696             # consumed but is NOT part of the returned text.
1697             sub _read_hash_block_comment {
1698 2     2   4 my $self = shift;
1699 2         45 my ($sl, $sls, $sp) = ($self->{line}, $self->{line_start}, $self->{pos});
1700 2         6 my $start = $self->{pos};
1701 2         5 $self->{pos} += 3;
1702 2         5 my $ls = $self->{pos};
1703 2         4 while (1) {
1704 2         6 my $c = $self->_peek;
1705 2 50 33     18 last if !defined($c) || !($c eq '_' || $c =~ /[A-Za-z0-9]/);
      33        
1706 0         0 $self->{pos}++;
1707             }
1708 2         8 my $label = substr($self->{src}, $ls, $self->{pos} - $ls);
1709 2 50       7 if (length($label) > 0) {
1710 0         0 my $c0 = substr($label, 0, 1);
1711 0 0 0     0 if ($c0 ne '_' && $c0 !~ /[A-Za-z]/) {
1712 0         0 die $self->_err_at($sl, $sls, $sp, "block comment label must start with a letter or underscore");
1713             }
1714             }
1715 2 50       6 my $terminator = length($label) ? $label : "###";
1716 2         7 $self->_skip_inline_ws;
1717 2 50 33     6 if (!($self->_consume_eol || $self->_eof)) {
1718 0         0 $self->_die("block comment opener must be on its own line");
1719             }
1720 2         5 while (1) {
1721 4 50       10 if ($self->_eof) {
1722 0         0 die $self->_err_at($sl, $sls, $sp, "unterminated ### block comment");
1723             }
1724 4         9 my $lb = $self->{pos};
1725 4         5 while (1) {
1726 18         34 my $c = $self->_peek;
1727 18 100 66     80 last if !defined($c) || $c eq "\n" || $c eq "\r";
      66        
1728 14         21 $self->{pos}++;
1729             }
1730 4         14 my $line_text = substr($self->{src}, $lb, $self->{pos} - $lb);
1731 4         8 my $line_end = $self->{pos};
1732 4         13 $self->_consume_eol;
1733 4         24 my $trimmed = $line_text;
1734 4         20 $trimmed =~ s/^\s+|\s+$//g;
1735 4 100       13 if ($trimmed eq $terminator) {
1736 2         11 return substr($self->{src}, $start, $line_end - $start);
1737             }
1738             }
1739             }
1740              
1741             sub parse_front_matter {
1742 53     53 0 87 my $self = shift;
1743 53         114 my $save_pos = $self->{pos};
1744 53         92 my $save_line = $self->{line};
1745 53         86 my $save_ls = $self->{line_start};
1746 53         76 my $save_pending = scalar @{$self->{pending_leading}};
  53         107  
1747 53         113 my $save_comments = scalar @{$self->{comments}};
  53         124  
1748 53         186 $self->_skip_trivia;
1749 53         146 my $rest = substr($self->{src}, $self->{pos});
1750 53 100       178 if (substr($rest, 0, 3) ne '+++') {
1751 42         81 $self->{pos} = $save_pos; $self->{line} = $save_line; $self->{line_start} = $save_ls;
  42         73  
  42         102  
1752             # Speculative skip_trivia may have captured comments — undo so
1753             # the body parser re-captures them with the right path.
1754 42         68 splice @{$self->{pending_leading}}, $save_pending;
  42         109  
1755 42         107 splice @{$self->{comments}}, $save_comments;
  42         84  
1756 42         126 return undef;
1757             }
1758             # Any trailing content on the opener line is a parse error
1759             # (SPEC §Front matter: "each `+++` must appear on its own line,
1760             # with no trailing content"). Advance past `+++` and let the
1761             # strict EOL check below diagnose.
1762 11         28 my ($ol, $ols, $op) = ($self->{line}, $self->{line_start}, $self->{pos});
1763 11         18 $self->{pos} += 3;
1764 11         28 $self->_skip_inline_ws;
1765 11 50 33     21 if (!($self->_consume_eol || $self->_eof)) {
1766 0         0 $self->_die("front matter opener must be on its own line");
1767             }
1768 11         14 my @inner;
1769 11         15 while (1) {
1770 24 100       41 if ($self->_eof) {
1771 1         3 die $self->_err_at($ol, $ols, $op, "unterminated front matter: missing closing '+++'");
1772             }
1773 23         31 my $lb = $self->{pos};
1774 23         27 while (1) {
1775 240         313 my $c = $self->_peek;
1776 240 100 66     693 last if !defined($c) || $c eq "\n" || $c eq "\r";
      66        
1777 217         277 $self->{pos}++;
1778             }
1779 23         48 my $lt = substr($self->{src}, $lb, $self->{pos} - $lb);
1780 23         32 my $trimmed = $lt; $trimmed =~ s/^\s+|\s+$//g;
  23         90  
1781 23 100       47 if ($trimmed eq '+++') { $self->_consume_eol; last; }
  10         21  
  10         14  
1782 13         24 push @inner, $lt;
1783 13 50       25 push @inner, "\n" if $self->_consume_eol;
1784             }
1785 10         27 my $inner_src = join('', @inner);
1786             my $sub = bless {
1787             src => $inner_src,
1788             len => length($inner_src),
1789             pos => 0,
1790             line => 1,
1791             line_start => 0,
1792             pending_leading => [],
1793             path => [],
1794             comments => [],
1795             original_forms => [],
1796             record_forms => 1,
1797             lite => $self->{lite},
1798             # Front-matter is always ordered, regardless of body's ignore_order.
1799             # See SPEC §"Unordered tables".
1800 10         86 ignore_order => 0,
1801             }, __PACKAGE__;
1802 10         45 my $table = $sub->parse_body_as_table;
1803 10         37 my $lite = $self->{lite};
1804 10         15 my ($meta, $meta_order);
1805 10 100       25 if ($lite) {
1806 7         8 $meta_order = [];
1807 7         10 $meta = { $ORDER_KEY => $meta_order };
1808             } else {
1809 3         9 $meta = new_table();
1810             }
1811 10     3   76 my $fm_err = sub { die $self->_err_at($ol, $ols, $op, $_[0]); };
  3         7  
1812             # Iterate $table in insertion order: lite-mode tables carry their
1813             # order list at $ORDER_KEY, tied tables yield insertion order via
1814             # `keys`.
1815 10         15 my @table_keys;
1816 10 100 66     34 if ($lite && exists $table->{$ORDER_KEY}) {
1817 7         7 @table_keys = @{ $table->{$ORDER_KEY} };
  7         13  
1818             } else {
1819 3         17 @table_keys = grep { $_ ne $ORDER_KEY } keys %$table;
  3         69  
1820             }
1821 10         22 for my $k (@table_keys) {
1822 11         25 my $v = $table->{$k};
1823 11 100       50 if ($k =~ /^_/) {
1824 4 100       6 if ($k eq '_dms_tier') {
1825 3 100       7 unless (ref($v) eq 'DMS::Parser::Integer') {
1826 1         3 $fm_err->("_dms_tier must be a non-negative integer");
1827             }
1828 2         7 my $n = int($$v);
1829 2 50       4 if ($n < 0) {
1830 0         0 $fm_err->("_dms_tier must be non-negative");
1831             }
1832 2 100       4 if ($n >= 1) {
1833 1         3 $fm_err->("_dms_tier: $n is not supported (no tier >= 1 is defined in this version of DMS)");
1834             }
1835             } else {
1836 1         3 $fm_err->("unknown reserved key: $k");
1837             }
1838             } else {
1839 7 100       14 push @$meta_order, $k if $lite;
1840 7         22 $meta->{$k} = $v;
1841             }
1842             }
1843             # Hoist sub-parser comments into ours, prefixing each path with the
1844             # sentinel string "__fm__" so callers can distinguish front-matter
1845             # comments from body comments. Comments attached to reserved
1846             # (consumed) `_dms_*` keys are dropped.
1847 7         58 for my $ac (@{$sub->{comments}}) {
  7         13  
1848 1         3 my $first = $ac->{path}[0];
1849 1   33     11 my $attached_to_reserved = (defined($first) && !ref($first) && substr($first, 0, 1) eq '_');
1850 1 50       4 if ($attached_to_reserved) {
1851             # Reserved key was consumed — re-attach as floating on FM.
1852 0         0 push @{$self->{comments}}, {
1853             comment => $ac->{comment},
1854 0         0 position => 'floating',
1855             path => ['__fm__'],
1856             };
1857 0         0 next;
1858             }
1859 1         2 my @new_path = ('__fm__', @{$ac->{path}});
  1         4  
1860 1         21 push @{$self->{comments}}, {
1861             comment => $ac->{comment},
1862             position => $ac->{position},
1863 1         2 path => \@new_path,
1864             };
1865             }
1866             # Same hoist for original_forms: `__fm__` prefix, drop entries for
1867             # consumed `_dms_*` keys.
1868 7         12 for my $pair (@{$sub->{original_forms}}) {
  7         14  
1869 0         0 my ($spath, $lit) = @$pair;
1870 0         0 my $first = $spath->[0];
1871 0 0 0     0 if (defined($first) && !ref($first) && substr($first, 0, 1) eq '_') {
      0        
1872 0         0 next;
1873             }
1874 0         0 my @new_path = ('__fm__', @$spath);
1875 0         0 push @{$self->{original_forms}}, [ \@new_path, $lit ];
  0         0  
1876             }
1877 7         80 return $meta;
1878             }
1879              
1880             sub parse_body_as_table {
1881 10     10 0 13 my $self = shift;
1882 10         20 $self->_skip_trivia;
1883 10 100       19 if ($self->_eof) {
1884             # FM body is empty or comments-only — flush pending as floating.
1885 1         3 $self->_flush_pending_as_floating;
1886 1 50       6 return $self->{lite} ? { $ORDER_KEY => [] } : new_table();
1887             }
1888 9         20 my $c = $self->_peek;
1889 9 50 33     33 if ($c eq ' ' || $c eq "\t") { $self->_die("unexpected indentation inside front matter"); }
  0         0  
1890             # SPEC §Lexical "Reserved decorator sigils": rejected before the
1891             # generic "front matter block must be a table" diagnostic so the
1892             # error message identifies the actual cause.
1893 9         24 $self->_check_reserved_sigil;
1894 9 50 33     22 if ($c eq '+' && $self->_peek_after_plus_is_space_or_eol) { $self->_die("front matter block cannot have a list root"); }
  0         0  
1895 9 50       21 if (!$self->_line_starts_kvpair) { $self->_die("front matter block must be a table"); }
  0         0  
1896 9         26 my $t = $self->parse_table_block(0);
1897 9         22 $self->_skip_trivia;
1898 9 50       23 $self->_die("trailing content inside front matter") unless $self->_eof;
1899 9         15 return $t;
1900             }
1901              
1902             sub parse_body {
1903 42     42 0 66 my $self = shift;
1904 42         117 $self->_skip_trivia;
1905 42 50       110 if ($self->_eof) {
1906             # Empty / comment-only body: pending comments float on root.
1907 0         0 $self->_flush_pending_as_floating;
1908 0 0       0 if ($self->{ignore_order}) {
1909 0         0 return new_unordered_table();
1910             }
1911 0 0       0 return $self->{lite} ? { $ORDER_KEY => [] } : new_table();
1912             }
1913 42         109 my $c = $self->_peek;
1914 42 50 33     176 if ($c eq ' ' || $c eq "\t") {
1915 0         0 $self->_die("unexpected indentation at document root");
1916             }
1917 42         72 my $result;
1918 42 100 66     162 if ($c eq '+' && $self->_peek_after_plus_is_space_or_eol) {
    50          
1919 1         7 $result = $self->parse_list_block(0);
1920 1         4 $self->_skip_trivia;
1921 1 50       4 $self->_die("trailing content after list root") unless $self->_eof;
1922             } elsif ($self->_line_starts_kvpair) {
1923 41         138 $result = $self->parse_table_block(0);
1924 41         113 $self->_skip_trivia;
1925 41 50       106 $self->_die("trailing content after table root") unless $self->_eof;
1926             } else {
1927 0         0 $result = $self->parse_inline_value_or_heredoc;
1928 0         0 $self->_consume_after_value(1);
1929 0         0 $self->_skip_trivia;
1930 0 0       0 $self->_die("scalar root cannot be followed by more content") unless $self->_eof;
1931             }
1932             # Any trivia comments seen after the body float on root.
1933 42         119 $self->_flush_pending_as_floating;
1934 42         104 return $result;
1935             }
1936              
1937             sub _peek_after_plus_is_space_or_eol {
1938 4     4   10 my $self = shift;
1939 4         44 my $nxt = $self->_peek_at(1);
1940 4 50       13 return 1 if !defined($nxt);
1941 4   0     25 return $nxt eq ' ' || $nxt eq "\t" || $nxt eq "\n" || $nxt eq "\r";
1942             }
1943              
1944             sub _line_starts_kvpair {
1945 59     59   111 my $self = shift;
1946 59         101 my $p = $self->{pos};
1947 59         116 my $s = $self->{src};
1948 59         125 my $n = $self->{len};
1949 59 50 33     381 if ($p < $n && substr($s, $p, 1) eq '"') {
    50 33        
1950 0         0 $p++;
1951 0         0 while ($p < $n) {
1952 0         0 my $ch = substr($s, $p, 1);
1953 0 0 0     0 if ($ch eq '\\') { $p += 2; }
  0 0       0  
    0          
1954 0         0 elsif ($ch eq '"') { $p++; last; }
  0         0  
1955 0         0 elsif ($ch eq "\n" || $ch eq "\r") { return 0; }
1956 0         0 else { $p++; }
1957             }
1958             } elsif ($p < $n && substr($s, $p, 1) eq "'") {
1959 0         0 $p++;
1960 0         0 while ($p < $n) {
1961 0         0 my $ch = substr($s, $p, 1);
1962 0 0 0     0 if ($ch eq "'") { $p++; last; }
  0 0       0  
  0         0  
1963 0         0 elsif ($ch eq "\n" || $ch eq "\r") { return 0; }
1964 0         0 else { $p++; }
1965             }
1966             } else {
1967 59         96 my $any = 0;
1968 59         161 while ($p < $n) {
1969 296         509 my $ch = substr($s, $p, 1);
1970 296 100       563 last if !_is_bare_key_char($ch);
1971 237         379 $p++; $any = 1;
  237         497  
1972             }
1973 59 50       158 return 0 if !$any;
1974             }
1975 59 100 66     329 return 0 if $p >= $n || substr($s, $p, 1) ne ':';
1976 50 50       127 return 1 if $p + 1 >= $n;
1977 50         99 my $nxt = substr($s, $p+1, 1);
1978 50   33     227 return $nxt eq ' ' || $nxt eq "\t" || $nxt eq "\n" || $nxt eq "\r";
1979             }
1980              
1981             sub _measure_line_indent {
1982 17     17   29 my $self = shift;
1983             # Single regex hit instead of a per-char loop. Operate directly on
1984             # $self->{src} (no copy of the 700KB+ source) — pos() set/read is
1985             # safe on a hash element.
1986 17         51 pos($self->{src}) = $self->{line_start};
1987 17 100       106 return $self->{src} =~ /\G( +)/g ? length($1) : 0;
1988             }
1989              
1990             sub parse_table_block {
1991 54     54 0 117 my ($self, $indent) = @_;
1992 54         99 my $lite = $self->{lite};
1993 54         97 my $ignore_order = $self->{ignore_order};
1994             # Lite path uses a plain (non-tied) hash with a sidecar order list
1995             # under "\0_keys" — Tie::IxHash STORE/FETCH/EXISTS dispatch overhead
1996             # is the single biggest fixed cost on flat-table parses. Non-lite
1997             # path keeps the tied hash for full Document-tree compatibility.
1998             # ignore_order path: blessed DMS::Parser::UnorderedTable plain hashref, no
1999             # order list, no tying. SPEC §"Unordered tables".
2000 54         96 my $t;
2001             my $order;
2002 54 50       159 if ($ignore_order) {
    100          
2003 0         0 $t = new_unordered_table();
2004             } elsif ($lite) {
2005 6         5 $order = [];
2006 6         12 $t = { $ORDER_KEY => $order };
2007             } else {
2008 48         115 $t = new_table();
2009             }
2010             # Hot bulk loop (lite mode, indent == 0 only). On flat-table
2011             # benchmarks, every line matches `bareword: int\n` — we can batch
2012             # them with one tight regex and avoid 50 k method-call frames per
2013             # parse. The regex anchors at the current `pos`, captures up to N
2014             # consecutive simple-line kvpairs, and stops on the first non-match
2015             # so the slow path can handle anything else (strings, floats, dates,
2016             # nested values, comments, indented blocks). Re-entered after each
2017             # slow-path iteration so a one-off slow line doesn't disable the
2018             # batch path for the rest of the file.
2019             # Bulk path: lite + (ordered with sidecar) OR lite + ignore_order
2020             # (no sidecar). Both paths use a plain hash; the only difference is
2021             # whether `$order` is updated.
2022 54 100       142 my $bulk = $lite ? 1 : 0;
2023             # File-scoped cache below (%BULK_RE_CACHE) maps indent -> compiled
2024             # regex. Persists across calls so common levels (0, 2, 4, 6) compile
2025             # once globally even across recursive parse_table_block invocations.
2026 54   66     227 my $bulk_re = $BULK_RE_CACHE{$indent} //= do {
2027 5 100       21 my $sp = $indent == 0 ? '' : "[ ]{$indent}";
2028             # Groups:
2029             # $2: positive integer
2030             # $3: negative integer
2031             # $4: bool
2032             # $5: ASCII-only basic-string content (no escapes)
2033             # $6: empty list / empty table literal
2034             # $7: decimal float (matches `-?[0-9]+\.[0-9]+`, no exponent)
2035 5         919 qr/\G$sp([A-Za-z_][A-Za-z0-9_-]*):[ ](?:(0|[1-9][0-9]{0,17})|(-[1-9][0-9]{0,17})|(true|false)|"([\x20-\x21\x23-\x5b\x5d-\x7e]*)"|(\[\]|\{\})|(-?(?:0|[1-9][0-9]*)\.[0-9]+))\r?\n/;
2036             };
2037 54         108 while (1) {
2038 154 100 66     2207 if ($bulk && $self->{pos} == $self->{line_start}) {
2039 6         8 my $src_ref = \$self->{src};
2040 6         14 pos($$src_ref) = $self->{pos};
2041 6         9 my $line = $self->{line};
2042             # One regex matches multiple value forms:
2043             # group $2: positive integer (0 or [1-9][0-9]{0,17}, 18 digits max → safe i64)
2044             # group $3: negative integer (-[1-9][0-9]{0,17})
2045             # group $4: 'true' or 'false'
2046             # group $5: simple ASCII basic-string content (no \, ", \n, \r,
2047             # and ASCII-only so we skip the NFC pass) — empty
2048             # string is common in real configs (kube values.yaml
2049             # has hundreds of `key: ""` lines).
2050             # group $6: '[]' or '{}' (empty list / empty table)
2051             # Excludes leading-zero forms (which DMS rejects) and overflow.
2052             # Accepts both LF and CRLF line endings. Anything else (escaped
2053             # strings, floats, dates, multi-line blocks, comments) falls to
2054             # the slow path on the next iteration.
2055             # Inner bulk loop: consumes any pattern we can match purely
2056             # by regex (kvpair, blank line, single-line `#` or `//`
2057             # comment) without method dispatch. Falls back to the
2058             # outer slow path the moment something complex appears
2059             # (heredoc, escaped string, list item, block comment).
2060 6         5 INNER: while (1) {
2061 14 100       76 if ($$src_ref =~ /$bulk_re/gc) {
2062 8         16 my $k = $1;
2063 8 50       24 if (exists $t->{$k}) {
2064             # Roll back to start of the current line for accurate error pos.
2065 0         0 my $cur = pos($$src_ref);
2066 0   0     0 while ($cur > 0 && substr($$src_ref, $cur - 1, 1) ne "\n") { $cur--; }
  0         0  
2067 0         0 $self->{pos} = $cur;
2068 0         0 $self->{line} = $line;
2069 0         0 $self->{line_start} = $cur;
2070 0         0 $self->_die("duplicate key: $k");
2071             }
2072 8         7 my $val;
2073 8 100       35 if (defined $2) {
    50          
    50          
    50          
    0          
2074 2         4 my $iv = 0 + $2;
2075 2         8 $val = bless \$iv, 'DMS::Parser::Integer';
2076             } elsif (defined $3) {
2077 0         0 my $iv = 0 + $3;
2078 0         0 $val = bless \$iv, 'DMS::Parser::Integer';
2079             } elsif (defined $4) {
2080 0 0       0 my $bv = $4 eq 'true' ? 1 : 0;
2081 0         0 $val = bless \$bv, 'DMS::Parser::Bool';
2082             } elsif (defined $5) {
2083             # ASCII-only basic string, no escapes / no NFC.
2084 6         8 $val = $5;
2085             } elsif (defined $6) {
2086             # '[]' or '{}'.
2087 0 0       0 $val = $6 eq '[]' ? [] : { $ORDER_KEY => [] };
2088             } else {
2089             # $7: decimal float.
2090 0         0 my $fv = 0 + $7;
2091 0         0 $val = bless \$fv, 'DMS::Parser::Float';
2092             }
2093 8 50       18 push @$order, $k if $order;
2094 8         12 $t->{$k} = $val;
2095 8         8 $line++;
2096 8         12 next INNER;
2097             }
2098             # Blank line at any leading whitespace.
2099 6 50       8 if ($$src_ref =~ /\G[ \t]*\r?\n/gc) {
2100 0         0 $line++;
2101 0         0 next INNER;
2102             }
2103             # Single-line `#` comment (NOT `###` labeled block) or `//`.
2104             # Excludes `/*` (C-style block) which spans multiple lines.
2105             # bench_realistic is 56% comments — taking these in the
2106             # bulk loop avoids hundreds of _skip_trivia method calls.
2107 6 50       8 if ($$src_ref =~ /\G[ \t]*(?:#(?!##)|\/\/(?!\*))[^\n\r]*\r?\n/gc) {
2108 0         0 $line++;
2109 0         0 next INNER;
2110             }
2111 6         7 last INNER;
2112             }
2113 6   33     12 $self->{pos} = pos($$src_ref) // $self->{pos};
2114 6         8 $self->{line} = $line;
2115 6         8 $self->{line_start} = $self->{pos};
2116             }
2117 154         491 $self->_skip_trivia;
2118 154 100       460 last if $self->{pos} >= $self->{len};
2119             # Inline _measure_line_indent: hot enough that the call cost
2120             # matters across 50k iterations. For indent==0 (most flat
2121             # tables) we can skip the regex when pos is already at
2122             # line_start with no leading space — by far the common case.
2123 100         217 my $li;
2124 100 100 66     654 if ($indent == 0 && $self->{pos} == $self->{line_start}
      66        
2125             && substr($self->{src}, $self->{pos}, 1) ne ' ') {
2126 95         157 $li = 0;
2127             } else {
2128 5         18 pos($self->{src}) = $self->{line_start};
2129 5 50       36 $li = $self->{src} =~ /\G( +)/g ? length($1) : 0;
2130             }
2131 100 50       271 last if $li < $indent;
2132 100 50       279 if ($li != $indent) {
2133 0         0 die $self->_err_at($self->{line}, $self->{line_start}, $self->{line_start}+$indent,
2134             "inconsistent indent: expected $indent spaces, got $li");
2135             }
2136 100         223 $self->{pos} = $self->{line_start} + $indent;
2137             # SPEC §Lexical "Reserved decorator sigils": reject ! @ $ % ^ & *
2138             # | ~ ` . , > < ? ; = as the first non-whitespace character of a
2139             # body line. We sit at exactly that position now (line_start +
2140             # structural indent), so a single-char check is sufficient.
2141 100         297 $self->_check_reserved_sigil;
2142 100         162 my ($k, $v);
2143 100 50       204 if ($lite) {
2144             # Inlined fast-path of parse_kvpair: skip the eval frame and
2145             # the path push/pop that parse_kvpair adds for non-lite modes.
2146 0         0 $k = $self->parse_key;
2147             $self->_die("expected ':' after key")
2148 0 0       0 if substr($self->{src}, $self->{pos}, 1) ne ':';
2149 0         0 $v = $self->_parse_kvpair_after_key($indent);
2150 0 0       0 $self->_die("duplicate key: $k") if exists $t->{$k};
2151 0 0       0 push @$order, $k if $order;
2152 0         0 $t->{$k} = $v;
2153             } else {
2154 100         295 ($k, $v) = $self->parse_kvpair($indent);
2155 100 50       603 $self->_die("duplicate key: $k") if exists $t->{$k};
2156 100         951 $t->{$k} = $v;
2157             }
2158             }
2159             # Block close: leftover pending comments float on the enclosing
2160             # container (this table itself).
2161 54 100       186 $self->_flush_pending_as_floating unless $lite;
2162 54         127 return $t;
2163             }
2164              
2165             sub parse_list_block {
2166 4     4 0 10 my ($self, $indent) = @_;
2167 4         8 my @items;
2168 4         9 while (1) {
2169 13         39 $self->_skip_trivia;
2170 13 100       32 last if $self->_eof;
2171 10         27 my $li = $self->_measure_line_indent;
2172 10 100       24 last if $li < $indent;
2173 9 50       27 if ($li != $indent) {
2174 0         0 die $self->_err_at($self->{line}, $self->{line_start}, $self->{line_start}+$indent,
2175             "inconsistent indent: expected $indent spaces, got $li");
2176             }
2177 9         20 $self->{pos} = $self->{line_start} + $indent;
2178 9 50       31 if ($self->_peek ne '+') { last; }
  0         0  
2179             # Commit to a new list item: push its index, attach pending
2180             # leading comments to it, then parse the value.
2181 9         20 my $idx = scalar @items;
2182 9         17 push @{$self->{path}}, DMS::Parser::Index->new($idx);
  9         41  
2183 9         40 $self->_flush_pending_as_leading_on_current;
2184 9         14 $self->{pos}++;
2185 9         39 my $c = $self->_peek;
2186 9         18 my $item;
2187 9         17 my $ok = eval {
2188 9 50 33     47 if (defined($c) && ($c eq ' ' || $c eq "\t")) {
    0 33        
      0        
      0        
2189 9         18 $self->{pos}++;
2190 9         35 $self->_skip_inline_ws;
2191 9         30 $self->_capture_inner_block_comments;
2192 9         20 my $c2 = $self->_peek;
2193 9 50 33     101 if (!defined($c2) || $c2 eq "\n" || $c2 eq "\r") {
      33        
2194             # "+ INNER[EOL]" — empty item with inner comments.
2195 0         0 $self->_consume_eol;
2196 0         0 $self->_skip_trivia;
2197 0 0       0 $self->_die("expected indented block after empty '+' marker") if $self->_eof;
2198 0         0 my $inner = $self->_measure_line_indent;
2199 0 0       0 $self->_die("expected indented block after empty '+' marker") if $inner <= $indent;
2200 0         0 $item = $self->parse_block_value($inner);
2201             } else {
2202 9         30 $item = $self->parse_list_item_value($indent);
2203             }
2204             } elsif (!defined($c) || $c eq "\n" || $c eq "\r") {
2205 0         0 $self->_consume_eol;
2206 0         0 $self->_skip_trivia;
2207 0 0       0 $self->_die("expected indented block after empty '+' marker") if $self->_eof;
2208 0         0 my $inner = $self->_measure_line_indent;
2209 0 0       0 $self->_die("expected indented block after empty '+' marker") if $inner <= $indent;
2210 0         0 $item = $self->parse_block_value($inner);
2211             } else {
2212 0         0 $self->_die("expected space after '+'");
2213             }
2214 9         16 1;
2215             };
2216 9         20 my $err = $@;
2217 9         15 pop @{$self->{path}};
  9         36  
2218 9 50       39 if (!$ok) { die $err; }
  0         0  
2219 9         20 push @items, $item;
2220             }
2221             # Block close: leftover pending comments float on the list itself.
2222 4         16 $self->_flush_pending_as_floating;
2223 4         15 return \@items;
2224             }
2225              
2226             sub parse_block_value {
2227 7     7 0 17 my ($self, $indent) = @_;
2228 7         17 $self->{pos} = $self->{line_start} + $indent;
2229 7 100 66     54 if ($self->_peek eq '+' && $self->_peek_after_plus_is_space_or_eol) {
2230 3         9 return $self->parse_list_block($indent);
2231             }
2232 4         50 return $self->parse_table_block($indent);
2233             }
2234              
2235             sub parse_list_item_value {
2236 9     9 0 21 my ($self, $list_indent) = @_;
2237 9 50       23 if ($self->_line_starts_kvpair) {
2238 0         0 my $key_col = $self->{pos} - $self->{line_start};
2239 0         0 my $lite = $self->{lite};
2240 0         0 my $ignore_order = $self->{ignore_order};
2241 0         0 my ($k, $v) = $self->parse_kvpair($key_col);
2242 0         0 my ($t, $order);
2243 0 0       0 if ($ignore_order) {
    0          
2244 0         0 $t = new_unordered_table();
2245 0         0 $t->{$k} = $v;
2246             } elsif ($lite) {
2247 0         0 $order = [$k];
2248 0         0 $t = { $ORDER_KEY => $order, $k => $v };
2249             } else {
2250 0         0 $t = new_table();
2251 0         0 $t->{$k} = $v;
2252             }
2253 0         0 while (1) {
2254 0         0 $self->_skip_trivia;
2255 0 0       0 last if $self->_eof;
2256 0         0 my $li = $self->_measure_line_indent;
2257 0 0       0 last if $li < $key_col;
2258 0 0       0 if ($li != $key_col) {
2259 0         0 die $self->_err_at($self->{line}, $self->{line_start}, $self->{line_start}+$key_col,
2260             "list-item table sibling key must align with first key");
2261             }
2262 0         0 $self->{pos} = $self->{line_start} + $key_col;
2263 0 0       0 $self->_die("'+' marker at sibling-key column is ambiguous") if $self->_peek eq '+';
2264 0 0       0 last if !$self->_line_starts_kvpair;
2265 0         0 my ($k2, $v2) = $self->parse_kvpair($key_col);
2266 0 0       0 $self->_die("duplicate key: $k2") if exists $t->{$k2};
2267 0 0       0 push @$order, $k2 if $order;
2268 0         0 $t->{$k2} = $v2;
2269             }
2270             # End of inline-table-in-list-item: any pending leading comments
2271             # belong to the enclosing list item itself (Floating).
2272 0         0 $self->_flush_pending_as_floating;
2273 0         0 return $t;
2274             }
2275 9         25 my $v = $self->parse_inline_value_or_heredoc;
2276 9         36 $self->_consume_after_value(0);
2277 9         22 return $v;
2278             }
2279              
2280             sub parse_kvpair {
2281 100     100 0 251 my ($self, $parent_indent) = @_;
2282 100         257 my $key = $self->parse_key;
2283 100 50       347 $self->_die("expected ':' after key") if substr($self->{src}, $self->{pos}, 1) ne ':';
2284             # Lite mode: no comment-AST, no original_form recording, no path
2285             # bookkeeping needed. Skip the breadcrumb push/pop + eval frame.
2286 100 50       329 if ($self->{lite}) {
2287 0         0 my $v = $self->_parse_kvpair_after_key($parent_indent);
2288 0         0 return ($key, $v);
2289             }
2290             # We've now committed: this is a kvpair. Push the breadcrumb so
2291             # pending leading comments attach here and so trailing comments
2292             # captured by _consume_after_value get the right path.
2293 100         153 push @{$self->{path}}, $key;
  100         271  
2294 100         293 $self->_flush_pending_as_leading_on_current;
2295 100         175 my $v;
2296 100         178 my $ok = eval { $v = $self->_parse_kvpair_after_key($parent_indent); 1 };
  100         277  
  100         230  
2297 100         192 my $err = $@;
2298 100         144 pop @{$self->{path}};
  100         210  
2299 100 50       277 if (!$ok) { die $err; }
  0         0  
2300 100         338 return ($key, $v);
2301             }
2302              
2303             sub _parse_kvpair_after_key {
2304 100     100   208 my ($self, $parent_indent) = @_;
2305 100         176 $self->{pos}++; # consume ':'
2306             # Inline _peek: per-key dispatch overhead is significant on flat-50k.
2307 100         189 my $p = $self->{pos};
2308 100 50       278 my $c = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
2309 100 100 66     449 if (defined($c) && ($c eq ' ' || $c eq "\t")) {
      33        
2310 93         161 $self->{pos}++;
2311 93         153 $p = $self->{pos};
2312             # Inline _skip_inline_ws fast path — only run the regex if the
2313             # next byte is itself ws (single-space `key: v` is the common case).
2314 93 50       298 my $c2 = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
2315 93 50 33     491 if (defined($c2) && ($c2 eq ' ' || $c2 eq "\t")) {
      33        
2316 0         0 pos($self->{src}) = $p;
2317 0         0 $self->{src} =~ /\G[ \t]+/gc;
2318 0         0 $self->{pos} = pos($self->{src});
2319 0         0 $p = $self->{pos};
2320 0 0       0 $c2 = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
2321             }
2322             # Inline _capture_inner_block_comments fast path: only call out
2323             # if the next byte is '/'.
2324 93 50 33     389 if (defined($c2) && $c2 eq '/') {
2325 0         0 $self->_capture_inner_block_comments;
2326 0         0 $p = $self->{pos};
2327 0 0       0 $c2 = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
2328             }
2329 93 50 33     470 if (!defined($c2) || $c2 eq "\n" || $c2 eq "\r") {
      33        
2330             # "key: INNER[EOL]" — child block with inner comments.
2331 0         0 $self->_consume_eol;
2332 0         0 $self->_skip_trivia;
2333 0 0       0 $self->_die("expected indented child block") if $self->_eof;
2334 0         0 my $child = $self->_measure_line_indent;
2335 0 0       0 $self->_die("expected indented child block") if $child <= $parent_indent;
2336 0         0 return $self->parse_block_value($child);
2337             }
2338 93         265 my $v = $self->parse_inline_value_or_heredoc;
2339 93         348 $self->_consume_after_value(0);
2340 93         242 return $v;
2341             }
2342 7 50 33     37 if (!defined($c) || $c eq "\n" || $c eq "\r") {
      33        
2343 7         25 $self->_consume_eol;
2344 7         21 $self->_skip_trivia;
2345 7 50       20 $self->_die("expected indented child block") if $self->_eof;
2346 7         23 my $child = $self->_measure_line_indent;
2347 7 50       20 $self->_die("expected indented child block") if $child <= $parent_indent;
2348 7         29 return $self->parse_block_value($child);
2349             }
2350 0         0 $self->_die("expected whitespace after ':'");
2351             }
2352              
2353             sub parse_key {
2354 102     102 0 169 my $self = shift;
2355 102         189 my $p = $self->{pos};
2356 102 50       287 my $c = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
2357 102 50 33     409 if (defined($c) && $c eq '"') {
2358 0 0       0 $self->_die("triple-quoted strings are not allowed as keys") if $self->_starts_with('"""');
2359             # Suppress original-form recording: keys are not values and must
2360             # not generate OriginalLiteral entries on the parent path.
2361 0         0 my $saved = $self->{record_forms};
2362 0         0 $self->{record_forms} = 0;
2363 0         0 my $r = eval { $self->parse_basic_string_value };
  0         0  
2364 0         0 my $e = $@;
2365 0         0 $self->{record_forms} = $saved;
2366 0 0       0 die $e if $e;
2367 0         0 return $r;
2368             }
2369 102 50 33     395 if (defined($c) && $c eq "'") {
2370 0 0       0 $self->_die("triple-quoted strings are not allowed as keys") if $self->_starts_with("'''");
2371 0         0 my $saved = $self->{record_forms};
2372 0         0 $self->{record_forms} = 0;
2373 0         0 my $r = eval { $self->parse_literal_string_value };
  0         0  
2374 0         0 my $e = $@;
2375 0         0 $self->{record_forms} = $saved;
2376 0 0       0 die $e if $e;
2377 0         0 return $r;
2378             }
2379 102 50       222 $self->_die("expected key") if !defined($c);
2380 102         267 return $self->parse_bare_key;
2381             }
2382              
2383             sub parse_bare_key {
2384 102     102 0 168 my $self = shift;
2385 102         213 my $start = $self->{pos};
2386             # Fast path: ASCII-only bare key. One regex bite handles the entire
2387             # token — no per-byte loop, no Unicode codepoint test.
2388 102         358 pos($self->{src}) = $start;
2389 102 50       500 if ($self->{src} =~ /\G[A-Za-z0-9_-]+/gc) {
2390 102         242 $self->{pos} = pos($self->{src});
2391             }
2392             # Slow path: extend across non-ASCII XID_Continue codepoints if any.
2393 102         277 while ($self->{pos} < $self->{len}) {
2394 102         277 my $c = substr($self->{src}, $self->{pos}, 1);
2395 102 50       275 last if ord($c) < 128;
2396 0 0       0 last unless _is_bare_key_char($c);
2397 0         0 $self->{pos}++;
2398 0         0 pos($self->{src}) = $self->{pos};
2399 0 0       0 if ($self->{src} =~ /\G[A-Za-z0-9_-]+/gc) {
2400 0         0 $self->{pos} = pos($self->{src});
2401             }
2402             }
2403 102 50       242 $self->_die("expected key") if $self->{pos} == $start;
2404 102         356 return substr($self->{src}, $start, $self->{pos} - $start);
2405             }
2406              
2407             sub _capture_inner_block_comments {
2408 9     9   13 my $self = shift;
2409             # Hot-path early-out: if the next byte isn't '/', there's nothing to do.
2410             # Per-key cost is one substr instead of two method calls.
2411 9         17 my $p = $self->{pos};
2412             return if $p >= $self->{len} || substr($self->{src}, $p, 1) ne '/'
2413 9 50 33     94 || substr($self->{src}, $p, 2) ne '/*';
      33        
2414 0         0 while (1) {
2415 0         0 $p = $self->{pos};
2416 0 0 0     0 last if $p >= $self->{len} || substr($self->{src}, $p, 2) ne '/*';
2417 0         0 my $raw = $self->_read_c_block_comment;
2418 0         0 push @{$self->{comments}}, {
2419             comment => { content => $raw, kind => 'block' },
2420             position => 'inner',
2421 0         0 path => [@{$self->{path}}],
2422 0 0       0 } unless $self->{lite};
2423 0         0 $self->_skip_inline_ws;
2424             }
2425             }
2426              
2427             sub parse_inline_value_or_heredoc {
2428 113     113 0 194 my $self = shift;
2429             # Inner /* ... */ comments are captured by the caller via
2430             # _capture_inner_block_comments before this function runs.
2431 113         203 my $p = $self->{pos};
2432 113 50       292 my $c = $p >= $self->{len} ? undef : substr($self->{src}, $p, 1);
2433             # SPEC §Lexical "Reserved decorator sigils": a value position that
2434             # leads with one of `! @ $ % ^ & * | ~ \` . , > < ? ; =` is rejected
2435             # at tier 0. Covers `key: !tag`, `+ !tag`, scalar-root `!tag`, and
2436             # the same forms inside flow containers (which dispatch per item).
2437 113 50 33     471 if (defined($c) && exists $RESERVED_DECORATOR_SIGIL{$c}) {
2438 0         0 $self->_die(
2439             "'$c' is a reserved decorator sigil at line-start (tier 0)"
2440             );
2441             }
2442 113 50       232 if (defined($c)) {
2443             # Hot path: ASCII digit (the most common leaf). Dispatch
2444             # straight to parse_number_or_datetime without hitting the
2445             # other typed-leaf branches.
2446 113 100 100     429 if ($c ge '0' && $c le '9') {
2447 70         185 return $self->parse_number_or_datetime;
2448             }
2449 43 100       147 if ($c eq '"') {
2450 26 100       81 return $self->parse_heredoc_basic if $self->_starts_with('"""');
2451             # Basic is the emitter's default for strings — no record.
2452 15         57 return $self->parse_basic_string_value;
2453             }
2454 17 100       40 if ($c eq "'") {
2455 9 100       22 return $self->parse_heredoc_literal if $self->_starts_with("'''");
2456 3         12 my $r = $self->parse_literal_string_value;
2457 3         42 $self->_record_form({ string_form => { kind => 'literal' } });
2458 3         10 return $r;
2459             }
2460 8 100       23 return $self->parse_flow_array if $c eq '[';
2461 7 100       33 return $self->parse_flow_table if $c eq '{';
2462 6 50 33     37 return $self->parse_bool_value if $c eq 't' || $c eq 'f';
2463 6 50       15 return $self->parse_inf_value if $c eq 'i';
2464 6 50       15 return $self->parse_nan_value if $c eq 'n';
2465 6 50 66     27 return $self->parse_number_or_datetime if $c eq '+' || $c eq '-';
2466             }
2467 0 0       0 $self->_die("expected value") if !defined($c);
2468 0         0 $self->_die("unexpected character '$c' in value");
2469             }
2470              
2471             sub parse_bool_value {
2472 0     0 0 0 my $self = shift;
2473 0 0 0     0 if ($self->_starts_with("true") && _is_value_terminator($self->_peek_at(4))) {
2474 0         0 $self->{pos} += 4;
2475 0         0 return DMS::Parser::Bool->new(1);
2476             }
2477 0 0 0     0 if ($self->_starts_with("false") && _is_value_terminator($self->_peek_at(5))) {
2478 0         0 $self->{pos} += 5;
2479 0         0 return DMS::Parser::Bool->new(0);
2480             }
2481 0         0 $self->_die("expected value");
2482             }
2483              
2484             sub parse_inf_value {
2485 0     0 0 0 my $self = shift;
2486 0 0 0     0 if ($self->_starts_with("inf") && _is_value_terminator($self->_peek_at(3))) {
2487 0         0 $self->{pos} += 3;
2488 0         0 return DMS::Parser::Float->new(9**9**9);
2489             }
2490 0         0 $self->_die("expected 'inf'");
2491             }
2492              
2493             sub parse_nan_value {
2494 0     0 0 0 my $self = shift;
2495 0 0 0     0 if ($self->_starts_with("nan") && _is_value_terminator($self->_peek_at(3))) {
2496 0         0 $self->{pos} += 3;
2497 0         0 return DMS::Parser::Float->new(-(9**9**9) + (9**9**9));
2498             }
2499 0         0 $self->_die("expected 'nan'");
2500             }
2501              
2502             sub parse_number_or_datetime {
2503 76     76 0 116 my $self = shift;
2504 76         155 my $base = $self->{pos};
2505 76         134 my $len_src = $self->{len};
2506             # Combined hot-path: a plain decimal integer that doesn't look like
2507             # a date or time. One regex bite scans the token; a structural
2508             # check rejects anything _parse_integer would die on. This single
2509             # branch is taken by every leaf in flat-data benchmarks.
2510 76         281 pos($self->{src}) = $base;
2511 76 100       432 if ($self->{src} =~ /\G(0|[1-9][0-9]{0,17})(?![\d_.eExob:-])/g) {
2512 52         155 my $tok = $1;
2513 52         108 $self->{pos} = $base + length($tok);
2514 52         138 my $iv = 0 + $tok;
2515 52         210 return bless \$iv, 'DMS::Parser::Integer';
2516             }
2517             # Inline first-char peek: avoid the function-call + substr in _peek.
2518 24         56 my $first = substr($self->{src}, $base, 1);
2519 24   100     81 my $starts_sign = ($first eq '+' || $first eq '-');
2520 24 100       61 if (!$starts_sign) {
2521             # Combined date/time-prefix sniff, regex-only. Skips _looks_like_*
2522             # which themselves do another substr+regex.
2523 18         40 pos($self->{src}) = $base;
2524 18 50 66     86 if ($len_src - $base >= 10 && $self->{src} =~ /\G\d\d\d\d-\d\d-\d\d/) {
2525 0         0 return $self->parse_datetime_value;
2526             }
2527 18 50 66     74 if ($len_src - $base >= 8 && $self->{src} =~ /\G\d\d:\d\d:\d\d/) {
2528 0         0 return $self->parse_local_time_value;
2529             }
2530             }
2531 24 50 66     77 if ($starts_sign && substr($self->{src}, $base + 1, 3) eq 'inf') {
2532 0         0 my $p4 = $base + 4;
2533 0 0       0 my $next = $p4 >= $len_src ? undef : substr($self->{src}, $p4, 1);
2534 0 0       0 if (_is_value_terminator($next)) {
2535 0         0 my $neg = $first eq '-';
2536 0         0 $self->{pos} += 4;
2537 0 0       0 return DMS::Parser::Float->new($neg ? -(9**9**9) : 9**9**9);
2538             }
2539             }
2540 24         61 my ($len, $is_float) = $self->_scan_number_token;
2541 24         55 my $s = substr($self->{src}, $base, $len);
2542 24 50       88 if ($is_float) {
2543 0         0 my $f;
2544 0         0 eval { $f = _parse_float_val($s); };
  0         0  
2545 0 0       0 $self->_die("invalid float: $s ($@)") if $@;
2546 0         0 $self->{pos} += $len;
2547 0         0 return DMS::Parser::Float->new($f);
2548             }
2549             # Fast path: plain decimal (no sign, no leading-0, fits in 18 digits)
2550             # never dies — skip the eval frame entirely. The validating regex
2551             # rejects anything _parse_integer would error on. Bless inline to
2552             # avoid a method call per leaf.
2553 24 50       91 if ($s =~ /\A(?:0|[1-9][0-9]{0,17})\z/) {
2554 0         0 $self->{pos} += $len;
2555 0         0 my $iv = 0 + $s;
2556 0         0 return bless \$iv, 'DMS::Parser::Integer';
2557             }
2558 24         36 my $n;
2559 24         46 eval { $n = _parse_integer($s); };
  24         77  
2560 24 50       79 if ($@) { my $msg = $@; $msg =~ s/\n.*//s; $self->_die($msg); }
  0         0  
  0         0  
  0         0  
2561 24         51 $self->{pos} += $len;
2562             # Record original lexeme when it differs from the canonical
2563             # "decimal, no underscores, no '+' sign" form the default emitter
2564             # would produce. Hex/oct/bin, underscores, explicit '+' → recorded.
2565             # Skipped entirely in lite mode (record_forms gate inside _record_form).
2566 24 100 66     96 if (!$self->{lite} && $s ne $n) {
2567 21         110 $self->_record_form({ integer_lit => $s });
2568             }
2569 24         87 return DMS::Parser::Integer->new($n);
2570             }
2571              
2572             sub _scan_number_token {
2573 24     24   38 my $self = shift;
2574 24         45 my $base = $self->{pos};
2575 24         59 pos($self->{src}) = $base;
2576             # Fast path: plain decimal integer (no sign, no '.' / 'e' / 'x' /
2577             # 'o' / 'b' prefix continuation). Most leaves in flat data are like
2578             # this — skip the heavy alternation and avoid the post-scan token
2579             # re-classification. The negative lookahead must reject anything
2580             # that would extend the token under the slow grammar (digits, '_',
2581             # '.', 'e'/'E', and the 0x/0o/0b prefix indicators).
2582 24 50       91 if ($self->{src} =~ /\G(\d+)(?![\d_.eExob])/g) {
2583 0         0 return (length($1), 0);
2584             }
2585             # One regex covers both prefixed (0x/0o/0b) and decimal forms. Behavior
2586             # mirrors the per-char loop: the prefix branch accepts hex digits +
2587             # underscores with at most one '.' and one 'p[+-]?' exponent; the
2588             # decimal branch accepts digits + underscores with at most one '.' and
2589             # one 'e[+-]?' exponent.
2590 24         53 pos($self->{src}) = $base;
2591 24         104 $self->{src} =~ m{
2592             \G [+-]?
2593             (?:
2594             0[xob] [0-9a-fA-F_]* (?: \. [0-9a-fA-F_]* )? (?: p [+-]? \d* )?
2595             | [\d_]* (?: \. [\d_]* )? (?: [eE] [+-]? \d* )?
2596             )
2597             }gxc;
2598 24         54 my $end = pos($self->{src});
2599 24 50       58 $end = $base unless defined $end;
2600 24         47 my $len = $end - $base;
2601 24 50       62 return (0, 0) if $len == 0;
2602 24         56 my $tok = substr($self->{src}, $base, $len);
2603             # In prefixed (0x/0o/0b) tokens 'e' is a hex digit, not an exponent —
2604             # only '.' or 'p' make the token a float there.
2605 24 100       95 if ($tok =~ /^[+-]?0[xob]/) {
2606 13 50       58 return ($len, $tok =~ /[.p]/ ? 1 : 0);
2607             }
2608 11 50       49 return ($len, $tok =~ /[.eE]/ ? 1 : 0);
2609             }
2610              
2611             sub _valid_underscores {
2612 0     0   0 my ($s) = @_;
2613 0 0       0 return 1 if length($s) == 0;
2614 0 0 0     0 return 0 if substr($s,0,1) eq '_' || substr($s,-1,1) eq '_';
2615 0         0 my $prev = 0;
2616 0         0 for my $c (split //, $s) {
2617 0 0       0 if ($c eq '_') { return 0 if $prev; $prev = 1; }
  0 0       0  
  0         0  
2618 0         0 else { $prev = 0; }
2619             }
2620 0         0 return 1;
2621             }
2622              
2623             # Returns the canonical decimal string for an DMS integer literal. The hot
2624             # path (decimal, no underscores, fits in i64) avoids Math::BigInt entirely
2625             # because BigInt construction/arithmetic dominates the parser otherwise.
2626             # Errors mirror the old Math::BigInt-based implementation byte-for-byte.
2627             sub _parse_integer {
2628 24     24   46 my ($s) = @_;
2629             # Hot path: a plain decimal token (no sign, no leading zero, no
2630             # underscore, fits in 18 chars i.e. always within i64). Covers the
2631             # vast majority of real-world keys/leaves and skips ~10 substr +
2632             # length checks below.
2633 24 50       74 if ($s =~ /\A(?:0|[1-9][0-9]{0,17})\z/) {
2634 0         0 return $s;
2635             }
2636 24         46 my $sign_str = '';
2637 24         37 my $is_neg = 0;
2638 24         49 my $rest = $s;
2639 24         51 my $first = substr($rest, 0, 1);
2640 24 100       71 if ($first eq '-') { $sign_str = '-'; $is_neg = 1; $rest = substr($rest, 1); }
  3 100       6  
  3         6  
  3         7  
2641 3         6 elsif ($first eq '+') { $rest = substr($rest, 1); }
2642 24 50       61 die "hex prefix must be lowercase '0x'\n" if substr($rest, 0, 2) eq '0X';
2643 24         37 my $radix = 10;
2644 24         38 my $body = $rest;
2645 24 100       86 if (length($rest) >= 2) {
2646 21         41 my $p2 = substr($rest, 0, 2);
2647 21 100       64 if ($p2 eq '0x') { $radix = 16; $body = substr($rest, 2); }
  7 100       38  
  7 100       18  
2648 3         6 elsif ($p2 eq '0o') { $radix = 8; $body = substr($rest, 2); }
  3         7  
2649 3         7 elsif ($p2 eq '0b') { $radix = 2; $body = substr($rest, 2); }
  3         7  
2650             }
2651 24 50       68 die "empty number\n" if length($body) == 0;
2652 24 50 33     111 die "underscore must be between digits\n"
2653             if substr($body, 0, 1) eq '_' || substr($body, -1, 1) eq '_';
2654 24 50 100     108 if ($radix == 10 && length($rest) > 1 && substr($rest, 0, 1) eq '0') {
      66        
2655 0         0 die "leading zeros are not allowed on decimal integers\n";
2656             }
2657 24 50       61 die "underscore must be between digits\n" if index($body, '__') >= 0;
2658 24         43 my $clean = $body;
2659 24 100       67 $clean =~ tr/_//d if index($clean, '_') >= 0;
2660 24 100       82 if ($radix == 10) {
    100          
    100          
2661 11 50       30 die "invalid digit for base 10\n" if $clean =~ /\D/;
2662             } elsif ($radix == 16) {
2663 7 50       29 die "invalid digit for base 16\n" if $clean =~ /[^0-9a-fA-F]/;
2664             } elsif ($radix == 8) {
2665 3 50       12 die "invalid digit for base 8\n" if $clean =~ /[^0-7]/;
2666             } else {
2667 3 50       11 die "invalid digit for base 2\n" if $clean =~ /[^01]/;
2668             }
2669              
2670 24 100       57 if ($radix == 10) {
2671             # Strip leading zeros (keep at least one digit).
2672 11         25 $clean =~ s/^0+(?=\d)//;
2673 11 100       26 my $bound = $is_neg ? '9223372036854775808' : '9223372036854775807';
2674 11 50 33     49 die "integer out of i64 range\n"
      33        
2675             if length($clean) > length($bound)
2676             || (length($clean) == length($bound) && $clean gt $bound);
2677 11 50       26 return '0' if $clean eq '0';
2678 11         42 return "${sign_str}${clean}";
2679             }
2680              
2681             # Non-decimal: cap by max digit count for i64 magnitude.
2682 13 50 66     92 if ($radix == 16 && length($clean) > 16) { die "integer out of i64 range\n"; }
  0 50 66     0  
    50 66        
2683 0         0 elsif ($radix == 8 && length($clean) > 22) { die "integer out of i64 range\n"; }
2684 0         0 elsif ($radix == 2 && length($clean) > 64) { die "integer out of i64 range\n"; }
2685              
2686 13         27 my $val_lc = lc $clean;
2687             # Detect magnitudes in the high half (>= 2^63) that don't fit a signed
2688             # 64-bit positive value. Allowed only as the exact i64 minimum: -2^63.
2689 13         23 my $high_half = 0;
2690 13 100 33     35 if ($radix == 16) { $high_half = length($val_lc) == 16 && substr($val_lc, 0, 1) ge '8'; }
  7 100       22  
2691 3   33     9 elsif ($radix == 8) { $high_half = length($val_lc) == 22 && substr($val_lc, 0, 1) gt '0'; }
2692 3   33     49 else { $high_half = length($val_lc) == 64 && substr($val_lc, 0, 1) eq '1'; }
2693 13 50       31 if ($high_half) {
2694 0 0       0 if ($is_neg) {
2695 0 0 0     0 return '-9223372036854775808'
      0        
      0        
      0        
      0        
2696             if ($radix == 16 && $val_lc eq '8000000000000000')
2697             || ($radix == 8 && $val_lc eq '1000000000000000000000')
2698             || ($radix == 2 && $val_lc eq '1' . ('0' x 63));
2699             }
2700 0         0 die "integer out of i64 range\n";
2701             }
2702              
2703 13         22 my $native;
2704 13 100       40 if ($radix == 16) { $native = hex($val_lc); }
  7 100       15  
2705 3         10 elsif ($radix == 8) { $native = oct("0$val_lc"); }
2706 3         7 else { $native = oct("0b$val_lc"); }
2707 13 50       32 return '0' if $native == 0;
2708 13         59 return "${sign_str}$native";
2709             }
2710              
2711             sub _parse_float_val {
2712 0     0   0 my ($s) = @_;
2713 0         0 my $sign = 1.0;
2714 0         0 my $rest = $s;
2715 0 0       0 if (substr($rest,0,1) eq '-') { $sign = -1.0; $rest = substr($rest, 1); }
  0 0       0  
  0         0  
2716 0         0 elsif (substr($rest,0,1) eq '+') { $rest = substr($rest, 1); }
2717 0 0 0     0 if (substr($rest,0,2) eq '0x' || substr($rest,0,2) eq '0o' || substr($rest,0,2) eq '0b') {
      0        
2718 0         0 return $sign * _parse_nondec_float($rest);
2719             }
2720 0         0 return $sign * _parse_dec_float($rest);
2721             }
2722              
2723             sub _parse_dec_float {
2724 0     0   0 my ($s) = @_;
2725 0         0 my $e_idx = -1;
2726 0         0 for (my $i = 0; $i < length($s); $i++) {
2727 0         0 my $c = substr($s, $i, 1);
2728 0 0 0     0 if ($c eq 'e' || $c eq 'E') { $e_idx = $i; last; }
  0         0  
  0         0  
2729             }
2730 0 0       0 my $m = $e_idx == -1 ? $s : substr($s, 0, $e_idx);
2731 0 0       0 my $e = $e_idx == -1 ? undef : substr($s, $e_idx+1);
2732 0 0       0 die "decimal float requires '.'\n" if index($m, '.') < 0;
2733 0         0 my @parts = split /\./, $m, 2;
2734 0 0 0     0 die "decimal float requires digit on both sides of '.'\n"
      0        
2735             if @parts != 2 || $parts[0] eq '' || $parts[1] eq '';
2736 0 0       0 die "invalid character in mantissa\n" if $parts[0] =~ /[^\d_]/;
2737 0 0       0 die "invalid character in mantissa\n" if $parts[1] =~ /[^\d_]/;
2738 0 0 0     0 die "bad underscore in mantissa\n"
2739             unless _valid_underscores($parts[0]) && _valid_underscores($parts[1]);
2740 0         0 my $full = $parts[0]; $full =~ s/_//g; $full .= '.';
  0         0  
  0         0  
2741 0         0 my $frac = $parts[1]; $frac =~ s/_//g; $full .= $frac;
  0         0  
  0         0  
2742 0 0       0 if (defined $e) {
2743 0         0 my $e_clean = $e; $e_clean =~ s/^[+-]//;
  0         0  
2744 0 0       0 die "underscore not allowed in exponent\n" if $e_clean =~ /_/;
2745 0 0       0 die "invalid character in exponent\n" if $e =~ /[^\d+-]/;
2746 0 0       0 die "empty exponent\n" if $e_clean eq '';
2747 0         0 $full .= 'e' . $e;
2748             }
2749 0         0 return 0 + $full;
2750             }
2751              
2752             sub _parse_nondec_float {
2753 0     0   0 my ($s) = @_;
2754 0         0 my ($radix, $rest);
2755 0 0       0 if (substr($s,0,2) eq '0x') { $radix = 16; $rest = substr($s,2); }
  0 0       0  
  0 0       0  
2756 0         0 elsif (substr($s,0,2) eq '0o') { $radix = 8; $rest = substr($s,2); }
  0         0  
2757 0         0 elsif (substr($s,0,2) eq '0b') { $radix = 2; $rest = substr($s,2); }
  0         0  
2758 0         0 else { die "non-decimal float prefix required\n"; }
2759 0         0 my $p_idx = index($rest, 'p');
2760 0 0       0 die "non-decimal float requires 'p' exponent\n" if $p_idx < 0;
2761 0         0 my $mant = substr($rest, 0, $p_idx);
2762 0         0 my $exp_str = substr($rest, $p_idx+1);
2763 0 0       0 die "empty exponent\n" if $exp_str eq '';
2764 0 0       0 die "underscore not allowed in exponent\n" if $exp_str =~ /_/;
2765 0 0       0 die "invalid exponent character\n" if $exp_str =~ /[^\d+-]/;
2766 0         0 my $exp = int($exp_str);
2767 0         0 my ($ip, $fp);
2768 0 0       0 if (index($mant, '.') >= 0) {
2769 0         0 ($ip, $fp) = split /\./, $mant, 2;
2770 0 0 0     0 die "digit required on both sides of '.'\n" if $ip eq '' || $fp eq '';
2771             } else {
2772 0         0 $ip = $mant; $fp = '';
  0         0  
2773             }
2774 0 0 0     0 die "bad underscore\n" unless _valid_underscores($ip) && _valid_underscores($fp);
2775 0         0 $ip =~ s/_//g; $fp =~ s/_//g;
  0         0  
2776 0         0 my $digit_chars = substr("0123456789abcdef", 0, $radix);
2777 0         0 for my $c (split //, $ip) {
2778 0 0       0 die "invalid digit for base $radix\n" if index($digit_chars, lc $c) < 0;
2779             }
2780 0         0 for my $c (split //, $fp) {
2781 0 0       0 die "invalid digit for base $radix\n" if index($digit_chars, lc $c) < 0;
2782             }
2783 0 0       0 my $int_val = $ip eq '' ? 0 : hex_to_int($ip, $radix);
2784 0         0 my $frac_val = 0.0;
2785 0         0 my $div = $radix * 1.0;
2786 0         0 for my $c (split //, $fp) {
2787 0         0 my $d = index("0123456789abcdef", lc $c);
2788 0         0 $frac_val += $d / $div;
2789 0         0 $div *= $radix;
2790             }
2791 0         0 return ($int_val + $frac_val) * (2 ** $exp);
2792             }
2793              
2794             sub hex_to_int {
2795 0     0 0 0 my ($s, $radix) = @_;
2796 0         0 my $v = 0;
2797 0         0 for my $c (split //, $s) {
2798 0         0 $v = $v * $radix + index("0123456789abcdef", lc $c);
2799             }
2800 0         0 return $v;
2801             }
2802              
2803             sub _days_in_month {
2804 0     0   0 my ($y, $m) = @_;
2805 0 0 0     0 return 31 if $m == 1 || $m == 3 || $m == 5 || $m == 7 || $m == 8 || $m == 10 || $m == 12;
      0        
      0        
      0        
      0        
      0        
2806 0 0 0     0 return 30 if $m == 4 || $m == 6 || $m == 9 || $m == 11;
      0        
      0        
2807 0 0       0 if ($m == 2) {
2808 0 0 0     0 return ((($y % 4 == 0) && ($y % 100 != 0)) || ($y % 400 == 0)) ? 29 : 28;
2809             }
2810 0         0 return 0;
2811             }
2812              
2813             sub _validate_date {
2814 0     0   0 my ($s) = @_;
2815 0 0 0     0 die "invalid date format\n"
      0        
2816             if length($s) != 10 || substr($s,4,1) ne '-' || substr($s,7,1) ne '-';
2817 0         0 for my $i (0,1,2,3,5,6,8,9) {
2818 0 0       0 die "date must be all digits\n" if substr($s,$i,1) !~ /\d/;
2819             }
2820 0         0 my $y = int(substr($s,0,4));
2821 0         0 my $mo = int(substr($s,5,2));
2822 0         0 my $d = int(substr($s,8,2));
2823 0 0 0     0 die "month out of range\n" if $mo < 1 || $mo > 12;
2824 0 0 0     0 die "day out of range\n" if $d < 1 || $d > _days_in_month($y, $mo);
2825             }
2826              
2827             sub _validate_time {
2828 0     0   0 my ($s) = @_;
2829 0 0 0     0 die "invalid time format\n"
      0        
2830             if length($s) != 8 || substr($s,2,1) ne ':' || substr($s,5,1) ne ':';
2831 0         0 for my $i (0,1,3,4,6,7) {
2832 0 0       0 die "time must be all digits\n" if substr($s,$i,1) !~ /\d/;
2833             }
2834 0         0 my $h = int(substr($s,0,2));
2835 0         0 my $m = int(substr($s,3,2));
2836 0         0 my $sec = int(substr($s,6,2));
2837 0 0       0 die "hour out of range\n" if $h > 23;
2838 0 0       0 die "minute out of range\n" if $m > 59;
2839 0 0       0 die "second out of range (leap seconds not supported)\n" if $sec > 59;
2840             }
2841              
2842             sub parse_datetime_value {
2843 0     0 0 0 my $self = shift;
2844 0         0 my $rest = substr($self->{src}, $self->{pos});
2845 0         0 my $date = substr($rest, 0, 10);
2846 0         0 eval { _validate_date($date); };
  0         0  
2847 0 0       0 if ($@) { my $msg = $@; chomp $msg; $self->_die($msg); }
  0         0  
  0         0  
  0         0  
2848 0         0 my $rest2 = substr($rest, 10);
2849 0 0 0     0 if (substr($rest2,0,1) ne 'T' && substr($rest2,0,1) ne ' ') {
2850 0 0       0 if (substr($rest2,0,1) eq 't') {
2851 0         0 $self->_die("date and time separator must be uppercase 'T' (lowercase 't' not permitted)");
2852             }
2853 0         0 my $after = substr($rest2,0,1);
2854 0 0       0 $after = undef if $after eq '';
2855 0 0       0 $self->_die("invalid character after date") unless _is_value_terminator($after);
2856 0         0 $self->{pos} += 10;
2857 0         0 return DMS::Parser::LocalDate->new($date);
2858             }
2859 0 0       0 if (substr($rest2,0,1) eq ' ') {
2860 0         0 (my $after_ws = $rest2) =~ s/\A[ \t]+//;
2861 0 0 0     0 if (length($after_ws) > 0 && substr($after_ws, 0, 1) =~ /\d/) {
2862 0         0 $self->_die("date and time must be separated by 'T' (space not permitted)");
2863             }
2864 0         0 $self->{pos} += 10;
2865 0         0 return DMS::Parser::LocalDate->new($date);
2866             }
2867 0         0 my $after_t = substr($rest2, 1);
2868 0 0       0 $self->_die("expected HH:MM:SS after 'T'") unless _looks_like_time_prefix($after_t);
2869 0         0 my $time_str = substr($after_t, 0, 8);
2870 0         0 eval { _validate_time($time_str); };
  0         0  
2871 0 0       0 if ($@) { my $msg = $@; chomp $msg; $self->_die($msg); }
  0         0  
  0         0  
  0         0  
2872 0         0 my $consumed = 10 + 1 + 8;
2873 0         0 my $after_time = substr($rest, $consumed);
2874 0         0 my $frac_len = 0;
2875 0 0       0 if (substr($after_time,0,1) eq '.') {
2876 0         0 my $k = 1;
2877 0   0     0 while ($k < length($after_time) && substr($after_time,$k,1) =~ /\d/) { $k++; }
  0         0  
2878 0         0 my $digits = $k - 1;
2879 0 0       0 $self->_die("expected fractional digits after '.'") if $digits == 0;
2880 0 0       0 $self->_die("fractional seconds limited to 9 digits (nanosecond precision)") if $digits > 9;
2881 0         0 $frac_len = $k;
2882             }
2883 0         0 $consumed += $frac_len;
2884 0         0 my $after_frac = substr($rest, $consumed);
2885 0 0 0     0 if (substr($after_frac,0,1) eq 'Z' || substr($after_frac,0,1) eq 'z') {
2886 0         0 $consumed += 1;
2887 0         0 my $s = substr($rest, 0, $consumed);
2888 0         0 $self->{pos} += $consumed;
2889 0         0 return DMS::Parser::OffsetDateTime->new($s);
2890             }
2891 0 0 0     0 if (substr($after_frac,0,1) eq '+' || substr($after_frac,0,1) eq '-') {
2892 0 0 0     0 if (length($after_frac) < 6
      0        
      0        
      0        
      0        
2893             || substr($after_frac,1,1) !~ /\d/ || substr($after_frac,2,1) !~ /\d/
2894             || substr($after_frac,3,1) ne ':'
2895             || substr($after_frac,4,1) !~ /\d/ || substr($after_frac,5,1) !~ /\d/) {
2896 0         0 $self->_die("invalid offset; expected ±HH:MM");
2897             }
2898 0         0 my $oh = int(substr($after_frac,1,2));
2899 0         0 my $om = int(substr($after_frac,4,2));
2900 0 0 0     0 $self->_die("offset out of range") if $oh > 23 || $om > 59;
2901 0         0 $consumed += 6;
2902 0         0 my $s = substr($rest, 0, $consumed);
2903 0         0 $self->{pos} += $consumed;
2904 0         0 return DMS::Parser::OffsetDateTime->new($s);
2905             }
2906 0         0 my $after = substr($after_frac,0,1);
2907 0 0       0 $after = undef if $after eq '';
2908 0 0       0 $self->_die("invalid character after datetime") unless _is_value_terminator($after);
2909 0         0 my $s = substr($rest, 0, $consumed);
2910 0         0 $self->{pos} += $consumed;
2911 0         0 return DMS::Parser::LocalDateTime->new($s);
2912             }
2913              
2914             sub parse_local_time_value {
2915 0     0 0 0 my $self = shift;
2916 0         0 my $rest = substr($self->{src}, $self->{pos});
2917 0         0 my $time_str = substr($rest, 0, 8);
2918 0         0 eval { _validate_time($time_str); };
  0         0  
2919 0 0       0 if ($@) { my $msg = $@; chomp $msg; $self->_die($msg); }
  0         0  
  0         0  
  0         0  
2920 0         0 my $consumed = 8;
2921 0         0 my $after = substr($rest, $consumed);
2922 0 0       0 if (substr($after,0,1) eq '.') {
2923 0         0 my $k = 1;
2924 0   0     0 while ($k < length($after) && substr($after,$k,1) =~ /\d/) { $k++; }
  0         0  
2925 0         0 my $digits = $k - 1;
2926 0 0       0 $self->_die("expected fractional digits after '.'") if $digits == 0;
2927 0 0       0 $self->_die("fractional seconds limited to 9 digits") if $digits > 9;
2928 0         0 $consumed += $k;
2929             }
2930 0         0 my $after2 = substr($rest, $consumed);
2931 0         0 my $nxt = substr($after2,0,1);
2932 0 0       0 $nxt = undef if $nxt eq '';
2933 0 0       0 $self->_die("invalid character after time") unless _is_value_terminator($nxt);
2934 0         0 my $s = substr($rest, 0, $consumed);
2935 0         0 $self->{pos} += $consumed;
2936 0         0 return DMS::Parser::LocalTime->new($s);
2937             }
2938              
2939             sub parse_basic_string_value {
2940 15     15 0 29 my $self = shift;
2941 15         51 my ($sl, $sls, $sp) = ($self->{line}, $self->{line_start}, $self->{pos});
2942 15         29 $self->{pos}++;
2943 15         28 my $out = '';
2944 15         32 my $src = $self->{src};
2945 15         29 my $len = $self->{len};
2946 15         51 pos($src) = $self->{pos};
2947 15         59 while (1) {
2948             # Bulk-scan a run of safe characters (anything but ", \, or line-end)
2949             # in one regex op. The previous per-char _peek loop did O(N) Perl-VM
2950             # round-trips; this reduces typical string parsing to a single regex
2951             # plus per-escape handling. parse_basic_string_value was the
2952             # heaviest leaf at 19% of decode time on bench_realistic.
2953 18 50       78 if ($src =~ /\G([^"\\\n\r]*)/gc) {
2954 18 100       92 $out .= $1 if length($1);
2955             }
2956 18         31 my $p = pos($src);
2957 18 50       56 if ($p >= $len) {
2958 0         0 $self->{pos} = $p;
2959 0         0 die $self->_err_at($sl, $sls, $sp, "unterminated string");
2960             }
2961 18         43 my $c = substr($src, $p, 1);
2962 18 100       48 if ($c eq '"') {
2963 15         31 $self->{pos} = $p + 1;
2964             # SPEC §Unicode normalization: re-NFC after escape decoding.
2965 15 50       88 return $out !~ /[^\x00-\x7F]/ ? $out : _NFC($out);
2966             }
2967 3 50 33     16 if ($c eq "\n" || $c eq "\r") {
2968 0         0 $self->{pos} = $p;
2969 0         0 $self->_die("strings cannot span lines");
2970             }
2971             # $c is '\\' — handle the escape, then resume the bulk scan.
2972 3         6 $p++; # past the backslash
2973 3 50       8 if ($p >= $len) {
2974 0         0 $self->{pos} = $p;
2975 0         0 $self->_die("unterminated escape");
2976             }
2977 3         7 my $esc = substr($src, $p, 1);
2978 3         4 $p++;
2979 3 50 0     13 if ($esc eq '"') { $out .= '"'; }
  0 50       0  
    50          
    0          
    0          
    0          
    0          
    0          
2980 0         0 elsif ($esc eq '\\') { $out .= '\\'; }
2981 3         7 elsif ($esc eq 'n') { $out .= "\n"; }
2982 0         0 elsif ($esc eq 't') { $out .= "\t"; }
2983 0         0 elsif ($esc eq 'r') { $out .= "\r"; }
2984 0         0 elsif ($esc eq 'b') { $out .= "\b"; }
2985 0         0 elsif ($esc eq 'f') { $out .= "\f"; }
2986             elsif ($esc eq 'u' || $esc eq 'U') {
2987             # _read_hex_codepoint reads from $self->{pos}; sync it first.
2988 0         0 $self->{pos} = $p;
2989 0 0       0 $out .= $self->_read_hex_codepoint($esc eq 'u' ? 4 : 8);
2990 0         0 $p = $self->{pos};
2991             }
2992             else {
2993 0         0 $self->{pos} = $p;
2994 0         0 $self->_die("invalid escape '\\$esc'");
2995             }
2996 3         10 pos($src) = $p;
2997             }
2998             }
2999              
3000             sub parse_literal_string_value {
3001 3     3 0 5 my $self = shift;
3002 3         10 my ($sl, $sls, $sp) = ($self->{line}, $self->{line_start}, $self->{pos});
3003 3         5 $self->{pos}++;
3004 3         22 my $out = '';
3005 3         4 while (1) {
3006 24 50       51 if ($self->_eof) {
3007 0         0 die $self->_err_at($sl, $sls, $sp, "unterminated string");
3008             }
3009 24         53 my $c = $self->_peek;
3010 24 50 33     147 $self->_die("strings cannot span lines") if $c eq "\n" || $c eq "\r";
3011 24 100       52 if ($c eq "'") { $self->{pos}++; return $out; }
  3         5  
  3         10  
3012 21         39 $self->{pos}++;
3013 21         36 $out .= $c;
3014             }
3015             }
3016              
3017             sub _read_hex_codepoint {
3018 0     0   0 my ($self, $n) = @_;
3019 0         0 my $rest = substr($self->{src}, $self->{pos});
3020 0 0       0 $self->_die("expected $n hex digits in unicode escape") if length($rest) < $n;
3021 0         0 my $hex = substr($rest, 0, $n);
3022 0 0       0 $self->_die("invalid hex in unicode escape: $hex") if $hex !~ /^[0-9a-fA-F]+$/;
3023 0         0 my $v = hex($hex);
3024             # SPEC: U+0000 is forbidden anywhere in DMS source, including via
3025             # escape decoding. ` ` / `\U00000000` must not slip through.
3026 0 0       0 if ($v == 0) {
3027 0         0 $self->_die("\\u0000 escape forbidden");
3028             }
3029 0 0 0     0 if ($v >= 0xD800 && $v <= 0xDFFF) {
3030 0         0 $self->_die(sprintf("surrogate codepoint U+%04X in escape", $v));
3031             }
3032 0 0       0 if ($v > 0x10FFFF) {
3033 0         0 $self->_die("unicode escape is not a scalar value");
3034             }
3035 0         0 $self->{pos} += $n;
3036 0         0 return chr($v);
3037             }
3038              
3039             # Heredocs
3040              
3041             sub parse_heredoc_basic {
3042 11     11 0 19 my $self = shift;
3043 11         22 $self->{pos} += 3;
3044 11         32 my $label = $self->_parse_heredoc_label;
3045 11         33 my $mods = $self->_parse_heredoc_modifiers;
3046 11         33 $self->_skip_inline_ws;
3047 11 50 33     39 if (!($self->_consume_eol || $self->_eof)) {
3048 0         0 $self->_die("heredoc opener must be followed by end of line");
3049             }
3050 11 100       32 my $terminator = length($label) ? $label : '"""';
3051 11         34 my $body = $self->_collect_heredoc_body($terminator);
3052             # SPEC §basic-string escapes: surrogate codepoints (U+D800..U+DFFF)
3053             # and U+0000 are not valid in `\uXXXX` / `\UXXXXXXXX` escapes.
3054             # Basic-heredoc bodies are kept raw, so we validate the rules by
3055             # scanning the body for offending escape sequences.
3056 11         32 _validate_heredoc_basic_escapes($body);
3057 11         25 my $stripped = _strip_indent_and_continuations($body, 1);
3058 11         18 my $out;
3059 11         23 eval { $out = _apply_modifiers($stripped, $mods); };
  11         28  
3060 11 50       31 if ($@) { my $msg = $@; chomp $msg; $self->_die($msg); }
  0         0  
  0         0  
  0         0  
3061             $self->_record_form({
3062             string_form => {
3063             kind => 'heredoc',
3064             flavor => 'basic_triple',
3065             label => (length($label) ? $label : undef),
3066 11 100       121 modifiers => [ map { { name => $_->{name}, args => $_->{args} } } @$mods ],
  3         37  
3067             },
3068             });
3069             # SPEC §Unicode normalization: re-NFC after escape decoding.
3070 11 50       89 return $out !~ /[^\x00-\x7F]/ ? $out : _NFC($out);
3071             }
3072              
3073             # SPEC §basic-string escapes: a `\uXXXX` / `\UXXXXXXXX` escape whose
3074             # decoded value falls in the surrogate range U+D800..U+DFFF is not a
3075             # Unicode scalar and is a parse error. Likewise U+0000 is forbidden.
3076             # Basic-heredoc body lines are collected raw, so we validate the same
3077             # rules by scanning the body for offending escape sequences.
3078             sub _validate_heredoc_basic_escapes {
3079 11     11   23 my ($body) = @_;
3080 11         16 for my $ln (@{$body->{lines}}) {
  11         31  
3081 11         21 my $text = $ln->{text};
3082 11         21 my $len = length($text);
3083 11         18 my $i = 0;
3084 11         28 while ($i < $len) {
3085 83 50       184 if (substr($text, $i, 1) eq '\\') {
3086             # find run of consecutive backslashes
3087 0         0 my $j = $i;
3088 0   0     0 while ($j < $len && substr($text, $j, 1) eq '\\') { $j++; }
  0         0  
3089 0         0 my $run = $j - $i;
3090 0 0 0     0 if ($run % 2 == 1 && $j < $len) {
3091 0         0 my $intro = substr($text, $j, 1);
3092 0 0       0 my $n = ($intro eq 'u') ? 4 : ($intro eq 'U') ? 8 : 0;
    0          
3093 0 0 0     0 if ($n > 0 && $j + 1 + $n <= $len) {
3094 0         0 my $hex = substr($text, $j + 1, $n);
3095 0 0       0 if ($hex =~ /^[0-9a-fA-F]+$/) {
3096 0         0 my $cp = hex($hex);
3097 0         0 my $esc_off = $j - 1;
3098 0 0       0 if ($cp == 0) {
3099             die sprintf("%d:%d: \\u0000 escape forbidden\n",
3100 0         0 $ln->{line}, $esc_off + 1);
3101             }
3102 0 0 0     0 if ($cp >= 0xD800 && $cp <= 0xDFFF) {
3103             die sprintf("%d:%d: surrogate codepoint U+%04X in escape\n",
3104 0         0 $ln->{line}, $esc_off + 1, $cp);
3105             }
3106             }
3107             }
3108             }
3109 0         0 $i = $j;
3110             } else {
3111 83         207 $i++;
3112             }
3113             }
3114             }
3115             }
3116              
3117             sub parse_heredoc_literal {
3118 6     6 0 11 my $self = shift;
3119 6         12 $self->{pos} += 3;
3120 6         53 my $label = $self->_parse_heredoc_label;
3121 6         17 my $mods = $self->_parse_heredoc_modifiers;
3122 6         20 $self->_skip_inline_ws;
3123 6 50 33     15 if (!($self->_consume_eol || $self->_eof)) {
3124 0         0 $self->_die("heredoc opener must be followed by end of line");
3125             }
3126 6 100       16 my $terminator = length($label) ? $label : "'''";
3127 6         15 my $body = $self->_collect_heredoc_body($terminator);
3128 6         16 my $stripped = _strip_indent_and_continuations($body, 0);
3129 6         11 my $out;
3130 6         13 eval { $out = _apply_modifiers($stripped, $mods); };
  6         14  
3131 6 50       17 if ($@) { my $msg = $@; chomp $msg; $self->_die($msg); }
  0         0  
  0         0  
  0         0  
3132             $self->_record_form({
3133             string_form => {
3134             kind => 'heredoc',
3135             flavor => 'literal_triple',
3136             label => (length($label) ? $label : undef),
3137 6 100       87 modifiers => [ map { { name => $_->{name}, args => $_->{args} } } @$mods ],
  0         0  
3138             },
3139             });
3140 6         32 return $out;
3141             }
3142              
3143             sub _parse_heredoc_label {
3144 17     17   57 my $self = shift;
3145 17         42 my $c = $self->_peek;
3146 17 100       41 return '' if !_is_label_start($c);
3147 11         26 my $start = $self->{pos};
3148 11         17 while (1) {
3149 44         110 my $c2 = $self->_peek;
3150 44 100       82 last if !_is_label_cont($c2);
3151 33         60 $self->{pos}++;
3152             }
3153 11         34 return substr($self->{src}, $start, $self->{pos} - $start);
3154             }
3155              
3156             sub _parse_heredoc_modifiers {
3157 17     17   29 my $self = shift;
3158 17         31 my @mods;
3159 17         35 while (1) {
3160 20         38 my $ws_start = $self->{pos};
3161 20         55 $self->_skip_inline_ws;
3162 20         40 my $had_ws = $self->{pos} > $ws_start;
3163 20         41 my $c = $self->_peek;
3164 20 100 66     62 if (defined($c) && _is_label_start($c)) {
3165 3 50       10 $self->_die("modifier must be preceded by whitespace") unless $had_ws;
3166 3         10 push @mods, $self->_parse_one_modifier;
3167             } else {
3168 17         31 $self->{pos} = $ws_start;
3169 17         46 return \@mods;
3170             }
3171             }
3172             }
3173              
3174             sub _parse_one_modifier {
3175 3     3   5 my $self = shift;
3176 3         7 my $ns = $self->{pos};
3177 3         5 while (1) {
3178 18         35 my $c = $self->_peek;
3179 18 100       33 last if !_is_label_cont($c);
3180 15         27 $self->{pos}++;
3181             }
3182 3         9 my $name = substr($self->{src}, $ns, $self->{pos} - $ns);
3183 3 50       8 $self->_die("modifiers require parentheses") if $self->_peek ne '(';
3184 3         7 $self->{pos}++;
3185             # Suppress original-form recording for modifier args: they're
3186             # parse-time values used as call arguments and must not pollute the
3187             # host heredoc node's original_forms slot.
3188 3         7 my $saved = $self->{record_forms};
3189 3         5 $self->{record_forms} = 0;
3190 3         5 my @args;
3191 3         6 my $ok = eval {
3192 3         5 while (1) {
3193 6         16 $self->_skip_inline_ws;
3194 6 50       13 if ($self->_peek eq ')') { $self->{pos}++; last; }
  0         0  
  0         0  
3195 6         35 push @args, $self->parse_inline_value_or_heredoc;
3196 6         18 $self->_skip_inline_ws;
3197 6         14 my $c = $self->_peek;
3198 6 100       18 if ($c eq ',') { $self->{pos}++; }
  3 50       7  
3199 3         5 elsif ($c eq ')') { $self->{pos}++; last; }
  3         7  
3200 0         0 else { $self->_die("expected ',' or ')' in modifier args"); }
3201             }
3202 3         7 1;
3203             };
3204 3         5 my $err = $@;
3205 3         7 $self->{record_forms} = $saved;
3206 3 50       8 die $err if !$ok;
3207 3         45 return { name => $name, args => \@args };
3208             }
3209              
3210             sub _collect_heredoc_body {
3211 17     17   37 my ($self, $terminator) = @_;
3212 17         30 my @lines;
3213 17         49 my ($sl, $sls, $sp) = ($self->{line}, $self->{line_start}, $self->{pos});
3214 17         28 while (1) {
3215 34 50       73 if ($self->_eof) {
3216 0         0 die $self->_err_at($sl, $sls, $sp, "unterminated heredoc");
3217             }
3218 34         65 my $lb = $self->{pos};
3219 34         51 while (1) {
3220 235         439 my $c = $self->_peek;
3221 235 100 66     1083 last if !defined($c) || $c eq "\n" || $c eq "\r";
      66        
3222 201         327 $self->{pos}++;
3223             }
3224 34         77 my $raw = substr($self->{src}, $lb, $self->{pos} - $lb);
3225 34         80 my ($this_line, $this_lstart) = ($self->{line}, $self->{line_start});
3226 34         68 my $trimmed = $raw; $trimmed =~ s/^\s+|\s+$//g;
  34         239  
3227 34 100       111 if ($trimmed eq $terminator) {
3228 17         58 my $strip_depth = 0;
3229 17         61 for my $c (split //, $raw) {
3230 51 100       110 if ($c eq ' ') { $strip_depth++; } else { last; }
  34         77  
  17         33  
3231             }
3232 17         84 return { lines => \@lines, strip_depth => $strip_depth };
3233             }
3234 17         46 $self->_consume_eol;
3235 17         100 push @lines, { text => $raw, line => $this_line, line_start => $this_lstart };
3236             }
3237             }
3238              
3239             sub _strip_indent_and_continuations {
3240 17     17   37 my ($body, $allow_cont) = @_;
3241 17         26 my @out;
3242 17         30 my $first = 1;
3243 17         24 my $pending = 0;
3244 17         36 my @last = (1, 0);
3245 17         30 for my $ln (@{$body->{lines}}) {
  17         39  
3246 17         61 @last = ($ln->{line}, $ln->{line_start});
3247 17         75 my $is_blank = ($ln->{text} =~ /\A[ \t]*\z/);
3248 17         32 my $stripped;
3249 17 50       35 if ($is_blank) {
3250 0         0 $stripped = '';
3251             } else {
3252 17         26 my $leading = 0;
3253 17         60 for my $c (split //, $ln->{text}) {
3254 51 100       97 if ($c eq ' ') { $leading++; } else { last; }
  34         64  
  17         26  
3255             }
3256 17 50       55 if ($leading < $body->{strip_depth}) {
3257             die sprintf("%d:%d: heredoc body line indented %d spaces, less than strip depth %d\n",
3258 0         0 $ln->{line}, $leading + 1, $leading, $body->{strip_depth});
3259             }
3260 17         46 $stripped = substr($ln->{text}, $body->{strip_depth});
3261             }
3262 17         27 my $piece = $stripped;
3263 17         28 my $splice = 0;
3264 17 100       41 if ($allow_cont) {
3265 11         19 my $trimmed_end = $piece; $trimmed_end =~ s/[ \t]+$//;
  11         35  
3266 11         22 my $idx = rindex($trimmed_end, '\\');
3267 11 50 33     31 if ($idx != -1 && $idx == length($trimmed_end) - 1) {
3268 0         0 my $preceding = 0;
3269 0   0     0 for (my $k = $idx - 1; $k >= 0 && substr($trimmed_end,$k,1) eq '\\'; $k--) {
3270 0         0 $preceding++;
3271             }
3272 0 0       0 if ($preceding % 2 == 0) {
3273 0         0 $piece = substr($trimmed_end, 0, $idx);
3274 0         0 $splice = 1;
3275             }
3276             }
3277             }
3278 17 50       59 if ($first) {
    0          
3279 17         37 push @out, $piece;
3280 17         29 $first = 0;
3281             } elsif ($pending) {
3282 0         0 my $trimmed_start = $piece; $trimmed_start =~ s/^[ \t]+//;
  0         0  
3283 0 0       0 push @out, $trimmed_start unless $is_blank;
3284             } else {
3285 0         0 push @out, "\n", $piece;
3286             }
3287 17         38 $pending = $splice;
3288             }
3289 17 50       42 if ($pending) {
3290 0         0 die sprintf("%d:1: trailing line continuation has nothing to splice to\n", $last[0]);
3291             }
3292 17         58 return join('', @out);
3293             }
3294              
3295             sub _fold_paragraphs {
3296 0     0   0 my ($s) = @_;
3297 0         0 my @paras = split /\n\n/, $s, -1;
3298             return join("\n", map {
3299 0         0 join(' ', grep { length($_) } split /\n/, $_, -1);
  0         0  
  0         0  
3300             } @paras);
3301             }
3302              
3303             sub _replace_all_runs {
3304 0     0   0 my ($s, $charset, $replacement) = @_;
3305 0         0 my $out = '';
3306 0         0 my @chars = split //, $s;
3307 0         0 my $i = 0;
3308 0         0 while ($i < @chars) {
3309 0 0       0 if ($charset->{$chars[$i]}) {
3310 0   0     0 while ($i < @chars && $charset->{$chars[$i]}) { $i++; }
  0         0  
3311 0         0 $out .= $replacement;
3312             } else {
3313 0         0 $out .= $chars[$i];
3314 0         0 $i++;
3315             }
3316             }
3317 0         0 return $out;
3318             }
3319              
3320             sub _replace_leading_run {
3321 0     0   0 my ($s, $charset, $replacement) = @_;
3322 0         0 my @chars = split //, $s;
3323 0         0 my $end = 0;
3324 0   0     0 while ($end < @chars && $charset->{$chars[$end]}) { $end++; }
  0         0  
3325 0 0       0 return $s if $end == 0;
3326 0         0 return $replacement . join('', @chars[$end..$#chars]);
3327             }
3328              
3329             sub _replace_trailing_run {
3330 3     3   8 my ($s, $charset, $replacement) = @_;
3331 3         9 my @chars = split //, $s;
3332 3         7 my $start = scalar @chars;
3333 3   33     17 while ($start > 0 && $charset->{$chars[$start-1]}) { $start--; }
  0         0  
3334 3 50       15 return $s if $start == @chars;
3335 0         0 return join('', @chars[0..$start-1]) . $replacement;
3336             }
3337              
3338             sub _per_line_edges {
3339 0     0   0 my ($s, $charset, $replacement) = @_;
3340 0         0 my @lines = split /\n/, $s, -1;
3341 0         0 for my $line (@lines) {
3342 0         0 $line = _replace_leading_run($line, $charset, $replacement);
3343 0         0 $line = _replace_trailing_run($line, $charset, $replacement);
3344             }
3345 0         0 return join("\n", @lines);
3346             }
3347              
3348             sub _apply_trim {
3349 3     3   10 my ($s, $chars, $where, $replacement) = @_;
3350 3 50       8 return $s if length($chars) == 0;
3351 3         12 my %charset = map { $_ => 1 } split //, $chars;
  3         14  
3352 3         8 my $has_star = (index($where, '*') >= 0);
3353 3         7 my $has_pipe = (index($where, '|') >= 0);
3354 3         7 my $has_lt = (index($where, '<') >= 0);
3355 3         5 my $has_gt = (index($where, '>') >= 0);
3356 3 50 33     37 return $s if !($has_star || $has_pipe || $has_lt || $has_gt);
      33        
      33        
3357 3 50       7 return _replace_all_runs($s, \%charset, $replacement) if $has_star;
3358 3         7 my $cur = $s;
3359 3 50       7 $cur = _per_line_edges($cur, \%charset, $replacement) if $has_pipe;
3360 3 50       8 $cur = _replace_leading_run($cur, \%charset, $replacement) if $has_lt;
3361 3 50       13 $cur = _replace_trailing_run($cur, \%charset, $replacement) if $has_gt;
3362 3         12 return $cur;
3363             }
3364              
3365             sub _apply_modifiers {
3366 17     17   39 my ($s, $mods) = @_;
3367 17         57 my $cur = $s;
3368 17         39 for my $m (@$mods) {
3369 3         8 my $name = $m->{name};
3370 3         6 my $args = $m->{args};
3371 3 50       11 if ($name eq '_fold_paragraphs') {
    50          
3372 0 0       0 die "fold_paragraphs() takes no arguments\n" if @$args;
3373 0         0 $cur = _fold_paragraphs($cur);
3374             } elsif ($name eq '_trim') {
3375 3 50 33     29 die qq{trim(chars, where, replacement = "") expects 2 or 3 arguments\n}
3376             if @$args < 2 || @$args > 3;
3377 3         6 my $chars = $args->[0];
3378 3 50       10 die "trim: first argument (chars) must be a string\n" if ref($chars) ne '';
3379 3         5 my $where = $args->[1];
3380 3 50       7 die "trim: second argument (where) must be a string\n" if ref($where) ne '';
3381 3         7 my $replacement = '';
3382 3 50       10 if (@$args == 3) {
3383 0 0       0 die "trim: third argument (replacement) must be a string\n" if ref($args->[2]) ne '';
3384 0         0 $replacement = $args->[2];
3385             }
3386 3         9 $cur = _apply_trim($cur, $chars, $where, $replacement);
3387             } else {
3388 0         0 die "unknown modifier: $name\n";
3389             }
3390             }
3391 17         42 return $cur;
3392             }
3393              
3394             # Flow forms
3395              
3396             sub parse_flow_array {
3397 1     1 0 3 my $self = shift;
3398 1         3 $self->{pos}++;
3399 1         3 my @items;
3400 1         2 while (1) {
3401 3         11 $self->_skip_flow_ws;
3402 3 50       7 if ($self->_peek eq ']') { $self->{pos}++; return \@items; }
  0         0  
  0         0  
3403             # Push the current index onto the path so any OriginalLiteral
3404             # records inside the flow value get the right breadcrumb.
3405 3         6 my $idx = scalar @items;
3406 3         6 push @{$self->{path}}, DMS::Parser::Index->new($idx);
  3         27  
3407 3         5 my $v;
3408 3         6 my $ok = eval { $v = $self->_parse_inline_value_in_flow; 1 };
  3         9  
  3         7  
3409 3         7 my $err = $@;
3410 3         5 pop @{$self->{path}};
  3         6  
3411 3 50       10 if (!$ok) { die $err; }
  0         0  
3412 3         6 push @items, $v;
3413 3         9 $self->_skip_flow_ws;
3414 3         7 my $c = $self->_peek;
3415 3 100       9 if ($c eq ',') { $self->{pos}++; }
  2 50       4  
    0          
3416 1         14 elsif ($c eq ']') { $self->{pos}++; return \@items; }
  1         4  
3417 0         0 elsif (!defined($c)) { $self->_die("unterminated flow array"); }
3418 0         0 else { $self->_die("unexpected '$c' in flow array; expected ',' or ']'"); }
3419             }
3420             }
3421              
3422             sub parse_flow_table {
3423 1     1 0 3 my $self = shift;
3424 1         2 $self->{pos}++;
3425 1         3 my $lite = $self->{lite};
3426 1         2 my $ignore_order = $self->{ignore_order};
3427 1         3 my ($t, $order);
3428 1 50       22 if ($ignore_order) {
    50          
3429 0         0 $t = new_unordered_table();
3430             } elsif ($lite) {
3431 0         0 $order = [];
3432 0         0 $t = { $ORDER_KEY => $order };
3433             } else {
3434 1         4 $t = new_table();
3435             }
3436 1         2 while (1) {
3437 2         8 $self->_skip_flow_ws;
3438 2 50       6 if ($self->_peek eq '}') { $self->{pos}++; return $t; }
  0         0  
  0         0  
3439 2         6 my $key = $self->parse_key;
3440 2 50       5 $self->_die("expected ':' after flow-table key") if $self->_peek ne ':';
3441 2         3 $self->{pos}++;
3442 2         6 my $c = $self->_peek;
3443 2 0 33     12 if (!defined($c) || ($c ne ' ' && $c ne "\t" && $c ne "\n" && $c ne "\r")) {
      33        
      0        
      33        
3444 0         0 $self->_die("expected whitespace after ':'");
3445             }
3446 2         6 $self->_skip_flow_ws;
3447             # Path push so OriginalLiteral records inside the value get the
3448             # key path as their breadcrumb.
3449 2 50       6 push @{$self->{path}}, $key unless $lite;
  2         6  
3450 2         4 my $v;
3451 2 50       5 if ($lite) {
3452 0         0 $v = $self->_parse_inline_value_in_flow;
3453             } else {
3454 2         15 my $ok = eval { $v = $self->_parse_inline_value_in_flow; 1 };
  2         6  
  2         5  
3455 2         4 my $err = $@;
3456 2         4 pop @{$self->{path}};
  2         5  
3457 2 50       8 if (!$ok) { die $err; }
  0         0  
3458             }
3459 2 50       9 $self->_die("duplicate key: $key") if exists $t->{$key};
3460 2 50       14 push @$order, $key if $order;
3461 2         9 $t->{$key} = $v;
3462 2         47 $self->_skip_flow_ws;
3463 2         6 my $c2 = $self->_peek;
3464 2 100       14 if ($c2 eq ',') { $self->{pos}++; }
  1 50       3  
    0          
3465 1         2 elsif ($c2 eq '}') { $self->{pos}++; return $t; }
  1         28  
3466 0         0 elsif (!defined($c2)) { $self->_die("unterminated flow table"); }
3467 0         0 else { $self->_die("unexpected '$c2' in flow table; expected ',' or '}'"); }
3468             }
3469             }
3470              
3471             sub _skip_flow_ws {
3472 12     12   21 my $self = shift;
3473 12         23 while (1) {
3474 12         29 pos($self->{src}) = $self->{pos};
3475 12 100       47 if ($self->{src} =~ /\G[ \t]+/gc) {
3476 5         10 $self->{pos} = pos($self->{src});
3477             }
3478 12 50       33 return if $self->{pos} >= $self->{len};
3479 12         27 my $c = substr($self->{src}, $self->{pos}, 1);
3480 12 50       30 if ($c eq "\n") { $self->{pos}++; $self->_advance_line; next; }
  0         0  
  0         0  
  0         0  
3481 12 50       27 if ($c eq "\r") {
3482 0 0       0 if (substr($self->{src}, $self->{pos}, 2) eq "\r\n") {
3483 0         0 $self->{pos} += 2; $self->_advance_line; next;
  0         0  
  0         0  
3484             }
3485 0         0 return;
3486             }
3487 12 50       27 if ($c eq '#') { $self->_die("comments not allowed inside flow forms"); }
  0         0  
3488 12 50       25 if ($c eq '/') {
3489 0         0 my $n = substr($self->{src}, $self->{pos} + 1, 1);
3490 0 0 0     0 if ($n eq '/' || $n eq '*') {
3491 0         0 $self->_die("comments not allowed inside flow forms");
3492             }
3493             }
3494 12         23 return;
3495             }
3496             }
3497              
3498             sub _parse_inline_value_in_flow {
3499 5     5   8 my $self = shift;
3500 5 50 33     13 if ($self->_peek eq '"' && $self->_starts_with('"""')) {
3501 0         0 $self->_die("heredocs are not allowed inside flow forms");
3502             }
3503 5 50 33     11 if ($self->_peek eq "'" && $self->_starts_with("'''")) {
3504 0         0 $self->_die("heredocs are not allowed inside flow forms");
3505             }
3506 5         15 return $self->parse_inline_value_or_heredoc;
3507             }
3508              
3509             sub _consume_after_value {
3510 102     102   219 my ($self, $allow_eof) = @_;
3511             # Hot-path early-out for the no-comment case (every flat-table leaf):
3512             # check the byte at pos. Common case: directly at \n with no
3513             # trailing WS — return after one substr + branch. _advance_line is
3514             # inlined too so the hot path has zero method calls.
3515 102         154 my $hot_had_ws = 0;
3516 102         205 my $p = $self->{pos};
3517             {
3518 102 50       157 if ($p >= $self->{len}) { return; }
  102         287  
  0         0  
3519 102         258 my $c = substr($self->{src}, $p, 1);
3520 102 100       237 if ($c eq "\n") {
3521 85         147 my $np = $p + 1;
3522 85         151 $self->{pos} = $np;
3523 85         183 $self->{line}++;
3524 85         208 $self->{line_start} = $np;
3525 85         190 return;
3526             }
3527 17 50 33     69 if ($c eq ' ' || $c eq "\t") {
3528             # Have trailing WS: skip it, then re-check.
3529 17         42 pos($self->{src}) = $p;
3530 17         58 $self->{src} =~ /\G[ \t]+/gc;
3531 17         38 $self->{pos} = pos($self->{src});
3532 17         53 $hot_had_ws = 1;
3533 17         34 $p = $self->{pos};
3534 17 50       52 if ($p >= $self->{len}) { return; }
  0         0  
3535 17         38 $c = substr($self->{src}, $p, 1);
3536 17 50       47 if ($c eq "\n") {
3537 0         0 my $np = $p + 1;
3538 0         0 $self->{pos} = $np;
3539 0         0 $self->{line}++;
3540 0         0 $self->{line_start} = $np;
3541 0         0 return;
3542             }
3543             }
3544 17 0 33     64 if ($c ne '#' && $c ne '/' && $c ne "\r") {
      33        
3545 0         0 $self->_die("unexpected character '$c' after value");
3546             }
3547             # else fall through to the comment-handling slow path. Pass
3548             # $hot_had_ws so the first iteration knows ws was already consumed.
3549             }
3550             # Same-line comment(s) after a value attach as `trailing`. Multiple
3551             # block comments may stack; a `#`/`//` line comment, if present,
3552             # consumes to EOL and must come last.
3553 17         31 my $first_iter = 1;
3554 17         24 while (1) {
3555 17         31 my $ws_start = $self->{pos};
3556 17         49 $self->_skip_inline_ws;
3557 17         83 my $had_ws = $self->{pos} > $ws_start;
3558 17 50 33     94 $had_ws = 1 if $first_iter && $hot_had_ws;
3559 17         25 $first_iter = 0;
3560 17         51 my $c = $self->_peek;
3561 17 50 33     83 if (defined($c) && $c eq '#' && !$self->_starts_with("###")) {
    0 33        
    0 0        
      0        
      0        
      0        
3562 17 50       45 $self->_die("expected whitespace before '#' comment") unless $had_ws;
3563 17         50 my $raw = $self->_read_line_comment_to_eol;
3564 17         72 push @{$self->{comments}}, {
3565             comment => { content => $raw, kind => 'line' },
3566             position => 'trailing',
3567 17         73 path => [@{$self->{path}}],
3568 17 50       49 } unless $self->{lite};
3569 17         42 last;
3570             } elsif (defined($c) && $c eq '/' && $self->_starts_with("//")) {
3571 0 0       0 $self->_die("expected whitespace before '//' comment") unless $had_ws;
3572 0         0 my $raw = $self->_read_line_comment_to_eol;
3573 0         0 push @{$self->{comments}}, {
3574             comment => { content => $raw, kind => 'line' },
3575             position => 'trailing',
3576 0         0 path => [@{$self->{path}}],
3577 0 0       0 } unless $self->{lite};
3578 0         0 last;
3579             } elsif (defined($c) && $c eq '/' && $self->_starts_with("/*")) {
3580 0         0 my $raw = $self->_read_c_block_comment;
3581 0         0 push @{$self->{comments}}, {
3582             comment => { content => $raw, kind => 'block' },
3583             position => 'trailing',
3584 0         0 path => [@{$self->{path}}],
3585 0 0       0 } unless $self->{lite};
3586 0         0 next;
3587             } else {
3588 0         0 last;
3589             }
3590             }
3591 17         43 my $c = $self->_peek;
3592 17 50       42 return if !defined($c);
3593 17 50       56 if ($c eq "\n") { $self->{pos}++; $self->_advance_line; return; }
  17         33  
  17         76  
  17         35  
3594 0 0 0       if ($c eq "\r" && $self->_starts_with("\r\n")) { $self->{pos} += 2; $self->_advance_line; return; }
  0            
  0            
  0            
3595 0           $self->_die("unexpected character '$c' after value");
3596             }
3597              
3598             1;
3599              
3600             __END__