File Coverage

blib/lib/TOML/Tiny/Tokenizer.pm

Criterion	Covered	Total	%
statement	130	136	95.5
branch	44	46	95.6
condition	24	29	82.7
subroutine	16	18	88.8
pod	0	12	0.0
total	214	241	88.8

line	stmt	bran	cond	sub	pod	time	code
1							package TOML::Tiny::Tokenizer;
2							# ABSTRACT: tokenizer used by TOML::Tiny
3							$TOML::Tiny::Tokenizer::VERSION = '0.20';
4	287			287		2269	use strict;
	287					647
	287					11473
5	287			287		1450	use warnings;
	287					554
	287					18261
6	287			287		1466	no warnings qw(experimental);
	287					551
	287					12922
7	287			287		164343	use charnames qw(:full);
	287					2848837
	287					2018
8	287			287		7526809	use v5.18;
	287					1262
9
10	287					695858	use TOML::Tiny::Grammar qw(
11							$Comment
12							$CRLF
13							$DateTime
14							$EOL
15							$Escape
16							$Float
17							$Integer
18							$Key
19							$SimpleKey
20							$String
21							$WS
22	287			287		1793	);
	287					643
23
24							sub new {
25	427			427	0	1991	my ($class, %param) = @_;
26
27							my $self = bless{
28							source => $param{source},
29							last_position => length $param{source},
30	427					4092	position => 0,
31							line => 1,
32							last_token => undef,
33							}, $class;
34
35	427					2300	return $self;
36							}
37
38							sub last_token {
39	0			0	0	0	my $self = shift;
40	0					0	return $self->{last_token};
41							}
42
43							sub next_token {
44	6522			6522	0	9650	my $self = shift;
45
46							return unless defined $self->{source}
47	6522	100	66			25655	&& $self->{position} < $self->{last_position};
48
49	6275	100				16768	if (!$self->{last_token}) {
50	425					3793	return $self->{last_token} = {type => 'table', pos => 0, line => 1, value => []};
51							}
52
53							# Update the regex engine's position marker in case some other regex
54							# attempted to match against the source string and reset it.
55	5850					17651	pos($self->{source}) = $self->{position};
56
57	5850					15026	my $token;
58							my $type;
59	5850					0	my $value;
60
61	5850					140326	state $key_set = qr/\G ($Key) $WS* (?= =)/x;
62	5850					125920	state $table = qr/\G \[ $WS* ($Key) $WS* \] $WS* (?:$EOL \| $)/x;
63	5850					121863	state $array_table = qr/\G \[\[ $WS* ($Key) $WS* \]\] $WS* (?:$EOL \| $)/x;
64
65	5850					11545	state $simple = {
66							'[' => 'inline_array',
67							']' => 'inline_array_close',
68							'{' => 'inline_table',
69							'}' => 'inline_table_close',
70							',' => 'comma',
71							'=' => 'assign',
72							'true' => 'bool',
73							'false' => 'bool',
74							};
75
76							# More complex matches with regexps
77	5850		100			22040	while ($self->{position} < $self->{last_position} && !defined($type)) {
78	6048	50				13897	my $prev = $self->{last_token} ? $self->{last_token}{type} : 'EOL';
79	6048		100			25826	my $newline = !!($prev eq 'EOL' \|\| $prev eq 'table' \|\| $prev eq 'array_table');
80
81	6048					14343	for ($self->{source}) {
82	6048					38139	/\G$WS+/gc; # ignore whitespace
83	6048	100				31112	/\G$Comment$/mgc && next; # ignore comments
84
85	5948	100				14902	last if /\G$/gc;
86
87	5847	100				42114	if (/\G$EOL/gc) {
88	1366					2615	++$self->{line};
89	1366					2163	$type = 'EOL';
90	1366					2228	last;
91							}
92
93	4481	100				9763	if ($newline) {
94	1602	100				13130	if (/$table/gc) {
95	244					546	$type = 'table';
96	244					740	$value = $self->tokenize_key($1);
97	244					523	last;
98							}
99
100	1358	100				7996	if (/$array_table/gc) {
101	75					154	$type = 'array_table';
102	75					279	$value = $self->tokenize_key($1);
103	75					228	last;
104							}
105							}
106
107	4162	100				45699	if (/$key_set/gc) {
108	1091					6419	$type = 'key';
109	1091					4739	$value = $1;
110	1091					3307	last;
111							}
112
113	3071	100				9797	if (/\G ( [\[\]{}=,] \| true \| false )/xgc) {
114	1894					4909	$value = $1;
115	1894					4737	$type = $simple->{$value};
116	1894					3208	last;
117							}
118
119	1177	100				108710	if (/\G($String)/gc) {
120	480					969	$type = 'string';
121	480					1111	$value = $1;
122	480					786	last;
123							}
124
125	697	100				167716	if (/\G($DateTime)/gc) {
126	57					140	$type = 'datetime';
127	57					176	$value = $1;
128	57					114	last;
129							}
130
131	640	100				42253	if (/\G($Float)/gc) {
132	102					228	$type = 'float';
133	102					304	$value = $1;
134	102					205	last;
135							}
136
137	538	100				19613	if (/\G($Integer)/gc) {
138	429					912	$type = 'integer';
139	429					1114	$value = $1;
140	429					786	last;
141							}
142
143	109		50			1045	my $substr = substr($self->{source}, $self->{position}, 30) // 'undef';
144	109					1649	die "toml syntax error on line $self->{line}\n\t-->\|$substr\|\n";
145							}
146
147	5939	100				12762	if ($type) {
148	5738					8032	state $tokenizers = {};
149	5738		100			27149	my $tokenize = $tokenizers->{$type} //= $self->can("tokenize_$type") \|\| 0;
			100
150
151							$token = {
152							line => $self->{line},
153							pos => $self->{pos},
154							type => $type,
155							value => $tokenize ? $tokenize->($self, $value) : $value,
156							prev => $self->{last_token},
157	5738	100				25204	};
158
159							# Unset the previous token's 'prev' key to prevent keeping the entire
160							# chain of previously parsed tokens alive for the whole process.
161	5733					14703	undef $self->{last_token}{prev};
162
163	5733					10179	$self->{last_token} = $token;
164							}
165
166	5934					14649	$self->update_position;
167							}
168
169	5736					19185	return $token;
170							}
171
172							sub current_line {
173	0			0	0	0	my $self = shift;
174	0					0	my $rest = substr $self->{source}, $self->{position};
175	0					0	my $stop = index $rest, "\n";
176	0					0	substr $rest, 0, $stop;
177							}
178
179							sub update_position {
180	5934			5934	0	8753	my $self = shift;
181	5934		50			35957	$self->{position} = pos($self->{source}) // 0;
182							}
183
184							sub error {
185	5			5	0	12	my $self = shift;
186	5					10	my $token = shift;
187	5		50			34	my $msg = shift // 'unknown';
188	5	50				21	my $line = $token ? $token->{line} : $self->{line};
189	5					83	die "toml: parse error at line $line: $msg\n";
190							}
191
192							sub tokenize_key {
193	1410			1410	0	2364	my $self = shift;
194	1410					2669	my $toml = shift;
195	1410					52666	my @segs = $toml =~ /($SimpleKey)\.?/g;
196	1410					3169	my @keys;
197
198	1410					2876	for my $seg (@segs) {
199	1692	100				4979	$seg = $self->tokenize_string($seg) if $seg =~ m/^['"]/;
200	1692					4557	push @keys, $seg;
201							}
202
203	1410					7103	return \@keys;
204							}
205
206							sub tokenize_float {
207	102			102	0	236	$_[1] =~ tr/_//d;
208	102					594	$_[1];
209							}
210
211							sub tokenize_integer {
212	429			429	0	1103	$_[1] =~ tr/_+//d;
213	429					2116	$_[1];
214							}
215
216							sub tokenize_string {
217	582			582	0	935	my $self = shift;
218	582					943	my $toml = shift;
219	582		100			3078	my $ml = index($toml, q{'''}) == 0
220							\|\| index($toml, q{"""}) == 0;
221	582					1250	my $lit = index($toml, q{'}) == 0;
222	582					1136	my $str = '';
223
224	582	100				1249	if ($ml) {
225	35					277	$str = substr $toml, 3, length($toml) - 6;
226	35					495	my @newlines = $str =~ /($CRLF)/g;
227	35					141	$self->{line} += scalar @newlines;
228	35					423	$str =~ s/^$WS* $CRLF//x; # trim leading whitespace
229	35					512	$str =~ s/\\$EOL\s*//xgs; # trim newlines from lines ending in backslash
230							} else {
231	547					2307	$str = substr($toml, 1, length($toml) - 2);
232							}
233
234	582	100				1834	if (!$lit) {
235	521					1521	$str = $self->unescape_str($str);
236							}
237
238	577					2789	return $str;
239							}
240
241							sub unescape_chars {
242	136			136	0	402	state $esc = {
243							'\b' => "\x08",
244							'\t' => "\x09",
245							'\n' => "\x0A",
246							'\f' => "\x0C",
247							'\r' => "\x0D",
248							'\"' => "\x22",
249							'\/' => "\x2F",
250							'\\\\' => "\x5C",
251							};
252
253	136	100				379	if (exists $esc->{$_[0]}) {
254	120					563	return $esc->{$_[0]};
255							}
256
257	16					50	my $hex = hex substr($_[0], 2);
258
259	16	100	100			149	if ($hex < 0x10FFFF && charnames::viacode($hex)) {
260	11					916	return chr $hex;
261							}
262
263	5					1627	return;
264							}
265
266							sub unescape_str {
267	521			521	0	7052	state $re = qr/($Escape)/;
268	521		66			2483	$_[1] =~ s\|$re\|unescape_chars($1) // $_[0]->error(undef, "invalid unicode escape: $1")\|xge;
	136					345
269	516					1245	$_[1];
270							}
271
272							1;
273
274							__END__
275
276							=pod
277
278							=encoding UTF-8
279
280							=head1 NAME
281
282							TOML::Tiny::Tokenizer - tokenizer used by TOML::Tiny
283
284							=head1 VERSION
285
286							version 0.20
287
288							=head1 AUTHOR
289
290							Jeff Ober <sysread@fastmail.fm>
291
292							=head1 COPYRIGHT AND LICENSE
293
294							This software is copyright (c) 2025 by Jeff Ober.
295
296							This is free software; you can redistribute it and/or modify it under
297							the same terms as the Perl 5 programming language system itself.
298
299							=cut