File Coverage

blib/lib/TOML/Parser/Tokenizer.pm

Criterion	Covered	Total	%
statement	201	204	98.5
branch	74	82	90.2
condition	17	26	65.3
subroutine	19	19	100.0
pod	0	2	0.0
total	311	333	93.3

line	stmt	bran	cond	sub	pod	time	code
1							package TOML::Parser::Tokenizer;
2	20			20		447	use 5.010000;
	20					76
3	20			20		114	use strict;
	20					46
	20					507
4	20			20		152	use warnings;
	20					45
	20					687
5
6	20			20		115	use Exporter 5.57 'import';
	20					297
	20					984
7
8	20	50		20		135	use constant DEBUG => $ENV{TOML_PARSER_TOKENIZER_DEBUG} ? 1 : 0;
	20					54
	20					3921
9
10							BEGIN {
11	20			20		262	my @TOKENS = map uc, qw/
12							comment
13							table
14							array_of_table
15							key
16							integer
17							float
18							boolean
19							datetime
20							string
21							multi_line_string_begin
22							multi_line_string_end
23							inline_table_begin
24							inline_table_end
25							array_begin
26							array_end
27							/;
28							my %CONSTANTS = map {
29	20					67	("TOKEN_$_" => $_)
	300					1000
30							} @TOKENS;
31
32	20					144	require constant;
33	20					2918	constant->import(\%CONSTANTS);
34
35							# Exporter
36	20					108	our @EXPORT_OK = keys %CONSTANTS;
37	20					54034	our %EXPORT_TAGS = (
38							constant => [keys %CONSTANTS],
39							);
40							};
41
42							sub grammar_regexp {
43							return +{
44	878			878	0	19944	comment => qr{#(.*)},
45							table => {
46							start => qr{\[},
47							key => qr{(?:"(.*?)(?
48							sep => qr{\.},
49							end => qr{\]},
50							},
51							array_of_table => {
52							start => qr{\[\[},
53							key => qr{(?:"(.*?)(?
54							sep => qr{\.},
55							end => qr{\]\]},
56							},
57							key => qr{(?:"(.*?)(?
58							value => {
59							datetime => qr{([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(?:\.[0-9]+)?(?:Z\|[-+][0-9]{2}:[0-9]{2}))},
60							float => qr{([-+]?(?:[0-9_]+(?:\.[0-9_]+)?[eE][-+]?[0-9_]+\|[0-9_]*\.[0-9_]+))},
61							integer => qr{([-+]?[0-9_]+)},
62							boolean => qr{(true\|false)},
63							string => qr{(?:"(.*?)(?
64							mlstring => qr{("""\|''')},
65							inline => {
66							start => qr{\{},
67							sep => qr{\s,\s},
68							end => qr{\}},
69							},
70							array => {
71							start => qr{\[},
72							sep => qr{\s,\s},
73							end => qr{\]},
74							},
75							},
76							};
77							}
78
79							sub tokenize {
80	56			56	0	441	my ($class, $src) = @_;
81
82	56					112	local $_ = $src;
83	56					224	return $class->_tokenize();
84							}
85
86							sub _tokenize {
87	56			56		120	my $class = shift;
88	56					186	my $grammar_regexp = $class->grammar_regexp();
89
90	56					126	my @tokens;
91	56					269	until (/\G\z/mgco) {
92	467	100				3879	if (/\G$grammar_regexp->{comment}/mgc) {
		100
		100
		100
		100
93	72					118	warn "[TOKEN] COMMENT: $1" if DEBUG;
94	72					202	$class->_skip_whitespace();
95	72		100			414	push @tokens => [TOKEN_COMMENT, $1 \|\| ''];
96							}
97							elsif (/\G$grammar_regexp->{array_of_table}->{start}/mgc) {
98	23					82	push @tokens => $class->_tokenize_array_of_table();
99							}
100							elsif (/\G$grammar_regexp->{table}->{start}/mgc) {
101	49					182	push @tokens => $class->_tokenize_table();
102							}
103							elsif (my @t = $class->_tokenize_key_and_value()) {
104	225					1014	push @tokens => @t;
105							}
106							elsif (/\G\s+/mgco) {
107							# pass through
108	89					263	$class->_skip_whitespace();
109							}
110							else {
111	3					39	$class->_syntax_error();
112							}
113							}
114	36					486	return @tokens;
115							}
116
117							sub _tokenize_key_and_value {
118	369			369		718	my $class = shift;
119	369					813	my $grammar_regexp = $class->grammar_regexp();
120
121	369					748	my @tokens;
122	369	100				4122	if (/\G$grammar_regexp->{key}/mgc) {
123	277		66			1771	my $key = $1 \|\| $2 \|\| $3;
124	277					431	warn "[TOKEN] KEY: $key" if DEBUG;
125	277					809	$class->_skip_whitespace();
126	277					765	push @tokens => [TOKEN_KEY, $key];
127	277					754	push @tokens => $class->_tokenize_value();
128	271					2356	return @tokens;
129							}
130
131	92					963	return;
132							}
133
134							sub _tokenize_value {
135	332			332		568	my $class = shift;
136	332					694	my $grammar_regexp = $class->grammar_regexp();
137	332					596	warn "[CALL] _tokenize_value" if DEBUG;
138
139	332	50				8179	if (/\G$grammar_regexp->{comment}/mgc) {
		100
		100
		100
		100
		100
		100
		100
		100
140	0					0	warn "[TOKEN] COMMENT: $1" if DEBUG;
141	0					0	$class->_skip_whitespace();
142	0		0			0	return [TOKEN_COMMENT, $1 \|\| ''];
143							}
144							elsif (/\G$grammar_regexp->{value}->{datetime}/mgc) {
145	2					6	warn "[TOKEN] DATETIME: $1" if DEBUG;
146	2					9	$class->_skip_whitespace();
147	2					22	return [TOKEN_DATETIME, $1];
148							}
149							elsif (/\G$grammar_regexp->{value}->{float}/mgc) {
150	20					32	warn "[TOKEN] FLOAT: $1" if DEBUG;
151	20					45	$class->_skip_whitespace();
152	20					137	return [TOKEN_FLOAT, $1];
153							}
154							elsif (/\G$grammar_regexp->{value}->{integer}/mgc) {
155	78					150	warn "[TOKEN] INTEGER: $1" if DEBUG;
156	78					217	$class->_skip_whitespace();
157	78					780	return [TOKEN_INTEGER, $1];
158							}
159							elsif (/\G$grammar_regexp->{value}->{boolean}/mgc) {
160	4					11	warn "[TOKEN] BOOLEAN: $1" if DEBUG;
161	4					17	$class->_skip_whitespace();
162	4					45	return [TOKEN_BOOLEAN, $1];
163							}
164							elsif (/\G$grammar_regexp->{value}->{mlstring}/mgc) {
165	21					40	warn "[TOKEN] MULTI LINE STRING: $1" if DEBUG;
166							return (
167	21					81	[TOKEN_MULTI_LINE_STRING_BEGIN],
168							$class->_extract_multi_line_string($1),
169							[TOKEN_MULTI_LINE_STRING_END],
170							);
171							}
172							elsif (/\G$grammar_regexp->{value}->{string}/mgc) {
173	155					334	warn "[TOKEN] STRING: $1" if DEBUG;
174	155					466	$class->_skip_whitespace();
175
176	155					407	my $is_raw = defined $2;
177	155	50				1893	return [TOKEN_STRING, defined $1 ? $1 : defined $2 ? $2 : '', $is_raw];
		100
178							}
179							elsif (/\G$grammar_regexp->{value}->{inline}->{start}/mgc) {
180	17					33	warn "[TOKEN] INLINE TABLE" if DEBUG;
181	17					50	$class->_skip_whitespace();
182							return (
183	17					54	[TOKEN_INLINE_TABLE_BEGIN],
184							$class->_tokenize_inline_table(),
185							[TOKEN_INLINE_TABLE_END],
186							);
187							}
188							elsif (/\G$grammar_regexp->{value}->{array}->{start}/mgc) {
189	32					70	warn "[TOKEN] ARRAY" if DEBUG;
190	32					117	$class->_skip_whitespace();
191							return (
192	32					187	[TOKEN_ARRAY_BEGIN],
193							$class->_tokenize_array(),
194							[TOKEN_ARRAY_END],
195							);
196							}
197
198	3					13	$class->_syntax_error();
199							}
200
201							sub _tokenize_table {
202	49			49		103	my $class = shift;
203
204	49					131	my $grammar_regexp = $class->grammar_regexp()->{table};
205	49					309	warn "[CALL] _tokenize_table" if DEBUG;
206
207	49					170	$class->_skip_whitespace();
208
209	49					134	my @expected = ($grammar_regexp->{key});
210
211	49					83	my @keys;
212							LOOP:
213	49					92	while (1) {
214	157					344	for my $rx (@expected) {
215	201	100				4927	if (/\G$rx/smgc) {
216	151	100				682	if ($rx eq $grammar_regexp->{key}) {
		100
		50
217	76		66			565	my $key = $1 \|\| $2 \|\| $3;
218	76					138	warn "[TOKEN] table key: $key" if DEBUG;
219	76					197	push @keys => $key;
220	76					217	@expected = ($grammar_regexp->{sep}, $grammar_regexp->{end});
221							}
222							elsif ($rx eq $grammar_regexp->{sep}) {
223	32					57	warn "[TOKEN] table key separator" if DEBUG;
224	32					82	@expected = ($grammar_regexp->{key});
225							}
226							elsif ($rx eq $grammar_regexp->{end}) {
227	43					79	warn "[TOKEN] table key end" if DEBUG;
228	43					99	@expected = ();
229	43					141	last LOOP;
230							}
231	108					633	$class->_skip_whitespace();
232	108					344	next LOOP;
233							}
234							}
235
236	6					22	$class->_syntax_error();
237							}
238
239	43					83	warn "[TOKEN] TABLE: @{[ join '.', @keys ]}" if DEBUG;
240	43					416	return [TOKEN_TABLE, \@keys];
241							}
242
243							sub _tokenize_array_of_table {
244	23			23		41	my $class = shift;
245
246	23					51	my $grammar_regexp = $class->grammar_regexp()->{array_of_table};
247	23					110	warn "[CALL] _tokenize_array_of_table" if DEBUG;
248
249	23					81	$class->_skip_whitespace();
250
251	23					50	my @expected = ($grammar_regexp->{key});
252
253	23					30	my @keys;
254							LOOP:
255	23					33	while (1) {
256	57					101	for my $rx (@expected) {
257	75	100				1394	if (/\G$rx/smgc) {
258	52	100				178	if ($rx eq $grammar_regexp->{key}) {
		100
		50
259	26		66			147	my $key = $1 \|\| $2 \|\| $3;
260	26					40	warn "[TOKEN] table key: $key" if DEBUG;
261	26					55	push @keys => $key;
262	26					64	@expected = ($grammar_regexp->{sep}, $grammar_regexp->{end});
263							}
264							elsif ($rx eq $grammar_regexp->{sep}) {
265	8					11	warn "[TOKEN] table key separator" if DEBUG;
266	8					14	@expected = ($grammar_regexp->{key});
267							}
268							elsif ($rx eq $grammar_regexp->{end}) {
269	18					25	warn "[TOKEN] table key end" if DEBUG;
270	18					33	@expected = ();
271	18					36	last LOOP;
272							}
273	34					146	$class->_skip_whitespace();
274	34					77	next LOOP;
275							}
276							}
277
278	5					14	$class->_syntax_error();
279							}
280
281	18					23	warn "[TOKEN] ARRAY_OF_TABLE: @{[ join '.', @keys ]}" if DEBUG;
282	18					115	return [TOKEN_ARRAY_OF_TABLE, \@keys];
283							}
284
285							sub _extract_multi_line_string {
286	21			21		78	my ($class, $delimiter) = @_;
287	21					58	my $is_raw = $delimiter eq q{'''};
288	21	100				200	if (/\G(.+?)\Q$delimiter/smgc) {
289	20					33	warn "[TOKEN] MULTI LINE STRING: $1" if DEBUG;
290	20					65	$class->_skip_whitespace();
291	20					219	return [TOKEN_STRING, $1, $is_raw];
292							}
293	1					3	$class->_syntax_error();
294							}
295
296							sub _tokenize_inline_table {
297	17			17		30	my $class = shift;
298
299	17					36	my $common_grammar_regexp = $class->grammar_regexp();
300	17					38	my $grammar_regexp = $common_grammar_regexp->{value}->{inline};
301
302	17					25	warn "[CALL] _tokenize_inline_table" if DEBUG;
303	17	50				149	return if /\G(?:$grammar_regexp->{sep})?$grammar_regexp->{end}/smgc;
304
305	17					31	my $need_sep = 0;
306
307	17					24	my @tokens;
308	17					27	while (1) {
309	94					118	warn "[CONTEXT] _tokenize_inline_table [loop]" if DEBUG;
310
311	94					256	$class->_skip_whitespace();
312	94	100				460	if (/\G$common_grammar_regexp->{comment}/mgc) {
		100
313	2					5	warn "[TOKEN] COMMENT: $1" if DEBUG;
314	2		50			12	push @tokens => [TOKEN_COMMENT, $1 \|\| ''];
315	2					7	next;
316							}
317							elsif (/\G$grammar_regexp->{end}/mgc) {
318	16					33	last;
319							}
320
321	76	100				185	if ($need_sep) {
322	30	100				162	if (/\G$grammar_regexp->{sep}/smgc) {
323	29					51	$need_sep = 0;
324	29					57	next;
325							}
326							}
327							else {
328	46	50				101	if (my @t = $class->_tokenize_key_and_value()) {
329	46					93	push @tokens => @t;
330	46					64	$need_sep = 1;
331	46					110	next;
332							}
333							}
334
335	1					3	$class->_syntax_error();
336							}
337
338	16					227	return @tokens;
339							}
340
341							sub _tokenize_array {
342	32			32		69	my $class = shift;
343
344	32					98	my $common_grammar_regexp = $class->grammar_regexp();
345	32					76	my $grammar_regexp = $common_grammar_regexp->{value}->{array};
346
347	32					49	warn "[CALL] _tokenize_array" if DEBUG;
348	32	100				425	return if /\G(?:$grammar_regexp->{sep})?$grammar_regexp->{end}/smgc;
349
350	30					73	my $need_sep = 0;
351
352	30					52	my @tokens;
353	30					50	while (1) {
354	126					176	warn "[CONTEXT] _tokenize_inline_table [loop]" if DEBUG;
355
356	126					344	$class->_skip_whitespace();
357	126	100				836	if (/\G$common_grammar_regexp->{comment}/mgc) {
		100
358	7					10	warn "[TOKEN] COMMENT: $1" if DEBUG;
359	7		50			33	push @tokens => [TOKEN_COMMENT, $1 \|\| ''];
360	7					14	next;
361							}
362							elsif (/\G$grammar_regexp->{end}/mgc) {
363	29					70	last;
364							}
365
366	90	100				213	if ($need_sep) {
367	35	100				436	if (/\G$grammar_regexp->{sep}/smgc) {
368	34					78	$need_sep = 0;
369	34					76	next;
370							}
371							}
372							else {
373	55	50				185	if (my @t = $class->_tokenize_value()) {
374	55					193	push @tokens => @t;
375	55					84	$need_sep = 1;
376	55					161	next;
377							}
378							}
379
380	1					2	$class->_syntax_error();
381							}
382
383	29					430	return @tokens;
384							}
385
386							sub _skip_whitespace {
387	1200			1200		2006	my $class = shift;
388	1200	100				3771	if (/\G\s+/smgco) {
389							# pass through
390	610					1186	warn "[PASS] WHITESPACE" if DEBUG;
391							}
392							}
393
394	20			20		48	sub _syntax_error { shift->_error('Syntax Error') }
395
396							sub _error {
397	20			20		42	my ($class, $msg) = @_;
398
399	20					36	my $src = $_;
400	20		100			52	my $curr = pos \|\| 0;
401	20					34	my $line = 1;
402	20		50			64	my $start = pos $src \|\| 0;
403	20		66			108	while ($src =~ /$/smgco and pos $src <= $curr) {
404	28					34	$start = pos $src;
405	28					89	$line++;
406							}
407	20					31	my $end = pos $src;
408	20					39	my $len = $curr - $start;
409	20	100				38	$len-- if $len > 0;
410
411	20		100			117	my $trace = join "\n",
412							"${msg}: line:$line",
413							substr($src, $start \|\| 0, $end - $start),
414							(' ' x $len) . '^';
415	20					348	die $trace, "\n";
416							}
417
418							1;
419							__END__