File Coverage

blib/lib/C/Tokenize.pm

Criterion	Covered	Total	%
statement	75	89	84.2
branch	19	22	86.3
condition	3	3	100.0
subroutine	7	8	87.5
pod	3	5	60.0
total	107	127	84.2

line	stmt	bran	cond	sub	pod	time	code
1							package C::Tokenize;
2	6			6		740268	use warnings;
	6					11
	6					358
3	6			6		60	use strict;
	6					11
	6					176
4	6			6		28	use Carp;
	6					11
	6					13663
5							require Exporter;
6							our @ISA = qw(Exporter);
7							our @EXPORT_OK = qw/
8							$char_const_re
9							$comment_re
10							$cpp_re
11							$cvar_re
12							$cxx_comment_re
13							$decimal_re
14							$fargs_re
15							$grammar_re
16							$hex_re
17							$include
18							$include_local
19							$number_re
20							$octal_re
21							$operator_re
22							$reserved_re
23							$single_string_re
24							$string_re
25							$trad_comment_re
26							$word_re
27							@fields
28							decomment
29							function_arg
30							strip_comments
31							tokenize
32							/;
33
34							our %EXPORT_TAGS = (
35							all => \@EXPORT_OK,
36							);
37
38							our $VERSION = '0.19';
39
40							# http://www.open-std.org/JTC1/SC22/WG14/www/docs/n1256.pdf
41							# 6.4.1
42
43							my @reserved_words = sort {length $b <=> length $a}
44							qw/
45							auto
46							break
47							case
48							char
49							const
50							continue
51							default
52							do
53							double
54							else
55							enum
56							extern
57							float
58							for
59							goto
60							if
61							inline
62							int
63							long
64							register
65							restrict
66							return
67							short
68							signed
69							sizeof
70							static
71							struct
72							switch
73							typedef
74							union
75							unsigned
76							void
77							volatile
78							while
79							_Bool
80							_Complex
81							_Imaginary
82							/;
83
84							my $reserved_words = join '\|', @reserved_words;
85							our $reserved_re = qr/\b(?:$reserved_words)\b/;
86
87							our @fields = qw/
88							char_const
89							comment
90							cpp
91							grammar
92							number
93							operator
94							reserved
95							string
96							word
97							/;
98
99							# Regular expression to match a /* */ C comment.
100
101							our $trad_comment_re = qr!
102							/\*
103							(?:
104							# Match "not an asterisk"
105							[^*]
106							\|
107							# Match multiple asterisks followed
108							# by anything except an asterisk or a
109							# slash.
110							\+[^/]
111							)*
112							# Match multiple asterisks followed by a
113							# slash.
114							\*+/
115							!x;
116
117							# Regular expression to match a // C comment (C++-style comment).
118
119							our $cxx_comment_re = qr!//.*\n!;
120
121							# Master comment regex
122
123							our $comment_re = qr/
124							(?:
125							$trad_comment_re
126							\|
127							$cxx_comment_re
128							)
129							/x;
130
131							# Regular expression to match a C preprocessor instruction.
132
133							our $cpp_re = qr/^\h*
134							\#
135							(?:
136							$trad_comment_re
137							\|
138							[^\\\n]
139							\|
140							\\[^\n]
141							\|
142							\\\n
143							)+\n
144							/mx;
145
146							# Regular expression to match a C character constant like 'a' or '\0'.
147							# This allows any \. expression at all.
148
149							our $char_const_re = qr/
150							'
151							(?:
152							.
153							\|
154							\\.
155							)
156							'
157							/x;
158
159							# Regular expression to match one character operators
160
161							our $one_char_op_re = qr/(?:\%\|\&\|\+\|\-\|\=\|\/\|\\|\|\.\|\*\|\:\|>\|<\|\!\|\?\|~\|\^)/;
162
163							# Regular expression to match all operators
164
165							our $operator_re = qr/
166							(?:
167							# # Operators with two characters #
168							\\|\\|\|&&\|<<\|>>\|--\|\+\+\|->\|==
169							\|
170							# Operators with one or two characters
171							# followed by an equals sign.
172							(?:<<\|>>\|\+\|-\|\*\|\/\|%\|&\|\\|\|\^)
173							=
174							\|
175							$one_char_op_re
176							)
177							/x;
178
179							# Re to match a C number
180
181							our $octal_re = qr/0[0-7]+/;
182
183							our $decimal_re = qr/[-+]?([0-9]*\.)?[0-9]+([eE][-+]?[0-9]+)?l?/i;
184
185							our $hex_re = qr/0x[0-9a-f]+l?/i;
186
187							our $number_re = qr/
188							(?:
189							$hex_re
190							\|
191							$decimal_re
192							\|
193							$octal_re
194							)
195							/x;
196
197							# Re to match a C word
198
199							our $word_re = qr/[a-z_](?:[a-z_0-9]*)/i;
200
201							# Re to match C grammar
202
203							our $grammar_re = qr/[(){};,\[\]]/;
204
205							# Regular expression to match a C string.
206
207							our $single_string_re = qr/
208							(?:
209							"
210							(?:[^\\"]+\|\\[^"]\|\\")*
211							"
212							)
213							/x;
214
215
216							# Compound string regular expression.
217
218							our $string_re = qr/$single_string_re(?:\s$single_string_re)/;
219
220							# Master regular expression for tokenizing C text. This uses named
221							# captures.
222
223							our $c_re = qr/
224							(?\s+)?
225							(?:
226							(?$comment_re)
227							\|
228							(?$cpp_re)
229							\|
230							(?$char_const_re)
231							\|
232							(?$operator_re)
233							\|
234							(?$grammar_re)
235							\|
236							(?$number_re)
237							\|
238							(?$reserved_re)
239							\|
240							(?$word_re)
241							\|
242							(?$string_re)
243							)
244							/x;
245
246
247							# Match for '#include "file.h"'. This captures the entire #include
248							# statement in $1 and the file name in $2.
249
250							our $include_local = qr/
251							^
252							(\#
253							\s*
254							include
255							\s*
256							"((?:[^"]\|\\")+)"
257							)
258							(\s\|$comment_re)*
259							$
260							/smx;
261
262							our $include = qr/
263							^
264							(\#
265							\s*
266							include
267							\s*
268							["<]
269							((?:[^">]\|\\")+)
270							[">]
271							)
272							(\s\|$comment_re)*
273							$
274							/smx;
275
276							my $deref = qr!
277							[\&]+\s$word_re
278							!x;
279
280							my $array_re = qr!
281							$word_re
282							\s*
283							\[
284							\s*
285							$word_re
286							\s*
287							\]
288							!x;
289
290							my $member = qr!
291							(?:
292							(?:
293							->
294							\|
295							\.
296							)
297							$word_re
298							\|
299							$array_re
300							)
301							!x;
302
303							# Any C variable which can be used as an lvalue or a function argument.
304
305							our $cvar_re = qr!
306							(?:
307							# Any deferenced value
308							$deref
309							\|
310							# A word or a dereferenced value in brackets
311							(?:
312							$word_re
313							\|
314							$array_re
315							\|
316							$\s*$deref$
317							)
318							# Followed by zero or more struct member
319							$member*
320							)
321							!x;
322
323							# Function arguments
324
325							our $fargs_re = qr!
326							\(
327							(?:
328							\s$cvar_re\s,
329							)*
330							(?:\s$cvar_re\s)?
331							\)
332							!x;
333
334							sub decomment
335							{
336	2			2	1	2732	my ($comment) = @_;
337	2					17	$comment =~ s/^\/\(.)\*\/$/$1/sm;
338	2					7	return $comment;
339							}
340
341							sub tokenize
342							{
343	3			3	1	250465	my ($text) = @_;
344
345							# This array contains array references, each of which is a pair of
346							# start and end points of a line in $text.
347
348							# The tokens the input is broken into.
349
350	3					6	my @tokens;
351
352	3					6	my $line = 1;
353	3					988	while ($text =~ /\G($c_re)/g) {
354	32					80	my $match = $1;
355	32	50				59	if ($match =~ /^\s+$/s) {
356	0					0	die "Bad match.\n";
357							}
358							# Add one to the line number for each newline.
359	32					52	while ($match =~ /\n/g) {
360	11					18	$line++;
361							}
362	32					28	my %element;
363							# Store the whitespace in front of the element.
364	32	100				94	if ($+{leading}) {
365	15					30	$element{leading} = $+{leading};
366							}
367							else {
368	17					23	$element{leading} = '';
369							}
370	32					46	$element{line} = $line;
371	32					26	my $matched;
372	32					35	for my $field (@fields) {
373	185	100				414	if (defined $+{$field}) {
374	32					38	$element{type} = $field;
375	32					65	$element{$field} = $+{$field};
376	32					37	$matched = 1;
377	32					34	last;
378							}
379							}
380	32	50				45	if (! $matched) {
381	0					0	die "Bad regex $line: '$match'\n";
382							}
383
384	32					167	push @tokens, \%element;
385							}
386
387	3					22	return \@tokens;
388							}
389
390							# The return value is an array containing start and end points of the
391							# lines in $text.
392
393							sub get_lines
394							{
395	0			0	0	0	my ($text) = @_;
396	0					0	my @lines;
397	0					0	my $start = 0;
398	0					0	my $end;
399	0					0	my $line = 1;
400	0					0	while ($text =~ /\n/g) {
401	0					0	$end = pos $text;
402	0					0	$lines[$line] = {start => $start, end => $end};
403	0					0	$line++;
404	0					0	$start = $end + 1;
405							}
406	0					0	return @lines;
407							}
408
409							sub function_arg
410							{
411	1			1	0	174991	my ($c) = @_;
412	1					4	my $tokens = tokenize ($c);
413	1					2	my @args;
414							# Number of ('s minus number of )'s.
415	1					2	my $depth = 0;
416	1					1	my $arg = '';
417	1					3	for (@$tokens) {
418	20					19	my $type = $_->{type};
419	20					17	my $value = $_->{$type};
420	20	100	100			38	if ($depth == 1 && $value eq ',') {
421	4					7	$arg =~ s/^\s+//;
422	4					5	push @args, $arg;
423	4					4	$arg = '';
424	4					8	next;
425							}
426	16	100				20	if ($value eq '(') {
427	3					2	$depth++;
428	3	100				6	if ($depth == 1) {
429	1					2	$arg =~ s/^\s+//;
430	1					2	push @args, $arg;
431	1					2	$arg = '';
432	1					1	next;
433							}
434							}
435	15	100				17	if ($value eq ')') {
436	3					3	$depth--;
437							# Push final argument before the last ) of the function's
438							# arguments.
439	3	100				4	if ($depth == 0) {
440	1					2	$arg =~ s/^\s+//;
441	1					2	push @args, $arg;
442	1					1	$arg = '';
443	1					2	next;
444							}
445							}
446	14					18	$arg .= $_->{leading} . $value;
447							}
448	1	50				2	if (! wantarray ()) {
449	0					0	carp "Return value of function_arg is array";
450							}
451	1					12	return @args;
452							}
453
454							# This comes from XS::Check, moved here because it might be useful for
455							# other C projects.
456
457							sub strip_comments
458							{
459	4			4	1	214326	my ($xs) = @_;
460							# Remove trad comments but keep the line numbering. Trad comments
461							# are deleted before C++ comments, see below for explanation.
462	4					284	while ($xs =~ /($single_string_re\|$trad_comment_re\|$cxx_comment_re)/g) {
463	10					36	my $comment = $1;
464							# The /s regex qualifier is necessary because
465							# $single_string_re can match newlines within the
466							# quotes. 2025-08-05
467	10	100				41	if ($comment =~ /^".*"$/s) {
468	5					40	next;
469							}
470							# If the C comment consists of int/* comment */x;, it compiles
471							# OK, but if /* comment */ is completely removed then intx;
472							# doesn't compile, so at minimum substitute one space
473							# character for each comment.
474	5					11	my $subs = ' ';
475	5					32	while ($comment =~ /([\n\r])/g) {
476	4					22	$subs .= $1;
477							}
478	5					156	$xs =~ s/\Q$comment\E/$subs/;
479							}
480	4					19	return $xs;
481							}
482
483							1;