File Coverage

blib/lib/C/Tokenize.pm

Criterion	Covered	Total	%
statement	90	93	96.7
branch	19	22	86.3
condition	3	3	100.0
subroutine	9	9	100.0
pod	3	5	60.0
total	124	132	93.9

line	stmt	bran	cond	sub	pod	time	code
1							package C::Tokenize;
2	5			5		350387	use warnings;
	5					60
	5					183
3	5			5		27	use strict;
	5					10
	5					135
4	5			5		36	use Carp;
	5					9
	5					6054
5							require Exporter;
6							our @ISA = qw(Exporter);
7							our @EXPORT_OK = qw/
8							$char_const_re
9							$comment_re
10							$cpp_re
11							$cvar_re
12							$cxx_comment_re
13							$decimal_re
14							$fargs_re
15							$grammar_re
16							$hex_re
17							$include
18							$include_local
19							$number_re
20							$octal_re
21							$operator_re
22							$reserved_re
23							$single_string_re
24							$string_re
25							$trad_comment_re
26							$word_re
27							@fields
28							decomment
29							function_arg
30							remove_quotes
31							strip_comments
32							tokenize
33							/;
34
35							our %EXPORT_TAGS = (
36							all => \@EXPORT_OK,
37							);
38
39							our $VERSION = '0.18';
40
41							# http://www.open-std.org/JTC1/SC22/WG14/www/docs/n1256.pdf
42							# 6.4.1
43
44							my @reserved_words = sort {length $b <=> length $a}
45							qw/
46							auto
47							break
48							case
49							char
50							const
51							continue
52							default
53							do
54							double
55							else
56							enum
57							extern
58							float
59							for
60							goto
61							if
62							inline
63							int
64							long
65							register
66							restrict
67							return
68							short
69							signed
70							sizeof
71							static
72							struct
73							switch
74							typedef
75							union
76							unsigned
77							void
78							volatile
79							while
80							_Bool
81							_Complex
82							_Imaginary
83							/;
84
85							my $reserved_words = join '\|', @reserved_words;
86							our $reserved_re = qr/\b(?:$reserved_words)\b/;
87
88							our @fields = qw/comment cpp char_const operator grammar
89							number word string reserved/;
90
91							# Regular expression to match a /* */ C comment.
92
93							our $trad_comment_re = qr!
94							/\*
95							(?:
96							# Match "not an asterisk"
97							[^*]
98							\|
99							# Match multiple asterisks followed
100							# by anything except an asterisk or a
101							# slash.
102							\+[^/]
103							)*
104							# Match multiple asterisks followed by a
105							# slash.
106							\*+/
107							!x;
108
109							# Regular expression to match a // C comment (C++-style comment).
110
111							our $cxx_comment_re = qr!//.*\n!;
112
113							# Master comment regex
114
115							our $comment_re = qr/
116							(?:
117							$trad_comment_re
118							\|
119							$cxx_comment_re
120							)
121							/x;
122
123							# Regular expression to match a C preprocessor instruction.
124
125							our $cpp_re = qr/^\h*
126							\#
127							(?:
128							$trad_comment_re
129							\|
130							[^\\\n]
131							\|
132							\\[^\n]
133							\|
134							\\\n
135							)+\n
136							/mx;
137
138							# Regular expression to match a C character constant like 'a' or '\0'.
139							# This allows any \. expression at all.
140
141							our $char_const_re = qr/
142							'
143							(?:
144							.
145							\|
146							\\.
147							)
148							'
149							/x;
150
151							# Regular expression to match one character operators
152
153							our $one_char_op_re = qr/(?:\%\|\&\|\+\|\-\|\=\|\/\|\\|\|\.\|\*\|\:\|>\|<\|\!\|\?\|~\|\^)/;
154
155							# Regular expression to match all operators
156
157							our $operator_re = qr/
158							(?:
159							# Operators with two characters
160							\\|\\|\|&&\|<<\|>>\|--\|\+\+\|->\|==
161							\|
162							# Operators with one or two characters
163							# followed by an equals sign.
164							(?:<<\|>>\|\+\|-\|\*\|\/\|%\|&\|\\|\|\^)
165							=
166							\|
167							$one_char_op_re
168							)
169							/x;
170
171							# Re to match a C number
172
173							our $octal_re = qr/0[0-7]+/;
174
175							our $decimal_re = qr/[-+]?([0-9]*\.)?[0-9]+([eE][-+]?[0-9]+)?l?/i;
176
177							our $hex_re = qr/0x[0-9a-f]+l?/i;
178
179							our $number_re = qr/
180							(?:
181							$hex_re
182							\|
183							$decimal_re
184							\|
185							$octal_re
186							)
187							/x;
188
189							# Re to match a C word
190
191							our $word_re = qr/[a-z_](?:[a-z_0-9]*)/i;
192
193							# Re to match C grammar
194
195							our $grammar_re = qr/[(){};,\[\]]/;
196
197							# Regular expression to match a C string.
198
199							our $single_string_re = qr/
200							(?:
201							"
202							(?:[^\\"]+\|\\[^"]\|\\")*
203							"
204							)
205							/x;
206
207
208							# Compound string regular expression.
209
210							our $string_re = qr/$single_string_re(?:\s$single_string_re)/;
211
212							# Master regular expression for tokenizing C text. This uses named
213							# captures.
214
215							our $c_re = qr/
216							(?\s+)?
217							(?:
218							(?$comment_re)
219							\|
220							(?$cpp_re)
221							\|
222							(?$char_const_re)
223							\|
224							(?$operator_re)
225							\|
226							(?$grammar_re)
227							\|
228							(?$number_re)
229							\|
230							(?$reserved_re)
231							\|
232							(?$word_re)
233							\|
234							(?$string_re)
235							)
236							/x;
237
238
239							# Match for '#include "file.h"'. This captures the entire #include
240							# statement in $1 and the file name in $2.
241
242							our $include_local = qr/
243							^
244							(\#
245							\s*
246							include
247							\s*
248							"((?:[^"]\|\\")+)"
249							)
250							(\s\|$comment_re)*
251							$
252							/smx;
253
254							our $include = qr/
255							^
256							(\#
257							\s*
258							include
259							\s*
260							["<]
261							((?:[^">]\|\\")+)
262							[">]
263							)
264							(\s\|$comment_re)*
265							$
266							/smx;
267
268							my $deref = qr!
269							[\&]+\s$word_re
270							!x;
271
272							my $array_re = qr!
273							$word_re
274							\s*
275							\[
276							\s*
277							$word_re
278							\s*
279							\]
280							!x;
281
282							my $member = qr!
283							(?:
284							(?:
285							->
286							\|
287							\.
288							)
289							$word_re
290							\|
291							$array_re
292							)
293							!x;
294
295							# Any C variable which can be used as an lvalue or a function argument.
296
297							our $cvar_re = qr!
298							(?:
299							# Any deferenced value
300							$deref
301							\|
302							# A word or a dereferenced value in brackets
303							(?:
304							$word_re
305							\|
306							$array_re
307							\|
308							$\s*$deref$
309							)
310							# Followed by zero or more struct member
311							$member*
312							)
313							!x;
314
315							# Function arguments
316
317							our $fargs_re = qr!
318							\(
319							(?:
320							\s$cvar_re\s,
321							)*
322							(?:\s$cvar_re\s)?
323							\)
324							!x;
325
326							sub decomment
327							{
328	2			2	1	1726	my ($comment) = @_;
329	2					13	$comment =~ s/^\/\(.)\*\/$/$1/sm;
330	2					5	return $comment;
331							}
332
333							sub tokenize
334							{
335	3			3	1	10100	my ($text) = @_;
336
337							# This array contains array references, each of which is a pair of
338							# start and end points of a line in $text.
339
340	3					9	my @lines = get_lines ($text);
341
342							# The tokens the input is broken into.
343
344	3					4	my @tokens;
345
346	3					5	my $line = 1;
347	3					803	while ($text =~ /\G($c_re)/g) {
348	32					87	my $match = $1;
349	32	50				71	if ($match =~ /^\s+$/s) {
350	0					0	die "Bad match.\n";
351							}
352							# Add one to the line number for each newline.
353	32					64	while ($match =~ /\n/g) {
354	11					18	$line++;
355							}
356	32					43	my %element;
357							# Store the whitespace in front of the element.
358	5	100		5		2349	if ($+{leading}) {
	5					2138
	5					3359
	32					150
359	15					49	$element{leading} = $+{leading};
360							}
361							else {
362	17					35	$element{leading} = '';
363							}
364	32					55	$element{line} = $line;
365	32					37	my $matched;
366	32					50	for my $field (@fields) {
367	186	100				523	if (defined $+{$field}) {
368	32					52	$element{type} = $field;
369	32					88	$element{$field} = $+{$field};
370	32					49	$matched = 1;
371	32					39	last;
372							}
373							}
374	32	50				57	if (! $matched) {
375	0					0	die "Bad regex $line: '$match'\n";
376							}
377
378	32					243	push @tokens, \%element;
379							}
380
381	3					18	return \@tokens;
382							}
383
384							# The return value is an array containing start and end points of the
385							# lines in $text.
386
387							sub get_lines
388							{
389	3			3	0	8	my ($text) = @_;
390	3					3	my @lines;
391	3					6	my $start = 0;
392	3					3	my $end;
393	3					6	my $line = 1;
394	3					16	while ($text =~ /\n/g) {
395	12					15	$end = pos $text;
396	12					25	$lines[$line] = {start => $start, end => $end};
397	12					14	$line++;
398	12					16	$start = $end + 1;
399							}
400	3					10	return @lines;
401							}
402
403							sub function_arg
404							{
405	1			1	0	10469	my ($c) = @_;
406	1					3	my $tokens = tokenize ($c);
407	1					2	my @args;
408							# Number of ('s minus number of )'s.
409	1					3	my $depth = 0;
410	1					2	my $arg = '';
411	1					3	for (@$tokens) {
412	20					25	my $type = $_->{type};
413	20					25	my $value = $_->{$type};
414	20	100	100			47	if ($depth == 1 && $value eq ',') {
415	4					10	$arg =~ s/^\s+//;
416	4					6	push @args, $arg;
417	4					5	$arg = '';
418	4					6	next;
419							}
420	16	100				28	if ($value eq '(') {
421	3					5	$depth++;
422	3	100				6	if ($depth == 1) {
423	1					3	$arg =~ s/^\s+//;
424	1					2	push @args, $arg;
425	1					3	$arg = '';
426	1					2	next;
427							}
428							}
429	15	100				25	if ($value eq ')') {
430	3					4	$depth--;
431							# Push final argument before the last ) of the function's
432							# arguments.
433	3	100				5	if ($depth == 0) {
434	1					6	$arg =~ s/^\s+//;
435	1					2	push @args, $arg;
436	1					3	$arg = '';
437	1					2	next;
438							}
439							}
440	14					25	$arg .= $_->{leading} . $value;
441							}
442	1	50				3	if (! wantarray ()) {
443	0					0	carp "Return value of function_arg is array";
444							}
445	1					12	return @args;
446							}
447
448							# This comes from XS::Check, moved here because it might be useful for
449							# other C projects.
450
451							sub strip_comments
452							{
453	4			4	1	12513	my ($xs) = @_;
454							# Remove trad comments but keep the line numbering. Trad comments
455							# are deleted before C++ comments, see below for explanation.
456	4					146	while ($xs =~ /($single_string_re\|$trad_comment_re\|$cxx_comment_re)/g) {
457	10					24	my $comment = $1;
458	10	100				31	if ($comment =~ /^".*"$/) {
459	5					26	next;
460							}
461							# If the C comment consists of int/* comment */x;, it compiles
462							# OK, but if /* comment */ is completely removed then intx;
463							# doesn't compile, so at minimum substitute one space
464							# character for each comment.
465	5					6	my $subs = ' ';
466	5					14	while ($comment =~ /([\n\r])/g) {
467	4					14	$subs .= $1;
468							}
469	5					97	$xs =~ s/\Q$comment\E/$subs/;
470							}
471	4					12	return $xs;
472							}
473
474							1;