File Coverage

blib/lib/C/Tokenize.pm

Criterion	Covered	Total	%
statement	98	102	96.0
branch	18	22	81.8
condition	3	3	100.0
subroutine	9	9	100.0
pod	3	5	60.0
total	131	141	92.9

line	stmt	bran	cond	sub	pod	time	code
1							package C::Tokenize;
2	4			4		279261	use warnings;
	4					11
	4					159
3	4			4		25	use strict;
	4					10
	4					98
4	4			4		23	use Carp;
	4					13
	4					6275
5							require Exporter;
6							our @ISA = qw(Exporter);
7							our @EXPORT_OK = qw/
8							$fargs_re
9							$char_const_re
10							$comment_re
11							$cpp_re
12							$cvar_re
13							$cxx_comment_re
14							$decimal_re
15							$grammar_re
16							$hex_re
17							$include
18							$include_local
19							$number_re
20							$octal_re
21							$operator_re
22							$reserved_re
23							$single_string_re
24							$string_re
25							$trad_comment_re
26							$word_re
27							@fields
28							decomment
29							remove_quotes
30							tokenize
31							function_arg
32							strip_comments
33							/;
34
35							our %EXPORT_TAGS = (
36							all => \@EXPORT_OK,
37							);
38
39							our $VERSION = '0.16';
40
41							# http://www.open-std.org/JTC1/SC22/WG14/www/docs/n1256.pdf
42							# 6.4.1
43
44							my @reserved_words = sort {length $b <=> length $a}
45							qw/
46							auto
47							break
48							case
49							char
50							const
51							continue
52							default
53							do
54							double
55							else
56							enum
57							extern
58							float
59							for
60							goto
61							if
62							inline
63							int
64							long
65							register
66							restrict
67							return
68							short
69							signed
70							sizeof
71							static
72							struct
73							switch
74							typedef
75							union
76							unsigned
77							void
78							volatile
79							while
80							_Bool
81							_Complex
82							_Imaginary
83							/;
84
85							my $reserved_words = join '\|', @reserved_words;
86							our $reserved_re = qr/\b(?:$reserved_words)\b/;
87
88							our @fields = qw/comment cpp char_const operator grammar
89							number word string reserved/;
90
91							# Regular expression to match a /* */ C comment.
92
93							our $trad_comment_re = qr!
94							/\*
95							(?:
96							# Match "not an asterisk"
97							[^*]
98							\|
99							# Match multiple asterisks followed
100							# by anything except an asterisk or a
101							# slash.
102							\+[^/]
103							)*
104							# Match multiple asterisks followed by a
105							# slash.
106							\*+/
107							!x;
108
109							# Regular expression to match a // C comment (C++-style comment).
110
111							our $cxx_comment_re = qr!//.*\n!;
112
113							# Master comment regex
114
115							our $comment_re = qr/
116							(?:
117							$trad_comment_re
118							\|
119							$cxx_comment_re
120							)
121							/x;
122
123							# Regular expression to match a C preprocessor instruction.
124
125							our $cpp_re = qr/^\h*
126							\#
127							(?:
128							$trad_comment_re
129							\|
130							[^\\\n]
131							\|
132							\\[^\n]
133							\|
134							\\\n
135							)+\n
136							/mx;
137
138							# Regular expression to match a C character constant like 'a' or '\0'.
139							# This allows any \. expression at all.
140
141							our $char_const_re = qr/
142							'
143							(?:
144							.
145							\|
146							\\.
147							)
148							'
149							/x;
150
151							# Regular expression to match one character operators
152
153							our $one_char_op_re = qr/(?:\%\|\&\|\+\|\-\|\=\|\/\|\\|\|\.\|\*\|\:\|>\|<\|\!\|\?\|~\|\^)/;
154
155							# Regular expression to match all operators
156
157							our $operator_re = qr/
158							(?:
159							# Operators with two characters
160							\\|\\|\|&&\|<<\|>>\|--\|\+\+\|->\|==
161							\|
162							# Operators with one or two characters
163							# followed by an equals sign.
164							(?:<<\|>>\|\+\|-\|\*\|\/\|%\|&\|\\|\|\^)
165							=
166							\|
167							$one_char_op_re
168							)
169							/x;
170
171							# Re to match a C number
172
173							our $octal_re = qr/0[0-7]+/;
174
175							our $decimal_re = qr/[-+]?([0-9]*\.)?[0-9]+([eE][-+]?[0-9]+)?l?/i;
176
177							our $hex_re = qr/0x[0-9a-f]+l?/i;
178
179							our $number_re = qr/
180							(?:
181							$hex_re
182							\|
183							$decimal_re
184							\|
185							$octal_re
186							)
187							/x;
188
189							# Re to match a C word
190
191							our $word_re = qr/[a-z_](?:[a-z_0-9]*)/i;
192
193							# Re to match C grammar
194
195							our $grammar_re = qr/[(){};,\[\]]/;
196
197							# Regular expression to match a C string.
198
199							our $single_string_re = qr/
200							(?:
201							"
202							(?:[^\\"]+\|\\[^"]\|\\")*
203							"
204							)
205							/x;
206
207
208							# Compound string regular expression.
209
210							our $string_re = qr/$single_string_re(?:\s$single_string_re)/;
211
212							# Master regular expression for tokenizing C text. This uses named
213							# captures.
214
215							our $c_re = qr/
216							(?\s+)?
217							(?:
218							(?$comment_re)
219							\|
220							(?$cpp_re)
221							\|
222							(?$char_const_re)
223							\|
224							(?$operator_re)
225							\|
226							(?$grammar_re)
227							\|
228							(?$number_re)
229							\|
230							(?$reserved_re)
231							\|
232							(?$word_re)
233							\|
234							(?$string_re)
235							)
236							/x;
237
238
239							# Match for '#include "file.h"'. This captures the entire #include
240							# statement in $1 and the file name in $2.
241
242							our $include_local = qr/
243							^
244							(\#
245							\s*
246							include
247							\s*
248							"((?:[^"]\|\\"))"
249							)
250							(\s\|$comment_re)*
251							$
252							/smx;
253
254							our $include = qr/
255							^
256							(\#
257							\s*
258							include
259							\s*
260							["<]
261							([a-zA-Z0-9\-]+\.h)
262							[">]
263							)
264							(\s\|$comment_re)*
265							$
266							/smx;
267
268							my $deref = qr!
269							[\&]+\s$word_re
270							!x;
271
272							my $array_re = qr!
273							$word_re
274							\s*
275							\[
276							\s*
277							$word_re
278							\s*
279							\]
280							!x;
281
282							my $member = qr!
283							(?:
284							(?:
285							->
286							\|
287							\.
288							)
289							$word_re
290							\|
291							$array_re
292							)
293							!x;
294
295							# Any C variable which can be used as an lvalue or a function argument.
296
297							our $cvar_re = qr!
298							(?:
299							# Any deferenced value
300							$deref
301							\|
302							# A word or a dereferenced value in brackets
303							(?:
304							$word_re
305							\|
306							$array_re
307							\|
308							$\s*$deref$
309							)
310							# Followed by zero or more struct member
311							$member*
312							)
313							!x;
314
315							# Function arguments
316
317							our $fargs_re = qr!
318							\(
319							(?:
320							\s$cvar_re\s,
321							)*
322							(?:\s$cvar_re\s)?
323							\)
324							!x;
325
326							sub decomment
327							{
328	2			2	1	1803	my ($comment) = @_;
329	2					13	$comment =~ s/^\/\(.)\*\/$/$1/sm;
330	2					5	return $comment;
331							}
332
333							sub tokenize
334							{
335	3			3	1	10768	my ($text) = @_;
336
337							# This array contains array references, each of which is a pair of
338							# start and end points of a line in $text.
339
340	3					13	my @lines = get_lines ($text);
341
342							# The tokens the input is broken into.
343
344	3					7	my @tokens;
345
346	3					5	my $line = 1;
347	3					975	while ($text =~ /\G($c_re)/g) {
348	32					143	my $match = $1;
349	32	50				125	if ($match =~ /^\s+$/s) {
350	0					0	die "Bad match.\n";
351							}
352							# Add one to the line number for each newline.
353	32					135	while ($match =~ /\n/g) {
354	11					26	$line++;
355							}
356	32					58	my %element;
357							# Store the whitespace in front of the element.
358	4	100		4		1645	if ($+{leading}) {
	4					1840
	4					4269
	32					211
359	15					76	$element{leading} = $+{leading};
360							}
361							else {
362	17					59	$element{leading} = '';
363							}
364	32					93	$element{line} = $line;
365	32					50	my $matched;
366	32					71	for my $field (@fields) {
367	186	100				940	if (defined $+{$field}) {
368	32					80	$element{type} = $field;
369	32					246	$element{$field} = $+{$field};
370	32					84	$matched = 1;
371	32					66	last;
372							}
373							}
374	32	50				88	if (! $matched) {
375	0					0	die "Bad regex $line: '$match'\n";
376							}
377
378	32					612	push @tokens, \%element;
379							}
380
381	3					27	return \@tokens;
382							}
383
384							# The return value is an array containing start and end points of the
385							# lines in $text.
386
387							sub get_lines
388							{
389	3			3	0	8	my ($text) = @_;
390	3					7	my @lines;
391	3					9	my $start = 0;
392	3					7	my $end;
393	3					6	my $line = 1;
394	3					21	while ($text =~ /\n/g) {
395	12					15	$end = pos $text;
396	12					21	$lines[$line] = {start => $start, end => $end};
397	12					13	$line++;
398	12					24	$start = $end + 1;
399							}
400	3					10	return @lines;
401							}
402
403							sub function_arg
404							{
405	1			1	0	16600	my ($c) = @_;
406	1					6	my $tokens = tokenize ($c);
407	1					3	my @args;
408							# Number of ('s minus number of )'s.
409	1					3	my $depth = 0;
410	1					4	my $arg = '';
411	1					4	for (@$tokens) {
412	20					97	my $type = $_->{type};
413	20					45	my $value = $_->{$type};
414	20	100	100			75	if ($depth == 1 && $value eq ',') {
415	4					20	$arg =~ s/^\s+//;
416	4					14	push @args, $arg;
417	4					10	$arg = '';
418	4					10	next;
419							}
420	16	100				40	if ($value eq '(') {
421	3					4	$depth++;
422	3	100				10	if ($depth == 1) {
423	1					5	$arg =~ s/^\s+//;
424	1					4	push @args, $arg;
425	1					3	$arg = '';
426	1					3	next;
427							}
428							}
429	15	100				37	if ($value eq ')') {
430	3					5	$depth--;
431							# Push final argument before the last ) of the function's
432							# arguments.
433	3	100				12	if ($depth == 0) {
434	1					7	$arg =~ s/^\s+//;
435	1					6	push @args, $arg;
436	1					4	$arg = '';
437	1					3	next;
438							}
439							}
440	14					41	$arg .= $_->{leading} . $value;
441							}
442	1	50				9	if (! wantarray ()) {
443	0					0	carp "Return value of function_arg is array";
444							}
445	1					21	return @args;
446							}
447
448							# This comes from XS::Check, moved here because it might be useful for
449							# other C projects.
450
451							sub strip_comments
452							{
453	3			3	1	14544	my ($xs) = @_;
454							# Remove all the strings from $xs so that comments within strings
455							# are not matched.
456	3					9	my @strings;
457							my $magic;
458							fail:
459	3					11	for (0..9) {
460	30					103	$magic .= ('a'..'z')[int (rand (26))];
461							}
462	3	50				54	if ($xs =~ /$magic/) {
463	0					0	goto fail;
464							}
465	3					73	while ($xs =~ /($single_string_re)/g) {
466	3					9	my $match = $1;
467	3					30	push @strings, $match;
468	3					69	$xs =~ s/\Q$match\E/$magic$#strings/;
469							}
470							# Remove trad comments but keep the line numbering. Trad comments
471							# are deleted before C++ comments, see below for explanation.
472	3					95	while ($xs =~ /($trad_comment_re)/) {
473	3					15	my $comment = $1;
474							# If the C comment consists of int/* comment */x;, it compiles
475							# OK, but if /* comment */ is completely removed then intx;
476							# doesn't compile, so at minimum substitute one space
477							# character for each comment.
478	3					8	my $subs = ' ';
479	3					14	while ($comment =~ /([\n\r])/g) {
480	2					14	$subs .= $1;
481							}
482	3					86	$xs =~ s/\Q$comment\E/$subs/;
483							}
484							# Remove "//" comments. Must do this only after removing trad
485							# comments, otherwise "/* http://bad /" has its final "/"
486							# wrongly removed.
487	3					19	$xs =~ s/$cxx_comment_re/\n/g;
488							# Restore the strings.
489	3					57	$xs =~ s/$magic([0-9]+)/$strings[$1]/g;
490	3					16	return $xs;
491							}
492
493							1;