File Coverage

blib/lib/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	76	77	98.7
branch	25	28	89.2
condition	8	10	80.0
subroutine	8	8	100.0
pod	1	1	100.0
total	118	124	95.1

line	stmt	bran	cond	sub	pod	time	code
1							package Regexp::Lexer;
2	7			7		6835	use 5.008001;
	7					21
	7					246
3	7			7		29	use strict;
	7					7
	7					185
4	7			7		35	use warnings;
	7					8
	7					154
5	7			7		29	use B;
	7					8
	7					313
6	7			7		28	use Carp qw/croak/;
	7					9
	7					329
7	7			7		2771	use Regexp::Lexer::TokenType;
	7					13
	7					208
8	7			7		3599	use parent qw(Exporter);
	7					2052
	7					36
9
10							our @EXPORT_OK = qw(tokenize);
11
12							our $VERSION = "0.03";
13
14							my %escapedSpecialChar = (
15							t => Regexp::Lexer::TokenType::EscapedTab,
16							n => Regexp::Lexer::TokenType::EscapedNewline,
17							r => Regexp::Lexer::TokenType::EscapedReturn,
18							f => Regexp::Lexer::TokenType::EscapedFormFeed,
19							F => Regexp::Lexer::TokenType::EscapedFoldcase,
20							a => Regexp::Lexer::TokenType::EscapedAlarm,
21							e => Regexp::Lexer::TokenType::EscapedEscape,
22							c => Regexp::Lexer::TokenType::EscapedControlChar,
23							x => Regexp::Lexer::TokenType::EscapedCharHex,
24							o => Regexp::Lexer::TokenType::EscapedCharOct,
25							0 => Regexp::Lexer::TokenType::EscapedCharOct,
26							l => Regexp::Lexer::TokenType::EscapedLowerNext,
27							u => Regexp::Lexer::TokenType::EscapedUpperNext,
28							L => Regexp::Lexer::TokenType::EscapedLowerUntil,
29							U => Regexp::Lexer::TokenType::EscapedUpperUntil,
30							Q => Regexp::Lexer::TokenType::EscapedQuoteMetaUntil,
31							E => Regexp::Lexer::TokenType::EscapedEnd,
32							w => Regexp::Lexer::TokenType::EscapedWordChar,
33							W => Regexp::Lexer::TokenType::EscapedNotWordChar,
34							s => Regexp::Lexer::TokenType::EscapedWhiteSpaceChar,
35							S => Regexp::Lexer::TokenType::EscapedNotWhiteSpaceChar,
36							d => Regexp::Lexer::TokenType::EscapedDigitChar,
37							D => Regexp::Lexer::TokenType::EscapedNotDigitChar,
38							p => Regexp::Lexer::TokenType::EscapedProp,
39							P => Regexp::Lexer::TokenType::EscapedNotProp,
40							X => Regexp::Lexer::TokenType::EscapedUnicodeExtendedChar,
41							C => Regexp::Lexer::TokenType::EscapedCChar,
42							1 => Regexp::Lexer::TokenType::EscapedBackRef,
43							2 => Regexp::Lexer::TokenType::EscapedBackRef,
44							3 => Regexp::Lexer::TokenType::EscapedBackRef,
45							4 => Regexp::Lexer::TokenType::EscapedBackRef,
46							5 => Regexp::Lexer::TokenType::EscapedBackRef,
47							6 => Regexp::Lexer::TokenType::EscapedBackRef,
48							7 => Regexp::Lexer::TokenType::EscapedBackRef,
49							8 => Regexp::Lexer::TokenType::EscapedBackRef,
50							9 => Regexp::Lexer::TokenType::EscapedBackRef,
51							g => Regexp::Lexer::TokenType::EscapedBackRef,
52							k => Regexp::Lexer::TokenType::EscapedBackRef,
53							K => Regexp::Lexer::TokenType::EscapedKeepStuff,
54							v => Regexp::Lexer::TokenType::EscapedVerticalWhitespace,
55							V => Regexp::Lexer::TokenType::EscapedNotVerticalWhitespace,
56							h => Regexp::Lexer::TokenType::EscapedHorizontalWhitespace,
57							H => Regexp::Lexer::TokenType::EscapedNotHorizontalWhitespace,
58							R => Regexp::Lexer::TokenType::EscapedLinebreak,
59							b => Regexp::Lexer::TokenType::EscapedWordBoundary,
60							B => Regexp::Lexer::TokenType::EscapedNotWordBoundary,
61							A => Regexp::Lexer::TokenType::EscapedBeginningOfString,
62							Z => Regexp::Lexer::TokenType::EscapedEndOfStringBeforeNewline,
63							z => Regexp::Lexer::TokenType::EscapedEndOfString,
64							G => Regexp::Lexer::TokenType::EscapedPos,
65							);
66
67							my %specialChar = (
68							'.' => Regexp::Lexer::TokenType::MatchAny,
69							'\|' => Regexp::Lexer::TokenType::Alternation,
70							'(' => Regexp::Lexer::TokenType::LeftParenthesis,
71							')' => Regexp::Lexer::TokenType::RightParenthesis,
72							'[' => Regexp::Lexer::TokenType::LeftBracket,
73							']' => Regexp::Lexer::TokenType::RightBracket,
74							'{' => Regexp::Lexer::TokenType::LeftBrace,
75							'}' => Regexp::Lexer::TokenType::RightBrace,
76							'<' => Regexp::Lexer::TokenType::LeftAngle,
77							'>' => Regexp::Lexer::TokenType::RightAngle,
78							'*' => Regexp::Lexer::TokenType::Asterisk,
79							'+' => Regexp::Lexer::TokenType::Plus,
80							'?' => Regexp::Lexer::TokenType::Question,
81							',' => Regexp::Lexer::TokenType::Comma,
82							'-' => Regexp::Lexer::TokenType::Minus,
83							'$' => Regexp::Lexer::TokenType::ScalarSigil,
84							'@' => Regexp::Lexer::TokenType::ArraySigil,
85							':' => Regexp::Lexer::TokenType::Colon,
86							'#' => Regexp::Lexer::TokenType::Sharp,
87							'^' => Regexp::Lexer::TokenType::Cap,
88							'=' => Regexp::Lexer::TokenType::Equal,
89							'!' => Regexp::Lexer::TokenType::Exclamation,
90							q<'> => Regexp::Lexer::TokenType::SingleQuote,
91							q<"> => Regexp::Lexer::TokenType::DoubleQuote,
92							);
93
94							sub tokenize {
95	7			7	1	35924	my ($re) = @_;
96
97	7	100				42	if (ref $re ne 'Regexp') {
98	1					184	croak "Not regexp quoted argument is given";
99							}
100
101							# B::cstring() is used to escape backslashes
102	6					98	my $re_cluster_string = B::cstring($re);
103
104							# to remove double-quotes and parenthesis on leading and trailing
105	6					26	my $re_str = substr(substr($re_cluster_string, 2), 0, -2);
106
107	6					26	$re_str =~ s/\\"/"/g; # for double quote which is converted by B::cstring
108
109							# extract modifiers
110	6					47	$re_str =~ s/\A[?]([^:]*)://;
111	6					18	my @modifiers;
112	6					40	for my $modifier (split //, $1) {
113	10					28	push @modifiers, $modifier;
114							}
115
116	6					57	my @chars = split //, $re_str;
117
118	6					16	my @tokens;
119	6					12	my $index = 0;
120
121	6					12	my $end_of_line_exists = 0;
122	6	100				27	if ($chars[-1] eq '$') {
123	1					1	pop @chars;
124	1					2	$end_of_line_exists = 1;
125							}
126
127	6	100				34	if ($chars[0] eq '^') {
128	1					5	push @tokens, {
129							char => shift @chars,
130							index => ++$index,
131							type => Regexp::Lexer::TokenType::BeginningOfLine,
132							};
133							}
134
135	6					15	my $backslashes = 0;
136	6					27	my $next_c;
137	6					31	for (my $i = 0; defined(my $c = $chars[$i]); $i++) {
138	111	100				169	if ($c eq '\\') {
139	59	100				112	if ($backslashes <= 1) {
140	47					41	$backslashes++;
141	47					93	next;
142							}
143
144							# now status -> '\\\\\\'
145	12	50				26	if ($backslashes == 2) {
146	12					14	$next_c = $chars[++$i];
147	12	50	33			52	if (!defined $next_c \|\| $next_c ne '\\') {
148	0					0	croak "Invalid syntax regexp is given"; # fail safe
149							}
150
151	12					29	push @tokens, {
152							char => '\\\\',
153							index => ++$index,
154							type => Regexp::Lexer::TokenType::EscapedCharacter,
155							};
156
157	12					13	$backslashes = 0;
158	12					24	next;
159							}
160							}
161
162							# To support NOT META newline character which is in regexp
163	52	100				144	if ($backslashes == 1) {
164	3					7	my $type = Regexp::Lexer::TokenType::Unknown;
165	3	100				8	if ($c eq 'n') {
		50
166	2					3	$type = Regexp::Lexer::TokenType::Newline;
167							}
168							elsif ($c eq 'r') { # XXX maybe unreachable
169	1					2	$type = Regexp::Lexer::TokenType::Return;
170							}
171
172	3					9	push @tokens, {
173							char => '\\' . $c,
174							index => ++$index,
175							type => $type,
176							};
177
178	3					4	$backslashes = 0;
179	3					5	next;
180							}
181
182	49	100				90	if ($backslashes == 2) {
183	10					21	my $type = $escapedSpecialChar{$c};
184
185							# Determine meaning of \N
186	10	100				31	if ($c eq 'N') {
187	4					7	$type = Regexp::Lexer::TokenType::EscapedCharUnicode;
188
189	4					8	$next_c = $chars[$i+1];
190	4	100	100			23	if (!defined $next_c \|\| $next_c ne '{') {
191	2					5	$type = Regexp::Lexer::TokenType::EscapedNotNewline;
192							}
193							}
194
195	10		100			71	push @tokens, {
196							char => '\\' . $c,
197							index => ++$index,
198							type => $type \|\| Regexp::Lexer::TokenType::EscapedCharacter,
199							};
200
201	10					14	$backslashes = 0;
202	10					29	next;
203							}
204
205	39		100			181	push @tokens, {
206							char => $c,
207							index => ++$index,
208							type => $specialChar{$c} \|\| Regexp::Lexer::TokenType::Character,
209							};
210
211	39					98	$backslashes = 0; # for fail safe
212							}
213
214	6	100				26	if ($end_of_line_exists) {
215	1					2	push @tokens, {
216							char => '$',
217							index => ++$index,
218							type => Regexp::Lexer::TokenType::EndOfLine,
219							};
220							}
221
222							return {
223	6					64	tokens => \@tokens,
224							modifiers => \@modifiers,
225							};
226							}
227
228							1;
229							__END__