File Coverage

blib/lib/CQL/Lexer.pm

Criterion	Covered	Total	%
statement	96	100	96.0
branch	31	38	81.5
condition	12	18	66.6
subroutine	15	16	93.7
pod	7	8	87.5
total	161	180	89.4

line	stmt	bran	cond	sub	pod	time	code
1							package CQL::Lexer;
2
3	8			8		3134	use strict;
	8					13
	8					303
4	8			8		43	use warnings;
	8					16
	8					258
5	8			8		46	use Carp qw( croak );
	8					11
	8					573
6	8			8		9421	use String::Tokenizer;
	8					18856
	8					292
7	8			8		4801	use CQL::Token;
	8					25
	8					12467
8
9							=head1 NAME
10
11							CQL::Lexer - a lexical analyzer for CQL
12
13							=head1 SYNOPSIS
14
15							my $lexer = CQL::Lexer->new();
16							$lexer->tokenize( 'foo and bar' );
17							my @tokens = $lexer->getTokens();
18
19							=head1 DESCRIPTION
20
21							CQL::Lexer is lexical analyzer for a string of CQL. Once you've
22							got a CQL::Lexer object you can tokenize a CQL string into CQL::Token
23							objects. Ordinarily you'll never want to do this yourself since
24							CQL::Parser calls CQL::Lexer for you.
25
26							CQL::Lexer uses Stevan Little's lovely String::Tokenizer in the background,
27							and does a bit of analysis afterwards to handle some peculiarities of
28							CQL: double quoted strings, <, <=, etc.
29
30							=head1 METHODS
31
32							=head2 new()
33
34							The constructor.
35
36							=cut
37
38							sub new {
39	65			65	1	8495	my $class = shift;
40	65					1870	my $self = {
41							tokenizer => String::Tokenizer->new(),
42							tokens => [],
43							position => 0,
44							};
45	65		33			1600	return bless $self, ref($class) \|\| $class;
46							}
47
48							=head2 tokenize()
49
50							Pass in a string of CQL to tokenize. This initializes the lexer with
51							data so that you can retrieve tokens.
52
53							=cut
54
55							sub tokenize {
56	78			78	1	582	my ( $self, $string ) = @_;
57
58							## extract the String::Tokenizer object we will use
59	78					173	my $tokenizer = $self->{tokenizer};
60
61							## reset position parsing a new string of tokens
62	78					492	$self->reset();
63
64							## delegate to String::Tokenizer for basic tokenization
65	78					630	debug( "tokenizing: $string" );
66	78					475	$tokenizer->tokenize( $string, '\/<>=()"',
67							String::Tokenizer->RETAIN_WHITESPACE );
68
69							## do a bit of lexical analysis on the results of basic
70	78					20699	debug( "lexical analysis on tokens" );
71	78					178	my @tokens = _analyze( $tokenizer );
72	78					293	$self->{tokens} = \@tokens;
73							}
74
75							=head2 getTokens()
76
77							Returns a list of all the tokens.
78
79							=cut
80
81							sub getTokens {
82	13			13	1	31	my $self = shift;
83	13					19	return @{ $self->{tokens} };
	13					67
84							}
85
86							=head2 token()
87
88							Returns the current token.
89
90							=cut
91
92							sub token {
93	0			0	1	0	my $self = shift;
94	0					0	return $self->{tokens}[ $self->{position} ];
95							}
96
97							=head2 nextToken()
98
99							Returns the next token, or undef if there are more tokens to retrieve
100							from the lexer.
101
102							=cut
103
104							sub nextToken {
105	335			335	1	508	my $self = shift;
106							## if we haven't gone over the end of our token list
107							## return the token at our current position while
108							## incrementing the position.
109	335	100				522	if ( $self->{position} < @{ $self->{tokens} } ) {
	335					869
110	274					879	my $token = $self->{tokens}[ $self->{position}++ ];
111	274					862	return $token;
112							}
113	61					182	return CQL::Token->new( '' );
114							}
115
116							=head2 prevToken()
117
118							Returns the previous token, or undef if there are no tokens prior
119							to the current token.
120
121							=cut
122
123							sub prevToken {
124	5			5	1	12	my $self = shift;
125							## if we're not at the start of our list of tokens
126							## return the one previous to our current position
127							## while decrementing our position.
128	5	100				18	if ( $self->{position} > 0 ) {
129	3					7	my $token = $self->{tokens}[ --$self->{position} ];
130	3					16	return $token;
131							}
132	2					8	return CQL::Token->new( '' );
133							}
134
135							=head2 reset()
136
137							Resets the iterator to start reading tokens from the beginning.
138
139							=cut
140
141							sub reset {
142	79			79	1	163	shift->{position} = 0;
143							}
144
145							## Private sub used by _analyze for collecting a backslash escaped string terminated by "
146							sub _getString {
147	23			23		35	my $iterator = shift;
148	23					36	my $string = '"';
149	23					29	my $escaping = 0;
150							# loop through the tokens untill an unescaped " found
151	23					82	while ($iterator->hasNextToken()) {
152	107					714	my $token = $iterator->nextToken();
153	107					801	$string .= $token;
154	107	100				430	if ($escaping) {
		100
		100
155	5					15	$escaping = 0;
156							} elsif ($token eq '"') {
157	23					92	return $string;
158							} elsif ($token eq "\\") {
159	5					13	$escaping = 1;
160							}
161							}
162	0					0	croak( 'unterminated string ' . $string);
163							}
164
165							## Private sub used by _analyze to process \ outside double quotes.
166							## Because we tokenized on \ any \ outside double quotes (inside is handled by _getString)
167							## might need to be concatenated with a previous and or next CQL_WORD to form one CQL_WORD token
168							sub _concatBackslash {
169	78			78		342	my $tokensRef = shift;
170	78					106	my $i = 0;
171	78					1568	while ($i < @$tokensRef) {
172	332					492	my $token = $$tokensRef[$i];
173	332	100				1144	if ($token->getString() eq "\\") {
174	1					2	my $s = "\\";
175	1					2	my $replace = 0;
176	1	50				4	if ($i > 0) {
177	1					2	my $prevToken = $$tokensRef[$i - 1];
178	1	50	33			3	if (($prevToken->getType() == CQL_WORD) and !$prevToken->{terminated}) {
179							# concatenate and delete the previous CQL_WORD token
180	1					4	$s = $prevToken->getString() . $s;
181	1					7	$i--;
182	1					3	splice @$tokensRef, $i, 1;
183	1					5	$replace = 1;
184							}
185							}
186	1	50	33			215	if (!$token->{terminated} and ($i < $#$tokensRef)) {
187	1					3	my $nextToken = $$tokensRef[$i + 1];
188	1	50				4	if ($nextToken->getType() == CQL_WORD) {
189							# concatenate and delete the next CQL_WORD token
190	1					5	$s .= $nextToken->getString();
191	1					135	splice @$tokensRef, $i + 1, 1;
192	1					4	$replace = 1;
193							}
194							}
195	1	50				4	if ($replace) {
196	1					4	$$tokensRef[$i] = CQL::Token->new($s);
197							}
198							}
199	332					1916	$i++;
200							}
201							}
202
203							sub _analyze {
204	78			78		111	my $tokenizer = shift;
205
206	78					249	my $iterator = $tokenizer->iterator();
207	78					4336	my @tokens;
208	78					237	while ( defined (my $token = $iterator->nextToken()) ) {
209
210							## <=
211	455	100	100			10581	if ( $token eq '<' and $iterator->lookAheadToken() eq '=' ) {
		100	100
		100	100
		100
		100
212	10					119	push( @tokens, CQL::Token->new( '<=' ) );
213	10					31	$iterator->nextToken();
214							}
215
216							## <>
217							elsif ( $token eq '<' and $iterator->lookAheadToken() eq '>' ) {
218	1					27	push( @tokens, CQL::Token->new( '<>') );
219	1					5	$iterator->nextToken();
220							}
221
222							## >=
223							elsif ( $token eq '>' and $iterator->lookAheadToken() eq '=' ) {
224	3					43	push( @tokens, CQL::Token->new( '>=' ) );
225	3					13	$iterator->nextToken();
226							}
227
228							## "quoted strings"
229							elsif ( $token eq '"' ) {
230	23					61	my $cqlToken = CQL::Token->new( _getString($iterator) );
231							## Mark this and the previous token as terminated to prevent concatenation with backslash
232	23					55	$cqlToken->{terminated} = 1;
233	23	100				55	if (@tokens) { $tokens[$#tokens]->{terminated} = 1; }
	16					36
234	23					79	push( @tokens, $cqlToken );
235							}
236
237							## if it's just whitespace we can zap it
238							elsif ( $token =~ /\s+/ ) {
239							## Mark the previous token as terminated to prevent concatenation with backslash
240	122	50				497	if (@tokens) {
241	122					697	$tokens[$#tokens]->{terminated} = 1;
242							}
243							}
244
245							## otherwise it's fine the way it is
246							else {
247	296					1488	push( @tokens, CQL::Token->new($token) );
248							}
249
250							} # while
251
252							## Concatenate \ outside double quotes with a previous and or next CQL_WORD to form one CQL_WORD token
253	78					637	_concatBackslash(\@tokens);
254
255	78					465	return @tokens;
256							}
257
258							sub debug {
259	156	50		156	0	407	return unless $CQL::DEBUG;
260	0						print STDERR 'CQL::Lexer: ', shift, "\n";
261							}
262
263							1;