File Coverage

blib/lib/Language/Basic/Token.pm

Criterion	Covered	Total	%
statement	82	91	90.1
branch	26	28	92.8
condition			n/a
subroutine	25	27	92.5
pod	0	2	0.0
total	133	148	89.8

line	stmt	bran	sub	pod	time	code
1						package Language::Basic::Token;
2
3						# Part of Language::Basic by Amir Karger (See Basic.pm for details)
4
5						=pod
6
7						=head1 NAME
8
9						Language::Basic::Token - Module to handle lexing BASIC statements.
10
11						=head1 SYNOPSIS
12
13						See L for the overview of how the Language::Basic module
14						works. This pod page is more technical.
15
16						# lex a line of basic into a bunch of tokens.
17						my $token_group = new Language::Basic::Token::Group;
18						$token_group->lex('PRINT "YES","NO" : A=A+1');
19
20						# Look at tokens
21						my $tok = $token_group->lookahead && print $tok->text;
22						# Eat expected tokens
23						my $tok = $token_group->eat_if_string(",");
24						my $tok = $token_group->eat_if_class("Keyword");
25
26						=head1 DESCRIPTION
27
28						BASIC tokens are pretty simple. They include Keywords, Identifiers (Variable
29						or Function names), String and Numeric Constants, and a few one- or
30						two-character operators, like ':' and '<='. Tokens aren't very ambiguous, so
31						for example, you don't need to know what type of Statement you're looking at
32						in order to lex a line of BASIC. (The only remotely ambiguous thing is that
33						'=' can be either a Relational Operator or an Assignment statement.)
34
35						The subclasses of LB::Token represent the various sorts of tokens. The
36						Token::Group class isn't really a subclass at all; it's a group of tokens. See
37						L<"Language::Basic::Token::Group"> for more info.
38
39						=cut
40
41	16		16		105	use strict;
	16				33
	16				988
42	16		16		100	use Language::Basic::Common;
	16				33
	16				32480
43
44						# sub-packages
45						{
46						package Language::Basic::Token::Group;
47
48						package Language::Basic::Token::Comment;
49
50						package Language::Basic::Token::Arithmetic_Operator;
51						package Language::Basic::Token::Multiplicative_Operator;
52						package Language::Basic::Token::Relational_Operator;
53						package Language::Basic::Token::Logical_Operator;
54
55						package Language::Basic::Token::Identifier;
56						package Language::Basic::Token::Keyword;
57
58						package Language::Basic::Token::Separator;
59						package Language::Basic::Token::Left_Paren;
60						package Language::Basic::Token::Right_Paren;
61
62						package Language::Basic::Token::Statement_End;
63						}
64
65						# Fields:
66						# leading_whitespace Whitespace before the token
67						# text The text in the Token, upcased unless it's a string,
68						# with leading whitespace removed
69						# original_text Original text (non-upcased) including whitespace
70						#
71
72						# Takes the first token off of text string arg1. Upcases the text in the token
73						# unless it's a string constant, and blesses the Token to a subclass of
74						# Language::Basic::Token.
75						# This sub never gets called except from LB::Token::Group::lex
76
77						sub _new {
78						# TODO error if called from subclass?
79	1051		1051		1106	shift; # get rid of class
80	1051				4793	my $self = {
81						"text" => undef,
82						};
83	1051				1670	my $textref = shift;
84
85						# Figure out what sub-class to make it
86	1051				1409	my $class;
87	1051	100			4221	return undef if ($$textref =~ /^\s*$/); # end of a whole line
88
89						# Test each possible LBT subclass.
90						# Identifier needs to come after all other reserved words since
91						# it allows any letters
92						# Other classes basically don't overlap, so their order doesn't matter
93	892				1918	foreach my $c (qw(Keyword
94						Comment
95						Logical_Operator
96						Identifier
97						String_Constant Numeric_Constant
98						Left_Paren Right_Paren Separator
99						Arithmetic_Operator Multiplicative_Operator Relational_Operator
100						Statement_End)) {
101	4941				7042	$class = "Language::Basic::Token::" . $c;
102	4941				28456	my $regex = $class->regex;
103	4941	100			234597	if ($$textref =~ s/^(\s*)($regex)//) {
104	892				3551	$self->{"original_text"} = $1 . $2;
105	892				2082	$self->{"leading_whitespace"} = $1;
106	892				1348	my $text = $2;
107	892	100			2562	$text = uc($text) unless $c eq "String_Constant";
108	892				2559	$self->{"text"} = $text;
109	892				2799	last;
110						}
111	4049				16762	$class = undef;
112						}
113	892	50			3997	Exit_Error("Don't know how to lex '$$textref'!\n") unless defined $class;
114	892				4852	bless $self, $class;
115						} # end sub Language::Basic::Token::_new
116
117						=pod
118
119						The "text" method returns the text that makes up the token. Note that text
120						is stored in upper case (except for string constants, which are stored
121						exactly as entered).
122
123						=cut
124
125	1373		1373	0	6383	sub text {return shift->{"text"}}
126
127						# sub regex returns a regex which matches at the beginning of a string if the
128						# next token is of this class
129	0		0	0	0	sub regex {my $class=shift;Exit_Error($class."::regex should never be called!")}
	0				0
130
131						##############################################################################
132						{
133						package Language::Basic::Token::Group;
134						# Note: no @ISA, because Token::Group isn't really a Token
135
136						=head2 class Language::Basic::Token::Group
137
138						This important class handles a group of tokens. Text from the BASIC program
139						is lexed and turned into LB::Tokens which are stored in a Token::Group. Any
140						access to these Tokens (including creating them) is through the Token::Group
141						methods. Other classes' parse methods will usually eat their way through
142						the tokens in the Token::Group until it's empty.
143
144						=over 4
145
146						=item new
147
148						This method just creates a new LBT::Group.
149
150						=cut
151
152						sub new {
153	187		187		247	my $class = shift;
154	187				1002	my $self = {
155						"tokens" => [],
156						};
157	187				720	bless $self, $class;
158						} # end sub Language::Basic::Token::Group::new
159
160						=item lex
161
162						This method breaks BASIC text arg1 into LB::Tokens and puts them in
163						Token::Group arg0.
164
165						=cut
166
167						sub lex {
168	159		159		562	my $self = shift;
169	159				231	my $text = shift;
170	159				332	my @tokens = ();
171	159				605	while (defined (my $tok = _new Language::Basic::Token \$text)) {
172	892				3436	push @tokens, $tok;
173						}
174	159				802	$self->{"tokens"} = \@tokens;
175						#print $self->print;
176						}
177
178						=item lookahead
179
180						This method returns the next token in the Token::Group without removing
181						it from the group. That means lookahead can be called many times
182						and keep getting the same token (as long as eat is never called).
183						It returns undef if there are no more Tokens left.
184
185						=cut
186
187						sub lookahead {
188	3552		3552		4517	my $self = shift;
189	3552	100			4759	return undef unless @{$self->{"tokens"}};
	3552				23642
190	2886				5312	my $tok = $self->{"tokens"}->[0];
191	2886				5052	return $tok;
192						} # end sub Language::Basic::Token::Group::lookahead
193
194						=item eat
195
196						This method eats the next Token from the Token::Group and returns it.
197						It returns undef if there are no more Tokens left.
198
199						=cut
200
201						sub eat {
202	972		972		1179	my $self = shift;
203	972	50			4563	return undef unless @{$self->{"tokens"}};
	972				2371
204	972				1122	my $tok = shift @{$self->{"tokens"}};
	972				6009
205	972				1739	return $tok;
206						} # end sub Language::Basic::Token::Group::eat
207
208
209						=item eat_if_string
210
211						This method eats the next token from Group arg0 if it matches string arg1
212						If it ate a token, it returns it. Otherwise (or if there are no tokens left)
213						it returns undef.
214
215						Note that the string to match should be upper case, since all \w tokens
216						are stored as uppercase.
217
218						=cut
219
220						sub eat_if_string {
221	608		608		718	my $self = shift;
222	608				772	my $match = shift;
223
224	608				1049	my $tok = $self->lookahead;
225	608	100			1593	return undef unless defined $tok;
226
227						#print "looking for text '$match' and found ",$tok->text,"\n";
228	571				1351	my $matched= $tok->text eq $match;
229
230	571	100			1408	$self->eat if $matched;
231	571	100			2308	return $matched ? $tok : undef;
232						} # end sub Language::Basic::Token::Group::eat_if_string
233
234						=item eat_if_class
235
236						This method eats the next token from Group arg0 if the token is of class
237						"Language::Basic::Token::" . arg1. (I.e., it's called with "Keyword" to
238						get a Language::Basic::Token::Keyword Token.) If it ate a token, it returns it.
239						Otherwise (or if there are no tokens left) it returns undef.
240
241						=cut
242
243						sub eat_if_class {
244	1982		1982		2402	my $self = shift;
245	1982				2542	my $match = shift;
246	1982				5855	my $tok = $self->lookahead;
247	1982	100			4861	return undef unless defined $tok;
248
249						#print "looking for $match and found ",$tok->text,"\n";
250	1649				9066	my $matched= $tok->isa("Language::Basic::Token::" . $match);
251
252	1649	100			3977	$self->eat if $matched;
253	1649	100			9426	return $matched ? $tok : undef;
254						} # end sub Language::Basic::Token::Group::eat_if_class
255
256						=item slurp
257
258						Eats tokens from Group arg1 and puts them in Group arg0 until it gets
259						to a Token whose text matches string arg2 or it reaches the end of arg1. (The
260						matching Token is left in arg1.)
261
262						=cut
263
264						sub slurp {
265	28		28		50	my ($to, $from, $string) = @_;
266	28				61	while (defined(my $tok = $from->lookahead)) {
267	88	100			167	last if $tok->text eq $string;
268	80				106	push @{$to->{"tokens"}}, $from->eat;
	80				166
269						}
270						} # end sub Language::Basic::Token::Group::slurp
271
272						=item stuff_left
273
274						Returns true if there's stuff left in the Statement we're parsing (i.e. if
275						there are still tokens left in the Token::Group and the next token isn't a
276						colon)
277
278						=cut
279
280						sub stuff_left {
281	368		368		522	my $self = shift;
282	368				691	my $tok = $self->lookahead;
283	368	100			2056	return 0 unless defined $tok;
284	92				1088	return (!$tok->isa("Language::Basic::Token::Statement_End"));
285						} # end sub Language::Basic::Token::stuff_left
286
287
288						=item print
289
290						For debugging purposes. Returns the Tokens in Group arg0 nicely formatted.
291
292						=cut
293
294						sub print {
295	0		0		0	my $self = shift;
296	0				0	my $ret = "";
297	0				0	foreach (@{$self->{"tokens"}}) {
	0				0
298	0				0	($a = ref($_)) =~ s/^Language::Basic::Token/LBT/;
299	0				0	$ret .= "$a '" . $_->{"text"} . "'\n";
300						}
301	0				0	return $ret;
302						} # end sub Language::Basic::Token::Group::print
303
304						=pod
305
306						=back
307
308						=cut
309
310						} # end package Language::Basic::Token::Group
311
312						##############################################################################
313
314						=head2 Other Language::Basic::Token subclasses
315
316						The other subclasses are actually kinds of Tokens, unlike Token::Group.
317						There are no "new" methods for these classes. Creation of Tokens is done
318						by Token::Group::lex. In fact, these classes don't have any public
319						methods. They're mostly there to use "isa" on.
320
321						=over 4
322
323						=item Keyword
324
325						A BASIC keyword (reserved word)
326
327						=cut
328
329						{
330						package Language::Basic::Token::Keyword;
331						@Language::Basic::Token::Keyword::ISA = qw(Language::Basic::Token);
332
333						my @Keywords = qw (
334						DATA DEF DIM END FOR GOSUB GOTO IF INPUT
335						LET NEXT ON PRINT READ RETURN
336						TO STEP THEN ELSE
337						);
338						# Make sure not to accept something like "FORT"
339	892		892		4219	sub regex { "(?i)(" . join("\|", @Keywords) . ")\\b"}
340
341						} # end package Language::Basic::Token::Keyword
342
343						=item Identifier
344
345						An Identifier matches /[A-Z][A-Z0-9]*\$?/. It's a variable or function
346						name.
347
348						=cut
349
350						{
351						package Language::Basic::Token::Identifier;
352						@Language::Basic::Token::Identifier::ISA = qw(Language::Basic::Token);
353
354	649		649		1103	sub regex { '(?i)[A-Z][A-Z0-9]*\\$?'}
355
356						} # end package Language::Basic::Token::Identifier
357
358						=item String_Constant
359
360						Stuff inside double quotes.
361
362						=cut
363
364						{
365						package Language::Basic::Token::String_Constant;
366						@Language::Basic::Token::String_Constant::ISA = qw(Language::Basic::Token);
367
368	544		544		1624	sub regex { '".*?"'}
369
370						} # end package Language::Basic::Token::String_Constant
371
372						=item Numeric_Constant
373
374						A float (or integer, currently)
375
376						=cut
377
378						{
379						package Language::Basic::Token::Numeric_Constant;
380						@Language::Basic::Token::Numeric_Constant::ISA = qw(Language::Basic::Token);
381
382	506		506		896	sub regex { '(\\d*\\.)?\\d+'}
383
384						} # end package Language::Basic::Token::Numeric_Constant
385
386						=item Left_Paren
387
388						A "("
389
390						=cut
391
392						{
393						package Language::Basic::Token::Left_Paren;
394						@Language::Basic::Token::Left_Paren::ISA = qw(Language::Basic::Token);
395
396	285		285		575	sub regex { '\\('}
397
398						} # end package Language::Basic::Token::Left_Paren
399
400						=item Right_Paren
401
402						A ")"
403
404						=cut
405
406						{
407						package Language::Basic::Token::Right_Paren;
408						@Language::Basic::Token::Right_Paren::ISA = qw(Language::Basic::Token);
409
410	249		249		429	sub regex { '\\)'}
411
412						} # end package Language::Basic::Token::Right_Paren
413
414						=item Separator
415
416						Comma or semicolon (separators in arglists, PRINT statements)
417
418						=cut
419
420						{
421						package Language::Basic::Token::Separator;
422						@Language::Basic::Token::Separator::ISA = qw(Language::Basic::Token);
423
424	213		213		361	sub regex { '[,;]'}
425
426						} # end package Language::Basic::Token::Separator
427
428						=item Arithmetic_Operator
429
430						Plus or minus
431
432						=cut
433
434						{
435						package Language::Basic::Token::Arithmetic_Operator;
436						@Language::Basic::Token::Arithmetic_Operator::ISA = qw(Language::Basic::Token);
437
438	100		100		204	sub regex { '[-+]'}
439
440						} # end package Language::Basic::Token::Arithmetic_Operator
441
442						=item Multiplicative_Operator
443
444						Multiply or divide operators ('*' and '/')
445
446						=cut
447
448						{
449						package Language::Basic::Token::Multiplicative_Operator;
450						@Language::Basic::Token::Multiplicative_Operator::ISA = qw(Language::Basic::Token);
451
452	83		83		146	sub regex { '[*/]'}
453
454						} # end package Language::Basic::Token::Multiplicative_Operator
455
456						=item Relational_Operator
457
458						Greater than, less than, equals, and their combinations. Note that
459						equals sign is also used to assign values in BASIC.
460
461						=cut
462
463						{
464						package Language::Basic::Token::Relational_Operator;
465						@Language::Basic::Token::Relational_Operator::ISA = qw(Language::Basic::Token);
466
467						# <> <= < >= > =
468						# Note that Equals can be Rel. Op. or Assignment!
469	76		76		127	sub regex { '<[=>]?\|>=?\|='}
470
471						} # end package Language::Basic::Token::Relational_Operator
472
473						=item Logical_Operator
474
475						AND, OR, NOT
476
477						=cut
478
479						{
480						package Language::Basic::Token::Logical_Operator;
481						@Language::Basic::Token::Logical_Operator::ISA = qw(Language::Basic::Token);
482
483						sub regex {
484	665		665		1470	my @Keywords = qw (AND OR NOT);
485	665				2427	"(?i)(" . join("\|", @Keywords) . ")\\b"
486						}
487
488						} # end package Language::Basic::Token::Logical_Operator
489
490						=item Comment
491
492						REM statement (includes the whole rest of the line, even if there are colons
493						in it)
494
495						=cut
496
497						{
498						package Language::Basic::Token::Comment;
499						@Language::Basic::Token::Comment::ISA = qw(Language::Basic::Token);
500
501	671		671		1125	sub regex { '(?i)REM\\s.*'}
502
503						} # end package Language::Basic::Token::Comment
504
505						=item Statement_End
506
507						End of a statement (i.e., a colon)
508
509						=cut
510
511						{
512						package Language::Basic::Token::Statement_End;
513						@Language::Basic::Token::Statement_End::ISA = qw(Language::Basic::Token);
514
515	8		8		17	sub regex { ':'}
516
517						} # end package Language::Basic::Token::Statement_End
518
519						1; # end package Language::Basic::Token