File Coverage

blib/lib/Language/Basic/Token.pm
Criterion Covered Total %
statement 82 91 90.1
branch 26 28 92.8
condition n/a
subroutine 25 27 92.5
pod 0 2 0.0
total 133 148 89.8


line stmt bran cond sub pod time code
1             package Language::Basic::Token;
2              
3             # Part of Language::Basic by Amir Karger (See Basic.pm for details)
4              
5             =pod
6              
7             =head1 NAME
8              
9             Language::Basic::Token - Module to handle lexing BASIC statements.
10              
11             =head1 SYNOPSIS
12              
13             See L for the overview of how the Language::Basic module
14             works. This pod page is more technical.
15              
16             # lex a line of basic into a bunch of tokens.
17             my $token_group = new Language::Basic::Token::Group;
18             $token_group->lex('PRINT "YES","NO" : A=A+1');
19              
20             # Look at tokens
21             my $tok = $token_group->lookahead && print $tok->text;
22             # Eat expected tokens
23             my $tok = $token_group->eat_if_string(",");
24             my $tok = $token_group->eat_if_class("Keyword");
25              
26             =head1 DESCRIPTION
27              
28             BASIC tokens are pretty simple. They include Keywords, Identifiers (Variable
29             or Function names), String and Numeric Constants, and a few one- or
30             two-character operators, like ':' and '<='. Tokens aren't very ambiguous, so
31             for example, you don't need to know what type of Statement you're looking at
32             in order to lex a line of BASIC. (The only remotely ambiguous thing is that
33             '=' can be either a Relational Operator or an Assignment statement.)
34              
35             The subclasses of LB::Token represent the various sorts of tokens. The
36             Token::Group class isn't really a subclass at all; it's a group of tokens. See
37             L<"Language::Basic::Token::Group"> for more info.
38              
39             =cut
40              
41 16     16   105 use strict;
  16         33  
  16         988  
42 16     16   100 use Language::Basic::Common;
  16         33  
  16         32480  
43              
44             # sub-packages
45             {
46             package Language::Basic::Token::Group;
47              
48             package Language::Basic::Token::Comment;
49              
50             package Language::Basic::Token::Arithmetic_Operator;
51             package Language::Basic::Token::Multiplicative_Operator;
52             package Language::Basic::Token::Relational_Operator;
53             package Language::Basic::Token::Logical_Operator;
54              
55             package Language::Basic::Token::Identifier;
56             package Language::Basic::Token::Keyword;
57              
58             package Language::Basic::Token::Separator;
59             package Language::Basic::Token::Left_Paren;
60             package Language::Basic::Token::Right_Paren;
61              
62             package Language::Basic::Token::Statement_End;
63             }
64              
65             # Fields:
66             # leading_whitespace Whitespace before the token
67             # text The text in the Token, upcased unless it's a string,
68             # with leading whitespace removed
69             # original_text Original text (non-upcased) including whitespace
70             #
71              
72             # Takes the first token off of text string arg1. Upcases the text in the token
73             # unless it's a string constant, and blesses the Token to a subclass of
74             # Language::Basic::Token.
75             # This sub never gets called except from LB::Token::Group::lex
76              
77             sub _new {
78             # TODO error if called from subclass?
79 1051     1051   1106 shift; # get rid of class
80 1051         4793 my $self = {
81             "text" => undef,
82             };
83 1051         1670 my $textref = shift;
84              
85             # Figure out what sub-class to make it
86 1051         1409 my $class;
87 1051 100       4221 return undef if ($$textref =~ /^\s*$/); # end of a whole line
88              
89             # Test each possible LBT subclass.
90             # Identifier needs to come after all other reserved words since
91             # it allows any letters
92             # Other classes basically don't overlap, so their order doesn't matter
93 892         1918 foreach my $c (qw(Keyword
94             Comment
95             Logical_Operator
96             Identifier
97             String_Constant Numeric_Constant
98             Left_Paren Right_Paren Separator
99             Arithmetic_Operator Multiplicative_Operator Relational_Operator
100             Statement_End)) {
101 4941         7042 $class = "Language::Basic::Token::" . $c;
102 4941         28456 my $regex = $class->regex;
103 4941 100       234597 if ($$textref =~ s/^(\s*)($regex)//) {
104 892         3551 $self->{"original_text"} = $1 . $2;
105 892         2082 $self->{"leading_whitespace"} = $1;
106 892         1348 my $text = $2;
107 892 100       2562 $text = uc($text) unless $c eq "String_Constant";
108 892         2559 $self->{"text"} = $text;
109 892         2799 last;
110             }
111 4049         16762 $class = undef;
112             }
113 892 50       3997 Exit_Error("Don't know how to lex '$$textref'!\n") unless defined $class;
114 892         4852 bless $self, $class;
115             } # end sub Language::Basic::Token::_new
116              
117             =pod
118              
119             The "text" method returns the text that makes up the token. Note that text
120             is stored in upper case (except for string constants, which are stored
121             exactly as entered).
122              
123             =cut
124              
125 1373     1373 0 6383 sub text {return shift->{"text"}}
126              
127             # sub regex returns a regex which matches at the beginning of a string if the
128             # next token is of this class
129 0     0 0 0 sub regex {my $class=shift;Exit_Error($class."::regex should never be called!")}
  0         0  
130              
131             ##############################################################################
132             {
133             package Language::Basic::Token::Group;
134             # Note: no @ISA, because Token::Group isn't really a Token
135              
136             =head2 class Language::Basic::Token::Group
137              
138             This important class handles a group of tokens. Text from the BASIC program
139             is lexed and turned into LB::Tokens which are stored in a Token::Group. Any
140             access to these Tokens (including creating them) is through the Token::Group
141             methods. Other classes' parse methods will usually eat their way through
142             the tokens in the Token::Group until it's empty.
143              
144             =over 4
145              
146             =item new
147              
148             This method just creates a new LBT::Group.
149              
150             =cut
151              
152             sub new {
153 187     187   247 my $class = shift;
154 187         1002 my $self = {
155             "tokens" => [],
156             };
157 187         720 bless $self, $class;
158             } # end sub Language::Basic::Token::Group::new
159              
160             =item lex
161              
162             This method breaks BASIC text arg1 into LB::Tokens and puts them in
163             Token::Group arg0.
164              
165             =cut
166              
167             sub lex {
168 159     159   562 my $self = shift;
169 159         231 my $text = shift;
170 159         332 my @tokens = ();
171 159         605 while (defined (my $tok = _new Language::Basic::Token \$text)) {
172 892         3436 push @tokens, $tok;
173             }
174 159         802 $self->{"tokens"} = \@tokens;
175             #print $self->print;
176             }
177              
178             =item lookahead
179              
180             This method returns the next token in the Token::Group without removing
181             it from the group. That means lookahead can be called many times
182             and keep getting the same token (as long as eat is never called).
183             It returns undef if there are no more Tokens left.
184              
185             =cut
186              
187             sub lookahead {
188 3552     3552   4517 my $self = shift;
189 3552 100       4759 return undef unless @{$self->{"tokens"}};
  3552         23642  
190 2886         5312 my $tok = $self->{"tokens"}->[0];
191 2886         5052 return $tok;
192             } # end sub Language::Basic::Token::Group::lookahead
193              
194             =item eat
195              
196             This method eats the next Token from the Token::Group and returns it.
197             It returns undef if there are no more Tokens left.
198              
199             =cut
200              
201             sub eat {
202 972     972   1179 my $self = shift;
203 972 50       4563 return undef unless @{$self->{"tokens"}};
  972         2371  
204 972         1122 my $tok = shift @{$self->{"tokens"}};
  972         6009  
205 972         1739 return $tok;
206             } # end sub Language::Basic::Token::Group::eat
207              
208              
209             =item eat_if_string
210              
211             This method eats the next token from Group arg0 if it matches string arg1
212             If it ate a token, it returns it. Otherwise (or if there are no tokens left)
213             it returns undef.
214              
215             Note that the string to match should be upper case, since all \w tokens
216             are stored as uppercase.
217              
218             =cut
219              
220             sub eat_if_string {
221 608     608   718 my $self = shift;
222 608         772 my $match = shift;
223              
224 608         1049 my $tok = $self->lookahead;
225 608 100       1593 return undef unless defined $tok;
226              
227             #print "looking for text '$match' and found ",$tok->text,"\n";
228 571         1351 my $matched= $tok->text eq $match;
229              
230 571 100       1408 $self->eat if $matched;
231 571 100       2308 return $matched ? $tok : undef;
232             } # end sub Language::Basic::Token::Group::eat_if_string
233              
234             =item eat_if_class
235              
236             This method eats the next token from Group arg0 if the token is of class
237             "Language::Basic::Token::" . arg1. (I.e., it's called with "Keyword" to
238             get a Language::Basic::Token::Keyword Token.) If it ate a token, it returns it.
239             Otherwise (or if there are no tokens left) it returns undef.
240              
241             =cut
242              
243             sub eat_if_class {
244 1982     1982   2402 my $self = shift;
245 1982         2542 my $match = shift;
246 1982         5855 my $tok = $self->lookahead;
247 1982 100       4861 return undef unless defined $tok;
248              
249             #print "looking for $match and found ",$tok->text,"\n";
250 1649         9066 my $matched= $tok->isa("Language::Basic::Token::" . $match);
251              
252 1649 100       3977 $self->eat if $matched;
253 1649 100       9426 return $matched ? $tok : undef;
254             } # end sub Language::Basic::Token::Group::eat_if_class
255              
256             =item slurp
257              
258             Eats tokens from Group arg1 and puts them in Group arg0 until it gets
259             to a Token whose text matches string arg2 or it reaches the end of arg1. (The
260             matching Token is left in arg1.)
261              
262             =cut
263              
264             sub slurp {
265 28     28   50 my ($to, $from, $string) = @_;
266 28         61 while (defined(my $tok = $from->lookahead)) {
267 88 100       167 last if $tok->text eq $string;
268 80         106 push @{$to->{"tokens"}}, $from->eat;
  80         166  
269             }
270             } # end sub Language::Basic::Token::Group::slurp
271              
272             =item stuff_left
273              
274             Returns true if there's stuff left in the Statement we're parsing (i.e. if
275             there are still tokens left in the Token::Group and the next token isn't a
276             colon)
277              
278             =cut
279              
280             sub stuff_left {
281 368     368   522 my $self = shift;
282 368         691 my $tok = $self->lookahead;
283 368 100       2056 return 0 unless defined $tok;
284 92         1088 return (!$tok->isa("Language::Basic::Token::Statement_End"));
285             } # end sub Language::Basic::Token::stuff_left
286              
287              
288             =item print
289              
290             For debugging purposes. Returns the Tokens in Group arg0 nicely formatted.
291              
292             =cut
293              
294             sub print {
295 0     0   0 my $self = shift;
296 0         0 my $ret = "";
297 0         0 foreach (@{$self->{"tokens"}}) {
  0         0  
298 0         0 ($a = ref($_)) =~ s/^Language::Basic::Token/LBT/;
299 0         0 $ret .= "$a '" . $_->{"text"} . "'\n";
300             }
301 0         0 return $ret;
302             } # end sub Language::Basic::Token::Group::print
303              
304             =pod
305              
306             =back
307              
308             =cut
309              
310             } # end package Language::Basic::Token::Group
311              
312             ##############################################################################
313              
314             =head2 Other Language::Basic::Token subclasses
315              
316             The other subclasses are actually kinds of Tokens, unlike Token::Group.
317             There are no "new" methods for these classes. Creation of Tokens is done
318             by Token::Group::lex. In fact, these classes don't have any public
319             methods. They're mostly there to use "isa" on.
320              
321             =over 4
322              
323             =item Keyword
324              
325             A BASIC keyword (reserved word)
326              
327             =cut
328              
329             {
330             package Language::Basic::Token::Keyword;
331             @Language::Basic::Token::Keyword::ISA = qw(Language::Basic::Token);
332              
333             my @Keywords = qw (
334             DATA DEF DIM END FOR GOSUB GOTO IF INPUT
335             LET NEXT ON PRINT READ RETURN
336             TO STEP THEN ELSE
337             );
338             # Make sure not to accept something like "FORT"
339 892     892   4219 sub regex { "(?i)(" . join("|", @Keywords) . ")\\b"}
340              
341             } # end package Language::Basic::Token::Keyword
342              
343             =item Identifier
344              
345             An Identifier matches /[A-Z][A-Z0-9]*\$?/. It's a variable or function
346             name.
347              
348             =cut
349              
350             {
351             package Language::Basic::Token::Identifier;
352             @Language::Basic::Token::Identifier::ISA = qw(Language::Basic::Token);
353              
354 649     649   1103 sub regex { '(?i)[A-Z][A-Z0-9]*\\$?'}
355              
356             } # end package Language::Basic::Token::Identifier
357              
358             =item String_Constant
359              
360             Stuff inside double quotes.
361              
362             =cut
363              
364             {
365             package Language::Basic::Token::String_Constant;
366             @Language::Basic::Token::String_Constant::ISA = qw(Language::Basic::Token);
367              
368 544     544   1624 sub regex { '".*?"'}
369              
370             } # end package Language::Basic::Token::String_Constant
371              
372             =item Numeric_Constant
373              
374             A float (or integer, currently)
375              
376             =cut
377              
378             {
379             package Language::Basic::Token::Numeric_Constant;
380             @Language::Basic::Token::Numeric_Constant::ISA = qw(Language::Basic::Token);
381              
382 506     506   896 sub regex { '(\\d*\\.)?\\d+'}
383              
384             } # end package Language::Basic::Token::Numeric_Constant
385              
386             =item Left_Paren
387              
388             A "("
389              
390             =cut
391              
392             {
393             package Language::Basic::Token::Left_Paren;
394             @Language::Basic::Token::Left_Paren::ISA = qw(Language::Basic::Token);
395              
396 285     285   575 sub regex { '\\('}
397              
398             } # end package Language::Basic::Token::Left_Paren
399              
400             =item Right_Paren
401              
402             A ")"
403              
404             =cut
405              
406             {
407             package Language::Basic::Token::Right_Paren;
408             @Language::Basic::Token::Right_Paren::ISA = qw(Language::Basic::Token);
409              
410 249     249   429 sub regex { '\\)'}
411              
412             } # end package Language::Basic::Token::Right_Paren
413              
414             =item Separator
415              
416             Comma or semicolon (separators in arglists, PRINT statements)
417              
418             =cut
419              
420             {
421             package Language::Basic::Token::Separator;
422             @Language::Basic::Token::Separator::ISA = qw(Language::Basic::Token);
423              
424 213     213   361 sub regex { '[,;]'}
425              
426             } # end package Language::Basic::Token::Separator
427              
428             =item Arithmetic_Operator
429              
430             Plus or minus
431              
432             =cut
433              
434             {
435             package Language::Basic::Token::Arithmetic_Operator;
436             @Language::Basic::Token::Arithmetic_Operator::ISA = qw(Language::Basic::Token);
437              
438 100     100   204 sub regex { '[-+]'}
439              
440             } # end package Language::Basic::Token::Arithmetic_Operator
441              
442             =item Multiplicative_Operator
443              
444             Multiply or divide operators ('*' and '/')
445              
446             =cut
447              
448             {
449             package Language::Basic::Token::Multiplicative_Operator;
450             @Language::Basic::Token::Multiplicative_Operator::ISA = qw(Language::Basic::Token);
451              
452 83     83   146 sub regex { '[*/]'}
453              
454             } # end package Language::Basic::Token::Multiplicative_Operator
455              
456             =item Relational_Operator
457              
458             Greater than, less than, equals, and their combinations. Note that
459             equals sign is also used to assign values in BASIC.
460              
461             =cut
462              
463             {
464             package Language::Basic::Token::Relational_Operator;
465             @Language::Basic::Token::Relational_Operator::ISA = qw(Language::Basic::Token);
466              
467             # <> <= < >= > =
468             # Note that Equals can be Rel. Op. or Assignment!
469 76     76   127 sub regex { '<[=>]?|>=?|='}
470              
471             } # end package Language::Basic::Token::Relational_Operator
472              
473             =item Logical_Operator
474              
475             AND, OR, NOT
476              
477             =cut
478              
479             {
480             package Language::Basic::Token::Logical_Operator;
481             @Language::Basic::Token::Logical_Operator::ISA = qw(Language::Basic::Token);
482              
483             sub regex {
484 665     665   1470 my @Keywords = qw (AND OR NOT);
485 665         2427 "(?i)(" . join("|", @Keywords) . ")\\b"
486             }
487              
488             } # end package Language::Basic::Token::Logical_Operator
489              
490             =item Comment
491              
492             REM statement (includes the whole rest of the line, even if there are colons
493             in it)
494              
495             =cut
496              
497             {
498             package Language::Basic::Token::Comment;
499             @Language::Basic::Token::Comment::ISA = qw(Language::Basic::Token);
500              
501 671     671   1125 sub regex { '(?i)REM\\s.*'}
502              
503             } # end package Language::Basic::Token::Comment
504              
505             =item Statement_End
506              
507             End of a statement (i.e., a colon)
508              
509             =cut
510              
511             {
512             package Language::Basic::Token::Statement_End;
513             @Language::Basic::Token::Statement_End::ISA = qw(Language::Basic::Token);
514              
515 8     8   17 sub regex { ':'}
516              
517             } # end package Language::Basic::Token::Statement_End
518              
519             1; # end package Language::Basic::Token