File Coverage

blib/lib/HOP/Lexer.pm
Criterion Covered Total %
statement 47 47 100.0
branch 14 14 100.0
condition 13 14 92.8
subroutine 11 11 100.0
pod 2 2 100.0
total 87 88 98.8


line stmt bran cond sub pod time code
1             package HOP::Lexer;
2              
3 2     2   108934 use warnings;
  2         6  
  2         153  
4 2     2   12 use strict;
  2         5  
  2         82  
5              
6 2     2   11 use base 'Exporter';
  2         10  
  2         338  
7             our @EXPORT_OK = qw/ make_lexer string_lexer /;
8             our %EXPORT_TAGS = ( all => \@EXPORT_OK );
9              
10 2     2   2040 use HOP::Stream 'node';
  2         9224  
  2         1320  
11              
12             =head1 NAME
13              
14             HOP::Lexer - "Higher Order Perl" Lexer
15              
16             =head1 VERSION
17              
18             Version 0.032
19              
20             =cut
21              
22             our $VERSION = '0.032';
23              
24             =head1 SYNOPSIS
25              
26             use HOP::Lexer 'string_lexer';
27            
28             my @input_tokens = (
29             [ 'VAR', qr/[[:alpha:]]+/ ],
30             [ 'NUM', qr/\d+/ ],
31             [ 'OP', qr/[+=]/ ],
32             [ 'SPACE', qr/\s*/, sub { () } ],
33             );
34            
35             my $text = 'x = 3 + 4';
36             my $lexer = string_lexer( $text, @input_tokens );
37            
38             my @tokens;
39             while ( my $token = $lexer->() ) {
40             push @tokens, $token;
41             }
42              
43             =head1 EXPORT
44              
45             Two functions may be exported, C and C.
46              
47             =head1 FUNCTIONS
48              
49             =head2 make_lexer
50              
51             my $lexer = make_lexer( $input_iterator, @tokens );
52              
53             The C function expects an input data iterator as the first
54             argument and a series of tokens as subsequent arguments. It returns a stream
55             of lexed tokens. The output tokens are two element arrays:
56              
57             [ $label, $matched_text ]
58              
59             The iterator should be a subroutine reference that returns the next value
60             merely by calling the subroutine with no arguments. If you have a single
61             block of text in a scalar that you want lexed, see the C
62             function.
63              
64             The input C<@tokens> array passed into C is expected to be a list
65             of array references with two mandatory items and one optional one:
66              
67             [ $label, qr/$match/, &transform ]
68              
69             =over 4
70              
71             =item * C<$label>
72              
73             The C<$label> is the name used for the first item in an output token.
74              
75             =item * C<$match>
76              
77             The C<$match> is either an exact string or regular expression which matches
78             the text the label is to identify.
79              
80             =item * C<&transform>
81              
82             The C<&transform> subroutine reference is optional. If supplied, this will
83             take the matched text and should return a token matching an output token or
84             an empty list if the token is to be discarded. For example, to discard
85             whitespace (the label is actually irrelevant, but it helps to document the
86             code):
87              
88             [ 'WHITESPACE', /\s+/, sub {()} ]
89              
90             The two arguments supplied to the transformation subroutine are the label and
91             value. Thus, if we wish to force all non-negative integers to have a unary
92             plus, we might do something like this:
93              
94             [
95             'REVERSED INT', # the label
96             /[+-]?\d+/, # integers with an optional unary plus or minus
97             sub {
98             my ($label, $value) = @_;
99             $value = "+$value" unless $value =~ /^[-+]/;
100             [ $label, $value ]
101             }
102             ]
103              
104             =back
105              
106             For example, let's say we want to convert the string "x = 3 + 4" to the
107             following tokens:
108              
109             [ 'VAR', 'x' ]
110             [ 'OP', '=' ]
111             [ 'NUM', 3 ]
112             [ 'OP', '+' ]
113             [ 'NUM', 4 ]
114              
115             One way to do this would be with the following code:
116              
117             my $text = 'x = 3 + 4';
118             my @text = ($text);
119             my $iter = sub { shift @text };
120            
121             my @input_tokens = (
122             [ 'VAR', qr/[[:alpha:]]+/ ],
123             [ 'NUM', qr/\d+/ ],
124             [ 'OP', qr/[+=]/ ],
125             [ 'SPACE', qr/\s*/, sub { () } ],
126             );
127            
128             my $lexer = make_lexer( $iter, @input_tokens );
129            
130             my @tokens;
131             while ( my $token = $lexer->() ) {
132             push @tokens, $token;
133             }
134              
135             C<@tokens> would contain the desired tokens.
136              
137             Note that the order in which the input tokens are passed in might cause input
138             to be lexed in different ways, thus the order is significant (C might
139             slurp up numbers before C can read them).
140              
141             =head2 string_lexer
142              
143             my $lexer = string_lexer( $string, @tokens );
144              
145             This function is identical to C, but takes a string as the first
146             argument. This is merely syntactic sugar for the common case where we have
147             our data in a string but don't want to create an iterator. The following are
148             equivalent.
149              
150             my $lexer = string_lexer( $text, @input_tokens );
151              
152             Versus:
153              
154             my @text = ($text);
155             my $iter = sub { shift @text };
156             my $lexer = make_lexer( $iter, @input_tokens );
157            
158             =cut
159              
160             sub string_lexer {
161 1     1 1 3066 my $text = shift;
162 1         4 my @text = $text;
163 1     2   13 return make_lexer( sub { shift @text }, @_ );
  2         6  
164             }
165              
166             sub make_lexer {
167 2     2 1 449185 my $lexer = shift;
168 2         14 while (@_) {
169 8         12 my $args = shift;
170 8         25 $lexer = _tokens( $lexer, @$args );
171             }
172 2         16 return $lexer;
173             }
174              
175             sub _tokens {
176 8     8   19 my ( $input, $label, $pattern, $maketoken ) = @_;
177 8   100 10   50 $maketoken ||= sub { [ $_[0] => $_[1] ] };
  10         84  
178 8         12 my @tokens;
179 8         14 my $buf = ""; # set to undef when input is exhausted
180 8     42   308 my $split = sub { split /($pattern)/ => $_[0] };
  42         1410  
181              
182             return sub {
183 51   100 51   2993 while ( 0 == @tokens && defined $buf ) {
184 42         83 my $i = $input->();
185 42 100       190 if ( ref $i ) { # input is a token
186 18         36 my ( $sep, $tok ) = $split->($buf);
187 18 100       63 $tok = $maketoken->( $label, $tok ) if defined $tok;
188 18   100     157 push @tokens => grep defined && $_ ne "" => $sep, $tok, $i;
189 18         32 $buf = "";
190 18         25 last;
191             }
192 24 100       56 $buf .= $i if defined $i; # append new input to buffer
193 24         50 my @newtoks = $split->($buf);
194 24   100     138 while ( @newtoks > 2 || @newtoks && !defined $i ) {
      66        
195              
196             # buffer contains complete separator plus combined token
197             # OR we've reached the end of input
198 12         20 push @tokens => shift @newtoks;
199 12 100       49 push @tokens => $maketoken->( $label, shift @newtoks )
200             if @newtoks;
201             }
202              
203             # reassemble remaining contents of buffer
204 24         49 $buf = join "" => @newtoks;
205 24 100       59 undef $buf unless defined $i;
206 24         143 @tokens = grep $_ ne "" => @tokens;
207             }
208 51 100       333 $_[0] = '' unless defined $_[0];
209 51 100       276 return 'peek' eq $_[0] ? $tokens[0] : shift @tokens;
210 8         62 };
211             }
212              
213             =head1 DEBUGGING
214              
215             The following caveats (or pitfalls, if you prefer), should be kept in mind
216             while lexing data.
217              
218             =over 4
219              
220             =item * Unlexed data
221              
222             The tokens returned by the lexer are array references. If any data cannot be
223             lexed, it will be returned as a string, unchanged.
224              
225             =item * Capturing parens
226              
227             Internally, L uses capturing parentheses to extract the data from
228             the provided regular expressions. If you need to group data in regular
229             expressions, use the non-capturing parentheses C<(?:...)>. Otherwise, your
230             code will break.
231              
232             =item * Precedence
233              
234             It's important to note that the order of the described tokens is important.
235             If you have keywords such as "while", "if", "unless", and so on, and any text
236             which matches C is considered a variable, the following fails:
237              
238             my @input_tokens = (
239             [ 'VAR', qr/[[:word:]]+/ ],
240             [ 'KEYWORD', qr/(?:while|if|unless)/ ],
241             );
242              
243             This is because the potential keywords will be matched as C. To deal
244             with this, place the higher precedence tokens first:
245              
246             my @input_tokens = (
247             [ 'KEYWORD', qr/(?:while|if|unless)/ ],
248             [ 'VAR', qr/[[:word:]]+/ ],
249             );
250              
251             =back
252              
253             =head1 AUTHOR
254              
255             Mark Jason Dominus. Maintained by Curtis "Ovid" Poe, C<< >>
256              
257             =head1 BUGS
258              
259             Please report any bugs or feature requests to
260             C, or through the web interface at
261             L.
262             I will be notified, and then you'll automatically be notified of progress on
263             your bug as I make changes.
264              
265             =head1 FURTHER READING
266              
267             See L for a detailed
268             article about using this module, along with a comprehensive example.
269              
270             This has now been included in the distribution as L.
271              
272             =head1 ACKNOWLEDGEMENTS
273              
274             Many thanks to Mark Dominus and Elsevier, Inc. for allowing this work to be
275             republished.
276              
277             =head1 COPYRIGHT & LICENSE
278              
279             Code derived from the book "Higher-Order Perl" by Mark Dominus, published by
280             Morgan Kaufmann Publishers, Copyright 2005 by Elsevier Inc.
281              
282             =head1 ABOUT THE SOFTWARE
283              
284             All Software (code listings) presented in the book can be found on the
285             companion website for the book (http://perl.plover.com/hop/) and is
286             subject to the License agreements below.
287              
288             =head1 ELSEVIER SOFTWARE LICENSE AGREEMENT
289              
290             Please read the following agreement carefully before using this Software. This
291             Software is licensed under the terms contained in this Software license
292             agreement ("agreement"). By using this Software product, you, an individual,
293             or entity including employees, agents and representatives ("you" or "your"),
294             acknowledge that you have read this agreement, that you understand it, and
295             that you agree to be bound by the terms and conditions of this agreement.
296             Elsevier inc. ("Elsevier") expressly does not agree to license this Software
297             product to you unless you assent to this agreement. If you do not agree with
298             any of the following terms, do not use the Software.
299              
300             =head1 LIMITED WARRANTY AND LIMITATION OF LIABILITY
301              
302             YOUR USE OF THIS SOFTWARE IS AT YOUR OWN RISK. NEITHER ELSEVIER NOR ITS
303             LICENSORS REPRESENT OR WARRANT THAT THE SOFTWARE PRODUCT WILL MEET YOUR
304             REQUIREMENTS OR THAT ITS OPERATION WILL BE UNINTERRUPTED OR ERROR-FREE. WE
305             EXCLUDE AND EXPRESSLY DISCLAIM ALL EXPRESS AND IMPLIED WARRANTIES NOT STATED
306             HEREIN, INCLUDING THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
307             PARTICULAR PURPOSE. IN ADDITION, NEITHER ELSEVIER NOR ITS LICENSORS MAKE ANY
308             REPRESENTATIONS OR WARRANTIES, EITHER EXPRESS OR IMPLIED, REGARDING THE
309             PERFORMANCE OF YOUR NETWORK OR COMPUTER SYSTEM WHEN USED IN CONJUNCTION WITH
310             THE SOFTWARE PRODUCT. WE SHALL NOT BE LIABLE FOR ANY DAMAGE OR LOSS OF ANY
311             KIND ARISING OUT OF OR RESULTING FROM YOUR POSSESSION OR USE OF THE SOFTWARE
312             PRODUCT CAUSED BY ERRORS OR OMISSIONS, DATA LOSS OR CORRUPTION, ERRORS OR
313             OMISSIONS IN THE PROPRIETARY MATERIAL, REGARDLESS OF WHETHER SUCH LIABILITY IS
314             BASED IN TORT, CONTRACT OR OTHERWISE AND INCLUDING, BUT NOT LIMITED TO,
315             ACTUAL, SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES. IF THE
316             FOREGOING LIMITATION IS HELD TO BE UNENFORCEABLE, OUR MAXIMUM LIABILITY TO YOU
317             SHALL NOT EXCEED THE AMOUNT OF THE PURCHASE PRICE PAID BY YOU FOR THE SOFTWARE
318             PRODUCT. THE REMEDIES AVAILABLE TO YOU AGAINST US AND THE LICENSORS OF
319             MATERIALS INCLUDED IN THE SOFTWARE PRODUCT ARE EXCLUSIVE.
320              
321             YOU UNDERSTAND THAT ELSEVIER, ITS AFFILIATES, LICENSORS, SUPPLIERS AND AGENTS,
322             MAKE NO WARRANTIES, EXPRESSED OR IMPLIED, WITH RESPECT TO THE SOFTWARE
323             PRODUCT, INCLUDING, WITHOUT LIMITATION THE PROPRIETARY MATERIAL, AND
324             SPECIFICALLY DISCLAIM ANY WARRANTY OF MERCHANTABILITY OR FITNESS FOR A
325             PARTICULAR PURPOSE.
326              
327             IN NO EVENT WILL ELSEVIER, ITS AFFILIATES, LICENSORS, SUPPLIERS OR AGENTS, BE
328             LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, WITHOUT LIMITATION, ANY LOST
329             PROFITS, LOST SAVINGS OR OTHER INCIDENTAL OR CONSEQUENTIAL DAMAGES, ARISING
330             OUT OF YOUR USE OR INABILITY TO USE THE SOFTWARE PRODUCT REGARDLESS OF WHETHER
331             SUCH DAMAGES ARE FORESEEABLE OR WHETHER SUCH DAMAGES ARE DEEMED TO RESULT FROM
332             THE FAILURE OR INADEQUACY OF ANY EXCLUSIVE OR OTHER REMEDY.
333              
334             =head1 SOFTWARE LICENSE AGREEMENT
335              
336             This Software License Agreement is a legal agreement between the Author and
337             any person or legal entity using or accepting any Software governed by this
338             Agreement. The Software is available on the companion website
339             (http://perl.plover.com/hop/) for the Book, Higher-Order Perl, which is
340             published by Morgan Kaufmann Publishers. "The Software" is comprised of all
341             code (fragments and pseudocode) presented in the book.
342              
343             By installing, copying, or otherwise using the Software, you agree to be bound
344             by the terms of this Agreement.
345              
346             The parties agree as follows:
347              
348             =over 4
349              
350             =item 1 Grant of License
351              
352             We grant you a nonexclusive license to use the Software for any purpose,
353             commercial or non-commercial, as long as the following credit is included
354             identifying the original source of the Software: "from Higher-Order Perl by
355             Mark Dominus, published by Morgan Kaufmann Publishers, Copyright 2005 by
356             Elsevier Inc".
357              
358             =item 2 Disclaimer of Warranty.
359              
360             We make no warranties at all. The Software is transferred to you on an "as is"
361             basis. You use the Software at your own peril. You assume all risk of loss for
362             all claims or controversies, now existing or hereafter, arising out of use of
363             the Software. We shall have no liability based on a claim that your use or
364             combination of the Software with products or data not supplied by us infringes
365             any patent, copyright, or proprietary right. All other warranties, expressed
366             or implied, including, without limitation, any warranty of merchantability or
367             fitness for a particular purpose are hereby excluded.
368              
369             =item 3 Limitation of Liability.
370              
371             We will have no liability for special, incidental, or consequential damages
372             even if advised of the possibility of such damages. We will not be liable for
373             any other damages or loss in any way connected with the Software.
374              
375             =back
376              
377             =cut
378              
379             1; # End of HOP::Lexer