File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 96 98 97.9
branch 81 88 92.0
condition 77 90 85.5
subroutine 7 8 87.5
pod 2 2 100.0
total 263 286 91.9


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 67     67   143315 use strict;
  67         190  
  67         2766  
45 67     67   757 use Clone ();
  67         612  
  67         1340  
46 67     67   678 use PPI::Token ();
  67         142  
  67         161814  
47              
48             our $VERSION = '1.28401'; # TRIAL
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 18 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         14 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 52860     52860   101209 my $t = $_[1];
147 52860         120917 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 52860 100       448618 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 6867         28759 $t->_new_token( 'Whitespace', $line );
153 6867         18699 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4341         16643 $t->_new_token( 'Comment', $line );
158 4341         13018 $t->_finalize_token;
159 4341         10549 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 762         3294 $t->_new_token( 'Pod', $line );
164 762 50       4851 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 762         1990 $t->{class} = 'PPI::Token::Pod';
170             }
171 762         2009 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         5 my @perl6;
178 2         4 while ( 1 ) {
179 693         1305 my $line6 = $t->_get_line;
180 693 100       1261 last unless defined $line6;
181 691         1082 push @perl6, $line6;
182             }
183 2         5 push @{ $t->{perl6} }, join '', @perl6;
  2         157  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         60 return 1;
189             }
190              
191 40888         123497 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 396889     396889   646222 my $t = $_[1];
196 396889         924553 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 396889         662723 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 396889 100       1466752 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 276421 100       1321305 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 19674 100       102161 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 8430 100       34527 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215 8430         30612 my $has_sig = $t->_current_token_has_signatures_active;
216 8430 100       21828 return 'Structure' if $has_sig;
217              
218 8394         22413 my @tokens = $t->_previous_significant_tokens(3);
219              
220             # A normal subroutine declaration
221 8394         17245 my $p1 = $tokens[1];
222 8394         16338 my $p2 = $tokens[2];
223 8394 50 100     120706 if (
      100        
      100        
      100        
      33        
      100        
224             $tokens[0]
225             and
226             $tokens[0]->isa('PPI::Token::Word')
227             and
228             $p1
229             and
230             $p1->isa('PPI::Token::Word')
231             and
232             $p1->content eq 'sub'
233             and (
234             not $p2
235             or
236             $p2->isa('PPI::Token::Structure')
237             or (
238             $p2->isa('PPI::Token::Whitespace')
239             and
240             $p2->content eq ''
241             )
242             or (
243             # Lexical subroutine
244             $p2->isa('PPI::Token::Word')
245             and
246             $p2->content =~ /^(?:my|our|state)$/
247             )
248             )
249             ) {
250             # This is a sub prototype
251 314         2157 return 'Prototype';
252             }
253              
254             # A prototyped anonymous subroutine
255 8080         17065 my $p0 = $tokens[0];
256 8080 100 100     59681 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
257             # Maybe it's invoking a method named 'sub'
258             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
259             ) {
260 42         320 return 'Prototype';
261             }
262              
263             # This is a normal open bracket
264 8038         48226 return 'Structure';
265              
266             } elsif ( $char == 60 ) { # $char eq '<'
267             # Finalise any whitespace token...
268 2052 100       11288 $t->_finalize_token if $t->{token};
269              
270             # This is either "less than" or "readline quote-like"
271             # Do some context stuff to guess which.
272 2052         8288 my $prev = $t->_last_significant_token;
273              
274             # The most common group of less-thans are used like
275             # $foo < $bar
276             # 1 < $bar
277             # $#foo < $bar
278 2052 100 100     21110 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
279 1729 50 66     14855 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
280 1729 100 100     14528 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
281 1697 50 66     12476 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
282              
283             # If it is <<... it's a here-doc instead
284 1697         6849 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
285 1697 100       12128 return 'Operator' if $next_char =~ /<[^>]/;
286              
287 873 100       3047 return 'Operator' if not $prev;
288              
289             # The most common group of readlines are used like
290             # while ( <...> )
291             # while <>;
292 792         2813 my $prec = $prev->content;
293 792 100 100     22358 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
294             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
295             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
296             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
297             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
298             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
299              
300 696 100 100     3958 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
301             # Could go either way... do a regex check
302             # $foo->{bar} < 2;
303             # grep { .. } ;
304 17         64 pos $t->{line} = $t->{line_cursor};
305 17 100       105 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
306             # Almost definitely readline
307 3         12 return 'QuoteLike::Readline';
308             }
309             }
310              
311             # Otherwise, we guess operator, which has been the default up
312             # until this more comprehensive section was created.
313 693         3516 return 'Operator';
314              
315             } elsif ( $char == 47 ) { # $char eq '/'
316             # Finalise any whitespace token...
317 1038 100       5336 $t->_finalize_token if $t->{token};
318              
319             # This is either a "divided by" or a "start regex"
320             # Do some context stuff to guess ( ack ) which.
321             # Hopefully the guess will be good enough.
322 1038         4331 my $prev = $t->_last_significant_token;
323              
324             # Or as the very first thing in a file
325 1038 100       4344 return 'Regexp::Match' if not $prev;
326              
327 938         3353 my $prec = $prev->content;
328              
329             # Most times following an operator, we are a regex.
330             # This includes cases such as:
331             # , - As an argument in a list
332             # .. - The second condition in a flip flop
333             # =~ - A bound regex
334             # !~ - Ditto
335 938 100       10880 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
336              
337             # After a symbol
338 499 100       3028 return 'Operator' if $prev->isa('PPI::Token::Symbol');
339 441 100 66     2120 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
340 14         55 return 'Operator';
341             }
342              
343             # After another number
344 427 100       2384 return 'Operator' if $prev->isa('PPI::Token::Number');
345              
346             # After going into scope/brackets
347 391 100 100     2577 if (
      100        
348             $prev->isa('PPI::Token::Structure')
349             and (
350             $prec eq '('
351             or
352             $prec eq '{'
353             or
354             $prec eq ';'
355             )
356             ) {
357 54         268 return 'Regexp::Match';
358             }
359              
360             # Functions and keywords
361 337 100 66     1653 if (
362             $MATCHWORD{$prec}
363             and
364             $prev->isa('PPI::Token::Word')
365             ) {
366 73         371 return 'Regexp::Match';
367             }
368              
369             # What about the char after the slash? There's some things
370             # that would be highly illogical to see if it's an operator.
371 264         969 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
372 264 50 33     1397 if ( defined $next_char and length $next_char ) {
373 264 100       1673 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
374 16         89 return 'Regexp::Match';
375             }
376             }
377              
378             # Otherwise... erm... assume operator?
379             # Add more tests here as potential cases come to light
380 248         1290 return 'Operator';
381              
382             } elsif ( $char == 120 ) { # $char eq 'x'
383             # Could be a word, the x= operator, the x operator
384             # followed by whitespace, or the x operator without any
385             # space between itself and its operand, e.g.: '$a x3',
386             # which is the same as '$a x 3'. _current_x_is_operator
387             # assumes we have a complete 'x' token, but we don't
388             # yet. We may need to split this x character apart from
389             # what follows it.
390 843 100       3719 if ( $t->_current_x_is_operator ) {
391 222         1041 pos $t->{line} = $t->{line_cursor} + 1;
392 222 100       2107 return 'Operator' if $t->{line} =~ m/\G(?:
393             \d # x op with no whitespace e.g. 'x3'
394             |
395             (?!( # negative lookahead
396             => # not on left of fat comma
397             |
398             \w # not a word like "xyzzy"
399             |
400             \s # not x op plus whitespace
401             ))
402             )/gcx;
403             }
404              
405             # Otherwise, commit like a normal bareword, including x
406             # operator followed by whitespace.
407 715         3686 return PPI::Token::Word->__TOKENIZER__commit($t);
408              
409             } elsif ( $char == 45 ) { # $char eq '-'
410             # Look for an obvious operator operand context
411 7305         26841 my $context = $t->_opcontext;
412 7305 100       19115 if ( $context eq 'operator' ) {
413 5334         31909 return 'Operator';
414             } else {
415             # More logic needed
416 1971         9459 return 'Unknown';
417             }
418              
419             } elsif ( $char >= 128 ) { # Outside ASCII
420 6 100       78 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
421 1 50       6 return 'Whitespace' if $c =~ /\s/;
422             }
423              
424              
425             # All the whitespaces are covered, so what to do
426             ### For now, die
427 1         111 PPI::Exception->throw("Encountered unexpected character '$char'");
428             }
429              
430             sub __TOKENIZER__on_line_end {
431 40519 100   40519   276730 $_[1]->_finalize_token if $_[1]->{token};
432             }
433              
434             1;
435              
436             =pod
437              
438             =head1 SUPPORT
439              
440             See the L in the main module.
441              
442             =head1 AUTHOR
443              
444             Adam Kennedy Eadamk@cpan.orgE
445              
446             =head1 COPYRIGHT
447              
448             Copyright 2001 - 2011 Adam Kennedy.
449              
450             This program is free software; you can redistribute
451             it and/or modify it under the same terms as Perl itself.
452              
453             The full text of the license can be found in the
454             LICENSE file included with this module.
455              
456             =cut