File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 95 97 97.9
branch 81 88 92.0
condition 77 90 85.5
subroutine 7 8 87.5
pod 2 2 100.0
total 262 285 91.9


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 68     68   86581 use strict;
  68         125  
  68         1900  
45 68     68   590 use Clone ();
  68         519  
  68         863  
46 68     68   598 use PPI::Token ();
  68         98  
  68         114620  
47              
48             our $VERSION = '1.287';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 18 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         12 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 56869     56869   64528 my $t = $_[1];
147 56869         81759 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 56869 100       276967 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 7000         16818 $t->_new_token( 'Whitespace', $line );
153 7000         14149 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4328         10618 $t->_new_token( 'Comment', $line );
158 4328         8614 $t->_finalize_token;
159 4328         7051 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 761         2156 $t->_new_token( 'Pod', $line );
164 761 50       2775 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 761         1509 $t->{class} = 'PPI::Token::Pod';
170             }
171 761         1536 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         2 my @perl6;
178 2         3 while ( 1 ) {
179 693         798 my $line6 = $t->_get_line;
180 693 100       782 last unless defined $line6;
181 691         692 push @perl6, $line6;
182             }
183 2         3 push @{ $t->{perl6} }, join '', @perl6;
  2         73  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         34 return 1;
189             }
190              
191 44778         77134 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 609302     609302   609723 my $t = $_[1];
196 609302         832051 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 609302         623096 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 609302 100       1185263 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 469983 100       1210052 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 27179 100       69836 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 12236 100       30186 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215 12236 100       26400 return 'Structure' if $t->_current_token_has_signatures_active;
216              
217 8418         16581 my @tokens = $t->_previous_significant_tokens(3);
218              
219             # A normal subroutine declaration
220 8418         10827 my $p1 = $tokens[1];
221 8418         10920 my $p2 = $tokens[2];
222 8418 50 100     71804 if (
      100        
      100        
      100        
      33        
      100        
223             $tokens[0]
224             and
225             $tokens[0]->isa('PPI::Token::Word')
226             and
227             $p1
228             and
229             $p1->isa('PPI::Token::Word')
230             and
231             $p1->content eq 'sub'
232             and (
233             not $p2
234             or
235             $p2->isa('PPI::Token::Structure')
236             or (
237             $p2->isa('PPI::Token::Whitespace')
238             and
239             $p2->content eq ''
240             )
241             or (
242             # Lexical subroutine
243             $p2->isa('PPI::Token::Word')
244             and
245             $p2->content =~ /^(?:my|our|state)$/
246             )
247             )
248             ) {
249             # This is a sub prototype
250 314         1203 return 'Prototype';
251             }
252              
253             # A prototyped anonymous subroutine
254 8104         11962 my $p0 = $tokens[0];
255 8104 100 100     36595 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
256             # Maybe it's invoking a method named 'sub'
257             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
258             ) {
259 42         172 return 'Prototype';
260             }
261              
262             # This is a normal open bracket
263 8062         28099 return 'Structure';
264              
265             } elsif ( $char == 60 ) { # $char eq '<'
266             # Finalise any whitespace token...
267 2060 100       6349 $t->_finalize_token if $t->{token};
268              
269             # This is either "less than" or "readline quote-like"
270             # Do some context stuff to guess which.
271 2060         4238 my $prev = $t->_last_significant_token;
272              
273             # The most common group of less-thans are used like
274             # $foo < $bar
275             # 1 < $bar
276             # $#foo < $bar
277 2060 100 100     11585 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
278 1746 50 66     7801 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
279 1746 100 100     6613 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
280 1711 50 66     6717 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
281              
282             # If it is <<... it's a here-doc instead
283 1711         3495 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
284 1711 100       6790 return 'Operator' if $next_char =~ /<[^>]/;
285              
286 886 100       1885 return 'Operator' if not $prev;
287              
288             # The most common group of readlines are used like
289             # while ( <...> )
290             # while <>;
291 799         1761 my $prec = $prev->content;
292 799 100 100     11802 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
293             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
294             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
295             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
297             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
298              
299 693 100 100     2559 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
300             # Could go either way... do a regex check
301             # $foo->{bar} < 2;
302             # grep { .. } ;
303 20         75 pos $t->{line} = $t->{line_cursor};
304 20 100       113 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
305             # Almost definitely readline
306 3         9 return 'QuoteLike::Readline';
307             }
308             }
309              
310             # Otherwise, we guess operator, which has been the default up
311             # until this more comprehensive section was created.
312 690         2414 return 'Operator';
313              
314             } elsif ( $char == 47 ) { # $char eq '/'
315             # Finalise any whitespace token...
316 1073 100       3413 $t->_finalize_token if $t->{token};
317              
318             # This is either a "divided by" or a "start regex"
319             # Do some context stuff to guess ( ack ) which.
320             # Hopefully the guess will be good enough.
321 1073         2653 my $prev = $t->_last_significant_token;
322              
323             # Or as the very first thing in a file
324 1073 100       3084 return 'Regexp::Match' if not $prev;
325              
326 976         2528 my $prec = $prev->content;
327              
328             # Most times following an operator, we are a regex.
329             # This includes cases such as:
330             # , - As an argument in a list
331             # .. - The second condition in a flip flop
332             # =~ - A bound regex
333             # !~ - Ditto
334 976 100       4806 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
335              
336             # After a symbol
337 509 100       2198 return 'Operator' if $prev->isa('PPI::Token::Symbol');
338 442 100 66     1344 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
339 14         35 return 'Operator';
340             }
341              
342             # After another number
343 428 100       1558 return 'Operator' if $prev->isa('PPI::Token::Number');
344              
345             # After going into scope/brackets
346 395 100 100     1947 if (
      100        
347             $prev->isa('PPI::Token::Structure')
348             and (
349             $prec eq '('
350             or
351             $prec eq '{'
352             or
353             $prec eq ';'
354             )
355             ) {
356 58         154 return 'Regexp::Match';
357             }
358              
359             # Functions and keywords
360 337 100 66     1142 if (
361             $MATCHWORD{$prec}
362             and
363             $prev->isa('PPI::Token::Word')
364             ) {
365 73         222 return 'Regexp::Match';
366             }
367              
368             # What about the char after the slash? There's some things
369             # that would be highly illogical to see if it's an operator.
370 264         741 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
371 264 50 33     873 if ( defined $next_char and length $next_char ) {
372 264 100       1069 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
373 14         61 return 'Regexp::Match';
374             }
375             }
376              
377             # Otherwise... erm... assume operator?
378             # Add more tests here as potential cases come to light
379 250         837 return 'Operator';
380              
381             } elsif ( $char == 120 ) { # $char eq 'x'
382             # Could be a word, the x= operator, the x operator
383             # followed by whitespace, or the x operator without any
384             # space between itself and its operand, e.g.: '$a x3',
385             # which is the same as '$a x 3'. _current_x_is_operator
386             # assumes we have a complete 'x' token, but we don't
387             # yet. We may need to split this x character apart from
388             # what follows it.
389 855 100       2091 if ( $t->_current_x_is_operator ) {
390 227         694 pos $t->{line} = $t->{line_cursor} + 1;
391 227 100       1298 return 'Operator' if $t->{line} =~ m/\G(?:
392             \d # x op with no whitespace e.g. 'x3'
393             |
394             (?!( # negative lookahead
395             => # not on left of fat comma
396             |
397             \w # not a word like "xyzzy"
398             |
399             \s # not x op plus whitespace
400             ))
401             )/gcx;
402             }
403              
404             # Otherwise, commit like a normal bareword, including x
405             # operator followed by whitespace.
406 727         2277 return PPI::Token::Word->__TOKENIZER__commit($t);
407              
408             } elsif ( $char == 45 ) { # $char eq '-'
409             # Look for an obvious operator operand context
410 10949         21854 my $context = $t->_opcontext;
411 10949 100       17939 if ( $context eq 'operator' ) {
412 8975         30466 return 'Operator';
413             } else {
414             # More logic needed
415 1974         6039 return 'Unknown';
416             }
417              
418             } elsif ( $char >= 128 ) { # Outside ASCII
419 6 100       75 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
420 1 50       6 return 'Whitespace' if $c =~ /\s/;
421             }
422              
423              
424             # All the whitespaces are covered, so what to do
425             ### For now, die
426 1         13 PPI::Exception->throw("Encountered unexpected character '$char'");
427             }
428              
429             sub __TOKENIZER__on_line_end {
430 44406 100   44406   159075 $_[1]->_finalize_token if $_[1]->{token};
431             }
432              
433             1;
434              
435             =pod
436              
437             =head1 SUPPORT
438              
439             See the L in the main module.
440              
441             =head1 AUTHOR
442              
443             Adam Kennedy Eadamk@cpan.orgE
444              
445             =head1 COPYRIGHT
446              
447             Copyright 2001 - 2011 Adam Kennedy.
448              
449             This program is free software; you can redistribute
450             it and/or modify it under the same terms as Perl itself.
451              
452             The full text of the license can be found in the
453             LICENSE file included with this module.
454              
455             =cut