File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 95 97 97.9
branch 82 88 93.1
condition 78 90 86.6
subroutine 7 8 87.5
pod 2 2 100.0
total 264 285 92.6


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 69     69   82622 use strict;
  69         120  
  69         1924  
45 69     69   564 use Clone ();
  69         520  
  69         807  
46 69     69   527 use PPI::Token ();
  69         117  
  69         112198  
47              
48             our $VERSION = '1.290';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 13 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         9 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 57047     57047   63468 my $t = $_[1];
147 57047         79963 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 57047 100       262752 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 7029         16272 $t->_new_token( 'Whitespace', $line );
153 7029         10999 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4353         9249 $t->_new_token( 'Comment', $line );
158 4353         8134 $t->_finalize_token;
159 4353         6834 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 762         2253 $t->_new_token( 'Pod', $line );
164 762 50       2637 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 762         1423 $t->{class} = 'PPI::Token::Pod';
170             }
171 762         1377 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         4 my @perl6;
178 2         5 while ( 1 ) {
179 693         1429 my $line6 = $t->_get_line;
180 693 100       1401 last unless defined $line6;
181 691         1364 push @perl6, $line6;
182             }
183 2         4 push @{ $t->{perl6} }, join '', @perl6;
  2         128  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         59 return 1;
189             }
190              
191 44901         74745 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 609250     609250   600821 my $t = $_[1];
196 609250         800687 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 609250         614220 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 609250 100       1180589 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 470069 100       1214163 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 27134 100       69806 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 12214 100       26345 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215 12214 100       26807 return 'Structure' if $t->_current_token_has_signatures_active;
216              
217 8396         14558 my @tokens = $t->_previous_significant_tokens(3);
218              
219             # A normal subroutine declaration
220 8396         11815 my $p1 = $tokens[1];
221 8396         9829 my $p2 = $tokens[2];
222 8396 50 100     65636 if (
      100        
      100        
      100        
      33        
      100        
223             $tokens[0]
224             and
225             $tokens[0]->isa('PPI::Token::Word')
226             and
227             $p1
228             and
229             $p1->isa('PPI::Token::Word')
230             and
231             $p1->content eq 'sub'
232             and (
233             not $p2
234             or
235             $p2->isa('PPI::Token::Structure')
236             or (
237             $p2->isa('PPI::Token::Whitespace')
238             and
239             $p2->content eq ''
240             )
241             or (
242             # Lexical subroutine
243             $p2->isa('PPI::Token::Word')
244             and
245             $p2->content =~ /^(?:my|our|state)$/
246             )
247             )
248             ) {
249             # This is a sub prototype
250 314         1112 return 'Prototype';
251             }
252              
253             # A prototyped anonymous subroutine
254 8082         10194 my $p0 = $tokens[0];
255 8082 100 100     32992 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
256             # Maybe it's invoking a method named 'sub'
257             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
258             ) {
259 42         180 return 'Prototype';
260             }
261              
262             # This is a normal open bracket
263 8040         26807 return 'Structure';
264              
265             } elsif ( $char == 60 ) { # $char eq '<'
266             # Finalise any whitespace token...
267 2070 100       6012 $t->_finalize_token if $t->{token};
268              
269             # This is either "less than" or "readline quote-like"
270             # Do some context stuff to guess which.
271 2070         4477 my $prev = $t->_last_significant_token;
272              
273             # The most common group of less-thans are used like
274             # $foo < $bar
275             # 1 < $bar
276             # $#foo < $bar
277 2070 100 100     11232 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
278 1756 50 66     7443 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
279 1756 100 100     6950 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
280 1723 100 100     6713 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
281              
282             # If it is <<... it's a here-doc instead
283 1722         3600 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
284 1722 100       5989 return 'Operator' if $next_char =~ /<[^>]/;
285              
286 897 100       2102 return 'Operator' if not $prev;
287              
288             # The most common group of readlines are used like
289             # while ( <...> )
290             # while <>;
291 808         1723 my $prec = $prev->content;
292 808 100 100     12184 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
293             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
294             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
295             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
297             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
298              
299 703 100 100     2517 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
300             # Could go either way... do a regex check
301             # $foo->{bar} < 2;
302             # grep { .. } ;
303 11         35 pos $t->{line} = $t->{line_cursor};
304 11 100       77 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
305             # Almost definitely readline
306 3         11 return 'QuoteLike::Readline';
307             }
308             }
309              
310             # Otherwise, we guess operator, which has been the default up
311             # until this more comprehensive section was created.
312 700         2048 return 'Operator';
313              
314             } elsif ( $char == 47 ) { # $char eq '/'
315             # Finalise any whitespace token...
316 1038 100       3396 $t->_finalize_token if $t->{token};
317              
318             # This is either a "divided by" or a "start regex"
319             # Do some context stuff to guess ( ack ) which.
320             # Hopefully the guess will be good enough.
321 1038         2421 my $prev = $t->_last_significant_token;
322              
323             # Or as the very first thing in a file
324 1038 100       2903 return 'Regexp::Match' if not $prev;
325              
326 941         2369 my $prec = $prev->content;
327              
328             # Most times following an operator, we are a regex.
329             # This includes cases such as:
330             # , - As an argument in a list
331             # .. - The second condition in a flip flop
332             # =~ - A bound regex
333             # !~ - Ditto
334 941 100       4304 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
335              
336             # After a symbol
337 510 100       2011 return 'Operator' if $prev->isa('PPI::Token::Symbol');
338 440 100 66     1656 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
339 9         28 return 'Operator';
340             }
341              
342             # After another number
343 431 100       1458 return 'Operator' if $prev->isa('PPI::Token::Number');
344              
345             # After going into scope/brackets
346 389 100 100     1804 if (
      100        
347             $prev->isa('PPI::Token::Structure')
348             and (
349             $prec eq '('
350             or
351             $prec eq '{'
352             or
353             $prec eq ';'
354             )
355             ) {
356 53         173 return 'Regexp::Match';
357             }
358              
359             # Functions and keywords
360 336 100 66     1143 if (
361             $MATCHWORD{$prec}
362             and
363             $prev->isa('PPI::Token::Word')
364             ) {
365 73         209 return 'Regexp::Match';
366             }
367              
368             # What about the char after the slash? There's some things
369             # that would be highly illogical to see if it's an operator.
370 263         640 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
371 263 50 33     967 if ( defined $next_char and length $next_char ) {
372 263 100       928 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
373 6         22 return 'Regexp::Match';
374             }
375             }
376              
377             # Otherwise... erm... assume operator?
378             # Add more tests here as potential cases come to light
379 257         897 return 'Operator';
380              
381             } elsif ( $char == 120 ) { # $char eq 'x'
382             # Could be a word, the x= operator, the x operator
383             # followed by whitespace, or the x operator without any
384             # space between itself and its operand, e.g.: '$a x3',
385             # which is the same as '$a x 3'. _current_x_is_operator
386             # assumes we have a complete 'x' token, but we don't
387             # yet. We may need to split this x character apart from
388             # what follows it.
389 863 100       2147 if ( $t->_current_x_is_operator ) {
390 201         628 pos $t->{line} = $t->{line_cursor} + 1;
391 201 100       1166 return 'Operator' if $t->{line} =~ m/\G(?:
392             \d # x op with no whitespace e.g. 'x3'
393             |
394             (?!( # negative lookahead
395             => # not on left of fat comma
396             |
397             \w # not a word like "xyzzy"
398             |
399             \s # not x op plus whitespace
400             ))
401             )/gcx;
402             }
403              
404             # Otherwise, commit like a normal bareword, including x
405             # operator followed by whitespace.
406 750         2401 return PPI::Token::Word->__TOKENIZER__commit($t);
407              
408             } elsif ( $char == 45 ) { # $char eq '-'
409             # Look for an obvious operator operand context
410 10943         22608 my $context = $t->_opcontext;
411 10943 100       18607 if ( $context eq 'operator' ) {
412 9009         29251 return 'Operator';
413             } else {
414             # More logic needed
415 1934         5648 return 'Unknown';
416             }
417              
418             } elsif ( $char >= 128 ) { # Outside ASCII
419 6 100       35 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
420 1 50       5 return 'Whitespace' if $c =~ /\s/;
421             }
422              
423              
424             # All the whitespaces are covered, so what to do
425             ### For now, die
426 1         10 PPI::Exception->throw("Encountered unexpected character '$char'");
427             }
428              
429             sub __TOKENIZER__on_line_end {
430 44517 100   44517   155455 $_[1]->_finalize_token if $_[1]->{token};
431             }
432              
433             1;
434              
435             =pod
436              
437             =head1 SUPPORT
438              
439             See the L in the main module.
440              
441             =head1 AUTHOR
442              
443             Adam Kennedy Eadamk@cpan.orgE
444              
445             =head1 COPYRIGHT
446              
447             Copyright 2001 - 2011 Adam Kennedy.
448              
449             This program is free software; you can redistribute
450             it and/or modify it under the same terms as Perl itself.
451              
452             The full text of the license can be found in the
453             LICENSE file included with this module.
454              
455             =cut