File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 95 97 97.9
branch 82 88 93.1
condition 78 90 86.6
subroutine 7 8 87.5
pod 2 2 100.0
total 264 285 92.6


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L<PPI::Document> object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C<PPI::Token::Whitespace> objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B<have> to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L<PPI::Node> and L<PPI::Element> methods to do things.
29              
30             If you want the nth child element, you should be using C<schild> rather
31             than C<child>, and likewise C<snext_sibling>, C<sprevious_sibling>, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B<not> need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L<PPI::Token> and L<PPI::Element> classes.
41              
42             =cut
43              
44 67     67   92991 use strict;
  67         97  
  67         1981  
45 67     67   621 use Clone ();
  67         546  
  67         958  
46 67     67   618 use PPI::Token ();
  67         128  
  67         110398  
47              
48             our $VERSION = '1.284';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L<PPI> sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C<null> method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C<tidy> is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B<doesn't> include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L<PPI::Token::Pod> object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 12 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         8 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 52803     52803   64054 my $t = $_[1];
147 52803         79595 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 52803 100       294416 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 6855         17957 $t->_new_token( 'Whitespace', $line );
153 6855         12170 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4346         11518 $t->_new_token( 'Comment', $line );
158 4346         10456 $t->_finalize_token;
159 4346         7464 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 766         2561 $t->_new_token( 'Pod', $line );
164 766 50       3275 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 766         1569 $t->{class} = 'PPI::Token::Pod';
170             }
171 766         1617 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         4 my @perl6;
178 2         2 while ( 1 ) {
179 693         743 my $line6 = $t->_get_line;
180 693 100       747 last unless defined $line6;
181 691         693 push @perl6, $line6;
182             }
183 2         3 push @{ $t->{perl6} }, join '', @perl6;
  2         95  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         34 return 1;
189             }
190              
191 40834         72645 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 395556     395556   417050 my $t = $_[1];
196 395556         562011 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 395556         420811 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 395556 100       906982 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 275716 100       819943 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 19681 100       65801 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 8448 100       26457 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215 8448         25255 my ( $has_sig, @tokens ) = $t->_current_token_has_signatures_active;
216 8448 100       22291 return 'Structure' if $has_sig;
217              
218             # A normal subroutine declaration
219 8412         13528 my $p1 = $tokens[1];
220 8412         12479 my $p2 = $tokens[2];
221 8412 50 100     89462 if (
      100        
      100        
      100        
      33        
      100        
222             $tokens[0]
223             and
224             $tokens[0]->isa('PPI::Token::Word')
225             and
226             $p1
227             and
228             $p1->isa('PPI::Token::Word')
229             and
230             $p1->content eq 'sub'
231             and (
232             not $p2
233             or
234             $p2->isa('PPI::Token::Structure')
235             or (
236             $p2->isa('PPI::Token::Whitespace')
237             and
238             $p2->content eq ''
239             )
240             or (
241             # Lexical subroutine
242             $p2->isa('PPI::Token::Word')
243             and
244             $p2->content =~ /^(?:my|our|state)$/
245             )
246             )
247             ) {
248             # This is a sub prototype
249 314         1302 return 'Prototype';
250             }
251              
252             # A prototyped anonymous subroutine
253 8098         13440 my $p0 = $tokens[0];
254 8098 100 100     41109 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
255             # Maybe it's invoking a method named 'sub'
256             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257             ) {
258 42         182 return 'Prototype';
259             }
260              
261             # This is a normal open bracket
262 8056         41538 return 'Structure';
263              
264             } elsif ( $char == 60 ) { # $char eq '<'
265             # Finalise any whitespace token...
266 2094 100       6923 $t->_finalize_token if $t->{token};
267              
268             # This is either "less than" or "readline quote-like"
269             # Do some context stuff to guess which.
270 2094         4767 my $prev = $t->_last_significant_token;
271              
272             # The most common group of less-thans are used like
273             # $foo < $bar
274             # 1 < $bar
275             # $#foo < $bar
276 2094 100 100     11753 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277 1775 50 66     9658 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278 1775 100 100     7605 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279 1735 100 100     7521 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280              
281             # If it is <<... it's a here-doc instead
282 1734         4223 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283 1734 100       6975 return 'Operator' if $next_char =~ /<[^>]/;
284              
285 910 100       2137 return 'Operator' if not $prev;
286              
287             # The most common group of readlines are used like
288             # while ( <...> )
289             # while <>;
290 816         2020 my $prec = $prev->content;
291 816 100 100     14121 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
292             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297              
298 716 100 100     2786 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299             # Could go either way... do a regex check
300             # $foo->{bar} < 2;
301             # grep { .. } <foo>;
302 15         73 pos $t->{line} = $t->{line_cursor};
303 15 100       100 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304             # Almost definitely readline
305 3         12 return 'QuoteLike::Readline';
306             }
307             }
308              
309             # Otherwise, we guess operator, which has been the default up
310             # until this more comprehensive section was created.
311 713         2376 return 'Operator';
312              
313             } elsif ( $char == 47 ) { # $char eq '/'
314             # Finalise any whitespace token...
315 1038 100       3304 $t->_finalize_token if $t->{token};
316              
317             # This is either a "divided by" or a "start regex"
318             # Do some context stuff to guess ( ack ) which.
319             # Hopefully the guess will be good enough.
320 1038         2750 my $prev = $t->_last_significant_token;
321              
322             # Or as the very first thing in a file
323 1038 100       3167 return 'Regexp::Match' if not $prev;
324              
325 946         2313 my $prec = $prev->content;
326              
327             # Most times following an operator, we are a regex.
328             # This includes cases such as:
329             # , - As an argument in a list
330             # .. - The second condition in a flip flop
331             # =~ - A bound regex
332             # !~ - Ditto
333 946 100       4699 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334              
335             # After a symbol
336 500 100       2101 return 'Operator' if $prev->isa('PPI::Token::Symbol');
337 435 100 66     1380 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338 11         32 return 'Operator';
339             }
340              
341             # After another number
342 424 100       1591 return 'Operator' if $prev->isa('PPI::Token::Number');
343              
344             # After going into scope/brackets
345 382 100 100     2046 if (
      100        
346             $prev->isa('PPI::Token::Structure')
347             and (
348             $prec eq '('
349             or
350             $prec eq '{'
351             or
352             $prec eq ';'
353             )
354             ) {
355 57         181 return 'Regexp::Match';
356             }
357              
358             # Functions and keywords
359 325 100 66     1210 if (
360             $MATCHWORD{$prec}
361             and
362             $prev->isa('PPI::Token::Word')
363             ) {
364 73         274 return 'Regexp::Match';
365             }
366              
367             # What about the char after the slash? There's some things
368             # that would be highly illogical to see if it's an operator.
369 252         655 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370 252 50 33     1030 if ( defined $next_char and length $next_char ) {
371 252 100       979 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
372 14         49 return 'Regexp::Match';
373             }
374             }
375              
376             # Otherwise... erm... assume operator?
377             # Add more tests here as potential cases come to light
378 238         737 return 'Operator';
379              
380             } elsif ( $char == 120 ) { # $char eq 'x'
381             # Could be a word, the x= operator, the x operator
382             # followed by whitespace, or the x operator without any
383             # space between itself and its operand, e.g.: '$a x3',
384             # which is the same as '$a x 3'. _current_x_is_operator
385             # assumes we have a complete 'x' token, but we don't
386             # yet. We may need to split this x character apart from
387             # what follows it.
388 840 100       2287 if ( $t->_current_x_is_operator ) {
389 201         636 pos $t->{line} = $t->{line_cursor} + 1;
390 201 100       1244 return 'Operator' if $t->{line} =~ m/\G(?:
391             \d # x op with no whitespace e.g. 'x3'
392             |
393             (?!( # negative lookahead
394             => # not on left of fat comma
395             |
396             \w # not a word like "xyzzy"
397             |
398             \s # not x op plus whitespace
399             ))
400             )/gcx;
401             }
402              
403             # Otherwise, commit like a normal bareword, including x
404             # operator followed by whitespace.
405 721         2457 return PPI::Token::Word->__TOKENIZER__commit($t);
406              
407             } elsif ( $char == 45 ) { # $char eq '-'
408             # Look for an obvious operator operand context
409 7255         20972 my $context = $t->_opcontext;
410 7255 100       19034 if ( $context eq 'operator' ) {
411 5288         19876 return 'Operator';
412             } else {
413             # More logic needed
414 1967         6341 return 'Unknown';
415             }
416              
417             } elsif ( $char >= 128 ) { # Outside ASCII
418 6 100       40 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419 1 50       3 return 'Whitespace' if $c =~ /\s/;
420             }
421              
422              
423             # All the whitespaces are covered, so what to do
424             ### For now, die
425 1         9 PPI::Exception->throw("Encountered unexpected character '$char'");
426             }
427              
428             sub __TOKENIZER__on_line_end {
429 40470 100   40470   160598 $_[1]->_finalize_token if $_[1]->{token};
430             }
431              
432             1;
433              
434             =pod
435              
436             =head1 SUPPORT
437              
438             See the L<support section|PPI/SUPPORT> in the main module.
439              
440             =head1 AUTHOR
441              
442             Adam Kennedy E<lt>adamk@cpan.orgE<gt>
443              
444             =head1 COPYRIGHT
445              
446             Copyright 2001 - 2011 Adam Kennedy.
447              
448             This program is free software; you can redistribute
449             it and/or modify it under the same terms as Perl itself.
450              
451             The full text of the license can be found in the
452             LICENSE file included with this module.
453              
454             =cut