File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 95 97 97.9
branch 81 88 92.0
condition 77 90 85.5
subroutine 7 8 87.5
pod 2 2 100.0
total 262 285 91.9


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 69     69   78145 use strict;
  69         169  
  69         2015  
45 69     69   547 use Clone ();
  69         522  
  69         815  
46 69     69   513 use PPI::Token ();
  69         95  
  69         116612  
47              
48             our $VERSION = '1.291';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 13 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         8 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 57115     57115   64731 my $t = $_[1];
147 57115         84091 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 57115 100       272419 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 7030         17080 $t->_new_token( 'Whitespace', $line );
153 7030         10626 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4351         9649 $t->_new_token( 'Comment', $line );
158 4351         8232 $t->_finalize_token;
159 4351         6802 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 764         1997 $t->_new_token( 'Pod', $line );
164 764 50       2424 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 764         1371 $t->{class} = 'PPI::Token::Pod';
170             }
171 764         1312 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         3 my @perl6;
178 2         2 while ( 1 ) {
179 693         750 my $line6 = $t->_get_line;
180 693 100       786 last unless defined $line6;
181 691         720 push @perl6, $line6;
182             }
183 2         3 push @{ $t->{perl6} }, join '', @perl6;
  2         108  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         35 return 1;
189             }
190              
191 44968         80435 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 610556     610556   612505 my $t = $_[1];
196 610556         812934 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 610556         625528 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 610556 100       1195446 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 470723 100       1223484 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 27253 100       69489 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 12244 100       34365 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215 12244 100       33766 return 'Structure' if $t->_current_token_has_signatures_active;
216              
217 8426         14647 my @tokens = $t->_previous_significant_tokens(3);
218              
219             # A normal subroutine declaration
220 8426         10320 my $p1 = $tokens[1];
221 8426         9837 my $p2 = $tokens[2];
222 8426 50 100     68763 if (
      100        
      100        
      100        
      33        
      100        
223             $tokens[0]
224             and
225             $tokens[0]->isa('PPI::Token::Word')
226             and
227             $p1
228             and
229             $p1->isa('PPI::Token::Word')
230             and
231             $p1->content eq 'sub'
232             and (
233             not $p2
234             or
235             $p2->isa('PPI::Token::Structure')
236             or (
237             $p2->isa('PPI::Token::Whitespace')
238             and
239             $p2->content eq ''
240             )
241             or (
242             # Lexical subroutine
243             $p2->isa('PPI::Token::Word')
244             and
245             $p2->content =~ /^(?:my|our|state)$/
246             )
247             )
248             ) {
249             # This is a sub prototype
250 314         1013 return 'Prototype';
251             }
252              
253             # A prototyped anonymous subroutine
254 8112         10419 my $p0 = $tokens[0];
255 8112 100 100     31649 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
256             # Maybe it's invoking a method named 'sub'
257             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
258             ) {
259 42         186 return 'Prototype';
260             }
261              
262             # This is a normal open bracket
263 8070         25849 return 'Structure';
264              
265             } elsif ( $char == 60 ) { # $char eq '<'
266             # Finalise any whitespace token...
267 2082 100       5802 $t->_finalize_token if $t->{token};
268              
269             # This is either "less than" or "readline quote-like"
270             # Do some context stuff to guess which.
271 2082         4124 my $prev = $t->_last_significant_token;
272              
273             # The most common group of less-thans are used like
274             # $foo < $bar
275             # 1 < $bar
276             # $#foo < $bar
277 2082 100 100     10505 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
278 1770 50 66     7463 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
279 1770 100 100     6742 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
280 1739 50 66     6946 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
281              
282             # If it is <<... it's a here-doc instead
283 1739         3387 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
284 1739 100       5974 return 'Operator' if $next_char =~ /<[^>]/;
285              
286 910 100       1915 return 'Operator' if not $prev;
287              
288             # The most common group of readlines are used like
289             # while ( <...> )
290             # while <>;
291 825         1649 my $prec = $prev->content;
292 825 100 100     11774 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
293             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
294             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
295             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
297             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
298              
299 713 100 100     2218 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
300             # Could go either way... do a regex check
301             # $foo->{bar} < 2;
302             # grep { .. } ;
303 13         37 pos $t->{line} = $t->{line_cursor};
304 13 100       93 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
305             # Almost definitely readline
306 3         12 return 'QuoteLike::Readline';
307             }
308             }
309              
310             # Otherwise, we guess operator, which has been the default up
311             # until this more comprehensive section was created.
312 710         2119 return 'Operator';
313              
314             } elsif ( $char == 47 ) { # $char eq '/'
315             # Finalise any whitespace token...
316 1045 100       3011 $t->_finalize_token if $t->{token};
317              
318             # This is either a "divided by" or a "start regex"
319             # Do some context stuff to guess ( ack ) which.
320             # Hopefully the guess will be good enough.
321 1045         2421 my $prev = $t->_last_significant_token;
322              
323             # Or as the very first thing in a file
324 1045 100       3174 return 'Regexp::Match' if not $prev;
325              
326 948         2123 my $prec = $prev->content;
327              
328             # Most times following an operator, we are a regex.
329             # This includes cases such as:
330             # , - As an argument in a list
331             # .. - The second condition in a flip flop
332             # =~ - A bound regex
333             # !~ - Ditto
334 948 100       3989 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
335              
336             # After a symbol
337 493 100       1888 return 'Operator' if $prev->isa('PPI::Token::Symbol');
338 425 100 66     1244 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
339 16         48 return 'Operator';
340             }
341              
342             # After another number
343 409 100       1250 return 'Operator' if $prev->isa('PPI::Token::Number');
344              
345             # After going into scope/brackets
346 366 100 100     1535 if (
      100        
347             $prev->isa('PPI::Token::Structure')
348             and (
349             $prec eq '('
350             or
351             $prec eq '{'
352             or
353             $prec eq ';'
354             )
355             ) {
356 44         115 return 'Regexp::Match';
357             }
358              
359             # Functions and keywords
360 322 100 66     984 if (
361             $MATCHWORD{$prec}
362             and
363             $prev->isa('PPI::Token::Word')
364             ) {
365 73         212 return 'Regexp::Match';
366             }
367              
368             # What about the char after the slash? There's some things
369             # that would be highly illogical to see if it's an operator.
370 249         610 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
371 249 50 33     792 if ( defined $next_char and length $next_char ) {
372 249 100       902 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
373 16         57 return 'Regexp::Match';
374             }
375             }
376              
377             # Otherwise... erm... assume operator?
378             # Add more tests here as potential cases come to light
379 233         700 return 'Operator';
380              
381             } elsif ( $char == 120 ) { # $char eq 'x'
382             # Could be a word, the x= operator, the x operator
383             # followed by whitespace, or the x operator without any
384             # space between itself and its operand, e.g.: '$a x3',
385             # which is the same as '$a x 3'. _current_x_is_operator
386             # assumes we have a complete 'x' token, but we don't
387             # yet. We may need to split this x character apart from
388             # what follows it.
389 846 100       2102 if ( $t->_current_x_is_operator ) {
390 218         627 pos $t->{line} = $t->{line_cursor} + 1;
391 218 100       1188 return 'Operator' if $t->{line} =~ m/\G(?:
392             \d # x op with no whitespace e.g. 'x3'
393             |
394             (?!( # negative lookahead
395             => # not on left of fat comma
396             |
397             \w # not a word like "xyzzy"
398             |
399             \s # not x op plus whitespace
400             ))
401             )/gcx;
402             }
403              
404             # Otherwise, commit like a normal bareword, including x
405             # operator followed by whitespace.
406 723         2203 return PPI::Token::Word->__TOKENIZER__commit($t);
407              
408             } elsif ( $char == 45 ) { # $char eq '-'
409             # Look for an obvious operator operand context
410 11030         22468 my $context = $t->_opcontext;
411 11030 100       19115 if ( $context eq 'operator' ) {
412 9008         30127 return 'Operator';
413             } else {
414             # More logic needed
415 2022         5522 return 'Unknown';
416             }
417              
418             } elsif ( $char >= 128 ) { # Outside ASCII
419 6 100       36 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
420 1 50       4 return 'Whitespace' if $c =~ /\s/;
421             }
422              
423              
424             # All the whitespaces are covered, so what to do
425             ### For now, die
426 1         10 PPI::Exception->throw("Encountered unexpected character '$char'");
427             }
428              
429             sub __TOKENIZER__on_line_end {
430 44588 100   44588   151713 $_[1]->_finalize_token if $_[1]->{token};
431             }
432              
433             1;
434              
435             =pod
436              
437             =head1 SUPPORT
438              
439             See the L in the main module.
440              
441             =head1 AUTHOR
442              
443             Adam Kennedy Eadamk@cpan.orgE
444              
445             =head1 COPYRIGHT
446              
447             Copyright 2001 - 2011 Adam Kennedy.
448              
449             This program is free software; you can redistribute
450             it and/or modify it under the same terms as Perl itself.
451              
452             The full text of the license can be found in the
453             LICENSE file included with this module.
454              
455             =cut