File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	95	97	97.9
branch	81	88	92.0
condition	77	90	85.5
subroutine	7	8	87.5
pod	2	2	100.0
total	262	285	91.9

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	69			69		78145	use strict;
	69					169
	69					2015
45	69			69		547	use Clone ();
	69					522
	69					815
46	69			69		513	use PPI::Token ();
	69					95
	69					116612
47
48							our $VERSION = '1.291';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	13	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					8	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	57115			57115		64731	my $t = $_[1];
147	57115					84091	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	57115	100				272419	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	7030					17080	$t->_new_token( 'Whitespace', $line );
153	7030					10626	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4351					9649	$t->_new_token( 'Comment', $line );
158	4351					8232	$t->_finalize_token;
159	4351					6802	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	764					1997	$t->_new_token( 'Pod', $line );
164	764	50				2424	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	764					1371	$t->{class} = 'PPI::Token::Pod';
170							}
171	764					1312	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					3	my @perl6;
178	2					2	while ( 1 ) {
179	693					750	my $line6 = $t->_get_line;
180	693	100				786	last unless defined $line6;
181	691					720	push @perl6, $line6;
182							}
183	2					3	push @{ $t->{perl6} }, join '', @perl6;
	2					108
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					35	return 1;
189							}
190
191	44968					80435	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	610556			610556		612505	my $t = $_[1];
196	610556					812934	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	610556					625528	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	610556	100				1195446	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	470723	100				1223484	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	27253	100				69489	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	12244	100				34365	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215	12244	100				33766	return 'Structure' if $t->_current_token_has_signatures_active;
216
217	8426					14647	my @tokens = $t->_previous_significant_tokens(3);
218
219							# A normal subroutine declaration
220	8426					10320	my $p1 = $tokens[1];
221	8426					9837	my $p2 = $tokens[2];
222	8426	50	100			68763	if (
			100
			100
			100
			33
			100
223							$tokens[0]
224							and
225							$tokens[0]->isa('PPI::Token::Word')
226							and
227							$p1
228							and
229							$p1->isa('PPI::Token::Word')
230							and
231							$p1->content eq 'sub'
232							and (
233							not $p2
234							or
235							$p2->isa('PPI::Token::Structure')
236							or (
237							$p2->isa('PPI::Token::Whitespace')
238							and
239							$p2->content eq ''
240							)
241							or (
242							# Lexical subroutine
243							$p2->isa('PPI::Token::Word')
244							and
245							$p2->content =~ /^(?:my\|our\|state)$/
246							)
247							)
248							) {
249							# This is a sub prototype
250	314					1013	return 'Prototype';
251							}
252
253							# A prototyped anonymous subroutine
254	8112					10419	my $p0 = $tokens[0];
255	8112	100	100			31649	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
256							# Maybe it's invoking a method named 'sub'
257							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
258							) {
259	42					186	return 'Prototype';
260							}
261
262							# This is a normal open bracket
263	8070					25849	return 'Structure';
264
265							} elsif ( $char == 60 ) { # $char eq '<'
266							# Finalise any whitespace token...
267	2082	100				5802	$t->_finalize_token if $t->{token};
268
269							# This is either "less than" or "readline quote-like"
270							# Do some context stuff to guess which.
271	2082					4124	my $prev = $t->_last_significant_token;
272
273							# The most common group of less-thans are used like
274							# $foo < $bar
275							# 1 < $bar
276							# $#foo < $bar
277	2082	100	100			10505	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
278	1770	50	66			7463	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
279	1770	100	100			6742	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
280	1739	50	66			6946	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
281
282							# If it is <<... it's a here-doc instead
283	1739					3387	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
284	1739	100				5974	return 'Operator' if $next_char =~ /<[^>]/;
285
286	910	100				1915	return 'Operator' if not $prev;
287
288							# The most common group of readlines are used like
289							# while ( <...> )
290							# while <>;
291	825					1649	my $prec = $prev->content;
292	825	100	100			11774	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
293							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
294							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
295							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
297							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
298
299	713	100	100			2218	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
300							# Could go either way... do a regex check
301							# $foo->{bar} < 2;
302							# grep { .. } ;
303	13					37	pos $t->{line} = $t->{line_cursor};
304	13	100				93	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
305							# Almost definitely readline
306	3					12	return 'QuoteLike::Readline';
307							}
308							}
309
310							# Otherwise, we guess operator, which has been the default up
311							# until this more comprehensive section was created.
312	710					2119	return 'Operator';
313
314							} elsif ( $char == 47 ) { # $char eq '/'
315							# Finalise any whitespace token...
316	1045	100				3011	$t->_finalize_token if $t->{token};
317
318							# This is either a "divided by" or a "start regex"
319							# Do some context stuff to guess ( ack ) which.
320							# Hopefully the guess will be good enough.
321	1045					2421	my $prev = $t->_last_significant_token;
322
323							# Or as the very first thing in a file
324	1045	100				3174	return 'Regexp::Match' if not $prev;
325
326	948					2123	my $prec = $prev->content;
327
328							# Most times following an operator, we are a regex.
329							# This includes cases such as:
330							# , - As an argument in a list
331							# .. - The second condition in a flip flop
332							# =~ - A bound regex
333							# !~ - Ditto
334	948	100				3989	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
335
336							# After a symbol
337	493	100				1888	return 'Operator' if $prev->isa('PPI::Token::Symbol');
338	425	100	66			1244	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
339	16					48	return 'Operator';
340							}
341
342							# After another number
343	409	100				1250	return 'Operator' if $prev->isa('PPI::Token::Number');
344
345							# After going into scope/brackets
346	366	100	100			1535	if (
			100
347							$prev->isa('PPI::Token::Structure')
348							and (
349							$prec eq '('
350							or
351							$prec eq '{'
352							or
353							$prec eq ';'
354							)
355							) {
356	44					115	return 'Regexp::Match';
357							}
358
359							# Functions and keywords
360	322	100	66			984	if (
361							$MATCHWORD{$prec}
362							and
363							$prev->isa('PPI::Token::Word')
364							) {
365	73					212	return 'Regexp::Match';
366							}
367
368							# What about the char after the slash? There's some things
369							# that would be highly illogical to see if it's an operator.
370	249					610	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
371	249	50	33			792	if ( defined $next_char and length $next_char ) {
372	249	100				902	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
373	16					57	return 'Regexp::Match';
374							}
375							}
376
377							# Otherwise... erm... assume operator?
378							# Add more tests here as potential cases come to light
379	233					700	return 'Operator';
380
381							} elsif ( $char == 120 ) { # $char eq 'x'
382							# Could be a word, the x= operator, the x operator
383							# followed by whitespace, or the x operator without any
384							# space between itself and its operand, e.g.: '$a x3',
385							# which is the same as '$a x 3'. _current_x_is_operator
386							# assumes we have a complete 'x' token, but we don't
387							# yet. We may need to split this x character apart from
388							# what follows it.
389	846	100				2102	if ( $t->_current_x_is_operator ) {
390	218					627	pos $t->{line} = $t->{line_cursor} + 1;
391	218	100				1188	return 'Operator' if $t->{line} =~ m/\G(?:
392							\d # x op with no whitespace e.g. 'x3'
393							\|
394							(?!( # negative lookahead
395							=> # not on left of fat comma
396							\|
397							\w # not a word like "xyzzy"
398							\|
399							\s # not x op plus whitespace
400							))
401							)/gcx;
402							}
403
404							# Otherwise, commit like a normal bareword, including x
405							# operator followed by whitespace.
406	723					2203	return PPI::Token::Word->__TOKENIZER__commit($t);
407
408							} elsif ( $char == 45 ) { # $char eq '-'
409							# Look for an obvious operator operand context
410	11030					22468	my $context = $t->_opcontext;
411	11030	100				19115	if ( $context eq 'operator' ) {
412	9008					30127	return 'Operator';
413							} else {
414							# More logic needed
415	2022					5522	return 'Unknown';
416							}
417
418							} elsif ( $char >= 128 ) { # Outside ASCII
419	6	100				36	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
420	1	50				4	return 'Whitespace' if $c =~ /\s/;
421							}
422
423
424							# All the whitespaces are covered, so what to do
425							### For now, die
426	1					10	PPI::Exception->throw("Encountered unexpected character '$char'");
427							}
428
429							sub __TOKENIZER__on_line_end {
430	44588	100		44588		151713	$_[1]->_finalize_token if $_[1]->{token};
431							}
432
433							1;
434
435							=pod
436
437							=head1 SUPPORT
438
439							See the L in the main module.
440
441							=head1 AUTHOR
442
443							Adam Kennedy Eadamk@cpan.orgE
444
445							=head1 COPYRIGHT
446
447							Copyright 2001 - 2011 Adam Kennedy.
448
449							This program is free software; you can redistribute
450							it and/or modify it under the same terms as Perl itself.
451
452							The full text of the license can be found in the
453							LICENSE file included with this module.
454
455							=cut