File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	95	97	97.9
branch	82	88	93.1
condition	78	90	86.6
subroutine	7	8	87.5
pod	2	2	100.0
total	264	285	92.6

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L<PPI::Document> object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C<PPI::Token::Whitespace> objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B<have> to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L<PPI::Node> and L<PPI::Element> methods to do things.
29
30							If you want the nth child element, you should be using C<schild> rather
31							than C<child>, and likewise C<snext_sibling>, C<sprevious_sibling>, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B<not> need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L<PPI::Token> and L<PPI::Element> classes.
41
42							=cut
43
44	67			67		92991	use strict;
	67					97
	67					1981
45	67			67		621	use Clone ();
	67					546
	67					958
46	67			67		618	use PPI::Token ();
	67					128
	67					110398
47
48							our $VERSION = '1.284';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L<PPI> sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C<null> method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C<tidy> is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B<doesn't> include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L<PPI::Token::Pod> object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	12	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					8	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	52803			52803		64054	my $t = $_[1];
147	52803					79595	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	52803	100				294416	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	6855					17957	$t->_new_token( 'Whitespace', $line );
153	6855					12170	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4346					11518	$t->_new_token( 'Comment', $line );
158	4346					10456	$t->_finalize_token;
159	4346					7464	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	766					2561	$t->_new_token( 'Pod', $line );
164	766	50				3275	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	766					1569	$t->{class} = 'PPI::Token::Pod';
170							}
171	766					1617	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					4	my @perl6;
178	2					2	while ( 1 ) {
179	693					743	my $line6 = $t->_get_line;
180	693	100				747	last unless defined $line6;
181	691					693	push @perl6, $line6;
182							}
183	2					3	push @{ $t->{perl6} }, join '', @perl6;
	2					95
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					34	return 1;
189							}
190
191	40834					72645	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	395556			395556		417050	my $t = $_[1];
196	395556					562011	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	395556					420811	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	395556	100				906982	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	275716	100				819943	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	19681	100				65801	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	8448	100				26457	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215	8448					25255	my ( $has_sig, @tokens ) = $t->_current_token_has_signatures_active;
216	8448	100				22291	return 'Structure' if $has_sig;
217
218							# A normal subroutine declaration
219	8412					13528	my $p1 = $tokens[1];
220	8412					12479	my $p2 = $tokens[2];
221	8412	50	100			89462	if (
			100
			100
			100
			33
			100
222							$tokens[0]
223							and
224							$tokens[0]->isa('PPI::Token::Word')
225							and
226							$p1
227							and
228							$p1->isa('PPI::Token::Word')
229							and
230							$p1->content eq 'sub'
231							and (
232							not $p2
233							or
234							$p2->isa('PPI::Token::Structure')
235							or (
236							$p2->isa('PPI::Token::Whitespace')
237							and
238							$p2->content eq ''
239							)
240							or (
241							# Lexical subroutine
242							$p2->isa('PPI::Token::Word')
243							and
244							$p2->content =~ /^(?:my\|our\|state)$/
245							)
246							)
247							) {
248							# This is a sub prototype
249	314					1302	return 'Prototype';
250							}
251
252							# A prototyped anonymous subroutine
253	8098					13440	my $p0 = $tokens[0];
254	8098	100	100			41109	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
255							# Maybe it's invoking a method named 'sub'
256							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257							) {
258	42					182	return 'Prototype';
259							}
260
261							# This is a normal open bracket
262	8056					41538	return 'Structure';
263
264							} elsif ( $char == 60 ) { # $char eq '<'
265							# Finalise any whitespace token...
266	2094	100				6923	$t->_finalize_token if $t->{token};
267
268							# This is either "less than" or "readline quote-like"
269							# Do some context stuff to guess which.
270	2094					4767	my $prev = $t->_last_significant_token;
271
272							# The most common group of less-thans are used like
273							# $foo < $bar
274							# 1 < $bar
275							# $#foo < $bar
276	2094	100	100			11753	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277	1775	50	66			9658	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278	1775	100	100			7605	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279	1735	100	100			7521	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280
281							# If it is <<... it's a here-doc instead
282	1734					4223	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283	1734	100				6975	return 'Operator' if $next_char =~ /<[^>]/;
284
285	910	100				2137	return 'Operator' if not $prev;
286
287							# The most common group of readlines are used like
288							# while ( <...> )
289							# while <>;
290	816					2020	my $prec = $prev->content;
291	816	100	100			14121	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
292							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297
298	716	100	100			2786	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299							# Could go either way... do a regex check
300							# $foo->{bar} < 2;
301							# grep { .. } <foo>;
302	15					73	pos $t->{line} = $t->{line_cursor};
303	15	100				100	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304							# Almost definitely readline
305	3					12	return 'QuoteLike::Readline';
306							}
307							}
308
309							# Otherwise, we guess operator, which has been the default up
310							# until this more comprehensive section was created.
311	713					2376	return 'Operator';
312
313							} elsif ( $char == 47 ) { # $char eq '/'
314							# Finalise any whitespace token...
315	1038	100				3304	$t->_finalize_token if $t->{token};
316
317							# This is either a "divided by" or a "start regex"
318							# Do some context stuff to guess ( ack ) which.
319							# Hopefully the guess will be good enough.
320	1038					2750	my $prev = $t->_last_significant_token;
321
322							# Or as the very first thing in a file
323	1038	100				3167	return 'Regexp::Match' if not $prev;
324
325	946					2313	my $prec = $prev->content;
326
327							# Most times following an operator, we are a regex.
328							# This includes cases such as:
329							# , - As an argument in a list
330							# .. - The second condition in a flip flop
331							# =~ - A bound regex
332							# !~ - Ditto
333	946	100				4699	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334
335							# After a symbol
336	500	100				2101	return 'Operator' if $prev->isa('PPI::Token::Symbol');
337	435	100	66			1380	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338	11					32	return 'Operator';
339							}
340
341							# After another number
342	424	100				1591	return 'Operator' if $prev->isa('PPI::Token::Number');
343
344							# After going into scope/brackets
345	382	100	100			2046	if (
			100
346							$prev->isa('PPI::Token::Structure')
347							and (
348							$prec eq '('
349							or
350							$prec eq '{'
351							or
352							$prec eq ';'
353							)
354							) {
355	57					181	return 'Regexp::Match';
356							}
357
358							# Functions and keywords
359	325	100	66			1210	if (
360							$MATCHWORD{$prec}
361							and
362							$prev->isa('PPI::Token::Word')
363							) {
364	73					274	return 'Regexp::Match';
365							}
366
367							# What about the char after the slash? There's some things
368							# that would be highly illogical to see if it's an operator.
369	252					655	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370	252	50	33			1030	if ( defined $next_char and length $next_char ) {
371	252	100				979	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
372	14					49	return 'Regexp::Match';
373							}
374							}
375
376							# Otherwise... erm... assume operator?
377							# Add more tests here as potential cases come to light
378	238					737	return 'Operator';
379
380							} elsif ( $char == 120 ) { # $char eq 'x'
381							# Could be a word, the x= operator, the x operator
382							# followed by whitespace, or the x operator without any
383							# space between itself and its operand, e.g.: '$a x3',
384							# which is the same as '$a x 3'. _current_x_is_operator
385							# assumes we have a complete 'x' token, but we don't
386							# yet. We may need to split this x character apart from
387							# what follows it.
388	840	100				2287	if ( $t->_current_x_is_operator ) {
389	201					636	pos $t->{line} = $t->{line_cursor} + 1;
390	201	100				1244	return 'Operator' if $t->{line} =~ m/\G(?:
391							\d # x op with no whitespace e.g. 'x3'
392							\|
393							(?!( # negative lookahead
394							=> # not on left of fat comma
395							\|
396							\w # not a word like "xyzzy"
397							\|
398							\s # not x op plus whitespace
399							))
400							)/gcx;
401							}
402
403							# Otherwise, commit like a normal bareword, including x
404							# operator followed by whitespace.
405	721					2457	return PPI::Token::Word->__TOKENIZER__commit($t);
406
407							} elsif ( $char == 45 ) { # $char eq '-'
408							# Look for an obvious operator operand context
409	7255					20972	my $context = $t->_opcontext;
410	7255	100				19034	if ( $context eq 'operator' ) {
411	5288					19876	return 'Operator';
412							} else {
413							# More logic needed
414	1967					6341	return 'Unknown';
415							}
416
417							} elsif ( $char >= 128 ) { # Outside ASCII
418	6	100				40	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419	1	50				3	return 'Whitespace' if $c =~ /\s/;
420							}
421
422
423							# All the whitespaces are covered, so what to do
424							### For now, die
425	1					9	PPI::Exception->throw("Encountered unexpected character '$char'");
426							}
427
428							sub __TOKENIZER__on_line_end {
429	40470	100		40470		160598	$_[1]->_finalize_token if $_[1]->{token};
430							}
431
432							1;
433
434							=pod
435
436							=head1 SUPPORT
437
438							See the L<support section\|PPI/SUPPORT> in the main module.
439
440							=head1 AUTHOR
441
442							Adam Kennedy E<lt>adamk@cpan.orgE<gt>
443
444							=head1 COPYRIGHT
445
446							Copyright 2001 - 2011 Adam Kennedy.
447
448							This program is free software; you can redistribute
449							it and/or modify it under the same terms as Perl itself.
450
451							The full text of the license can be found in the
452							LICENSE file included with this module.
453
454							=cut