File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	95	97	97.9
branch	82	88	93.1
condition	78	90	86.6
subroutine	7	8	87.5
pod	2	2	100.0
total	264	285	92.6

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	69			69		82622	use strict;
	69					120
	69					1924
45	69			69		564	use Clone ();
	69					520
	69					807
46	69			69		527	use PPI::Token ();
	69					117
	69					112198
47
48							our $VERSION = '1.290';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	13	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					9	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	57047			57047		63468	my $t = $_[1];
147	57047					79963	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	57047	100				262752	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	7029					16272	$t->_new_token( 'Whitespace', $line );
153	7029					10999	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4353					9249	$t->_new_token( 'Comment', $line );
158	4353					8134	$t->_finalize_token;
159	4353					6834	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	762					2253	$t->_new_token( 'Pod', $line );
164	762	50				2637	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	762					1423	$t->{class} = 'PPI::Token::Pod';
170							}
171	762					1377	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					4	my @perl6;
178	2					5	while ( 1 ) {
179	693					1429	my $line6 = $t->_get_line;
180	693	100				1401	last unless defined $line6;
181	691					1364	push @perl6, $line6;
182							}
183	2					4	push @{ $t->{perl6} }, join '', @perl6;
	2					128
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					59	return 1;
189							}
190
191	44901					74745	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	609250			609250		600821	my $t = $_[1];
196	609250					800687	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	609250					614220	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	609250	100				1180589	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	470069	100				1214163	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	27134	100				69806	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	12214	100				26345	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215	12214	100				26807	return 'Structure' if $t->_current_token_has_signatures_active;
216
217	8396					14558	my @tokens = $t->_previous_significant_tokens(3);
218
219							# A normal subroutine declaration
220	8396					11815	my $p1 = $tokens[1];
221	8396					9829	my $p2 = $tokens[2];
222	8396	50	100			65636	if (
			100
			100
			100
			33
			100
223							$tokens[0]
224							and
225							$tokens[0]->isa('PPI::Token::Word')
226							and
227							$p1
228							and
229							$p1->isa('PPI::Token::Word')
230							and
231							$p1->content eq 'sub'
232							and (
233							not $p2
234							or
235							$p2->isa('PPI::Token::Structure')
236							or (
237							$p2->isa('PPI::Token::Whitespace')
238							and
239							$p2->content eq ''
240							)
241							or (
242							# Lexical subroutine
243							$p2->isa('PPI::Token::Word')
244							and
245							$p2->content =~ /^(?:my\|our\|state)$/
246							)
247							)
248							) {
249							# This is a sub prototype
250	314					1112	return 'Prototype';
251							}
252
253							# A prototyped anonymous subroutine
254	8082					10194	my $p0 = $tokens[0];
255	8082	100	100			32992	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
256							# Maybe it's invoking a method named 'sub'
257							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
258							) {
259	42					180	return 'Prototype';
260							}
261
262							# This is a normal open bracket
263	8040					26807	return 'Structure';
264
265							} elsif ( $char == 60 ) { # $char eq '<'
266							# Finalise any whitespace token...
267	2070	100				6012	$t->_finalize_token if $t->{token};
268
269							# This is either "less than" or "readline quote-like"
270							# Do some context stuff to guess which.
271	2070					4477	my $prev = $t->_last_significant_token;
272
273							# The most common group of less-thans are used like
274							# $foo < $bar
275							# 1 < $bar
276							# $#foo < $bar
277	2070	100	100			11232	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
278	1756	50	66			7443	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
279	1756	100	100			6950	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
280	1723	100	100			6713	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
281
282							# If it is <<... it's a here-doc instead
283	1722					3600	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
284	1722	100				5989	return 'Operator' if $next_char =~ /<[^>]/;
285
286	897	100				2102	return 'Operator' if not $prev;
287
288							# The most common group of readlines are used like
289							# while ( <...> )
290							# while <>;
291	808					1723	my $prec = $prev->content;
292	808	100	100			12184	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
293							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
294							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
295							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
297							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
298
299	703	100	100			2517	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
300							# Could go either way... do a regex check
301							# $foo->{bar} < 2;
302							# grep { .. } ;
303	11					35	pos $t->{line} = $t->{line_cursor};
304	11	100				77	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
305							# Almost definitely readline
306	3					11	return 'QuoteLike::Readline';
307							}
308							}
309
310							# Otherwise, we guess operator, which has been the default up
311							# until this more comprehensive section was created.
312	700					2048	return 'Operator';
313
314							} elsif ( $char == 47 ) { # $char eq '/'
315							# Finalise any whitespace token...
316	1038	100				3396	$t->_finalize_token if $t->{token};
317
318							# This is either a "divided by" or a "start regex"
319							# Do some context stuff to guess ( ack ) which.
320							# Hopefully the guess will be good enough.
321	1038					2421	my $prev = $t->_last_significant_token;
322
323							# Or as the very first thing in a file
324	1038	100				2903	return 'Regexp::Match' if not $prev;
325
326	941					2369	my $prec = $prev->content;
327
328							# Most times following an operator, we are a regex.
329							# This includes cases such as:
330							# , - As an argument in a list
331							# .. - The second condition in a flip flop
332							# =~ - A bound regex
333							# !~ - Ditto
334	941	100				4304	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
335
336							# After a symbol
337	510	100				2011	return 'Operator' if $prev->isa('PPI::Token::Symbol');
338	440	100	66			1656	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
339	9					28	return 'Operator';
340							}
341
342							# After another number
343	431	100				1458	return 'Operator' if $prev->isa('PPI::Token::Number');
344
345							# After going into scope/brackets
346	389	100	100			1804	if (
			100
347							$prev->isa('PPI::Token::Structure')
348							and (
349							$prec eq '('
350							or
351							$prec eq '{'
352							or
353							$prec eq ';'
354							)
355							) {
356	53					173	return 'Regexp::Match';
357							}
358
359							# Functions and keywords
360	336	100	66			1143	if (
361							$MATCHWORD{$prec}
362							and
363							$prev->isa('PPI::Token::Word')
364							) {
365	73					209	return 'Regexp::Match';
366							}
367
368							# What about the char after the slash? There's some things
369							# that would be highly illogical to see if it's an operator.
370	263					640	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
371	263	50	33			967	if ( defined $next_char and length $next_char ) {
372	263	100				928	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
373	6					22	return 'Regexp::Match';
374							}
375							}
376
377							# Otherwise... erm... assume operator?
378							# Add more tests here as potential cases come to light
379	257					897	return 'Operator';
380
381							} elsif ( $char == 120 ) { # $char eq 'x'
382							# Could be a word, the x= operator, the x operator
383							# followed by whitespace, or the x operator without any
384							# space between itself and its operand, e.g.: '$a x3',
385							# which is the same as '$a x 3'. _current_x_is_operator
386							# assumes we have a complete 'x' token, but we don't
387							# yet. We may need to split this x character apart from
388							# what follows it.
389	863	100				2147	if ( $t->_current_x_is_operator ) {
390	201					628	pos $t->{line} = $t->{line_cursor} + 1;
391	201	100				1166	return 'Operator' if $t->{line} =~ m/\G(?:
392							\d # x op with no whitespace e.g. 'x3'
393							\|
394							(?!( # negative lookahead
395							=> # not on left of fat comma
396							\|
397							\w # not a word like "xyzzy"
398							\|
399							\s # not x op plus whitespace
400							))
401							)/gcx;
402							}
403
404							# Otherwise, commit like a normal bareword, including x
405							# operator followed by whitespace.
406	750					2401	return PPI::Token::Word->__TOKENIZER__commit($t);
407
408							} elsif ( $char == 45 ) { # $char eq '-'
409							# Look for an obvious operator operand context
410	10943					22608	my $context = $t->_opcontext;
411	10943	100				18607	if ( $context eq 'operator' ) {
412	9009					29251	return 'Operator';
413							} else {
414							# More logic needed
415	1934					5648	return 'Unknown';
416							}
417
418							} elsif ( $char >= 128 ) { # Outside ASCII
419	6	100				35	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
420	1	50				5	return 'Whitespace' if $c =~ /\s/;
421							}
422
423
424							# All the whitespaces are covered, so what to do
425							### For now, die
426	1					10	PPI::Exception->throw("Encountered unexpected character '$char'");
427							}
428
429							sub __TOKENIZER__on_line_end {
430	44517	100		44517		155455	$_[1]->_finalize_token if $_[1]->{token};
431							}
432
433							1;
434
435							=pod
436
437							=head1 SUPPORT
438
439							See the L in the main module.
440
441							=head1 AUTHOR
442
443							Adam Kennedy Eadamk@cpan.orgE
444
445							=head1 COPYRIGHT
446
447							Copyright 2001 - 2011 Adam Kennedy.
448
449							This program is free software; you can redistribute
450							it and/or modify it under the same terms as Perl itself.
451
452							The full text of the license can be found in the
453							LICENSE file included with this module.
454
455							=cut