File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	95	97	97.9
branch	81	88	92.0
condition	77	90	85.5
subroutine	7	8	87.5
pod	2	2	100.0
total	262	285	91.9

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	68			68		86581	use strict;
	68					125
	68					1900
45	68			68		590	use Clone ();
	68					519
	68					863
46	68			68		598	use PPI::Token ();
	68					98
	68					114620
47
48							our $VERSION = '1.287';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	18	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					12	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	56869			56869		64528	my $t = $_[1];
147	56869					81759	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	56869	100				276967	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	7000					16818	$t->_new_token( 'Whitespace', $line );
153	7000					14149	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4328					10618	$t->_new_token( 'Comment', $line );
158	4328					8614	$t->_finalize_token;
159	4328					7051	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	761					2156	$t->_new_token( 'Pod', $line );
164	761	50				2775	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	761					1509	$t->{class} = 'PPI::Token::Pod';
170							}
171	761					1536	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					2	my @perl6;
178	2					3	while ( 1 ) {
179	693					798	my $line6 = $t->_get_line;
180	693	100				782	last unless defined $line6;
181	691					692	push @perl6, $line6;
182							}
183	2					3	push @{ $t->{perl6} }, join '', @perl6;
	2					73
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					34	return 1;
189							}
190
191	44778					77134	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	609302			609302		609723	my $t = $_[1];
196	609302					832051	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	609302					623096	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	609302	100				1185263	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	469983	100				1210052	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	27179	100				69836	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	12236	100				30186	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215	12236	100				26400	return 'Structure' if $t->_current_token_has_signatures_active;
216
217	8418					16581	my @tokens = $t->_previous_significant_tokens(3);
218
219							# A normal subroutine declaration
220	8418					10827	my $p1 = $tokens[1];
221	8418					10920	my $p2 = $tokens[2];
222	8418	50	100			71804	if (
			100
			100
			100
			33
			100
223							$tokens[0]
224							and
225							$tokens[0]->isa('PPI::Token::Word')
226							and
227							$p1
228							and
229							$p1->isa('PPI::Token::Word')
230							and
231							$p1->content eq 'sub'
232							and (
233							not $p2
234							or
235							$p2->isa('PPI::Token::Structure')
236							or (
237							$p2->isa('PPI::Token::Whitespace')
238							and
239							$p2->content eq ''
240							)
241							or (
242							# Lexical subroutine
243							$p2->isa('PPI::Token::Word')
244							and
245							$p2->content =~ /^(?:my\|our\|state)$/
246							)
247							)
248							) {
249							# This is a sub prototype
250	314					1203	return 'Prototype';
251							}
252
253							# A prototyped anonymous subroutine
254	8104					11962	my $p0 = $tokens[0];
255	8104	100	100			36595	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
256							# Maybe it's invoking a method named 'sub'
257							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
258							) {
259	42					172	return 'Prototype';
260							}
261
262							# This is a normal open bracket
263	8062					28099	return 'Structure';
264
265							} elsif ( $char == 60 ) { # $char eq '<'
266							# Finalise any whitespace token...
267	2060	100				6349	$t->_finalize_token if $t->{token};
268
269							# This is either "less than" or "readline quote-like"
270							# Do some context stuff to guess which.
271	2060					4238	my $prev = $t->_last_significant_token;
272
273							# The most common group of less-thans are used like
274							# $foo < $bar
275							# 1 < $bar
276							# $#foo < $bar
277	2060	100	100			11585	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
278	1746	50	66			7801	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
279	1746	100	100			6613	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
280	1711	50	66			6717	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
281
282							# If it is <<... it's a here-doc instead
283	1711					3495	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
284	1711	100				6790	return 'Operator' if $next_char =~ /<[^>]/;
285
286	886	100				1885	return 'Operator' if not $prev;
287
288							# The most common group of readlines are used like
289							# while ( <...> )
290							# while <>;
291	799					1761	my $prec = $prev->content;
292	799	100	100			11802	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
293							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
294							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
295							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
297							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
298
299	693	100	100			2559	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
300							# Could go either way... do a regex check
301							# $foo->{bar} < 2;
302							# grep { .. } ;
303	20					75	pos $t->{line} = $t->{line_cursor};
304	20	100				113	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
305							# Almost definitely readline
306	3					9	return 'QuoteLike::Readline';
307							}
308							}
309
310							# Otherwise, we guess operator, which has been the default up
311							# until this more comprehensive section was created.
312	690					2414	return 'Operator';
313
314							} elsif ( $char == 47 ) { # $char eq '/'
315							# Finalise any whitespace token...
316	1073	100				3413	$t->_finalize_token if $t->{token};
317
318							# This is either a "divided by" or a "start regex"
319							# Do some context stuff to guess ( ack ) which.
320							# Hopefully the guess will be good enough.
321	1073					2653	my $prev = $t->_last_significant_token;
322
323							# Or as the very first thing in a file
324	1073	100				3084	return 'Regexp::Match' if not $prev;
325
326	976					2528	my $prec = $prev->content;
327
328							# Most times following an operator, we are a regex.
329							# This includes cases such as:
330							# , - As an argument in a list
331							# .. - The second condition in a flip flop
332							# =~ - A bound regex
333							# !~ - Ditto
334	976	100				4806	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
335
336							# After a symbol
337	509	100				2198	return 'Operator' if $prev->isa('PPI::Token::Symbol');
338	442	100	66			1344	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
339	14					35	return 'Operator';
340							}
341
342							# After another number
343	428	100				1558	return 'Operator' if $prev->isa('PPI::Token::Number');
344
345							# After going into scope/brackets
346	395	100	100			1947	if (
			100
347							$prev->isa('PPI::Token::Structure')
348							and (
349							$prec eq '('
350							or
351							$prec eq '{'
352							or
353							$prec eq ';'
354							)
355							) {
356	58					154	return 'Regexp::Match';
357							}
358
359							# Functions and keywords
360	337	100	66			1142	if (
361							$MATCHWORD{$prec}
362							and
363							$prev->isa('PPI::Token::Word')
364							) {
365	73					222	return 'Regexp::Match';
366							}
367
368							# What about the char after the slash? There's some things
369							# that would be highly illogical to see if it's an operator.
370	264					741	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
371	264	50	33			873	if ( defined $next_char and length $next_char ) {
372	264	100				1069	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
373	14					61	return 'Regexp::Match';
374							}
375							}
376
377							# Otherwise... erm... assume operator?
378							# Add more tests here as potential cases come to light
379	250					837	return 'Operator';
380
381							} elsif ( $char == 120 ) { # $char eq 'x'
382							# Could be a word, the x= operator, the x operator
383							# followed by whitespace, or the x operator without any
384							# space between itself and its operand, e.g.: '$a x3',
385							# which is the same as '$a x 3'. _current_x_is_operator
386							# assumes we have a complete 'x' token, but we don't
387							# yet. We may need to split this x character apart from
388							# what follows it.
389	855	100				2091	if ( $t->_current_x_is_operator ) {
390	227					694	pos $t->{line} = $t->{line_cursor} + 1;
391	227	100				1298	return 'Operator' if $t->{line} =~ m/\G(?:
392							\d # x op with no whitespace e.g. 'x3'
393							\|
394							(?!( # negative lookahead
395							=> # not on left of fat comma
396							\|
397							\w # not a word like "xyzzy"
398							\|
399							\s # not x op plus whitespace
400							))
401							)/gcx;
402							}
403
404							# Otherwise, commit like a normal bareword, including x
405							# operator followed by whitespace.
406	727					2277	return PPI::Token::Word->__TOKENIZER__commit($t);
407
408							} elsif ( $char == 45 ) { # $char eq '-'
409							# Look for an obvious operator operand context
410	10949					21854	my $context = $t->_opcontext;
411	10949	100				17939	if ( $context eq 'operator' ) {
412	8975					30466	return 'Operator';
413							} else {
414							# More logic needed
415	1974					6039	return 'Unknown';
416							}
417
418							} elsif ( $char >= 128 ) { # Outside ASCII
419	6	100				75	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
420	1	50				6	return 'Whitespace' if $c =~ /\s/;
421							}
422
423
424							# All the whitespaces are covered, so what to do
425							### For now, die
426	1					13	PPI::Exception->throw("Encountered unexpected character '$char'");
427							}
428
429							sub __TOKENIZER__on_line_end {
430	44406	100		44406		159075	$_[1]->_finalize_token if $_[1]->{token};
431							}
432
433							1;
434
435							=pod
436
437							=head1 SUPPORT
438
439							See the L in the main module.
440
441							=head1 AUTHOR
442
443							Adam Kennedy Eadamk@cpan.orgE
444
445							=head1 COPYRIGHT
446
447							Copyright 2001 - 2011 Adam Kennedy.
448
449							This program is free software; you can redistribute
450							it and/or modify it under the same terms as Perl itself.
451
452							The full text of the license can be found in the
453							LICENSE file included with this module.
454
455							=cut