File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	96	98	97.9
branch	81	88	92.0
condition	77	90	85.5
subroutine	7	8	87.5
pod	2	2	100.0
total	263	286	91.9

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	67			67		143315	use strict;
	67					190
	67					2766
45	67			67		757	use Clone ();
	67					612
	67					1340
46	67			67		678	use PPI::Token ();
	67					142
	67					161814
47
48							our $VERSION = '1.28401'; # TRIAL
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	18	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					14	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	52860			52860		101209	my $t = $_[1];
147	52860					120917	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	52860	100				448618	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	6867					28759	$t->_new_token( 'Whitespace', $line );
153	6867					18699	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4341					16643	$t->_new_token( 'Comment', $line );
158	4341					13018	$t->_finalize_token;
159	4341					10549	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	762					3294	$t->_new_token( 'Pod', $line );
164	762	50				4851	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	762					1990	$t->{class} = 'PPI::Token::Pod';
170							}
171	762					2009	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					5	my @perl6;
178	2					4	while ( 1 ) {
179	693					1305	my $line6 = $t->_get_line;
180	693	100				1261	last unless defined $line6;
181	691					1082	push @perl6, $line6;
182							}
183	2					5	push @{ $t->{perl6} }, join '', @perl6;
	2					157
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					60	return 1;
189							}
190
191	40888					123497	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	396889			396889		646222	my $t = $_[1];
196	396889					924553	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	396889					662723	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	396889	100				1466752	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	276421	100				1321305	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	19674	100				102161	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	8430	100				34527	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215	8430					30612	my $has_sig = $t->_current_token_has_signatures_active;
216	8430	100				21828	return 'Structure' if $has_sig;
217
218	8394					22413	my @tokens = $t->_previous_significant_tokens(3);
219
220							# A normal subroutine declaration
221	8394					17245	my $p1 = $tokens[1];
222	8394					16338	my $p2 = $tokens[2];
223	8394	50	100			120706	if (
			100
			100
			100
			33
			100
224							$tokens[0]
225							and
226							$tokens[0]->isa('PPI::Token::Word')
227							and
228							$p1
229							and
230							$p1->isa('PPI::Token::Word')
231							and
232							$p1->content eq 'sub'
233							and (
234							not $p2
235							or
236							$p2->isa('PPI::Token::Structure')
237							or (
238							$p2->isa('PPI::Token::Whitespace')
239							and
240							$p2->content eq ''
241							)
242							or (
243							# Lexical subroutine
244							$p2->isa('PPI::Token::Word')
245							and
246							$p2->content =~ /^(?:my\|our\|state)$/
247							)
248							)
249							) {
250							# This is a sub prototype
251	314					2157	return 'Prototype';
252							}
253
254							# A prototyped anonymous subroutine
255	8080					17065	my $p0 = $tokens[0];
256	8080	100	100			59681	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
257							# Maybe it's invoking a method named 'sub'
258							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
259							) {
260	42					320	return 'Prototype';
261							}
262
263							# This is a normal open bracket
264	8038					48226	return 'Structure';
265
266							} elsif ( $char == 60 ) { # $char eq '<'
267							# Finalise any whitespace token...
268	2052	100				11288	$t->_finalize_token if $t->{token};
269
270							# This is either "less than" or "readline quote-like"
271							# Do some context stuff to guess which.
272	2052					8288	my $prev = $t->_last_significant_token;
273
274							# The most common group of less-thans are used like
275							# $foo < $bar
276							# 1 < $bar
277							# $#foo < $bar
278	2052	100	100			21110	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
279	1729	50	66			14855	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
280	1729	100	100			14528	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
281	1697	50	66			12476	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
282
283							# If it is <<... it's a here-doc instead
284	1697					6849	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
285	1697	100				12128	return 'Operator' if $next_char =~ /<[^>]/;
286
287	873	100				3047	return 'Operator' if not $prev;
288
289							# The most common group of readlines are used like
290							# while ( <...> )
291							# while <>;
292	792					2813	my $prec = $prev->content;
293	792	100	100			22358	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
294							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
295							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
296							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
297							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
298							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
299
300	696	100	100			3958	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
301							# Could go either way... do a regex check
302							# $foo->{bar} < 2;
303							# grep { .. } ;
304	17					64	pos $t->{line} = $t->{line_cursor};
305	17	100				105	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
306							# Almost definitely readline
307	3					12	return 'QuoteLike::Readline';
308							}
309							}
310
311							# Otherwise, we guess operator, which has been the default up
312							# until this more comprehensive section was created.
313	693					3516	return 'Operator';
314
315							} elsif ( $char == 47 ) { # $char eq '/'
316							# Finalise any whitespace token...
317	1038	100				5336	$t->_finalize_token if $t->{token};
318
319							# This is either a "divided by" or a "start regex"
320							# Do some context stuff to guess ( ack ) which.
321							# Hopefully the guess will be good enough.
322	1038					4331	my $prev = $t->_last_significant_token;
323
324							# Or as the very first thing in a file
325	1038	100				4344	return 'Regexp::Match' if not $prev;
326
327	938					3353	my $prec = $prev->content;
328
329							# Most times following an operator, we are a regex.
330							# This includes cases such as:
331							# , - As an argument in a list
332							# .. - The second condition in a flip flop
333							# =~ - A bound regex
334							# !~ - Ditto
335	938	100				10880	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
336
337							# After a symbol
338	499	100				3028	return 'Operator' if $prev->isa('PPI::Token::Symbol');
339	441	100	66			2120	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
340	14					55	return 'Operator';
341							}
342
343							# After another number
344	427	100				2384	return 'Operator' if $prev->isa('PPI::Token::Number');
345
346							# After going into scope/brackets
347	391	100	100			2577	if (
			100
348							$prev->isa('PPI::Token::Structure')
349							and (
350							$prec eq '('
351							or
352							$prec eq '{'
353							or
354							$prec eq ';'
355							)
356							) {
357	54					268	return 'Regexp::Match';
358							}
359
360							# Functions and keywords
361	337	100	66			1653	if (
362							$MATCHWORD{$prec}
363							and
364							$prev->isa('PPI::Token::Word')
365							) {
366	73					371	return 'Regexp::Match';
367							}
368
369							# What about the char after the slash? There's some things
370							# that would be highly illogical to see if it's an operator.
371	264					969	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
372	264	50	33			1397	if ( defined $next_char and length $next_char ) {
373	264	100				1673	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
374	16					89	return 'Regexp::Match';
375							}
376							}
377
378							# Otherwise... erm... assume operator?
379							# Add more tests here as potential cases come to light
380	248					1290	return 'Operator';
381
382							} elsif ( $char == 120 ) { # $char eq 'x'
383							# Could be a word, the x= operator, the x operator
384							# followed by whitespace, or the x operator without any
385							# space between itself and its operand, e.g.: '$a x3',
386							# which is the same as '$a x 3'. _current_x_is_operator
387							# assumes we have a complete 'x' token, but we don't
388							# yet. We may need to split this x character apart from
389							# what follows it.
390	843	100				3719	if ( $t->_current_x_is_operator ) {
391	222					1041	pos $t->{line} = $t->{line_cursor} + 1;
392	222	100				2107	return 'Operator' if $t->{line} =~ m/\G(?:
393							\d # x op with no whitespace e.g. 'x3'
394							\|
395							(?!( # negative lookahead
396							=> # not on left of fat comma
397							\|
398							\w # not a word like "xyzzy"
399							\|
400							\s # not x op plus whitespace
401							))
402							)/gcx;
403							}
404
405							# Otherwise, commit like a normal bareword, including x
406							# operator followed by whitespace.
407	715					3686	return PPI::Token::Word->__TOKENIZER__commit($t);
408
409							} elsif ( $char == 45 ) { # $char eq '-'
410							# Look for an obvious operator operand context
411	7305					26841	my $context = $t->_opcontext;
412	7305	100				19115	if ( $context eq 'operator' ) {
413	5334					31909	return 'Operator';
414							} else {
415							# More logic needed
416	1971					9459	return 'Unknown';
417							}
418
419							} elsif ( $char >= 128 ) { # Outside ASCII
420	6	100				78	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
421	1	50				6	return 'Whitespace' if $c =~ /\s/;
422							}
423
424
425							# All the whitespaces are covered, so what to do
426							### For now, die
427	1					111	PPI::Exception->throw("Encountered unexpected character '$char'");
428							}
429
430							sub __TOKENIZER__on_line_end {
431	40519	100		40519		276730	$_[1]->_finalize_token if $_[1]->{token};
432							}
433
434							1;
435
436							=pod
437
438							=head1 SUPPORT
439
440							See the L in the main module.
441
442							=head1 AUTHOR
443
444							Adam Kennedy Eadamk@cpan.orgE
445
446							=head1 COPYRIGHT
447
448							Copyright 2001 - 2011 Adam Kennedy.
449
450							This program is free software; you can redistribute
451							it and/or modify it under the same terms as Perl itself.
452
453							The full text of the license can be found in the
454							LICENSE file included with this module.
455
456							=cut