File Coverage

blib/lib/Text/Shellwords/Cursor.pm

Criterion	Covered	Total	%
statement	111	171	64.9
branch	57	114	50.0
condition	32	54	59.2
subroutine	6	8	75.0
pod	6	6	100.0
total	212	353	60.0

line	stmt	bran	cond	sub	pod	time	code
1							# Text::Shellwords::Cursor.pm
2							# Scott Bronson
3							# 27 Jan 2003
4							# Covered by the MIT license.
5
6							package Text::Shellwords::Cursor;
7
8	1			1		27909	use strict;
	1					2
	1					47
9
10	1			1		6	use vars qw($VERSION);
	1					2
	1					2666
11							$VERSION = '0.81';
12
13							=head1 NAME
14
15							Text::Shellwords::Cursor - Parse a string into tokens
16
17							=head1 SYNOPSIS
18
19							use Text::Shellwords::Cursor;
20							my $parser = Text::Shellwords::Cursor->new();
21							my $str = 'ab cdef "ghi" j"k\"l "';
22							my ($tok1) = $parser->parse_line($str);
23							$tok1 = ['ab', 'cdef', 'ghi', 'j', 'k"l ']
24							my ($tok2, $tokno, $tokoff) = $parser->parse_line($str, cursorpos => 6);
25							as above, but $tokno=1, $tokoff=3 (under the 'f')
26
27							DESCRIPTION
28
29							This module is very similar to Text::Shellwords and Text::ParseWords.
30							However, it has one very significant difference: it keeps track of
31							a character position in the line it's parsing. For instance, if you
32							pass it ("zq fmgb", cursorpos=>6), it would return
33							(['zq', 'fmgb'], 1, 3). The cursorpos parameter
34							tells where in the input string the cursor resides
35							(just before the 'b'), and the result tells you that
36							the cursor was on token 1 ('fmgb'), character 3 ('b').
37							This is very useful when computing command-line completions
38							involving quoting, escaping, and tokenizing characters (like '(' or '=').
39
40							A few helper utilities are included as well. You can escape a string to
41							ensure that parsing it will produce the original string (L).
42							You can also reassemble the tokens with a visually pleasing amount of
43							whitespace between them (L).
44
45							This module started out as an integral part of Term::GDBUI using
46							code loosely based on Text::ParseWords. However,
47							it is now basically a ground-up reimplementation. It was
48							split out of Term::GDBUI for version 0.8.
49
50							=head1 METHODS
51
52							=over 3
53
54							=item new
55
56							Creates a new parser. Takes named arguments on the command line.
57
58							=over 4
59
60							=item keep_quotes
61
62							Normally all unescaped, unnecessary quote marks are stripped.
63							If you specify C1>, however, they are preserved.
64							This is useful if you need to know whether the string was quoted
65							or not (string constants) or what type of quotes was around it
66							(affecting variable interpolation, for instance).
67
68							=item token_chars
69
70							This argument specifies the characters that should be considered
71							tokens all by themselves. For instance, if I pass
72							token_chars=>'=', then 'ab=123' would be parsed to ('ab', '=', '123').
73							Without token_chars, 'ab=123' remains a single string.
74
75							NOTE: you cannot change token_chars after the constructor has been
76							called! The regexps that use it are compiled once (m//o).
77							Also, until the Gnu Readline library can accept "=[]," without
78							diving into an endless loop, we will not tell history expansion
79							to use token_chars (it uses " \t\n()<>;&\|" by default).
80
81							=item debug
82
83							Turns on rather copious debugging to try to show what the parser is
84							thinking at every step.
85
86							=item space_none
87
88							=item space_before
89
90							=item space_after
91
92							These variables affect how whitespace in the line is normalized and
93							it is reassembled into a string. See the L routine.
94
95							=item error
96
97							This is a reference to a routine that should be called to display
98							a parse error. The routine takes two arguments: a reference to the
99							parser, and the error message to display as a string.
100
101							=cut
102
103							sub new
104							{
105	1			1	1	12	my $type = shift;
106	1					16	bless {
107							keep_quotes => 0,
108							token_chars => '',
109							debug => 0,
110							space_none => '(',
111							space_before => '[{',
112							space_after => ',)]}',
113							error => undef,
114							@_
115							}, $type
116							}
117
118							=item parsebail(msg)
119
120							If the parsel routine or any of its subroutines runs into a fatal
121							error, they call parsebail to present a very descriptive diagnostic.
122
123							=cut
124
125							sub parsebail
126							{
127	2			2	1	5	my $self = shift;
128	2					5	my $msg = shift;
129
130	2					38	die "$msg at char " . pos() . ":\n",
131							" $_\n " . (' ' x pos()) . '^' . "\n";
132
133							}
134
135
136							=item parsel
137
138							This is the heinous routine that actually does the parsing.
139							You should never need to call it directly. Call
140							L
141							instead.
142
143							=cut
144
145							sub parsel
146							{
147	65			65	1	83	my $self = shift;
148	65					142	$_ = shift;
149	65					94	my $cursorpos = shift;
150	65					82	my $fixclosequote = shift;
151
152	65					112	my $deb = $self->{debug};
153	65					98	my $tchrs = $self->{token_chars};
154
155	65		66			305	my $usingcp = (defined($cursorpos) && $cursorpos ne '');
156	65					77	my $tokno = undef;
157	65					63	my $tokoff = undef;
158	65					64	my $oldpos;
159
160	65					95	my @pieces = ();
161
162							# Need to special case the empty string. None of the patterns below
163							# will match it yet we need to return an empty token for the cursor.
164	65	100	100			299	return ([''], 0, 0) if $usingcp && $_ eq '';
165
166	64					155	/^/gc; # force scanning to the beginning of the line
167
168	64					79	do {
169	100	0				5023	$deb && print "-- top, pos=" . pos() .
		50
170							($usingcp ? " cursorpos=$cursorpos" : "") . "\n";
171
172							# trim whitespace from the beginning
173	100	100				344	if(/\G(\s+)/gc) {
174	54	0				119	$deb && print "trimmed " . length($1) . " whitespace chars, " .
		50
175							($usingcp ? "cursorpos=$cursorpos" : "") . "\n";
176							# if pos passed cursorpos, then we know that the cursor was
177							# surrounded by ws and we need to create an empty token for it.
178	54	100	100			210	if($usingcp && (pos() >= $cursorpos)) {
179							# if pos == cursorpos and we're not yet at EOL, let next token accept cursor
180	12	100	100			57	unless(pos() == $cursorpos && pos() < length($_)) {
181							# need to special-case at end-of-line as there are no more tokens
182							# to take care of the cursor so we must create an empty one.
183	8	50				20	$deb && print "adding bogus token to handle cursor.\n";
184	8					15	push @pieces, '';
185	8					15	$tokno = $#pieces;
186	8					8	$tokoff = 0;
187	8					15	$usingcp = 0;
188							}
189							}
190							}
191
192							# if there's a quote, then suck to the close quote
193	100					137	$oldpos = pos();
194	100	100				300	if(/\G(['"])/gc) {
195	40					91	my $quote = $1;
196	40					49	my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
197	40					106	my $s;
198
199	40	50				91	$deb && print "Found open quote [$quote] oldpos=$oldpos\n";
200
201							# adjust tokoff unless the cursor sits directly on the open quote
202	40	100	100			196	if($usingcp && pos()-1 < $cursorpos) {
203	31	50				63	$deb && print " lead quote increment pos=".pos()." cursorpos=$cursorpos\n";
204	31					51	$adjust += 1;
205							}
206
207	40	100				87	if($quote eq '"') {
208	20	100				105	if(/\G((?:\\.\|(?!["])[^\\])*)["]/gc) {
209	19					38	$s = $1; # string without quotes
210							} else {
211	1	50				5	unless($fixclosequote) {
212	1					4	pos() -= 1;
213	1					9	$self->parsebail("need closing quote [\"]");
214							}
215	0					0	/\G(.*)$/gc; # if no close quote, just suck to the end of the string
216	0					0	$s = $1; # string without quotes
217	0	0	0			0	if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
	0					0
218							}
219	19	50				71	$deb && print " quoted string is \"$s\"\n";
220	19					68	while($s =~ /\\./g) {
221	14					26	my $ps = pos($s) - 2; # points to the start of the sub
222	14	50				31	$deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
223	14	100	66			64	$adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
224	14					26	substr($s, $ps, 1) = '';
225	14					32	pos($s) = $ps + 1;
226	14	50				66	$deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
227							}
228							} else {
229	20	100				111	if(/\G((?:\\.\|(?!['])[^\\])*)[']/gc) {
230	19					40	$s = $1; # string without quotes
231							} else {
232	1	50				5	unless($fixclosequote) {
233	1					4	pos() -= 1;
234	1					10	$self->parsebail("need closing quote [']");
235							}
236	0					0	/\G(.*)$/gc; # if no close quote, just suck to the end of the string
237	0					0	$s = $1;
238	0	0	0			0	if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
	0					0
239							}
240	19	50				40	$deb && print " quoted string is '$s'\n";
241	19					75	while($s =~ /\\[\\']/g) {
242	12					20	my $ps = pos($s) - 2; # points to the start of the sub
243	12	50				26	$deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
244	12	100	66			63	$adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
245	12					25	substr($s, $ps, 1) = '';
246	12					483	pos($s) = $ps + 1;
247	12	50				176	$deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
248							}
249							}
250
251							# adjust tokoff if the cursor if it sits directly on the close quote
252	38	100	100			185	if($usingcp && pos() == $cursorpos) {
253	8	50				23	$deb && print " trail quote increment pos=".pos()." cursorpos=$cursorpos\n";
254	8					12	$adjust += 1;
255							}
256
257	38	50				92	$deb && print " Found close, pushing '$s' oldpos=$oldpos\n";
258	38	50				94	if($self->{keep_quotes}) {
259	0					0	$adjust -= 1; # need to move right 1 for opening quote
260	0					0	$s = $quote.$s.$quote;
261							}
262	38					72	push @pieces, $s;
263
264							# Set tokno and tokoff if this token contained the cursor
265	38	100	100			165	if($usingcp && pos() >= $cursorpos) {
266							# Previous block contains the cursor
267	34					51	$tokno = $#pieces;
268	34					48	$tokoff = $cursorpos - $oldpos - $adjust;
269	34					66	$usingcp = 0;
270							}
271							}
272
273							# suck up as much unquoted text as we can
274	98					126	$oldpos = pos();
275	98	100				368	if(/\G((?:\\.\|[^\s\\"'\Q$tchrs\E])+)/gco) {
276	40					79	my $s = $1; # the unquoted string
277	40					45	my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
278
279	40	50				80	$deb && print "Found unquoted string '$s'\n";
280	40					119	while($s =~ /\\./g) {
281	9					15	my $ps = pos($s) - 2; # points to the start of substitution
282	9	50				18	$deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
283	9	100	100			43	$adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
284	9					17	substr($s, $ps, 1) = '';
285	9					18	pos($s) = $ps + 1;
286	9	50				41	$deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
287							}
288	40	50				82	$deb && print " pushing '$s'\n";
289	40					66	push @pieces, $s;
290
291							# Set tokno and tokoff if this token contained the cursor
292	40	100	100			153	if($usingcp && pos() >= $cursorpos) {
293							# Previous block contains the cursor
294	12					23	$tokno = $#pieces;
295	12					18	$tokoff = $cursorpos - $oldpos - $adjust;
296	12					24	$usingcp = 0;
297							}
298							}
299
300	98	50	33			443	if(length($tchrs) && /\G([\Q$tchrs\E])/gco) {
301	0					0	my $s = $1; # the token char
302	0	0				0	$deb && print " pushing '$s'\n";
303	0					0	push @pieces, $s;
304
305	0	0	0			0	if($usingcp && pos() == $cursorpos) {
306							# Previous block contains the cursor
307	0					0	$tokno = $#pieces;
308	0					0	$tokoff = 0;
309	0					0	$usingcp = 0;
310							}
311							}
312							} until(pos() >= length($_));
313
314	62	0				123	$deb && print "Result: (", join(", ", @pieces), ") " .
		0
		50
315							(defined($tokno) ? $tokno : 'undef') . " " .
316							(defined($tokoff) ? $tokoff : 'undef') . "\n";
317
318	62					336	return ([@pieces], $tokno, $tokoff);
319							}
320
321
322							=item parse_line(line, I)
323
324							This is the entrypoint to this module's parsing functionality. It converts
325							a line into tokens, respecting quoted text, escaped characters,
326							etc. It also keeps track of a cursor position on the input text,
327							returning the token number and offset within the token where that position
328							can be found in the output.
329
330							This routine originally bore some resemblance to Text::ParseWords.
331							It has changed almost completely, however, to support keeping track
332							of the cursor position. It also has nicer failure modes, modular
333							quoting, token characters (see token_chars in L), etc. This
334							routine now does much more.
335
336							Arguments:
337
338							=over 3
339
340							=item line
341
342							This is a string containing the command-line to parse.
343
344							=back
345
346							This routine also accepts the following named parameters:
347
348							=over 3
349
350							=item cursorpos
351
352							This is the character position in the line to keep track of.
353							Pass undef (by not specifying it) or the empty string to have
354							the line processed with cursorpos ignored.
355
356							Note that passing undef is I the same as passing
357							some random number and ignoring the result! For instance, if you
358							pass 0 and the line begins with whitespace, you'll get a 0-length token at
359							the beginning of the line to represent the cursor in
360							the middle of the whitespace. This allows command completion
361							to work even when the cursor is not near any tokens.
362							If you pass undef, all whitespace at the beginning and end of
363							the line will be trimmed as you would expect.
364
365							If it is ambiguous whether the cursor should belong to the previous
366							token or to the following one (i.e. if it's between two quoted
367							strings, say "a""b" or a token_char), it always gravitates to
368							the previous token. This makes more sense when completing.
369
370							=item fixclosequote
371
372							Sometimes you want to try to recover from a missing close quote
373							(for instance, when calculating completions), but usually you
374							want a missing close quote to be a fatal error. fixclosequote=>1
375							will implicitly insert the correct quote if it's missing.
376							fixclosequote=>0 is the default.
377
378							=item messages
379
380							parse_line is capable of printing very informative error messages.
381							However, sometimes you don't care enough to print a message (like
382							when calculating completions). Messages are printed by default,
383							so pass messages=>0 to turn them off.
384
385							=back
386
387							This function returns a reference to an array containing three
388							items:
389
390							=over 3
391
392							=item tokens
393
394							A the tokens that the line was separated into (ref to an array of strings).
395
396							=item tokno
397
398							The number of the token (index into the previous array) that contains
399							cursorpos.
400
401							=item tokoff
402
403							The character offet into tokno of cursorpos.
404
405							=back
406
407							If the cursor is at the end of the token, tokoff will point to 1
408							character past the last character in tokno, a non-existant character.
409							If the cursor is between tokens (surrounded by whitespace), a zero-length
410							token will be created for it.
411
412							=cut
413
414							sub parse_line
415							{
416	65			65	1	36007	my $self = shift;
417	65					100	my $line = shift;
418	65					320	my %args = (
419							messages => 1, # true if we should print errors, etc.
420							cursorpos => undef, # cursor to keep track of, undef to ignore.
421							fixclosequote => 0,
422							@_
423							);
424
425	65					95	my @result = eval { $self->parsel($line,
	65					194
426							$args{'cursorpos'}, $args{'fixclosequote'}) };
427	65	100				357	if($@) {
428	2	50	33			11	$self->{error}->($self, $@) if $args{'messages'} && $self->{error};
429	2					6	@result = (undef, undef, undef);
430							}
431
432	65					277	return @result;
433							}
434
435
436							=item parse_escape(lines)
437
438							Escapes characters that would be otherwise interpreted by the parser.
439							Will accept either a single string or an arrayref of strings (which
440							will be modified in-place).
441
442							=cut
443
444							sub parse_escape
445							{
446	0			0	1		my $self = shift;
447	0						my $arr = shift; # either a string or an arrayref of strings
448
449	0						my $wantstr = 0;
450	0	0					if(ref($arr) ne 'ARRAY') {
451	0						$arr = [$arr];
452	0						$wantstr = 1;
453							}
454
455	0						foreach(@$arr) {
456	0						my $quote;
457	0	0	0				if($self->{keep_quotes} && /^(['"])(.*)\1$/) {
458	0						($quote, $_) = ($1, $2);
459							}
460	0						s/([ \\"'])/\\$1/g;
461	0	0					$_ = $quote.$_.$quote if $quote;
462							}
463
464	0	0					return $wantstr ? $arr->[0] : $arr;
465							}
466
467
468							=item join_line(tokens)
469
470							This routine does a somewhat intelligent job of joining tokens
471							back into a command line. If token_chars (see L) is empty
472							(the default), then it just escapes backslashes and quotes, and
473							joins the tokens with spaces.
474
475							However, if token_chars is nonempty, it tries to insert a visually
476							pleasing amount of space between the tokens. For instance, rather
477							than 'a ( b , c )', it tries to produce 'a (b, c)'. It won't reformat
478							any tokens that aren't found in $self->{token_chars}, of course.
479
480							To change the formatting, you can redefine the variables
481							$self->{space_none}, $self->{space_before}, and $self->{space_after}.
482							Each variable is a string containing all characters that should
483							not be surrounded by whitespace, should have whitespace before,
484							and should have whitespace after, respectively. Any character
485							found in token_chars, but non in any of these space_ variables,
486							will have space placed both before and after.
487
488							=cut
489
490							sub join_line
491							{
492	0			0	1		my $self = shift;
493	0						my $intoks = shift;
494
495	0						my $tchrs = $self->{token_chars};
496	0						my $s_none = $self->{space_none};
497	0						my $s_before = $self->{space_before};
498	0						my $s_after = $self->{space_after};
499
500							# copy the input array so we don't modify it
501	0						my $tokens = $self->parse_escape([@$intoks]);
502
503	0						my $str = '';
504	0						my $sw = 0; # space if space wanted after token.
505	0						my $sf = 0; # space if space should be forced after token.
506	0						for(@$tokens) {
507	0	0	0				if(length == 1 && index($tchrs,$_) >= 0) {
508	0	0					if(index($s_none,$_) >= 0) { $str .= $_; $sw=0; next; }
	0
	0
	0
509	0	0					if(index($s_before,$_) >= 0) { $str .= $sw.$_; $sw=0; next; }
	0
	0
	0
510	0	0					if(index($s_after,$_) >= 0) { $str .= $_; $sw=1; next; }
	0
	0
	0
511							# default: force space on both sides of operator.
512	0						$str .= " $_ "; $sw = 0; next;
	0
	0
513							}
514	0	0					$str .= ($sw ? ' ' : '') . $_;
515	0						$sw = 1;
516							}
517
518	0						return $str;
519							}
520
521
522							=back
523
524							=back
525
526							=head1 BUGS
527
528							None known.
529
530							=head1 LICENSE
531
532							Copyright (c) 2003-2011 Scott Bronson, all rights reserved.
533							This program is covered by the MIT license.
534
535							=head1 AUTHOR
536
537							Scott Bronson Ebronson@rinspin.comE
538
539							=cut
540
541							1;