line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package URI::Escape; |
2
|
|
|
|
|
|
|
|
3
|
42
|
|
|
42
|
|
76087
|
use strict; |
|
42
|
|
|
|
|
83
|
|
|
42
|
|
|
|
|
1172
|
|
4
|
42
|
|
|
42
|
|
192
|
use warnings; |
|
42
|
|
|
|
|
76
|
|
|
42
|
|
|
|
|
2040
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
URI::Escape - Percent-encode and percent-decode unsafe characters |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use URI::Escape; |
13
|
|
|
|
|
|
|
$safe = uri_escape("10% is enough\n"); |
14
|
|
|
|
|
|
|
$verysafe = uri_escape("foo", "\0-\377"); |
15
|
|
|
|
|
|
|
$str = uri_unescape($safe); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 DESCRIPTION |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
This module provides functions to percent-encode and percent-decode URI strings as |
20
|
|
|
|
|
|
|
defined by RFC 3986. Percent-encoding URI's is informally called "URI escaping". |
21
|
|
|
|
|
|
|
This is the terminology used by this module, which predates the formalization of the |
22
|
|
|
|
|
|
|
terms by the RFC by several years. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
A URI consists of a restricted set of characters. The restricted set |
25
|
|
|
|
|
|
|
of characters consists of digits, letters, and a few graphic symbols |
26
|
|
|
|
|
|
|
chosen from those common to most of the character encodings and input |
27
|
|
|
|
|
|
|
facilities available to Internet users. They are made up of the |
28
|
|
|
|
|
|
|
"unreserved" and "reserved" character sets as defined in RFC 3986. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
31
|
|
|
|
|
|
|
reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
32
|
|
|
|
|
|
|
"!" / "$" / "&" / "'" / "(" / ")" |
33
|
|
|
|
|
|
|
/ "*" / "+" / "," / ";" / "=" |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
In addition, any byte (octet) can be represented in a URI by an escape |
36
|
|
|
|
|
|
|
sequence: a triplet consisting of the character "%" followed by two |
37
|
|
|
|
|
|
|
hexadecimal digits. A byte can also be represented directly by a |
38
|
|
|
|
|
|
|
character, using the US-ASCII character for that octet. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Some of the characters are I for use as delimiters or as |
41
|
|
|
|
|
|
|
part of certain URI components. These must be escaped if they are to |
42
|
|
|
|
|
|
|
be treated as ordinary data. Read RFC 3986 for further details. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The functions provided (and exported by default) from this module are: |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over 4 |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item uri_escape( $string ) |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=item uri_escape( $string, $unsafe ) |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Replaces each unsafe character in the $string with the corresponding |
53
|
|
|
|
|
|
|
escape sequence and returns the result. The $string argument should |
54
|
|
|
|
|
|
|
be a string of bytes. The uri_escape() function will croak if given a |
55
|
|
|
|
|
|
|
characters with code above 255. Use uri_escape_utf8() if you know you |
56
|
|
|
|
|
|
|
have such chars or/and want chars in the 128 .. 255 range treated as |
57
|
|
|
|
|
|
|
UTF-8. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The uri_escape() function takes an optional second argument that |
60
|
|
|
|
|
|
|
overrides the set of characters that are to be escaped. The set is |
61
|
|
|
|
|
|
|
specified as a string that can be used in a regular expression |
62
|
|
|
|
|
|
|
character class (between [ ]). E.g.: |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
"\x00-\x1f\x7f-\xff" # all control and hi-bit characters |
65
|
|
|
|
|
|
|
"a-z" # all lower case characters |
66
|
|
|
|
|
|
|
"^A-Za-z" # everything not a letter |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The default set of characters to be escaped is all those which are |
69
|
|
|
|
|
|
|
I part of the C character class shown above as well |
70
|
|
|
|
|
|
|
as the reserved characters. I.e. the default is: |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
"^A-Za-z0-9\-\._~" |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
The second argument can also be specified as a regular expression object: |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
qr/[^A-Za-z]/ |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
Any strings matched by this regular expression will have all of their |
79
|
|
|
|
|
|
|
characters escaped. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item uri_escape_utf8( $string ) |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=item uri_escape_utf8( $string, $unsafe ) |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
Works like uri_escape(), but will encode chars as UTF-8 before |
86
|
|
|
|
|
|
|
escaping them. This makes this function able to deal with characters |
87
|
|
|
|
|
|
|
with code above 255 in $string. Note that chars in the 128 .. 255 |
88
|
|
|
|
|
|
|
range will be escaped differently by this function compared to what |
89
|
|
|
|
|
|
|
uri_escape() would. For chars in the 0 .. 127 range there is no |
90
|
|
|
|
|
|
|
difference. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Equivalent to: |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
utf8::encode($string); |
95
|
|
|
|
|
|
|
my $uri = uri_escape($string); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Note: JavaScript has a function called escape() that produces the |
98
|
|
|
|
|
|
|
sequence "%uXXXX" for chars in the 256 .. 65535 range. This function |
99
|
|
|
|
|
|
|
has really nothing to do with URI escaping but some folks got confused |
100
|
|
|
|
|
|
|
since it "does the right thing" in the 0 .. 255 range. Because of |
101
|
|
|
|
|
|
|
this you sometimes see "URIs" with these kind of escapes. The |
102
|
|
|
|
|
|
|
JavaScript encodeURIComponent() function is similar to uri_escape_utf8(). |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item uri_unescape($string,...) |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Returns a string with each %XX sequence replaced with the actual byte |
107
|
|
|
|
|
|
|
(octet). |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
This does the same as: |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
$string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
but does not modify the string in-place as this RE would. Using the |
114
|
|
|
|
|
|
|
uri_unescape() function instead of the RE might make the code look |
115
|
|
|
|
|
|
|
cleaner and is a few characters less to type. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
In a simple benchmark test I did, |
118
|
|
|
|
|
|
|
calling the function (instead of the inline RE above) if a few chars |
119
|
|
|
|
|
|
|
were unescaped was something like 40% slower, and something like 700% slower if none were. If |
120
|
|
|
|
|
|
|
you are going to unescape a lot of times it might be a good idea to |
121
|
|
|
|
|
|
|
inline the RE. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
If the uri_unescape() function is passed multiple strings, then each |
124
|
|
|
|
|
|
|
one is returned unescaped. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=back |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
The module can also export the C<%escapes> hash, which contains the |
129
|
|
|
|
|
|
|
mapping from all 256 bytes to the corresponding escape codes. Lookup |
130
|
|
|
|
|
|
|
in this hash is faster than evaluating C |
131
|
|
|
|
|
|
|
each time. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head1 SEE ALSO |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
L |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 COPYRIGHT |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Copyright 1995-2004 Gisle Aas. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
143
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
146
|
|
|
|
|
|
|
|
147
|
42
|
|
|
42
|
|
226
|
use Exporter 5.57 'import'; |
|
42
|
|
|
|
|
1000
|
|
|
42
|
|
|
|
|
3637
|
|
148
|
|
|
|
|
|
|
our %escapes; |
149
|
|
|
|
|
|
|
our @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8); |
150
|
|
|
|
|
|
|
our @EXPORT_OK = qw(%escapes); |
151
|
|
|
|
|
|
|
our $VERSION = '5.19'; |
152
|
|
|
|
|
|
|
|
153
|
42
|
|
|
42
|
|
328
|
use Carp (); |
|
42
|
|
|
|
|
117
|
|
|
42
|
|
|
|
|
15009
|
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Build a char->hex map |
156
|
|
|
|
|
|
|
for (0..255) { |
157
|
|
|
|
|
|
|
$escapes{chr($_)} = sprintf("%%%02X", $_); |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
my %subst; # compiled patterns |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
my %Unsafe = ( |
163
|
|
|
|
|
|
|
RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/, |
164
|
|
|
|
|
|
|
RFC3986 => qr/[^A-Za-z0-9\-\._~]/, |
165
|
|
|
|
|
|
|
); |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub uri_escape { |
168
|
29
|
|
|
29
|
1
|
17431
|
my($text, $patn) = @_; |
169
|
29
|
100
|
|
|
|
146
|
return undef unless defined $text; |
170
|
28
|
|
|
|
|
47
|
my $re; |
171
|
28
|
100
|
|
|
|
76
|
if (defined $patn){ |
172
|
20
|
100
|
|
|
|
158
|
if (ref $patn eq 'Regexp') { |
173
|
2
|
|
|
|
|
38
|
$text =~ s{($patn)}{ |
174
|
4
|
|
33
|
|
|
33
|
join('', map +($escapes{$_} || _fail_hi($_)), split //, "$1") |
175
|
|
|
|
|
|
|
}ge; |
176
|
2
|
|
|
|
|
13
|
return $text; |
177
|
|
|
|
|
|
|
} |
178
|
18
|
|
|
|
|
51
|
$re = $subst{$patn}; |
179
|
18
|
100
|
|
|
|
100
|
if (!defined $re) { |
180
|
15
|
|
|
|
|
41
|
$re = $patn; |
181
|
|
|
|
|
|
|
# we need to escape the [] characters, except for those used in |
182
|
|
|
|
|
|
|
# posix classes. if they are prefixed by a backslash, allow them |
183
|
|
|
|
|
|
|
# through unmodified. |
184
|
15
|
|
|
|
|
121
|
$re =~ s{(\[:\w+:\])|(\\)?([\[\]]|\\\z)}{ |
185
|
13
|
50
|
|
|
|
83
|
defined $1 ? $1 : defined $2 ? "$2$3" : "\\$3" |
|
|
100
|
|
|
|
|
|
186
|
|
|
|
|
|
|
}ge; |
187
|
15
|
100
|
|
|
|
39
|
eval { |
188
|
|
|
|
|
|
|
# disable the warnings here, since they will trigger later |
189
|
|
|
|
|
|
|
# when used, and we only want them to appear once per call, |
190
|
|
|
|
|
|
|
# but every time the same pattern is used. |
191
|
42
|
|
|
42
|
|
322
|
no warnings 'regexp'; |
|
42
|
|
|
|
|
89
|
|
|
42
|
|
|
|
|
22682
|
|
192
|
15
|
|
|
|
|
569
|
$re = $subst{$patn} = qr{[$re]}; |
193
|
14
|
|
|
|
|
79
|
1; |
194
|
|
|
|
|
|
|
} or Carp::croak("uri_escape: $@"); |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
else { |
198
|
8
|
|
|
|
|
20
|
$re = $Unsafe{RFC3986}; |
199
|
|
|
|
|
|
|
} |
200
|
25
|
100
|
|
|
|
542
|
$text =~ s/($re)/$escapes{$1} || _fail_hi($1)/ge; |
|
257
|
|
|
|
|
914
|
|
201
|
24
|
|
|
|
|
201
|
$text; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub _fail_hi { |
205
|
1
|
|
|
1
|
|
4
|
my $chr = shift; |
206
|
1
|
|
|
|
|
129
|
Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr)); |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub uri_escape_utf8 { |
210
|
2
|
|
|
2
|
1
|
7
|
my $text = shift; |
211
|
2
|
50
|
|
|
|
6
|
return undef unless defined $text; |
212
|
2
|
|
|
|
|
9
|
utf8::encode($text); |
213
|
2
|
|
|
|
|
5
|
return uri_escape($text, @_); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub uri_unescape { |
217
|
|
|
|
|
|
|
# Note from RFC1630: "Sequences which start with a percent sign |
218
|
|
|
|
|
|
|
# but are not followed by two hexadecimal characters are reserved |
219
|
|
|
|
|
|
|
# for future extension" |
220
|
1069
|
|
|
1069
|
1
|
1649
|
my $str = shift; |
221
|
1069
|
100
|
66
|
|
|
2366
|
if (@_ && wantarray) { |
222
|
|
|
|
|
|
|
# not executed for the common case of a single argument |
223
|
1
|
|
|
|
|
3
|
my @str = ($str, @_); # need to copy |
224
|
1
|
|
|
|
|
3
|
for (@str) { |
225
|
3
|
|
|
|
|
12
|
s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
3
|
|
|
|
|
12
|
|
226
|
|
|
|
|
|
|
} |
227
|
1
|
|
|
|
|
11
|
return @str; |
228
|
|
|
|
|
|
|
} |
229
|
1068
|
100
|
|
|
|
2494
|
$str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str; |
|
891
|
|
|
|
|
2280
|
|
230
|
1068
|
|
|
|
|
3194
|
$str; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# XXX FIXME escape_char is buggy as it assigns meaning to the string's storage format. |
234
|
|
|
|
|
|
|
sub escape_char { |
235
|
|
|
|
|
|
|
# Old versions of utf8::is_utf8() didn't properly handle magical vars (e.g. $1). |
236
|
|
|
|
|
|
|
# The following forces a fetch to occur beforehand. |
237
|
601
|
|
|
601
|
0
|
1094
|
my $dummy = substr($_[0], 0, 0); |
238
|
|
|
|
|
|
|
|
239
|
601
|
100
|
|
|
|
1406
|
if (utf8::is_utf8($_[0])) { |
240
|
134
|
|
|
|
|
182
|
my $s = shift; |
241
|
134
|
|
|
|
|
230
|
utf8::encode($s); |
242
|
134
|
|
|
|
|
202
|
unshift(@_, $s); |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
601
|
|
|
|
|
2671
|
return join '', @URI::Escape::escapes{split //, $_[0]}; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
1; |