| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package URI::Escape; |
|
2
|
|
|
|
|
|
|
|
|
3
|
43
|
|
|
43
|
|
77542
|
use strict; |
|
|
43
|
|
|
|
|
75
|
|
|
|
43
|
|
|
|
|
1242
|
|
|
4
|
43
|
|
|
43
|
|
212
|
use warnings; |
|
|
43
|
|
|
|
|
110
|
|
|
|
43
|
|
|
|
|
2115
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
URI::Escape - Percent-encode and percent-decode unsafe characters |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use URI::Escape; |
|
13
|
|
|
|
|
|
|
$safe = uri_escape("10% is enough\n"); |
|
14
|
|
|
|
|
|
|
$verysafe = uri_escape("foo", "\0-\377"); |
|
15
|
|
|
|
|
|
|
$str = uri_unescape($safe); |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
This module provides functions to percent-encode and percent-decode URI strings as |
|
20
|
|
|
|
|
|
|
defined by RFC 3986. Percent-encoding URI's is informally called "URI escaping". |
|
21
|
|
|
|
|
|
|
This is the terminology used by this module, which predates the formalization of the |
|
22
|
|
|
|
|
|
|
terms by the RFC by several years. |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
A URI consists of a restricted set of characters. The restricted set |
|
25
|
|
|
|
|
|
|
of characters consists of digits, letters, and a few graphic symbols |
|
26
|
|
|
|
|
|
|
chosen from those common to most of the character encodings and input |
|
27
|
|
|
|
|
|
|
facilities available to Internet users. They are made up of the |
|
28
|
|
|
|
|
|
|
"unreserved" and "reserved" character sets as defined in RFC 3986. |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
|
31
|
|
|
|
|
|
|
reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
|
32
|
|
|
|
|
|
|
"!" / "$" / "&" / "'" / "(" / ")" |
|
33
|
|
|
|
|
|
|
/ "*" / "+" / "," / ";" / "=" |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
In addition, any byte (octet) can be represented in a URI by an escape |
|
36
|
|
|
|
|
|
|
sequence: a triplet consisting of the character "%" followed by two |
|
37
|
|
|
|
|
|
|
hexadecimal digits. A byte can also be represented directly by a |
|
38
|
|
|
|
|
|
|
character, using the US-ASCII character for that octet. |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Some of the characters are I for use as delimiters or as |
|
41
|
|
|
|
|
|
|
part of certain URI components. These must be escaped if they are to |
|
42
|
|
|
|
|
|
|
be treated as ordinary data. Read RFC 3986 for further details. |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The functions provided (and exported by default) from this module are: |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over 4 |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item uri_escape( $string ) |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=item uri_escape( $string, $unsafe ) |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Replaces each unsafe character in the $string with the corresponding |
|
53
|
|
|
|
|
|
|
escape sequence and returns the result. The $string argument should |
|
54
|
|
|
|
|
|
|
be a string of bytes. The uri_escape() function will croak if given a |
|
55
|
|
|
|
|
|
|
characters with code above 255. Use uri_escape_utf8() if you know you |
|
56
|
|
|
|
|
|
|
have such chars or/and want chars in the 128 .. 255 range treated as |
|
57
|
|
|
|
|
|
|
UTF-8. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The uri_escape() function takes an optional second argument that |
|
60
|
|
|
|
|
|
|
overrides the set of characters that are to be escaped. The set is |
|
61
|
|
|
|
|
|
|
specified as a string that can be used in a regular expression |
|
62
|
|
|
|
|
|
|
character class (between [ ]). E.g.: |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
"\x00-\x1f\x7f-\xff" # all control and hi-bit characters |
|
65
|
|
|
|
|
|
|
"a-z" # all lower case characters |
|
66
|
|
|
|
|
|
|
"^A-Za-z" # everything not a letter |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The default set of characters to be escaped is all those which are |
|
69
|
|
|
|
|
|
|
I part of the C character class shown above as well |
|
70
|
|
|
|
|
|
|
as the reserved characters. I.e. the default is: |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
"^A-Za-z0-9\-\._~" |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
The second argument can also be specified as a regular expression object: |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
qr/[^A-Za-z]/ |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
Any strings matched by this regular expression will have all of their |
|
79
|
|
|
|
|
|
|
characters escaped. |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item uri_escape_utf8( $string ) |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=item uri_escape_utf8( $string, $unsafe ) |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
Works like uri_escape(), but will encode chars as UTF-8 before |
|
86
|
|
|
|
|
|
|
escaping them. This makes this function able to deal with characters |
|
87
|
|
|
|
|
|
|
with code above 255 in $string. Note that chars in the 128 .. 255 |
|
88
|
|
|
|
|
|
|
range will be escaped differently by this function compared to what |
|
89
|
|
|
|
|
|
|
uri_escape() would. For chars in the 0 .. 127 range there is no |
|
90
|
|
|
|
|
|
|
difference. |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Equivalent to: |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
utf8::encode($string); |
|
95
|
|
|
|
|
|
|
my $uri = uri_escape($string); |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Note: JavaScript has a function called escape() that produces the |
|
98
|
|
|
|
|
|
|
sequence "%uXXXX" for chars in the 256 .. 65535 range. This function |
|
99
|
|
|
|
|
|
|
has really nothing to do with URI escaping but some folks got confused |
|
100
|
|
|
|
|
|
|
since it "does the right thing" in the 0 .. 255 range. Because of |
|
101
|
|
|
|
|
|
|
this you sometimes see "URIs" with these kind of escapes. The |
|
102
|
|
|
|
|
|
|
JavaScript encodeURIComponent() function is similar to uri_escape_utf8(). |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item uri_unescape($string,...) |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Returns a string with each %XX sequence replaced with the actual byte |
|
107
|
|
|
|
|
|
|
(octet). |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
This does the same as: |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
$string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
but does not modify the string in-place as this RE would. Using the |
|
114
|
|
|
|
|
|
|
uri_unescape() function instead of the RE might make the code look |
|
115
|
|
|
|
|
|
|
cleaner and is a few characters less to type. |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
In a simple benchmark test I did, |
|
118
|
|
|
|
|
|
|
calling the function (instead of the inline RE above) if a few chars |
|
119
|
|
|
|
|
|
|
were unescaped was something like 40% slower, and something like 700% slower if none were. If |
|
120
|
|
|
|
|
|
|
you are going to unescape a lot of times it might be a good idea to |
|
121
|
|
|
|
|
|
|
inline the RE. |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
If the uri_unescape() function is passed multiple strings, then each |
|
124
|
|
|
|
|
|
|
one is returned unescaped. |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=back |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
The module can also export the C<%escapes> hash, which contains the |
|
129
|
|
|
|
|
|
|
mapping from all 256 bytes to the corresponding escape codes. Lookup |
|
130
|
|
|
|
|
|
|
in this hash is faster than evaluating C |
|
131
|
|
|
|
|
|
|
each time. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
L |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Copyright 1995-2004 Gisle Aas. |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
|
143
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
|
146
|
|
|
|
|
|
|
|
|
147
|
43
|
|
|
43
|
|
212
|
use Exporter 5.57 'import'; |
|
|
43
|
|
|
|
|
1171
|
|
|
|
43
|
|
|
|
|
3752
|
|
|
148
|
|
|
|
|
|
|
our %escapes; |
|
149
|
|
|
|
|
|
|
our @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8); |
|
150
|
|
|
|
|
|
|
our @EXPORT_OK = qw(%escapes); |
|
151
|
|
|
|
|
|
|
our $VERSION = '5.21'; |
|
152
|
|
|
|
|
|
|
|
|
153
|
43
|
|
|
43
|
|
306
|
use Carp (); |
|
|
43
|
|
|
|
|
114
|
|
|
|
43
|
|
|
|
|
16196
|
|
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Build a char->hex map |
|
156
|
|
|
|
|
|
|
for (0..255) { |
|
157
|
|
|
|
|
|
|
$escapes{chr($_)} = sprintf("%%%02X", $_); |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
my %subst; # compiled patterns |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
my %Unsafe = ( |
|
163
|
|
|
|
|
|
|
RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/, |
|
164
|
|
|
|
|
|
|
RFC3986 => qr/[^A-Za-z0-9\-\._~]/, |
|
165
|
|
|
|
|
|
|
); |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub uri_escape { |
|
168
|
29
|
|
|
29
|
1
|
14907
|
my($text, $patn) = @_; |
|
169
|
29
|
100
|
|
|
|
118
|
return undef unless defined $text; |
|
170
|
28
|
|
|
|
|
57
|
my $re; |
|
171
|
28
|
100
|
|
|
|
99
|
if (defined $patn){ |
|
172
|
20
|
100
|
|
|
|
145
|
if (ref $patn eq 'Regexp') { |
|
173
|
2
|
|
|
|
|
36
|
$text =~ s{($patn)}{ |
|
174
|
4
|
|
33
|
|
|
41
|
join('', map +($escapes{$_} || _fail_hi($_)), split //, "$1") |
|
175
|
|
|
|
|
|
|
}ge; |
|
176
|
2
|
|
|
|
|
12
|
return $text; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
18
|
|
|
|
|
57
|
$re = $subst{$patn}; |
|
179
|
18
|
100
|
|
|
|
95
|
if (!defined $re) { |
|
180
|
15
|
|
|
|
|
37
|
$re = $patn; |
|
181
|
|
|
|
|
|
|
# we need to escape the [] characters, except for those used in |
|
182
|
|
|
|
|
|
|
# posix classes. if they are prefixed by a backslash, allow them |
|
183
|
|
|
|
|
|
|
# through unmodified. |
|
184
|
15
|
|
|
|
|
113
|
$re =~ s{(\[:\w+:\])|(\\)?([\[\]]|\\\z)}{ |
|
185
|
13
|
50
|
|
|
|
81
|
defined $1 ? $1 : defined $2 ? "$2$3" : "\\$3" |
|
|
|
100
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
}ge; |
|
187
|
15
|
100
|
|
|
|
33
|
eval { |
|
188
|
|
|
|
|
|
|
# disable the warnings here, since they will trigger later |
|
189
|
|
|
|
|
|
|
# when used, and we only want them to appear once per call, |
|
190
|
|
|
|
|
|
|
# but every time the same pattern is used. |
|
191
|
43
|
|
|
43
|
|
327
|
no warnings 'regexp'; |
|
|
43
|
|
|
|
|
78
|
|
|
|
43
|
|
|
|
|
23432
|
|
|
192
|
15
|
|
|
|
|
488
|
$re = $subst{$patn} = qr{[$re]}; |
|
193
|
14
|
|
|
|
|
64
|
1; |
|
194
|
|
|
|
|
|
|
} or Carp::croak("uri_escape: $@"); |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
else { |
|
198
|
8
|
|
|
|
|
16
|
$re = $Unsafe{RFC3986}; |
|
199
|
|
|
|
|
|
|
} |
|
200
|
25
|
100
|
|
|
|
488
|
$text =~ s/($re)/$escapes{$1} || _fail_hi($1)/ge; |
|
|
257
|
|
|
|
|
902
|
|
|
201
|
24
|
|
|
|
|
186
|
$text; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub _fail_hi { |
|
205
|
1
|
|
|
1
|
|
3
|
my $chr = shift; |
|
206
|
1
|
|
|
|
|
136
|
Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr)); |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub uri_escape_utf8 { |
|
210
|
2
|
|
|
2
|
1
|
5
|
my $text = shift; |
|
211
|
2
|
50
|
|
|
|
7
|
return undef unless defined $text; |
|
212
|
2
|
|
|
|
|
7
|
utf8::encode($text); |
|
213
|
2
|
|
|
|
|
6
|
return uri_escape($text, @_); |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub uri_unescape { |
|
217
|
|
|
|
|
|
|
# Note from RFC1630: "Sequences which start with a percent sign |
|
218
|
|
|
|
|
|
|
# but are not followed by two hexadecimal characters are reserved |
|
219
|
|
|
|
|
|
|
# for future extension" |
|
220
|
1073
|
|
|
1073
|
1
|
1717
|
my $str = shift; |
|
221
|
1073
|
100
|
66
|
|
|
2265
|
if (@_ && wantarray) { |
|
222
|
|
|
|
|
|
|
# not executed for the common case of a single argument |
|
223
|
1
|
|
|
|
|
3
|
my @str = ($str, @_); # need to copy |
|
224
|
1
|
|
|
|
|
3
|
for (@str) { |
|
225
|
3
|
|
|
|
|
12
|
s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
|
3
|
|
|
|
|
12
|
|
|
226
|
|
|
|
|
|
|
} |
|
227
|
1
|
|
|
|
|
8
|
return @str; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
1072
|
100
|
|
|
|
2422
|
$str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str; |
|
|
892
|
|
|
|
|
2410
|
|
|
230
|
1072
|
|
|
|
|
3117
|
$str; |
|
231
|
|
|
|
|
|
|
} |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# XXX FIXME escape_char is buggy as it assigns meaning to the string's storage format. |
|
234
|
|
|
|
|
|
|
sub escape_char { |
|
235
|
|
|
|
|
|
|
# Old versions of utf8::is_utf8() didn't properly handle magical vars (e.g. $1). |
|
236
|
|
|
|
|
|
|
# The following forces a fetch to occur beforehand. |
|
237
|
602
|
|
|
602
|
0
|
1076
|
my $dummy = substr($_[0], 0, 0); |
|
238
|
|
|
|
|
|
|
|
|
239
|
602
|
100
|
|
|
|
1404
|
if (utf8::is_utf8($_[0])) { |
|
240
|
134
|
|
|
|
|
183
|
my $s = shift; |
|
241
|
134
|
|
|
|
|
243
|
utf8::encode($s); |
|
242
|
134
|
|
|
|
|
235
|
unshift(@_, $s); |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
|
|
245
|
602
|
|
|
|
|
2739
|
return join '', @URI::Escape::escapes{split //, $_[0]}; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
1; |