line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/************************************************* |
2
|
|
|
|
|
|
|
* Perl-Compatible Regular Expressions * |
3
|
|
|
|
|
|
|
*************************************************/ |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax |
6
|
|
|
|
|
|
|
and semantics are as close as possible to those of the Perl 5 language. |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
Written by Philip Hazel |
9
|
|
|
|
|
|
|
Copyright (c) 1997-2013 University of Cambridge |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
----------------------------------------------------------------------------- |
12
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without |
13
|
|
|
|
|
|
|
modification, are permitted provided that the following conditions are met: |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
* Redistributions of source code must retain the above copyright notice, |
16
|
|
|
|
|
|
|
this list of conditions and the following disclaimer. |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
* Redistributions in binary form must reproduce the above copyright |
19
|
|
|
|
|
|
|
notice, this list of conditions and the following disclaimer in the |
20
|
|
|
|
|
|
|
documentation and/or other materials provided with the distribution. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its |
23
|
|
|
|
|
|
|
contributors may be used to endorse or promote products derived from |
24
|
|
|
|
|
|
|
this software without specific prior written permission. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
27
|
|
|
|
|
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
28
|
|
|
|
|
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
29
|
|
|
|
|
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
30
|
|
|
|
|
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31
|
|
|
|
|
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32
|
|
|
|
|
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33
|
|
|
|
|
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34
|
|
|
|
|
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35
|
|
|
|
|
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36
|
|
|
|
|
|
|
POSSIBILITY OF SUCH DAMAGE. |
37
|
|
|
|
|
|
|
----------------------------------------------------------------------------- |
38
|
|
|
|
|
|
|
*/ |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
/* This module contains an internal function for validating UTF-8 character |
42
|
|
|
|
|
|
|
strings. */ |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H |
46
|
|
|
|
|
|
|
#include "config.h" |
47
|
|
|
|
|
|
|
#endif |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
#include "pcre_internal.h" |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
/************************************************* |
53
|
|
|
|
|
|
|
* Validate a UTF-8 string * |
54
|
|
|
|
|
|
|
*************************************************/ |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
/* This function is called (optionally) at the start of compile or match, to |
57
|
|
|
|
|
|
|
check that a supposed UTF-8 string is actually valid. The early check means |
58
|
|
|
|
|
|
|
that subsequent code can assume it is dealing with a valid string. The check |
59
|
|
|
|
|
|
|
can be turned off for maximum performance, but the consequences of supplying an |
60
|
|
|
|
|
|
|
invalid string are then undefined. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Originally, this function checked according to RFC 2279, allowing for values in |
63
|
|
|
|
|
|
|
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in |
64
|
|
|
|
|
|
|
the canonical format. Once somebody had pointed out RFC 3629 to me (it |
65
|
|
|
|
|
|
|
obsoletes 2279), additional restrictions were applied. The values are now |
66
|
|
|
|
|
|
|
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
67
|
|
|
|
|
|
|
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte |
68
|
|
|
|
|
|
|
characters is still checked. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
From release 8.13 more information about the details of the error are passed |
71
|
|
|
|
|
|
|
back in the returned value: |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
PCRE_UTF8_ERR0 No error |
74
|
|
|
|
|
|
|
PCRE_UTF8_ERR1 Missing 1 byte at the end of the string |
75
|
|
|
|
|
|
|
PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string |
76
|
|
|
|
|
|
|
PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string |
77
|
|
|
|
|
|
|
PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string |
78
|
|
|
|
|
|
|
PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string |
79
|
|
|
|
|
|
|
PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80 |
80
|
|
|
|
|
|
|
PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80 |
81
|
|
|
|
|
|
|
PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80 |
82
|
|
|
|
|
|
|
PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80 |
83
|
|
|
|
|
|
|
PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80 |
84
|
|
|
|
|
|
|
PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629 |
85
|
|
|
|
|
|
|
PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629 |
86
|
|
|
|
|
|
|
PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted |
87
|
|
|
|
|
|
|
PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted |
88
|
|
|
|
|
|
|
PCRE_UTF8_ERR15 Overlong 2-byte sequence |
89
|
|
|
|
|
|
|
PCRE_UTF8_ERR16 Overlong 3-byte sequence |
90
|
|
|
|
|
|
|
PCRE_UTF8_ERR17 Overlong 4-byte sequence |
91
|
|
|
|
|
|
|
PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) |
92
|
|
|
|
|
|
|
PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) |
93
|
|
|
|
|
|
|
PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) |
94
|
|
|
|
|
|
|
PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff |
95
|
|
|
|
|
|
|
PCRE_UTF8_ERR22 Unused (was non-character) |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Arguments: |
98
|
|
|
|
|
|
|
string points to the string |
99
|
|
|
|
|
|
|
length length of string, or -1 if the string is zero-terminated |
100
|
|
|
|
|
|
|
errp pointer to an error position offset variable |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Returns: = 0 if the string is a valid UTF-8 string |
103
|
|
|
|
|
|
|
> 0 otherwise, setting the offset of the bad character |
104
|
|
|
|
|
|
|
*/ |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
int |
107
|
0
|
|
|
|
|
|
PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) |
108
|
|
|
|
|
|
|
{ |
109
|
|
|
|
|
|
|
#ifdef SUPPORT_UTF |
110
|
|
|
|
|
|
|
register PCRE_PUCHAR p; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
if (length < 0) |
113
|
|
|
|
|
|
|
{ |
114
|
|
|
|
|
|
|
for (p = string; *p != 0; p++); |
115
|
|
|
|
|
|
|
length = (int)(p - string); |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
for (p = string; length-- > 0; p++) |
119
|
|
|
|
|
|
|
{ |
120
|
|
|
|
|
|
|
register pcre_uchar ab, c, d; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
c = *p; |
123
|
|
|
|
|
|
|
if (c < 128) continue; /* ASCII character */ |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
if (c < 0xc0) /* Isolated 10xx xxxx byte */ |
126
|
|
|
|
|
|
|
{ |
127
|
|
|
|
|
|
|
*erroroffset = (int)(p - string); |
128
|
|
|
|
|
|
|
return PCRE_UTF8_ERR20; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ |
132
|
|
|
|
|
|
|
{ |
133
|
|
|
|
|
|
|
*erroroffset = (int)(p - string); |
134
|
|
|
|
|
|
|
return PCRE_UTF8_ERR21; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */ |
138
|
|
|
|
|
|
|
if (length < ab) |
139
|
|
|
|
|
|
|
{ |
140
|
|
|
|
|
|
|
*erroroffset = (int)(p - string); /* Missing bytes */ |
141
|
|
|
|
|
|
|
return ab - length; /* Codes ERR1 to ERR5 */ |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
length -= ab; /* Length remaining */ |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
/* Check top bits in the second byte */ |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
if (((d = *(++p)) & 0xc0) != 0x80) |
148
|
|
|
|
|
|
|
{ |
149
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 1; |
150
|
|
|
|
|
|
|
return PCRE_UTF8_ERR6; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
/* For each length, check that the remaining bytes start with the 0x80 bit |
154
|
|
|
|
|
|
|
set and not the 0x40 bit. Then check for an overlong sequence, and for the |
155
|
|
|
|
|
|
|
excluded range 0xd800 to 0xdfff. */ |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
switch (ab) |
158
|
|
|
|
|
|
|
{ |
159
|
|
|
|
|
|
|
/* 2-byte character. No further bytes to check for 0x80. Check first byte |
160
|
|
|
|
|
|
|
for for xx00 000x (overlong sequence). */ |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
case 1: if ((c & 0x3e) == 0) |
163
|
|
|
|
|
|
|
{ |
164
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 1; |
165
|
|
|
|
|
|
|
return PCRE_UTF8_ERR15; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
break; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes |
170
|
|
|
|
|
|
|
for 1110 0000, xx0x xxxx (overlong sequence) or |
171
|
|
|
|
|
|
|
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
case 2: |
174
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
175
|
|
|
|
|
|
|
{ |
176
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 2; |
177
|
|
|
|
|
|
|
return PCRE_UTF8_ERR7; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
if (c == 0xe0 && (d & 0x20) == 0) |
180
|
|
|
|
|
|
|
{ |
181
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 2; |
182
|
|
|
|
|
|
|
return PCRE_UTF8_ERR16; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
if (c == 0xed && d >= 0xa0) |
185
|
|
|
|
|
|
|
{ |
186
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 2; |
187
|
|
|
|
|
|
|
return PCRE_UTF8_ERR14; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
break; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 |
192
|
|
|
|
|
|
|
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a |
193
|
|
|
|
|
|
|
character greater than 0x0010ffff (f4 8f bf bf) */ |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
case 3: |
196
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
197
|
|
|
|
|
|
|
{ |
198
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 2; |
199
|
|
|
|
|
|
|
return PCRE_UTF8_ERR7; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
202
|
|
|
|
|
|
|
{ |
203
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 3; |
204
|
|
|
|
|
|
|
return PCRE_UTF8_ERR8; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
if (c == 0xf0 && (d & 0x30) == 0) |
207
|
|
|
|
|
|
|
{ |
208
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 3; |
209
|
|
|
|
|
|
|
return PCRE_UTF8_ERR17; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) |
212
|
|
|
|
|
|
|
{ |
213
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 3; |
214
|
|
|
|
|
|
|
return PCRE_UTF8_ERR13; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
break; |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be |
219
|
|
|
|
|
|
|
rejected by the length test below. However, we do the appropriate tests |
220
|
|
|
|
|
|
|
here so that overlong sequences get diagnosed, and also in case there is |
221
|
|
|
|
|
|
|
ever an option for handling these larger code points. */ |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for |
224
|
|
|
|
|
|
|
1111 1000, xx00 0xxx */ |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
case 4: |
227
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
228
|
|
|
|
|
|
|
{ |
229
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 2; |
230
|
|
|
|
|
|
|
return PCRE_UTF8_ERR7; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
233
|
|
|
|
|
|
|
{ |
234
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 3; |
235
|
|
|
|
|
|
|
return PCRE_UTF8_ERR8; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
238
|
|
|
|
|
|
|
{ |
239
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 4; |
240
|
|
|
|
|
|
|
return PCRE_UTF8_ERR9; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
if (c == 0xf8 && (d & 0x38) == 0) |
243
|
|
|
|
|
|
|
{ |
244
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 4; |
245
|
|
|
|
|
|
|
return PCRE_UTF8_ERR18; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
break; |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for |
250
|
|
|
|
|
|
|
1111 1100, xx00 00xx. */ |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
case 5: |
253
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
254
|
|
|
|
|
|
|
{ |
255
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 2; |
256
|
|
|
|
|
|
|
return PCRE_UTF8_ERR7; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
259
|
|
|
|
|
|
|
{ |
260
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 3; |
261
|
|
|
|
|
|
|
return PCRE_UTF8_ERR8; |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
264
|
|
|
|
|
|
|
{ |
265
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 4; |
266
|
|
|
|
|
|
|
return PCRE_UTF8_ERR9; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ |
269
|
|
|
|
|
|
|
{ |
270
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 5; |
271
|
|
|
|
|
|
|
return PCRE_UTF8_ERR10; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
if (c == 0xfc && (d & 0x3c) == 0) |
274
|
|
|
|
|
|
|
{ |
275
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - 5; |
276
|
|
|
|
|
|
|
return PCRE_UTF8_ERR19; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
break; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are |
282
|
|
|
|
|
|
|
excluded by RFC 3629. The pointer p is currently at the last byte of the |
283
|
|
|
|
|
|
|
character. */ |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
if (ab > 3) |
286
|
|
|
|
|
|
|
{ |
287
|
|
|
|
|
|
|
*erroroffset = (int)(p - string) - ab; |
288
|
|
|
|
|
|
|
return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
#else /* Not SUPPORT_UTF */ |
293
|
|
|
|
|
|
|
(void)(string); /* Keep picky compilers happy */ |
294
|
|
|
|
|
|
|
(void)(length); |
295
|
|
|
|
|
|
|
(void)(erroroffset); |
296
|
|
|
|
|
|
|
#endif |
297
|
|
|
|
|
|
|
|
298
|
0
|
|
|
|
|
|
return PCRE_UTF8_ERR0; /* This indicates success */ |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
/* End of pcre_valid_utf8.c */ |