line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* This is a Unicode library in the programming language C which deals |
2
|
|
|
|
|
|
|
with conversions to and from the UTF-8 format. */ |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
/* |
5
|
|
|
|
|
|
|
Author: |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Ben Bullock , |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Repository: |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
https://github.com/benkasminbullock/unicode-c |
12
|
|
|
|
|
|
|
*/ |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
#include |
15
|
|
|
|
|
|
|
#include |
16
|
|
|
|
|
|
|
#include "unicode.h" |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
#ifdef HEADER |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
/* _ _ _ _ |
21
|
|
|
|
|
|
|
| | (_)_ __ ___ (_) |_ ___ |
22
|
|
|
|
|
|
|
| | | | '_ ` _ \| | __/ __| |
23
|
|
|
|
|
|
|
| |___| | | | | | | | |_\__ \ |
24
|
|
|
|
|
|
|
|_____|_|_| |_| |_|_|\__|___/ */ |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
/* The maximum number of bytes we need to contain any Unicode code |
29
|
|
|
|
|
|
|
point as UTF-8 as a C string. This length includes one trailing nul |
30
|
|
|
|
|
|
|
byte. */ |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
#define UTF8_MAX_LENGTH 5 |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
/* The maximum possible value of a Unicode code point. See |
35
|
|
|
|
|
|
|
http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */ |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
#define UNICODE_MAXIMUM 0x10ffff |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
/* The maximum possible value which will fit into four bytes of |
40
|
|
|
|
|
|
|
UTF-8. This is larger than UNICODE_MAXIMUM. */ |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
#define UNICODE_UTF8_4 0x1fffff |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
/* ____ _ _ |
45
|
|
|
|
|
|
|
| _ \ ___| |_ _ _ _ __ _ __ __ ____ _| |_ _ ___ ___ |
46
|
|
|
|
|
|
|
| |_) / _ \ __| | | | '__| '_ \ \ \ / / _` | | | | |/ _ \/ __| |
47
|
|
|
|
|
|
|
| _ < __/ |_| |_| | | | | | | \ V / (_| | | |_| | __/\__ \ |
48
|
|
|
|
|
|
|
|_| \_\___|\__|\__,_|_| |_| |_| \_/ \__,_|_|\__,_|\___||___/ */ |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
/* All of the functions in this library return an "int32_t". Negative |
52
|
|
|
|
|
|
|
values are used to indicate errors. */ |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
/* This return value indicates the successful completion of a routine |
55
|
|
|
|
|
|
|
which doesn't use the return value to communicate data back to the |
56
|
|
|
|
|
|
|
caller. */ |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
#define UNICODE_OK 0 |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
/* This return value means that the leading byte of a UTF-8 sequence |
61
|
|
|
|
|
|
|
was not valid. */ |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
#define UTF8_BAD_LEADING_BYTE -1 |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
/* This return value means the caller attempted to turn a code point |
66
|
|
|
|
|
|
|
for a surrogate pair to or from UTF-8. */ |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
#define UNICODE_SURROGATE_PAIR -2 |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
/* This return value means that code points which did not form a |
71
|
|
|
|
|
|
|
surrogate pair were tried to be converted into a code point as if |
72
|
|
|
|
|
|
|
they were a surrogate pair. */ |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
#define UNICODE_NOT_SURROGATE_PAIR -3 |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
/* This return value means that input which was supposed to be UTF-8 |
77
|
|
|
|
|
|
|
encoded contained an invalid continuation byte. If the leading byte |
78
|
|
|
|
|
|
|
of a UTF-8 sequence is not valid, UTF8_BAD_LEADING_BYTE is returned |
79
|
|
|
|
|
|
|
instead of this. */ |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
#define UTF8_BAD_CONTINUATION_BYTE -4 |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
/* This return value indicates a zero byte was found in a string which |
84
|
|
|
|
|
|
|
was supposed to contain UTF-8 bytes. It is returned only by the |
85
|
|
|
|
|
|
|
functions which are documented as not allowing zero bytes. */ |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
#define UNICODE_EMPTY_INPUT -5 |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
/* This return value indicates that UTF-8 bytes were not in the |
90
|
|
|
|
|
|
|
shortest possible form. See |
91
|
|
|
|
|
|
|
http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
This return value is currently unused. If a character is not in the |
94
|
|
|
|
|
|
|
shortest form, the error UTF8_BAD_CONTINUATION_BYTE is returned. */ |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
#define UTF8_NON_SHORTEST -6 |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
/* This return value indicates that there was an attempt to convert a |
99
|
|
|
|
|
|
|
code point which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4 |
100
|
|
|
|
|
|
|
into UTF-8 bytes. */ |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
#define UNICODE_TOO_BIG -7 |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
/* This return value indicates that the Unicode code-point ended with |
105
|
|
|
|
|
|
|
either 0xFFFF or 0xFFFE, meaning it cannot be used as a character |
106
|
|
|
|
|
|
|
code point, or it was in the disallowed range FDD0 to FDEF. */ |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#define UNICODE_NOT_CHARACTER -8 |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
/* This return value indicates that the UTF-8 is valid. It is only |
111
|
|
|
|
|
|
|
used by "valid_utf8". */ |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
#define UTF8_VALID 1 |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
/* This return value indicates that the UTF-8 is not valid. It is only |
116
|
|
|
|
|
|
|
used by "valid_utf8". */ |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
#define UTF8_INVALID 0 |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
#endif /* def HEADER */ |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
/* This table contains the length of a sequence which begins with the |
123
|
|
|
|
|
|
|
byte given. A value of zero indicates that the byte can not begin a |
124
|
|
|
|
|
|
|
UTF-8 sequence. */ |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
/* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */ |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
const uint8_t utf8_sequence_len[0x100] = |
129
|
|
|
|
|
|
|
{ |
130
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ |
131
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ |
132
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ |
133
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ |
134
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ |
135
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ |
136
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ |
137
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ |
138
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ |
139
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ |
140
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ |
141
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ |
142
|
|
|
|
|
|
|
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ |
143
|
|
|
|
|
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ |
144
|
|
|
|
|
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ |
145
|
|
|
|
|
|
|
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ |
146
|
|
|
|
|
|
|
}; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
/* This function returns the number of bytes of UTF-8 a sequence |
149
|
|
|
|
|
|
|
starting with byte "c" will become, either 1 (c = 0000xxxx), 2 (c = |
150
|
|
|
|
|
|
|
110xxxxx), 3 (c = 1110xxxx), or 4 (c = 111100xx or c = |
151
|
|
|
|
|
|
|
11110100). If "c" is not a valid UTF-8 first byte, the value |
152
|
|
|
|
|
|
|
UTF8_BAD_LEADING_BYTE is returned. */ |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
int32_t utf8_bytes (uint8_t c) |
155
|
|
|
|
|
|
|
{ |
156
|
|
|
|
|
|
|
int32_t r; |
157
|
0
|
|
|
|
|
|
r = utf8_sequence_len[c]; |
158
|
0
|
0
|
|
|
|
|
if (r == 0) { |
159
|
0
|
|
|
|
|
|
return UTF8_BAD_LEADING_BYTE; |
160
|
|
|
|
|
|
|
} |
161
|
0
|
|
|
|
|
|
return r; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
/* This macro converts four bytes of UTF-8 into the corresponding code |
165
|
|
|
|
|
|
|
point. */ |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
#define FOUR(x) \ |
168
|
|
|
|
|
|
|
(((int32_t) (x[0] & 0x07)) << 18) \ |
169
|
|
|
|
|
|
|
| (((int32_t) (x[1] & 0x3F)) << 12) \ |
170
|
|
|
|
|
|
|
| (((int32_t) (x[2] & 0x3F)) << 6) \ |
171
|
|
|
|
|
|
|
| (((int32_t) (x[3] & 0x3F))) |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
/* Reject code points which end in either FFFE or FFFF. */ |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
#define REJECT_FFFF(x) \ |
176
|
|
|
|
|
|
|
if ((x & 0xFFFF) >= 0xFFFE) { \ |
177
|
|
|
|
|
|
|
return UNICODE_NOT_CHARACTER; \ |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
/* Reject code points in a certain range. */ |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
#define REJECT_NOT_CHAR(r) \ |
183
|
|
|
|
|
|
|
if (r >= UNI_NOT_CHAR_MIN && r <= UNI_NOT_CHAR_MAX) { \ |
184
|
|
|
|
|
|
|
return UNICODE_NOT_CHARACTER; \ |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
/* Reject surrogates. */ |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
#define REJECT_SURROGATE(ucs2) \ |
190
|
|
|
|
|
|
|
if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { \ |
191
|
|
|
|
|
|
|
/* Ill-formed. */ \ |
192
|
|
|
|
|
|
|
return UNICODE_SURROGATE_PAIR; \ |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
/* Try to convert "input" from UTF-8 to UCS-2, and return a value even |
196
|
|
|
|
|
|
|
if the input is partly broken. This checks the first byte of the |
197
|
|
|
|
|
|
|
input, but it doesn't check the subsequent bytes. */ |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
int32_t |
200
|
0
|
|
|
|
|
|
utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr) |
201
|
|
|
|
|
|
|
{ |
202
|
|
|
|
|
|
|
uint8_t c; |
203
|
0
|
|
|
|
|
|
c = input[0]; |
204
|
0
|
|
|
|
|
|
switch (utf8_sequence_len[c]) { |
205
|
|
|
|
|
|
|
case 1: |
206
|
0
|
|
|
|
|
|
* end_ptr = input + 1; |
207
|
0
|
|
|
|
|
|
return c; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
case 2: |
210
|
0
|
|
|
|
|
|
* end_ptr = input + 2; |
211
|
|
|
|
|
|
|
return |
212
|
0
|
|
|
|
|
|
(c & 0x1F) << 6 | |
213
|
0
|
|
|
|
|
|
(input[1] & 0x3F); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
case 3: |
216
|
0
|
|
|
|
|
|
* end_ptr = input + 3; |
217
|
|
|
|
|
|
|
return |
218
|
0
|
|
|
|
|
|
(c & 0x0F) << 12 | |
219
|
0
|
|
|
|
|
|
(input[1] & 0x3F) << 6 | |
220
|
0
|
|
|
|
|
|
(input[2] & 0x3F); |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
case 4: |
223
|
0
|
|
|
|
|
|
* end_ptr = input + 4; |
224
|
0
|
|
|
|
|
|
return FOUR (input); |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
case 0: |
227
|
|
|
|
|
|
|
/* fall through */ |
228
|
|
|
|
|
|
|
default: |
229
|
0
|
|
|
|
|
|
return UTF8_BAD_LEADING_BYTE; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
/* Surrogate pair zone. */ |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
#define UNI_SUR_HIGH_START 0xD800 |
236
|
|
|
|
|
|
|
#define UNI_SUR_HIGH_END 0xDBFF |
237
|
|
|
|
|
|
|
#define UNI_SUR_LOW_START 0xDC00 |
238
|
|
|
|
|
|
|
#define UNI_SUR_LOW_END 0xDFFF |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
/* Start of the "not character" range. */ |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
#define UNI_NOT_CHAR_MIN 0xFDD0 |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
/* End of the "not character" range. */ |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
#define UNI_NOT_CHAR_MAX 0xFDEF |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
/* This function converts UTF-8 encoded bytes in "input" into the |
249
|
|
|
|
|
|
|
equivalent Unicode code point. The return value is the Unicode |
250
|
|
|
|
|
|
|
code point corresponding to the UTF-8 character in "input" if |
251
|
|
|
|
|
|
|
successful, and a negative number if not successful. Nul bytes are |
252
|
|
|
|
|
|
|
rejected. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
"*end_ptr" is set to the next character after the read character on |
255
|
|
|
|
|
|
|
success. "*end_ptr" is set to the start of input on all failures. |
256
|
|
|
|
|
|
|
"end_ptr" may not be NULL. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
If the first byte of "input" is zero, in other words a NUL or '\0', |
259
|
|
|
|
|
|
|
UNICODE_EMPTY_INPUT is returned. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
If the first byte of "input" is not valid UTF-8, |
262
|
|
|
|
|
|
|
UTF8_BAD_LEADING_BYTE is returned. |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
If the second or later bytes of "input" are not valid UTF-8, |
265
|
|
|
|
|
|
|
including NUL, UTF8_BAD_CONTINUATION_BYTE is returned. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
If the value extrapolated from "input" is greater than |
268
|
|
|
|
|
|
|
UNICODE_MAXIMUM, UNICODE_TOO_BIG is returned. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE, |
271
|
|
|
|
|
|
|
UNICODE_NOT_CHARACTER is returned. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
If the value extrapolated from "input" is between 0xFDD0 and 0xFDEF, |
274
|
|
|
|
|
|
|
UNICODE_NOT_CHARACTER is returned. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
If the value is within the range of surrogate pairs, the error |
277
|
|
|
|
|
|
|
UNICODE_SURROGATE_PAIR is returned. |
278
|
|
|
|
|
|
|
*/ |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
int32_t |
281
|
0
|
|
|
|
|
|
utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr) |
282
|
|
|
|
|
|
|
{ |
283
|
|
|
|
|
|
|
uint8_t c; |
284
|
|
|
|
|
|
|
uint8_t l; |
285
|
|
|
|
|
|
|
|
286
|
0
|
|
|
|
|
|
*end_ptr = input; |
287
|
0
|
|
|
|
|
|
c = input[0]; |
288
|
0
|
0
|
|
|
|
|
if (c == 0) { |
289
|
0
|
|
|
|
|
|
return UNICODE_EMPTY_INPUT; |
290
|
|
|
|
|
|
|
} |
291
|
0
|
|
|
|
|
|
l = utf8_sequence_len[c]; |
292
|
0
|
0
|
|
|
|
|
if (l == 1) { |
293
|
0
|
|
|
|
|
|
* end_ptr = input + 1; |
294
|
0
|
|
|
|
|
|
return (int32_t) c; |
295
|
|
|
|
|
|
|
} |
296
|
0
|
0
|
|
|
|
|
if (l == 2) { |
297
|
|
|
|
|
|
|
uint8_t d; |
298
|
0
|
|
|
|
|
|
d = input[1]; |
299
|
|
|
|
|
|
|
/* Two byte case. */ |
300
|
0
|
0
|
|
|
|
|
if (d < 0x80 || d > 0xBF) { |
|
|
0
|
|
|
|
|
|
301
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
302
|
|
|
|
|
|
|
} |
303
|
0
|
0
|
|
|
|
|
if (c <= 0xC1) { |
304
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
305
|
|
|
|
|
|
|
} |
306
|
0
|
|
|
|
|
|
* end_ptr = input + 2; |
307
|
|
|
|
|
|
|
return |
308
|
0
|
|
|
|
|
|
((int32_t) (c & 0x1F) << 6) | |
309
|
0
|
|
|
|
|
|
((int32_t) (d & 0x3F)); |
310
|
|
|
|
|
|
|
} |
311
|
0
|
0
|
|
|
|
|
if (l == 3) { |
312
|
|
|
|
|
|
|
uint8_t d; |
313
|
|
|
|
|
|
|
uint8_t e; |
314
|
|
|
|
|
|
|
int32_t r; |
315
|
|
|
|
|
|
|
|
316
|
0
|
|
|
|
|
|
d = input[1]; |
317
|
0
|
|
|
|
|
|
e = input[2]; |
318
|
|
|
|
|
|
|
/* Three byte case. */ |
319
|
0
|
0
|
|
|
|
|
if (d < 0x80 || d > 0xBF || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
320
|
0
|
0
|
|
|
|
|
e < 0x80 || e > 0xBF) { |
321
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
322
|
|
|
|
|
|
|
} |
323
|
0
|
0
|
|
|
|
|
if (c == 0xe0 && d < 0xa0) { |
|
|
0
|
|
|
|
|
|
324
|
|
|
|
|
|
|
/* We don't need to check the value of input[2], because |
325
|
|
|
|
|
|
|
the if statement above this one already guarantees that |
326
|
|
|
|
|
|
|
it is 10xxxxxx. */ |
327
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
328
|
|
|
|
|
|
|
} |
329
|
0
|
|
|
|
|
|
r = ((int32_t) (c & 0x0F)) << 12 | |
330
|
0
|
|
|
|
|
|
((int32_t) (d & 0x3F)) << 6 | |
331
|
0
|
|
|
|
|
|
((int32_t) (e & 0x3F)); |
332
|
0
|
0
|
|
|
|
|
REJECT_SURROGATE(r); |
|
|
0
|
|
|
|
|
|
333
|
0
|
0
|
|
|
|
|
REJECT_FFFF(r); |
334
|
0
|
0
|
|
|
|
|
REJECT_NOT_CHAR(r); |
|
|
0
|
|
|
|
|
|
335
|
0
|
|
|
|
|
|
* end_ptr = input + 3; |
336
|
0
|
|
|
|
|
|
return r; |
337
|
|
|
|
|
|
|
} |
338
|
0
|
0
|
|
|
|
|
else if (l == 4) { |
339
|
|
|
|
|
|
|
/* Four byte case. */ |
340
|
|
|
|
|
|
|
uint8_t d; |
341
|
|
|
|
|
|
|
uint8_t e; |
342
|
|
|
|
|
|
|
uint8_t f; |
343
|
|
|
|
|
|
|
int32_t v; |
344
|
|
|
|
|
|
|
|
345
|
0
|
|
|
|
|
|
d = input[1]; |
346
|
0
|
|
|
|
|
|
e = input[2]; |
347
|
0
|
|
|
|
|
|
f = input[3]; |
348
|
|
|
|
|
|
|
|
349
|
0
|
0
|
|
|
|
|
if (/* c must be 11110xxx. */ |
350
|
0
|
0
|
|
|
|
|
c >= 0xf8 || |
351
|
|
|
|
|
|
|
/* d, e, f must be 10xxxxxx. */ |
352
|
0
|
0
|
|
|
|
|
d < 0x80 || d >= 0xC0 || |
|
|
0
|
|
|
|
|
|
353
|
0
|
0
|
|
|
|
|
e < 0x80 || e >= 0xC0 || |
|
|
0
|
|
|
|
|
|
354
|
0
|
0
|
|
|
|
|
f < 0x80 || f >= 0xC0) { |
355
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
0
|
0
|
|
|
|
|
if (c == 0xf0 && d < 0x90) { |
|
|
0
|
|
|
|
|
|
359
|
|
|
|
|
|
|
/* We don't need to check the values of e and f, because |
360
|
|
|
|
|
|
|
the if statement above this one already guarantees that |
361
|
|
|
|
|
|
|
e and f are 10xxxxxx. */ |
362
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
/* Calculate the code point. */ |
365
|
0
|
|
|
|
|
|
v = FOUR (input); |
366
|
|
|
|
|
|
|
/* Greater than U+10FFFF */ |
367
|
0
|
0
|
|
|
|
|
if (v > UNICODE_MAXIMUM) { |
368
|
0
|
|
|
|
|
|
return UNICODE_TOO_BIG; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ |
371
|
0
|
0
|
|
|
|
|
REJECT_FFFF(v); |
372
|
|
|
|
|
|
|
/* We don't need to check for surrogate pairs here, since the |
373
|
|
|
|
|
|
|
minimum value of UCS2 if there are four bytes of UTF-8 is |
374
|
|
|
|
|
|
|
0x10000. */ |
375
|
0
|
|
|
|
|
|
* end_ptr = input + 4; |
376
|
0
|
|
|
|
|
|
return v; |
377
|
|
|
|
|
|
|
} |
378
|
0
|
|
|
|
|
|
return UTF8_BAD_LEADING_BYTE; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
/* Input: a Unicode code point, "ucs2". |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
Output: UTF-8 characters in buffer "utf8". |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
Return value: the number of bytes written into "utf8", or a |
387
|
|
|
|
|
|
|
negative number if there was an error. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
If the value of "ucs2" is invalid because of being in the surrogate |
390
|
|
|
|
|
|
|
pair range from 0xD800 to 0xDFFF, the return value is |
391
|
|
|
|
|
|
|
UNICODE_SURROGATE_PAIR. |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
If the value of "ucs2" is in the range 0xFDD0 to 0xFDEF inclusive, |
394
|
|
|
|
|
|
|
the return value is UNICODE_NOT_CHARACTER. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
If the lower two bytes of "ucs2" are either 0xFFFE or 0xFFFF, the |
397
|
|
|
|
|
|
|
return value is UNICODE_NOT_CHARACTER. |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
If the value is too big to fit into four bytes of UTF-8, |
400
|
|
|
|
|
|
|
UNICODE_UTF8_4, the return value is UNICODE_TOO_BIG. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
However, it does not insist on ucs2 being less than |
403
|
|
|
|
|
|
|
UNICODE_MAXIMUM, so the user needs to check that "ucs2" is a valid |
404
|
|
|
|
|
|
|
code point. |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
This adds a zero byte to the end of the string. It assumes that the |
407
|
|
|
|
|
|
|
buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to |
408
|
|
|
|
|
|
|
write to, without checking. */ |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
int32_t |
411
|
0
|
|
|
|
|
|
ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8) |
412
|
|
|
|
|
|
|
{ |
413
|
0
|
0
|
|
|
|
|
REJECT_FFFF(ucs2); |
414
|
0
|
0
|
|
|
|
|
if (ucs2 < 0x80) { |
415
|
0
|
|
|
|
|
|
utf8[0] = ucs2; |
416
|
0
|
|
|
|
|
|
utf8[1] = '\0'; |
417
|
0
|
|
|
|
|
|
return 1; |
418
|
|
|
|
|
|
|
} |
419
|
0
|
0
|
|
|
|
|
if (ucs2 < 0x800) { |
420
|
0
|
|
|
|
|
|
utf8[0] = (ucs2 >> 6) | 0xC0; |
421
|
0
|
|
|
|
|
|
utf8[1] = (ucs2 & 0x3F) | 0x80; |
422
|
0
|
|
|
|
|
|
utf8[2] = '\0'; |
423
|
0
|
|
|
|
|
|
return 2; |
424
|
|
|
|
|
|
|
} |
425
|
0
|
0
|
|
|
|
|
if (ucs2 < 0xFFFF) { |
426
|
0
|
|
|
|
|
|
utf8[0] = ((ucs2 >> 12) ) | 0xE0; |
427
|
0
|
|
|
|
|
|
utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80; |
428
|
0
|
|
|
|
|
|
utf8[2] = ((ucs2 ) & 0x3F) | 0x80; |
429
|
0
|
|
|
|
|
|
utf8[3] = '\0'; |
430
|
0
|
0
|
|
|
|
|
REJECT_SURROGATE(ucs2); |
|
|
0
|
|
|
|
|
|
431
|
0
|
0
|
|
|
|
|
REJECT_NOT_CHAR(ucs2); |
|
|
0
|
|
|
|
|
|
432
|
0
|
|
|
|
|
|
return 3; |
433
|
|
|
|
|
|
|
} |
434
|
0
|
0
|
|
|
|
|
if (ucs2 <= UNICODE_UTF8_4) { |
435
|
|
|
|
|
|
|
/* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */ |
436
|
0
|
|
|
|
|
|
utf8[0] = 0xF0 | (ucs2 >> 18); |
437
|
0
|
|
|
|
|
|
utf8[1] = 0x80 | ((ucs2 >> 12) & 0x3F); |
438
|
0
|
|
|
|
|
|
utf8[2] = 0x80 | ((ucs2 >> 6) & 0x3F); |
439
|
0
|
|
|
|
|
|
utf8[3] = 0x80 | ((ucs2 & 0x3F)); |
440
|
0
|
|
|
|
|
|
utf8[4] = '\0'; |
441
|
0
|
|
|
|
|
|
return 4; |
442
|
|
|
|
|
|
|
} |
443
|
0
|
|
|
|
|
|
return UNICODE_TOO_BIG; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
/* For shifting by 10 bits. */ |
447
|
|
|
|
|
|
|
#define TEN_BITS 10 |
448
|
|
|
|
|
|
|
#define HALF_BASE 0x0010000UL |
449
|
|
|
|
|
|
|
/* 0b1111111111 */ |
450
|
|
|
|
|
|
|
#define LOW_TEN_BITS 0x3FF |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
/* This converts the Unicode code point in "unicode" into a surrogate |
453
|
|
|
|
|
|
|
pair, and returns the two parts in "* hi_ptr" and "* lo_ptr". |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
Return value: |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
If "unicode" does not need to be a surrogate pair, the error |
458
|
|
|
|
|
|
|
UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr" |
459
|
|
|
|
|
|
|
and "*lo_ptr" are undefined. If the conversion is successful, |
460
|
|
|
|
|
|
|
UNICODE_OK is returned. */ |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
int32_t |
463
|
9
|
|
|
|
|
|
unicode_to_surrogates (int32_t unicode, int32_t * hi_ptr, int32_t * lo_ptr) |
464
|
|
|
|
|
|
|
{ |
465
|
9
|
|
|
|
|
|
int32_t hi = UNI_SUR_HIGH_START; |
466
|
9
|
|
|
|
|
|
int32_t lo = UNI_SUR_LOW_START; |
467
|
9
|
50
|
|
|
|
|
if (unicode < HALF_BASE) { |
468
|
|
|
|
|
|
|
/* Doesn't need to be a surrogate pair. */ |
469
|
0
|
|
|
|
|
|
return UNICODE_NOT_SURROGATE_PAIR; |
470
|
|
|
|
|
|
|
} |
471
|
9
|
|
|
|
|
|
unicode -= HALF_BASE; |
472
|
9
|
|
|
|
|
|
hi |= ((unicode >> TEN_BITS) & LOW_TEN_BITS); |
473
|
9
|
|
|
|
|
|
lo |= ((unicode) & LOW_TEN_BITS); |
474
|
9
|
|
|
|
|
|
* hi_ptr = hi; |
475
|
9
|
|
|
|
|
|
* lo_ptr = lo; |
476
|
9
|
|
|
|
|
|
return UNICODE_OK; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
/* Convert a surrogate pair in "hi" and "lo" to a single Unicode |
480
|
|
|
|
|
|
|
value. The return value is the Unicode value. If the return value |
481
|
|
|
|
|
|
|
is negative, an error has occurred. If "hi" and "lo" do not form a |
482
|
|
|
|
|
|
|
surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is |
483
|
|
|
|
|
|
|
returned. |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */ |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
int32_t |
488
|
0
|
|
|
|
|
|
surrogates_to_unicode (int32_t hi, int32_t lo) |
489
|
|
|
|
|
|
|
{ |
490
|
|
|
|
|
|
|
int32_t u; |
491
|
0
|
0
|
|
|
|
|
if (hi < UNI_SUR_HIGH_START || hi > UNI_SUR_HIGH_END || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
492
|
0
|
0
|
|
|
|
|
lo < UNI_SUR_LOW_START || lo > UNI_SUR_LOW_END) { |
493
|
0
|
|
|
|
|
|
return UNICODE_NOT_SURROGATE_PAIR; |
494
|
|
|
|
|
|
|
} |
495
|
0
|
|
|
|
|
|
u = ((hi - UNI_SUR_HIGH_START) << TEN_BITS) |
496
|
0
|
|
|
|
|
|
+ (lo - UNI_SUR_LOW_START) + HALF_BASE; |
497
|
0
|
|
|
|
|
|
return u; |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
#undef UNI_SUR_HIGH_START |
501
|
|
|
|
|
|
|
#undef UNI_SUR_HIGH_END |
502
|
|
|
|
|
|
|
#undef UNI_SUR_LOW_START |
503
|
|
|
|
|
|
|
#undef UNI_SUR_LOW_END |
504
|
|
|
|
|
|
|
#undef TEN_BITS |
505
|
|
|
|
|
|
|
#undef HALF_BASE |
506
|
|
|
|
|
|
|
#undef LOW_TEN_BITS |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
/* Convert the surrogate pair in "hi" and "lo" to UTF-8 in |
509
|
|
|
|
|
|
|
"utf8". This calls "surrogates_to_unicode" and "ucs2_to_utf8", thus |
510
|
|
|
|
|
|
|
it can return the same errors as them, and has the same restriction |
511
|
|
|
|
|
|
|
on "utf8" as "ucs2_to_utf8". */ |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
int32_t |
514
|
0
|
|
|
|
|
|
surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t * utf8) |
515
|
|
|
|
|
|
|
{ |
516
|
|
|
|
|
|
|
int32_t C; |
517
|
0
|
|
|
|
|
|
C = surrogates_to_unicode (hi, lo); |
518
|
0
|
0
|
|
|
|
|
if (C < 0) { |
519
|
0
|
|
|
|
|
|
return C; |
520
|
|
|
|
|
|
|
} |
521
|
0
|
|
|
|
|
|
return ucs2_to_utf8 (C, utf8); |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
/* Given a nul-terminated string "utf8" and a number of Unicode |
525
|
|
|
|
|
|
|
characters "n_chars", return the number of bytes into "utf8" at |
526
|
|
|
|
|
|
|
which the end of the characters occurs. A negative value indicates |
527
|
|
|
|
|
|
|
some kind of error. If "utf8" contains a zero byte, the return |
528
|
|
|
|
|
|
|
value is UNICODE_EMPTY_INPUT. This may also return any of the error |
529
|
|
|
|
|
|
|
values of "utf8_to_ucs2". */ |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
int32_t |
532
|
0
|
|
|
|
|
|
unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars) |
533
|
|
|
|
|
|
|
{ |
534
|
|
|
|
|
|
|
int32_t i; |
535
|
0
|
|
|
|
|
|
const uint8_t * p = utf8; |
536
|
0
|
|
|
|
|
|
int32_t len = strlen ((const char *) utf8); |
537
|
0
|
0
|
|
|
|
|
if (len == 0 && n_chars != 0) { |
|
|
0
|
|
|
|
|
|
538
|
0
|
|
|
|
|
|
return UNICODE_EMPTY_INPUT; |
539
|
|
|
|
|
|
|
} |
540
|
0
|
0
|
|
|
|
|
for (i = 0; i < n_chars; i++) { |
541
|
0
|
|
|
|
|
|
int32_t ucs2 = utf8_to_ucs2 (p, & p); |
542
|
0
|
0
|
|
|
|
|
if (ucs2 < 0) { |
543
|
0
|
|
|
|
|
|
return ucs2; |
544
|
|
|
|
|
|
|
} |
545
|
|
|
|
|
|
|
} |
546
|
0
|
|
|
|
|
|
return p - utf8; |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
/* Like unicode_count_chars, but without error checks or validation of |
550
|
|
|
|
|
|
|
the input. This only checks the first byte of each UTF-8 sequence, |
551
|
|
|
|
|
|
|
then jumps over the succeeding bytes. It may return |
552
|
|
|
|
|
|
|
UTF8_BAD_LEADING_BYTE if the first byte is invalid. */ |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
int32_t |
555
|
0
|
|
|
|
|
|
unicode_count_chars_fast (const uint8_t * utf8) |
556
|
|
|
|
|
|
|
{ |
557
|
|
|
|
|
|
|
int32_t chars; |
558
|
|
|
|
|
|
|
const uint8_t * p; |
559
|
0
|
|
|
|
|
|
chars = 0; |
560
|
0
|
|
|
|
|
|
p = utf8; |
561
|
0
|
0
|
|
|
|
|
while (*p) { |
562
|
|
|
|
|
|
|
int32_t len; |
563
|
0
|
|
|
|
|
|
len = utf8_sequence_len[*p]; |
564
|
0
|
0
|
|
|
|
|
if (len == 0) { |
565
|
|
|
|
|
|
|
/* The first byte of a UTF-8 sequence is bad, so return |
566
|
|
|
|
|
|
|
this, not BAD_UTF8. */ |
567
|
0
|
|
|
|
|
|
return UTF8_BAD_LEADING_BYTE; |
568
|
|
|
|
|
|
|
} |
569
|
0
|
|
|
|
|
|
p += len; |
570
|
0
|
|
|
|
|
|
chars++; |
571
|
|
|
|
|
|
|
} |
572
|
0
|
|
|
|
|
|
return chars; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
/* Given a nul-terminated string "utf8", return the total number of |
576
|
|
|
|
|
|
|
Unicode characters it contains. |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
Return value |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the |
581
|
|
|
|
|
|
|
errors of "utf8_to_ucs2". */ |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
int32_t |
584
|
0
|
|
|
|
|
|
unicode_count_chars (const uint8_t * utf8) |
585
|
|
|
|
|
|
|
{ |
586
|
0
|
|
|
|
|
|
int32_t chars = 0; |
587
|
0
|
|
|
|
|
|
const uint8_t * p = utf8; |
588
|
0
|
|
|
|
|
|
int32_t len = strlen ((const char *) utf8); |
589
|
0
|
0
|
|
|
|
|
if (len == 0) { |
590
|
0
|
|
|
|
|
|
return 0; |
591
|
|
|
|
|
|
|
} |
592
|
0
|
0
|
|
|
|
|
while (p - utf8 < len) { |
593
|
|
|
|
|
|
|
int32_t ucs2; |
594
|
0
|
|
|
|
|
|
ucs2 = utf8_to_ucs2 (p, & p); |
595
|
0
|
0
|
|
|
|
|
if (ucs2 < 0) { |
596
|
|
|
|
|
|
|
/* Return the error from utf8_to_ucs2. */ |
597
|
0
|
|
|
|
|
|
return ucs2; |
598
|
|
|
|
|
|
|
} |
599
|
0
|
|
|
|
|
|
chars++; |
600
|
0
|
0
|
|
|
|
|
if (*p == '\0') { |
601
|
0
|
|
|
|
|
|
return chars; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
/* Cannot be reached in practice, since strlen indicates the null |
605
|
|
|
|
|
|
|
byte. */ |
606
|
0
|
|
|
|
|
|
return UTF8_BAD_LEADING_BYTE; |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
#ifdef HEADER |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
/* These are intended for use in switch statements, for example |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
switch (c) { |
614
|
|
|
|
|
|
|
case BYTE_80_8F: |
615
|
|
|
|
|
|
|
do_something; |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
They originally come from the Json3 project. */ |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
#define BYTE_80_8F \ |
620
|
|
|
|
|
|
|
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ |
621
|
|
|
|
|
|
|
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ |
622
|
|
|
|
|
|
|
case 0x8E: case 0x8F |
623
|
|
|
|
|
|
|
#define BYTE_80_9F \ |
624
|
|
|
|
|
|
|
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ |
625
|
|
|
|
|
|
|
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ |
626
|
|
|
|
|
|
|
case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ |
627
|
|
|
|
|
|
|
case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ |
628
|
|
|
|
|
|
|
case 0x9C: case 0x9D: case 0x9E: case 0x9F |
629
|
|
|
|
|
|
|
#define BYTE_80_BF \ |
630
|
|
|
|
|
|
|
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ |
631
|
|
|
|
|
|
|
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ |
632
|
|
|
|
|
|
|
case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ |
633
|
|
|
|
|
|
|
case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ |
634
|
|
|
|
|
|
|
case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ |
635
|
|
|
|
|
|
|
case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ |
636
|
|
|
|
|
|
|
case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ |
637
|
|
|
|
|
|
|
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ |
638
|
|
|
|
|
|
|
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ |
639
|
|
|
|
|
|
|
case 0xBF |
640
|
|
|
|
|
|
|
#define BYTE_80_8F_B0_BF \ |
641
|
|
|
|
|
|
|
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ |
642
|
|
|
|
|
|
|
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ |
643
|
|
|
|
|
|
|
case 0x8E: case 0x8F: case 0xB0: \ |
644
|
|
|
|
|
|
|
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ |
645
|
|
|
|
|
|
|
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ |
646
|
|
|
|
|
|
|
case 0xBF |
647
|
|
|
|
|
|
|
#define BYTE_80_B6_B8_BF \ |
648
|
|
|
|
|
|
|
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ |
649
|
|
|
|
|
|
|
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ |
650
|
|
|
|
|
|
|
case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ |
651
|
|
|
|
|
|
|
case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ |
652
|
|
|
|
|
|
|
case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ |
653
|
|
|
|
|
|
|
case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ |
654
|
|
|
|
|
|
|
case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ |
655
|
|
|
|
|
|
|
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \ |
656
|
|
|
|
|
|
|
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ |
657
|
|
|
|
|
|
|
case 0xBF |
658
|
|
|
|
|
|
|
#define BYTE_80_BD \ |
659
|
|
|
|
|
|
|
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ |
660
|
|
|
|
|
|
|
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ |
661
|
|
|
|
|
|
|
case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ |
662
|
|
|
|
|
|
|
case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ |
663
|
|
|
|
|
|
|
case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ |
664
|
|
|
|
|
|
|
case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ |
665
|
|
|
|
|
|
|
case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ |
666
|
|
|
|
|
|
|
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ |
667
|
|
|
|
|
|
|
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD |
668
|
|
|
|
|
|
|
#define BYTE_90_BF \ |
669
|
|
|
|
|
|
|
0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \ |
670
|
|
|
|
|
|
|
case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \ |
671
|
|
|
|
|
|
|
case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \ |
672
|
|
|
|
|
|
|
case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \ |
673
|
|
|
|
|
|
|
case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \ |
674
|
|
|
|
|
|
|
case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \ |
675
|
|
|
|
|
|
|
case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF |
676
|
|
|
|
|
|
|
#define BYTE_A0_BF \ |
677
|
|
|
|
|
|
|
0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \ |
678
|
|
|
|
|
|
|
case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \ |
679
|
|
|
|
|
|
|
case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \ |
680
|
|
|
|
|
|
|
case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \ |
681
|
|
|
|
|
|
|
case 0xBC: case 0xBD: case 0xBE: case 0xBF |
682
|
|
|
|
|
|
|
#define BYTE_C2_DF \ |
683
|
|
|
|
|
|
|
0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \ |
684
|
|
|
|
|
|
|
case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \ |
685
|
|
|
|
|
|
|
case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \ |
686
|
|
|
|
|
|
|
case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \ |
687
|
|
|
|
|
|
|
case 0xDE: case 0xDF |
688
|
|
|
|
|
|
|
#define BYTE_E1_EC \ |
689
|
|
|
|
|
|
|
0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \ |
690
|
|
|
|
|
|
|
case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC |
691
|
|
|
|
|
|
|
#define BYTE_F1_F3 \ |
692
|
|
|
|
|
|
|
0xF1: case 0xF2: case 0xF3 |
693
|
|
|
|
|
|
|
#endif /* def HEADER */ |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
#define UNICODEADDBYTE i++ |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
#define UNICODEFAILUTF8(want) return UTF8_INVALID |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
#define UNICODENEXTBYTE c = input[i] |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
/* Given "input" and "input_length", validate "input" byte by byte up |
702
|
|
|
|
|
|
|
to "input_length". The return value may be UTF8_VALID or |
703
|
|
|
|
|
|
|
UTF8_INVALID. */ |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
int32_t |
706
|
0
|
|
|
|
|
|
valid_utf8 (const uint8_t * input, int32_t input_length) |
707
|
|
|
|
|
|
|
{ |
708
|
|
|
|
|
|
|
int32_t error; |
709
|
|
|
|
|
|
|
utf8_info_t info; |
710
|
0
|
|
|
|
|
|
error = validate_utf8 (input, input_length, & info); |
711
|
0
|
0
|
|
|
|
|
if (error < 0) { |
712
|
0
|
|
|
|
|
|
return UTF8_INVALID; |
713
|
|
|
|
|
|
|
} |
714
|
0
|
|
|
|
|
|
return UTF8_VALID; |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
#define FAIL(x) \ |
718
|
|
|
|
|
|
|
info->len_read = i; \ |
719
|
|
|
|
|
|
|
return x |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
#ifdef HEADER |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
typedef struct utf8_info |
724
|
|
|
|
|
|
|
{ |
725
|
|
|
|
|
|
|
int32_t len_read; |
726
|
|
|
|
|
|
|
int32_t runes_read; |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
utf8_info_t; |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
#endif /* def HEADER */ |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
/* Given "input" and "len", validate "input" byte by byte up to |
733
|
|
|
|
|
|
|
"len". The return value is "UNICODE_OK" (zero) on success or the |
734
|
|
|
|
|
|
|
error found (a negative number) on failure. |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
utf8_info_t is defined in "unicode.h". |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
The value of "info.len_read" is the number of bytes processed. the |
739
|
|
|
|
|
|
|
value of "info.runes_read" is the number of Unicode code points in |
740
|
|
|
|
|
|
|
the input. */ |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
int32_t |
743
|
0
|
|
|
|
|
|
validate_utf8 (const uint8_t * input, int32_t len, utf8_info_t * info) |
744
|
|
|
|
|
|
|
{ |
745
|
|
|
|
|
|
|
int32_t i; |
746
|
|
|
|
|
|
|
uint8_t c; |
747
|
|
|
|
|
|
|
|
748
|
0
|
|
|
|
|
|
info->len_read = 0; |
749
|
|
|
|
|
|
|
/* We want to increment the runes after "string_start", but that |
750
|
|
|
|
|
|
|
would give us one too many. */ |
751
|
0
|
|
|
|
|
|
info->runes_read = -1; |
752
|
0
|
|
|
|
|
|
i = 0; |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
string_start: |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
/* We get here after successfully reading a "rune". */ |
757
|
|
|
|
|
|
|
|
758
|
0
|
|
|
|
|
|
info->runes_read++; |
759
|
0
|
0
|
|
|
|
|
if (i >= len) { |
760
|
0
|
|
|
|
|
|
info->len_read = len; |
761
|
0
|
|
|
|
|
|
return UNICODE_OK; /* 0 */ |
762
|
|
|
|
|
|
|
} |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
/* Set c separately here since we use a range comparison before |
765
|
|
|
|
|
|
|
the switch statement. */ |
766
|
|
|
|
|
|
|
|
767
|
0
|
|
|
|
|
|
c = input[i]; |
768
|
|
|
|
|
|
|
|
769
|
0
|
0
|
|
|
|
|
if (c == 0) { |
770
|
0
|
|
|
|
|
|
FAIL (UNICODE_EMPTY_INPUT); |
771
|
|
|
|
|
|
|
} |
772
|
|
|
|
|
|
|
/* Admit all bytes < 0x80. */ |
773
|
0
|
0
|
|
|
|
|
if (c < 0x80) { |
774
|
0
|
|
|
|
|
|
i++; |
775
|
0
|
|
|
|
|
|
goto string_start; |
776
|
|
|
|
|
|
|
} |
777
|
0
|
|
|
|
|
|
switch (c) { |
778
|
|
|
|
|
|
|
case BYTE_C2_DF: |
779
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
780
|
0
|
|
|
|
|
|
goto byte_last_80_bf; |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
case 0xE0: |
783
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
784
|
0
|
|
|
|
|
|
goto byte23_a0_bf; |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
case BYTE_E1_EC: |
787
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
788
|
0
|
|
|
|
|
|
goto byte_penultimate_80_bf; |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
case 0xED: |
791
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
792
|
0
|
|
|
|
|
|
goto byte23_80_9f; |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
case 0xEE: |
795
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
796
|
0
|
|
|
|
|
|
goto byte_penultimate_80_bf; |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
case 0xEF: |
799
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
800
|
0
|
|
|
|
|
|
goto byte_ef_80_bf; |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
case 0xF0: |
803
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
804
|
0
|
|
|
|
|
|
goto byte24_90_bf; |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
case BYTE_F1_F3: |
807
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
808
|
0
|
|
|
|
|
|
goto byte24_80_bf; |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
case 0xF4: |
811
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
812
|
0
|
|
|
|
|
|
goto byte24_80_8f; |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
default: |
815
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_LEADING_BYTE); |
816
|
|
|
|
|
|
|
} |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
byte_last_80_bf: |
819
|
|
|
|
|
|
|
|
820
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
821
|
|
|
|
|
|
|
case BYTE_80_BF: |
822
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
823
|
0
|
|
|
|
|
|
goto string_start; |
824
|
|
|
|
|
|
|
default: |
825
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
byte_ef_b7: |
829
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
830
|
|
|
|
|
|
|
case BYTE_80_8F_B0_BF: |
831
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
832
|
0
|
|
|
|
|
|
goto string_start; |
833
|
|
|
|
|
|
|
default: |
834
|
0
|
0
|
|
|
|
|
if (c >= 0x90 && c <= 0xAF) { |
|
|
0
|
|
|
|
|
|
835
|
0
|
|
|
|
|
|
FAIL (UNICODE_NOT_CHARACTER); |
836
|
|
|
|
|
|
|
} |
837
|
|
|
|
|
|
|
else { |
838
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
byte_last_80_bd: |
843
|
|
|
|
|
|
|
|
844
|
0
|
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
845
|
|
|
|
|
|
|
case BYTE_80_BD: |
846
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
847
|
0
|
|
|
|
|
|
goto string_start; |
848
|
|
|
|
|
|
|
case 0xBE: |
849
|
|
|
|
|
|
|
case 0xBF: |
850
|
0
|
|
|
|
|
|
FAIL (UNICODE_NOT_CHARACTER); |
851
|
|
|
|
|
|
|
default: |
852
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
853
|
|
|
|
|
|
|
} |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
byte_penultimate_80_bf: |
856
|
|
|
|
|
|
|
|
857
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
858
|
|
|
|
|
|
|
case BYTE_80_BF: |
859
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
860
|
0
|
|
|
|
|
|
goto byte_last_80_bf; |
861
|
|
|
|
|
|
|
default: |
862
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
863
|
|
|
|
|
|
|
} |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
byte_ef_80_bf: |
866
|
0
|
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
867
|
|
|
|
|
|
|
case BYTE_80_B6_B8_BF: |
868
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
869
|
0
|
|
|
|
|
|
goto byte_last_80_bd; |
870
|
|
|
|
|
|
|
case 0xB7: |
871
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
872
|
|
|
|
|
|
|
/* FDD0 - FDE7 */ |
873
|
0
|
|
|
|
|
|
goto byte_ef_b7; |
874
|
|
|
|
|
|
|
default: |
875
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
876
|
|
|
|
|
|
|
} |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
byte24_90_bf: |
879
|
|
|
|
|
|
|
|
880
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
881
|
|
|
|
|
|
|
case BYTE_90_BF: |
882
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
883
|
0
|
|
|
|
|
|
goto byte_penultimate_80_bf; |
884
|
|
|
|
|
|
|
default: |
885
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
886
|
|
|
|
|
|
|
} |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
byte23_80_9f: |
889
|
|
|
|
|
|
|
|
890
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
891
|
|
|
|
|
|
|
case BYTE_80_9F: |
892
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
893
|
0
|
|
|
|
|
|
goto byte_last_80_bf; |
894
|
|
|
|
|
|
|
default: |
895
|
0
|
0
|
|
|
|
|
if (c >= 0xA0 && c <= 0xBF) { |
|
|
0
|
|
|
|
|
|
896
|
0
|
|
|
|
|
|
FAIL (UNICODE_SURROGATE_PAIR); |
897
|
|
|
|
|
|
|
} |
898
|
|
|
|
|
|
|
else { |
899
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
900
|
|
|
|
|
|
|
} |
901
|
|
|
|
|
|
|
} |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
byte23_a0_bf: |
904
|
|
|
|
|
|
|
|
905
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
906
|
|
|
|
|
|
|
case BYTE_A0_BF: |
907
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
908
|
0
|
|
|
|
|
|
goto byte_last_80_bf; |
909
|
|
|
|
|
|
|
default: |
910
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
911
|
|
|
|
|
|
|
} |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
byte24_80_bf: |
914
|
|
|
|
|
|
|
|
915
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
916
|
|
|
|
|
|
|
case BYTE_80_BF: |
917
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
918
|
0
|
|
|
|
|
|
goto byte_ef_80_bf; |
919
|
|
|
|
|
|
|
default: |
920
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
byte24_80_8f: |
924
|
|
|
|
|
|
|
|
925
|
0
|
0
|
|
|
|
|
switch (UNICODENEXTBYTE) { |
926
|
|
|
|
|
|
|
case BYTE_80_8F: |
927
|
0
|
|
|
|
|
|
UNICODEADDBYTE; |
928
|
0
|
|
|
|
|
|
goto byte_ef_80_bf; |
929
|
|
|
|
|
|
|
default: |
930
|
0
|
0
|
|
|
|
|
if (c >= 0x90) { |
931
|
0
|
|
|
|
|
|
FAIL (UNICODE_TOO_BIG); |
932
|
|
|
|
|
|
|
} |
933
|
|
|
|
|
|
|
else { |
934
|
0
|
|
|
|
|
|
FAIL (UTF8_BAD_CONTINUATION_BYTE); |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
} |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
#define REJECT_FE_FF(c) \ |
940
|
|
|
|
|
|
|
if (c == 0xFF || c == 0xFE) { \ |
941
|
|
|
|
|
|
|
return UNICODE_NOT_CHARACTER; \ |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
/* Make "* ptr" point to the start of the first UTF-8 character after |
945
|
|
|
|
|
|
|
its initial value. This assumes that there are at least four bytes |
946
|
|
|
|
|
|
|
which can be read, and that "* ptr" points to valid UTF-8. |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
If "** ptr" does not have its top bit set, 00xx_xxxx, this does not |
949
|
|
|
|
|
|
|
change the value of "* ptr", and it returns UNICODE_OK. If "** ptr" |
950
|
|
|
|
|
|
|
has its top two bits set, 11xx_xxxx, this does not change the value |
951
|
|
|
|
|
|
|
of "* ptr" and it returns UNICODE_OK. If "**ptr" has its top bit |
952
|
|
|
|
|
|
|
set but its second-to-top bit unset, 10xx_xxxx, so it is the |
953
|
|
|
|
|
|
|
second, third, or fourth byte of a multibyte sequence, "* ptr" is |
954
|
|
|
|
|
|
|
incremented until either "** ptr" is a valid first byte of a UTF-8 |
955
|
|
|
|
|
|
|
sequence, or too many bytes have passed for it to be valid |
956
|
|
|
|
|
|
|
UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is |
957
|
|
|
|
|
|
|
returned and "*ptr" is left unchanged. |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
If a valid UTF-8 first byte was found, either 11xx_xxxx or |
960
|
|
|
|
|
|
|
00xx_xxxx, UNICODE_OK is returned, and "*ptr" is set to the address |
961
|
|
|
|
|
|
|
of the valid byte. Nul bytes (bytes containing zero) are considered |
962
|
|
|
|
|
|
|
valid. |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
If any of the bytes read contains invalid UTF-8 bytes 0xFE and |
965
|
|
|
|
|
|
|
0xFF, the error code UNICODE_NOT_CHARACTER is returned and "*ptr" |
966
|
|
|
|
|
|
|
is left unchanged. */ |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
int32_t |
969
|
0
|
|
|
|
|
|
trim_to_utf8_start (const uint8_t ** ptr) |
970
|
|
|
|
|
|
|
{ |
971
|
0
|
|
|
|
|
|
const uint8_t * p = *ptr; |
972
|
|
|
|
|
|
|
uint8_t c; |
973
|
|
|
|
|
|
|
int32_t i; |
974
|
|
|
|
|
|
|
|
975
|
0
|
|
|
|
|
|
c = * p; |
976
|
0
|
0
|
|
|
|
|
REJECT_FE_FF (c); |
|
|
0
|
|
|
|
|
|
977
|
|
|
|
|
|
|
/* 0xC0 = 1100_0000. */ |
978
|
0
|
|
|
|
|
|
c &= 0xC0; |
979
|
0
|
0
|
|
|
|
|
if (c == 0xC0 || c == 0x00) { |
|
|
0
|
|
|
|
|
|
980
|
0
|
|
|
|
|
|
return UNICODE_OK; |
981
|
|
|
|
|
|
|
} |
982
|
0
|
0
|
|
|
|
|
for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) { |
983
|
0
|
|
|
|
|
|
c = p[i]; |
984
|
0
|
0
|
|
|
|
|
REJECT_FE_FF (c); |
|
|
0
|
|
|
|
|
|
985
|
0
|
0
|
|
|
|
|
if ((c & 0x80) != 0x80 || (c & 0x40) != 0) { |
|
|
0
|
|
|
|
|
|
986
|
0
|
|
|
|
|
|
* ptr = p + i; |
987
|
0
|
|
|
|
|
|
return UNICODE_OK; |
988
|
|
|
|
|
|
|
} |
989
|
|
|
|
|
|
|
} |
990
|
0
|
|
|
|
|
|
return UTF8_BAD_CONTINUATION_BYTE; |
991
|
|
|
|
|
|
|
} |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
/* Given a return value "code" which is negative or zero, return a |
994
|
|
|
|
|
|
|
string which describes what the return value means. Positive |
995
|
|
|
|
|
|
|
non-zero return values never indicate errors or statuses in this |
996
|
|
|
|
|
|
|
library. Unknown error codes result in a default string being |
997
|
|
|
|
|
|
|
returned. */ |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
const char * |
1000
|
0
|
|
|
|
|
|
unicode_code_to_error (int32_t code) |
1001
|
|
|
|
|
|
|
{ |
1002
|
0
|
|
|
|
|
|
switch (code) { |
1003
|
|
|
|
|
|
|
case UTF8_BAD_LEADING_BYTE: |
1004
|
0
|
|
|
|
|
|
return "The leading byte of a UTF-8 sequence was invalid"; |
1005
|
|
|
|
|
|
|
case UTF8_BAD_CONTINUATION_BYTE: |
1006
|
0
|
|
|
|
|
|
return "A continuation byte of a UTF-8 sequence was invalid"; |
1007
|
|
|
|
|
|
|
case UNICODE_SURROGATE_PAIR: |
1008
|
0
|
|
|
|
|
|
return "A surrogate pair code point could not be converted to UTF-8"; |
1009
|
|
|
|
|
|
|
case UNICODE_NOT_SURROGATE_PAIR: |
1010
|
0
|
|
|
|
|
|
return "Input code points did not form a surrogate pair"; |
1011
|
|
|
|
|
|
|
case UNICODE_OK: |
1012
|
0
|
|
|
|
|
|
return "Successful completion"; |
1013
|
|
|
|
|
|
|
case UNICODE_TOO_BIG: |
1014
|
0
|
|
|
|
|
|
return "A code point was beyond limits"; |
1015
|
|
|
|
|
|
|
case UNICODE_NOT_CHARACTER: |
1016
|
0
|
|
|
|
|
|
return "A number ending in hex FFFF or FFFE is not valid Unicode"; |
1017
|
|
|
|
|
|
|
case UTF8_NON_SHORTEST: |
1018
|
0
|
|
|
|
|
|
return "A UTF-8 input was not in the shortest form"; |
1019
|
|
|
|
|
|
|
case UNICODE_EMPTY_INPUT: |
1020
|
0
|
|
|
|
|
|
return "A byte with value zero was found in UTF-8 input"; |
1021
|
|
|
|
|
|
|
default: |
1022
|
0
|
|
|
|
|
|
return "Unknown/invalid error code"; |
1023
|
|
|
|
|
|
|
} |
1024
|
|
|
|
|
|
|
} |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
/* _____ _ |
1027
|
|
|
|
|
|
|
|_ _|__ ___| |_ ___ |
1028
|
|
|
|
|
|
|
| |/ _ \/ __| __/ __| |
1029
|
|
|
|
|
|
|
| | __/\__ \ |_\__ \ |
1030
|
|
|
|
|
|
|
|_|\___||___/\__|___/ |
1031
|
|
|
|
|
|
|
*/ |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
/* Below this is code for testing which is not normally compiled. Use |
1034
|
|
|
|
|
|
|
"make test" to compile the testing version. */ |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
#ifdef TEST |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
#include |
1039
|
|
|
|
|
|
|
#include |
1040
|
|
|
|
|
|
|
#include "c-tap-test.h" |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖX"; |
1043
|
|
|
|
|
|
|
static const uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x0}; |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
#define BUFFSIZE 0x100 |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
static void test_ucs2_to_utf8 () |
1048
|
|
|
|
|
|
|
{ |
1049
|
|
|
|
|
|
|
/* Buffer to print utf8 out into. */ |
1050
|
|
|
|
|
|
|
uint8_t buffer[BUFFSIZE]; |
1051
|
|
|
|
|
|
|
/* Offset into buffer. */ |
1052
|
|
|
|
|
|
|
uint8_t * offset; |
1053
|
|
|
|
|
|
|
const uint8_t * start = utf8; |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
offset = buffer; |
1056
|
|
|
|
|
|
|
while (1) { |
1057
|
|
|
|
|
|
|
int32_t unicode; |
1058
|
|
|
|
|
|
|
int32_t bytes; |
1059
|
|
|
|
|
|
|
const uint8_t * end; |
1060
|
|
|
|
|
|
|
unicode = utf8_to_ucs2 (start, & end); |
1061
|
|
|
|
|
|
|
if (unicode == UNICODE_EMPTY_INPUT) { |
1062
|
|
|
|
|
|
|
break; |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
if (unicode < 0) { |
1065
|
|
|
|
|
|
|
fprintf (stderr, |
1066
|
|
|
|
|
|
|
"%s:%d: unexpected error %s converting unicode.\n", |
1067
|
|
|
|
|
|
|
__FILE__, __LINE__, unicode_code_to_error (unicode)); |
1068
|
|
|
|
|
|
|
// exit ok in test |
1069
|
|
|
|
|
|
|
exit (EXIT_FAILURE); |
1070
|
|
|
|
|
|
|
} |
1071
|
|
|
|
|
|
|
bytes = ucs2_to_utf8 (unicode, offset); |
1072
|
|
|
|
|
|
|
TAP_TEST_MSG (bytes > 0, "no bad conversion"); |
1073
|
|
|
|
|
|
|
TAP_TEST_MSG (strncmp ((const char *) offset, |
1074
|
|
|
|
|
|
|
(const char *) start, bytes) == 0, |
1075
|
|
|
|
|
|
|
"round trip OK for %X (%d bytes)", unicode, bytes); |
1076
|
|
|
|
|
|
|
start = end; |
1077
|
|
|
|
|
|
|
offset += bytes; |
1078
|
|
|
|
|
|
|
if (offset - buffer >= BUFFSIZE) { |
1079
|
|
|
|
|
|
|
fprintf (stderr, "%s:%d: out of space in buffer.\n", |
1080
|
|
|
|
|
|
|
__FILE__, __LINE__); |
1081
|
|
|
|
|
|
|
// exit ok |
1082
|
|
|
|
|
|
|
exit (EXIT_FAILURE); |
1083
|
|
|
|
|
|
|
} |
1084
|
|
|
|
|
|
|
} |
1085
|
|
|
|
|
|
|
* offset = '\0'; |
1086
|
|
|
|
|
|
|
TAP_TEST_MSG (strcmp ((const char *) buffer, (const char *) utf8) == 0, |
1087
|
|
|
|
|
|
|
"input %s resulted in identical output %s", |
1088
|
|
|
|
|
|
|
utf8, buffer); |
1089
|
|
|
|
|
|
|
} |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
static void |
1092
|
|
|
|
|
|
|
test_invalid_utf8 () |
1093
|
|
|
|
|
|
|
{ |
1094
|
|
|
|
|
|
|
uint8_t invalid_utf8[UTF8_MAX_LENGTH]; |
1095
|
|
|
|
|
|
|
int32_t unicode; |
1096
|
|
|
|
|
|
|
int32_t valid; |
1097
|
|
|
|
|
|
|
const uint8_t * end; |
1098
|
|
|
|
|
|
|
snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1, |
1099
|
|
|
|
|
|
|
"%c%c%c", 0xe8, 0xe4, 0xe5); |
1100
|
|
|
|
|
|
|
unicode = utf8_to_ucs2 (invalid_utf8, & end); |
1101
|
|
|
|
|
|
|
TAP_TEST_MSG (unicode == UTF8_BAD_CONTINUATION_BYTE, |
1102
|
|
|
|
|
|
|
"invalid UTF-8 gives incorrect result"); |
1103
|
|
|
|
|
|
|
valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8)); |
1104
|
|
|
|
|
|
|
TAP_TEST_MSG (valid == UTF8_INVALID, "Invalid UTF-8 fails valid_utf8"); |
1105
|
|
|
|
|
|
|
} |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
static void |
1108
|
|
|
|
|
|
|
test_surrogate_pairs () |
1109
|
|
|
|
|
|
|
{ |
1110
|
|
|
|
|
|
|
int32_t status; |
1111
|
|
|
|
|
|
|
int32_t hi; |
1112
|
|
|
|
|
|
|
int32_t lo; |
1113
|
|
|
|
|
|
|
int32_t rt; |
1114
|
|
|
|
|
|
|
/* This is the wide character space, which does not require |
1115
|
|
|
|
|
|
|
representation as a surrogate pair. */ |
1116
|
|
|
|
|
|
|
int32_t nogood = 0x3000; |
1117
|
|
|
|
|
|
|
/* |
1118
|
|
|
|
|
|
|
Two examples from the Wikipedia article on UTF-16 |
1119
|
|
|
|
|
|
|
https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */ |
1120
|
|
|
|
|
|
|
int32_t wikipedia_1 = 0x10437; |
1121
|
|
|
|
|
|
|
int32_t wikipedia_2 = 0x24b62; |
1122
|
|
|
|
|
|
|
/* |
1123
|
|
|
|
|
|
|
An example from the JSON RFC |
1124
|
|
|
|
|
|
|
http://rfc7159.net/rfc7159#rfc.section.7 |
1125
|
|
|
|
|
|
|
*/ |
1126
|
|
|
|
|
|
|
int32_t json_spec = 0x1D11E; |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
status = unicode_to_surrogates (nogood, & hi, & lo); |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UNICODE_NOT_SURROGATE_PAIR, |
1131
|
|
|
|
|
|
|
"low value to surrogate pair breaker returns error"); |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
status = unicode_to_surrogates (wikipedia_1, & hi, & lo); |
1134
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1); |
1135
|
|
|
|
|
|
|
TAP_TEST_MSG (hi == 0xD801, "Got expected %X == 0xD801", hi); |
1136
|
|
|
|
|
|
|
TAP_TEST_MSG (lo == 0xDC37, "Got expected %X == 0xDC37", lo); |
1137
|
|
|
|
|
|
|
rt = surrogates_to_unicode (hi, lo); |
1138
|
|
|
|
|
|
|
TAP_TEST_MSG (rt == wikipedia_1, "Round trip %X == initial %X", |
1139
|
|
|
|
|
|
|
rt, wikipedia_1); |
1140
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
status = unicode_to_surrogates (wikipedia_2, & hi, & lo); |
1142
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1); |
1143
|
|
|
|
|
|
|
TAP_TEST_MSG (hi == 0xD852, "Got expected %X == 0xD852", hi); |
1144
|
|
|
|
|
|
|
TAP_TEST_MSG (lo == 0xDF62, "Got expected %X == 0xDF62", lo); |
1145
|
|
|
|
|
|
|
rt = surrogates_to_unicode (hi, lo); |
1146
|
|
|
|
|
|
|
TAP_TEST_MSG (rt == wikipedia_2, "Round trip %X == initial %X", |
1147
|
|
|
|
|
|
|
rt, wikipedia_2); |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
status = unicode_to_surrogates (json_spec, & hi, & lo); |
1150
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", json_spec); |
1151
|
|
|
|
|
|
|
TAP_TEST_MSG (hi == 0xD834, "Got expected %X == 0xD834", hi); |
1152
|
|
|
|
|
|
|
TAP_TEST_MSG (lo == 0xDd1e, "Got expected %X == 0xDD1e", lo); |
1153
|
|
|
|
|
|
|
rt = surrogates_to_unicode (hi, lo); |
1154
|
|
|
|
|
|
|
TAP_TEST_MSG (rt == json_spec, "Round trip %X == initial %X", |
1155
|
|
|
|
|
|
|
rt, json_spec); |
1156
|
|
|
|
|
|
|
} |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
/* Test sending various bytes into "utf8_bytes" and seeing whether the |
1159
|
|
|
|
|
|
|
return value is what we expected. */ |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
static void |
1162
|
|
|
|
|
|
|
test_utf8_bytes () |
1163
|
|
|
|
|
|
|
{ |
1164
|
|
|
|
|
|
|
struct tub { |
1165
|
|
|
|
|
|
|
int32_t first; |
1166
|
|
|
|
|
|
|
int32_t expect; |
1167
|
|
|
|
|
|
|
} tests[] = { |
1168
|
|
|
|
|
|
|
{'a', 1}, |
1169
|
|
|
|
|
|
|
{0xb0, UTF8_BAD_LEADING_BYTE}, |
1170
|
|
|
|
|
|
|
{0xc2, 2}, |
1171
|
|
|
|
|
|
|
{0xff, UTF8_BAD_LEADING_BYTE}, |
1172
|
|
|
|
|
|
|
}; |
1173
|
|
|
|
|
|
|
int32_t n_tests = sizeof (tests) / sizeof (struct tub); |
1174
|
|
|
|
|
|
|
int32_t i; |
1175
|
|
|
|
|
|
|
for (i = 0; i < n_tests; i++) { |
1176
|
|
|
|
|
|
|
/* Expected bytes. */ |
1177
|
|
|
|
|
|
|
int32_t xbytes; |
1178
|
|
|
|
|
|
|
int32_t firstbyte; |
1179
|
|
|
|
|
|
|
firstbyte = tests[i].first; |
1180
|
|
|
|
|
|
|
xbytes = utf8_bytes (firstbyte); |
1181
|
|
|
|
|
|
|
TAP_TEST_MSG (xbytes == tests[i].expect, "Got %d (%d) with input %d", |
1182
|
|
|
|
|
|
|
xbytes, tests[i].expect, firstbyte); |
1183
|
|
|
|
|
|
|
} |
1184
|
|
|
|
|
|
|
} |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
/* Test the conversion from utf-8 to ucs-2 (UTF-16). */ |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
static void |
1189
|
|
|
|
|
|
|
test_utf8_to_ucs2 () |
1190
|
|
|
|
|
|
|
{ |
1191
|
|
|
|
|
|
|
const uint8_t * start = utf8; |
1192
|
|
|
|
|
|
|
while (*start) { |
1193
|
|
|
|
|
|
|
int32_t unicode; |
1194
|
|
|
|
|
|
|
const uint8_t * end; |
1195
|
|
|
|
|
|
|
unicode = utf8_to_ucs2 (start, & end); |
1196
|
|
|
|
|
|
|
TAP_TEST_MSG (unicode > 0, "no bad value at %s", start); |
1197
|
|
|
|
|
|
|
printf ("# %s is %04X, length is %d\n", |
1198
|
|
|
|
|
|
|
start, unicode, (int) (end - start)); |
1199
|
|
|
|
|
|
|
start = end; |
1200
|
|
|
|
|
|
|
} |
1201
|
|
|
|
|
|
|
} |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
/* Test counting of unicode characters. */ |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
static void |
1206
|
|
|
|
|
|
|
test_unicode_count_chars () |
1207
|
|
|
|
|
|
|
{ |
1208
|
|
|
|
|
|
|
int32_t cc; |
1209
|
|
|
|
|
|
|
cc = unicode_count_chars (utf8); |
1210
|
|
|
|
|
|
|
TAP_TEST_MSG (cc == 7, "unicode_count_chars gets seven characters for utf8"); |
1211
|
|
|
|
|
|
|
cc = unicode_count_chars_fast (utf8); |
1212
|
|
|
|
|
|
|
TAP_TEST_MSG (cc == 7, "unicode_count_chars_fast gets seven characters for utf8"); |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
static void |
1216
|
|
|
|
|
|
|
test_valid_utf8 () |
1217
|
|
|
|
|
|
|
{ |
1218
|
|
|
|
|
|
|
int32_t valid; |
1219
|
|
|
|
|
|
|
valid = valid_utf8 (utf8, strlen ((const char *) utf8)); |
1220
|
|
|
|
|
|
|
TAP_TEST_MSG (valid == UTF8_VALID, "Valid UTF-8 passes valid_utf8"); |
1221
|
|
|
|
|
|
|
} |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
static void |
1224
|
|
|
|
|
|
|
test_trim_to_utf8_start () |
1225
|
|
|
|
|
|
|
{ |
1226
|
|
|
|
|
|
|
int32_t status; |
1227
|
|
|
|
|
|
|
const uint8_t * p; |
1228
|
|
|
|
|
|
|
/* Invalid UTF-8. */ |
1229
|
|
|
|
|
|
|
/* Valid UTF-8. */ |
1230
|
|
|
|
|
|
|
uint8_t good[] = "化苦"; |
1231
|
|
|
|
|
|
|
uint8_t good2[] = "化abc"; |
1232
|
|
|
|
|
|
|
p = bad; |
1233
|
|
|
|
|
|
|
status = trim_to_utf8_start (& p); |
1234
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UTF8_BAD_CONTINUATION_BYTE, |
1235
|
|
|
|
|
|
|
"Non-UTF-8 causes error"); |
1236
|
|
|
|
|
|
|
TAP_TEST_MSG (p == bad, "Did not change pointer"); |
1237
|
|
|
|
|
|
|
p = good + 1; |
1238
|
|
|
|
|
|
|
status = trim_to_utf8_start (& p); |
1239
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result"); |
1240
|
|
|
|
|
|
|
TAP_TEST_MSG (p != good + 1, "Moved p"); |
1241
|
|
|
|
|
|
|
TAP_TEST_MSG (p == good + 3, "Moved p to the right position"); |
1242
|
|
|
|
|
|
|
p = good2 + 1; |
1243
|
|
|
|
|
|
|
status = trim_to_utf8_start (& p); |
1244
|
|
|
|
|
|
|
TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result"); |
1245
|
|
|
|
|
|
|
TAP_TEST_MSG (p != good2 + 1, "Moved p"); |
1246
|
|
|
|
|
|
|
TAP_TEST_MSG (p == good2 + 3, "Moved p to the right position"); |
1247
|
|
|
|
|
|
|
} |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
static void |
1250
|
|
|
|
|
|
|
test_constants () |
1251
|
|
|
|
|
|
|
{ |
1252
|
|
|
|
|
|
|
TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM); |
1253
|
|
|
|
|
|
|
} |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
static void |
1256
|
|
|
|
|
|
|
test_utf8_validate () |
1257
|
|
|
|
|
|
|
{ |
1258
|
|
|
|
|
|
|
int r; |
1259
|
|
|
|
|
|
|
int l; |
1260
|
|
|
|
|
|
|
utf8_info_t info; |
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
r = validate_utf8 ((const uint8_t *) "", 0, & info); |
1263
|
|
|
|
|
|
|
TAP_TEST_EQUAL (r, UNICODE_OK); |
1264
|
|
|
|
|
|
|
TAP_TEST_EQUAL (info.len_read, 0); |
1265
|
|
|
|
|
|
|
TAP_TEST_EQUAL (info.runes_read, 0); |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
l = strlen ((const char *) utf8); |
1268
|
|
|
|
|
|
|
r = validate_utf8 (utf8, l, & info); |
1269
|
|
|
|
|
|
|
TAP_TEST_EQUAL (r, UNICODE_OK); |
1270
|
|
|
|
|
|
|
TAP_TEST_EQUAL (info.len_read, l); |
1271
|
|
|
|
|
|
|
TAP_TEST_EQUAL (info.runes_read, 7); |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
l = strlen ((const char *) bad); |
1274
|
|
|
|
|
|
|
r = validate_utf8 (bad, l, & info); |
1275
|
|
|
|
|
|
|
TAP_TEST (r != UNICODE_OK); |
1276
|
|
|
|
|
|
|
} |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
int main () |
1279
|
|
|
|
|
|
|
{ |
1280
|
|
|
|
|
|
|
test_utf8_to_ucs2 (); |
1281
|
|
|
|
|
|
|
test_ucs2_to_utf8 (); |
1282
|
|
|
|
|
|
|
test_invalid_utf8 (); |
1283
|
|
|
|
|
|
|
test_unicode_count_chars (); |
1284
|
|
|
|
|
|
|
test_surrogate_pairs (); |
1285
|
|
|
|
|
|
|
test_utf8_bytes (); |
1286
|
|
|
|
|
|
|
test_valid_utf8 (); |
1287
|
|
|
|
|
|
|
test_trim_to_utf8_start (); |
1288
|
|
|
|
|
|
|
test_constants (); |
1289
|
|
|
|
|
|
|
test_utf8_validate (); |
1290
|
|
|
|
|
|
|
TAP_PLAN; |
1291
|
|
|
|
|
|
|
} |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
#endif /* def TEST */ |