line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* |
2
|
|
|
|
|
|
|
* Copyright 2013 MongoDB, Inc. |
3
|
|
|
|
|
|
|
* |
4
|
|
|
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
5
|
|
|
|
|
|
|
* you may not use this file except in compliance with the License. |
6
|
|
|
|
|
|
|
* You may obtain a copy of the License at |
7
|
|
|
|
|
|
|
* |
8
|
|
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0 |
9
|
|
|
|
|
|
|
* |
10
|
|
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software |
11
|
|
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS, |
12
|
|
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13
|
|
|
|
|
|
|
* See the License for the specific language governing permissions and |
14
|
|
|
|
|
|
|
* limitations under the License. |
15
|
|
|
|
|
|
|
*/ |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
#include |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
#include "bson-memory.h" |
21
|
|
|
|
|
|
|
#include "bson-string.h" |
22
|
|
|
|
|
|
|
#include "bson-utf8.h" |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
/* |
26
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
27
|
|
|
|
|
|
|
* |
28
|
|
|
|
|
|
|
* _bson_utf8_get_sequence -- |
29
|
|
|
|
|
|
|
* |
30
|
|
|
|
|
|
|
* Determine the sequence length of the first UTF-8 character in |
31
|
|
|
|
|
|
|
* @utf8. The sequence length is stored in @seq_length and the mask |
32
|
|
|
|
|
|
|
* for the first character is stored in @first_mask. |
33
|
|
|
|
|
|
|
* |
34
|
|
|
|
|
|
|
* Returns: |
35
|
|
|
|
|
|
|
* None. |
36
|
|
|
|
|
|
|
* |
37
|
|
|
|
|
|
|
* Side effects: |
38
|
|
|
|
|
|
|
* @seq_length is set. |
39
|
|
|
|
|
|
|
* @first_mask is set. |
40
|
|
|
|
|
|
|
* |
41
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
42
|
|
|
|
|
|
|
*/ |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
static BSON_INLINE void |
45
|
2976
|
|
|
|
|
|
_bson_utf8_get_sequence (const char *utf8, /* IN */ |
46
|
|
|
|
|
|
|
uint8_t *seq_length, /* OUT */ |
47
|
|
|
|
|
|
|
uint8_t *first_mask) /* OUT */ |
48
|
|
|
|
|
|
|
{ |
49
|
2976
|
|
|
|
|
|
unsigned char c = *(const unsigned char *)utf8; |
50
|
|
|
|
|
|
|
uint8_t m; |
51
|
|
|
|
|
|
|
uint8_t n; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
/* |
54
|
|
|
|
|
|
|
* See the following[1] for a description of what the given multi-byte |
55
|
|
|
|
|
|
|
* sequences will be based on the bits set of the first byte. We also need |
56
|
|
|
|
|
|
|
* to mask the first byte based on that. All subsequent bytes are masked |
57
|
|
|
|
|
|
|
* against 0x3F. |
58
|
|
|
|
|
|
|
* |
59
|
|
|
|
|
|
|
* [1] http://www.joelonsoftware.com/articles/Unicode.html |
60
|
|
|
|
|
|
|
*/ |
61
|
|
|
|
|
|
|
|
62
|
2976
|
100
|
|
|
|
|
if ((c & 0x80) == 0) { |
63
|
2920
|
|
|
|
|
|
n = 1; |
64
|
2920
|
|
|
|
|
|
m = 0x7F; |
65
|
56
|
100
|
|
|
|
|
} else if ((c & 0xE0) == 0xC0) { |
66
|
36
|
|
|
|
|
|
n = 2; |
67
|
36
|
|
|
|
|
|
m = 0x1F; |
68
|
20
|
50
|
|
|
|
|
} else if ((c & 0xF0) == 0xE0) { |
69
|
20
|
|
|
|
|
|
n = 3; |
70
|
20
|
|
|
|
|
|
m = 0x0F; |
71
|
0
|
0
|
|
|
|
|
} else if ((c & 0xF8) == 0xF0) { |
72
|
0
|
|
|
|
|
|
n = 4; |
73
|
0
|
|
|
|
|
|
m = 0x07; |
74
|
0
|
0
|
|
|
|
|
} else if ((c & 0xFC) == 0xF8) { |
75
|
0
|
|
|
|
|
|
n = 5; |
76
|
0
|
|
|
|
|
|
m = 0x03; |
77
|
0
|
0
|
|
|
|
|
} else if ((c & 0xFE) == 0xFC) { |
78
|
0
|
|
|
|
|
|
n = 6; |
79
|
0
|
|
|
|
|
|
m = 0x01; |
80
|
|
|
|
|
|
|
} else { |
81
|
0
|
|
|
|
|
|
n = 0; |
82
|
0
|
|
|
|
|
|
m = 0; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
2976
|
|
|
|
|
|
*seq_length = n; |
86
|
2976
|
|
|
|
|
|
*first_mask = m; |
87
|
2976
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
/* |
91
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
92
|
|
|
|
|
|
|
* |
93
|
|
|
|
|
|
|
* bson_utf8_validate -- |
94
|
|
|
|
|
|
|
* |
95
|
|
|
|
|
|
|
* Validates that @utf8 is a valid UTF-8 string. |
96
|
|
|
|
|
|
|
* |
97
|
|
|
|
|
|
|
* If @allow_null is true, then \0 is allowed within @utf8_len bytes |
98
|
|
|
|
|
|
|
* of @utf8. Generally, this is bad practice since the main point of |
99
|
|
|
|
|
|
|
* UTF-8 strings is that they can be used with strlen() and friends. |
100
|
|
|
|
|
|
|
* However, some languages such as Python can send UTF-8 encoded |
101
|
|
|
|
|
|
|
* strings with NUL's in them. |
102
|
|
|
|
|
|
|
* |
103
|
|
|
|
|
|
|
* Parameters: |
104
|
|
|
|
|
|
|
* @utf8: A UTF-8 encoded string. |
105
|
|
|
|
|
|
|
* @utf8_len: The length of @utf8 in bytes. |
106
|
|
|
|
|
|
|
* @allow_null: If \0 is allowed within @utf8, exclusing trailing \0. |
107
|
|
|
|
|
|
|
* |
108
|
|
|
|
|
|
|
* Returns: |
109
|
|
|
|
|
|
|
* true if @utf8 is valid UTF-8. otherwise false. |
110
|
|
|
|
|
|
|
* |
111
|
|
|
|
|
|
|
* Side effects: |
112
|
|
|
|
|
|
|
* None. |
113
|
|
|
|
|
|
|
* |
114
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
115
|
|
|
|
|
|
|
*/ |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
bool |
118
|
1988
|
|
|
|
|
|
bson_utf8_validate (const char *utf8, /* IN */ |
119
|
|
|
|
|
|
|
size_t utf8_len, /* IN */ |
120
|
|
|
|
|
|
|
bool allow_null) /* IN */ |
121
|
|
|
|
|
|
|
{ |
122
|
|
|
|
|
|
|
bson_unichar_t c; |
123
|
|
|
|
|
|
|
uint8_t first_mask; |
124
|
|
|
|
|
|
|
uint8_t seq_length; |
125
|
|
|
|
|
|
|
unsigned i; |
126
|
|
|
|
|
|
|
unsigned j; |
127
|
|
|
|
|
|
|
|
128
|
1988
|
50
|
|
|
|
|
BSON_ASSERT (utf8); |
129
|
|
|
|
|
|
|
|
130
|
4961
|
100
|
|
|
|
|
for (i = 0; i < utf8_len; i += seq_length) { |
131
|
2976
|
|
|
|
|
|
_bson_utf8_get_sequence (&utf8[i], &seq_length, &first_mask); |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
/* |
134
|
|
|
|
|
|
|
* Ensure we have a valid multi-byte sequence length. |
135
|
|
|
|
|
|
|
*/ |
136
|
2976
|
50
|
|
|
|
|
if (!seq_length) { |
137
|
0
|
|
|
|
|
|
return false; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
/* |
141
|
|
|
|
|
|
|
* Ensure we have enough bytes left. |
142
|
|
|
|
|
|
|
*/ |
143
|
2976
|
100
|
|
|
|
|
if ((utf8_len - i) < seq_length) { |
144
|
3
|
|
|
|
|
|
return false; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
/* |
148
|
|
|
|
|
|
|
* Also calculate the next char as a unichar so we can |
149
|
|
|
|
|
|
|
* check code ranges for non-shortest form. |
150
|
|
|
|
|
|
|
*/ |
151
|
2973
|
|
|
|
|
|
c = utf8 [i] & first_mask; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
/* |
154
|
|
|
|
|
|
|
* Check the high-bits for each additional sequence byte. |
155
|
|
|
|
|
|
|
*/ |
156
|
3043
|
100
|
|
|
|
|
for (j = i + 1; j < (i + seq_length); j++) { |
157
|
70
|
|
|
|
|
|
c = (c << 6) | (utf8 [j] & 0x3F); |
158
|
70
|
50
|
|
|
|
|
if ((utf8[j] & 0xC0) != 0x80) { |
159
|
0
|
|
|
|
|
|
return false; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
/* |
164
|
|
|
|
|
|
|
* Check for NULL bytes afterwards. |
165
|
|
|
|
|
|
|
* |
166
|
|
|
|
|
|
|
* Hint: if you want to optimize this function, starting here to do |
167
|
|
|
|
|
|
|
* this in the same pass as the data above would probably be a good |
168
|
|
|
|
|
|
|
* idea. You would add a branch into the inner loop, but save possibly |
169
|
|
|
|
|
|
|
* on cache-line bouncing on larger strings. Just a thought. |
170
|
|
|
|
|
|
|
*/ |
171
|
2973
|
100
|
|
|
|
|
if (!allow_null) { |
172
|
4962
|
100
|
|
|
|
|
for (j = 0; j < seq_length; j++) { |
173
|
2482
|
50
|
|
|
|
|
if (((i + j) > utf8_len) || !utf8[i + j]) { |
|
|
50
|
|
|
|
|
|
174
|
0
|
|
|
|
|
|
return false; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
/* |
180
|
|
|
|
|
|
|
* Code point wont fit in utf-16, not allowed. |
181
|
|
|
|
|
|
|
*/ |
182
|
2973
|
50
|
|
|
|
|
if (c > 0x0010FFFF) { |
183
|
0
|
|
|
|
|
|
return false; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
/* |
187
|
|
|
|
|
|
|
* Byte is in reserved range for UTF-16 high-marks |
188
|
|
|
|
|
|
|
* for surrogate pairs. |
189
|
|
|
|
|
|
|
*/ |
190
|
2973
|
50
|
|
|
|
|
if ((c & 0xFFFFF800) == 0xD800) { |
191
|
0
|
|
|
|
|
|
return false; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
/* |
195
|
|
|
|
|
|
|
* Check non-shortest form unicode. |
196
|
|
|
|
|
|
|
*/ |
197
|
2973
|
|
|
|
|
|
switch (seq_length) { |
198
|
|
|
|
|
|
|
case 1: |
199
|
2920
|
50
|
|
|
|
|
if (c <= 0x007F) { |
200
|
2920
|
|
|
|
|
|
continue; |
201
|
|
|
|
|
|
|
} |
202
|
0
|
|
|
|
|
|
return false; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
case 2: |
205
|
36
|
50
|
|
|
|
|
if ((c >= 0x0080) && (c <= 0x07FF)) { |
|
|
50
|
|
|
|
|
|
206
|
36
|
|
|
|
|
|
continue; |
207
|
0
|
0
|
|
|
|
|
} else if (c == 0) { |
208
|
|
|
|
|
|
|
/* Two-byte representation for NULL. */ |
209
|
0
|
|
|
|
|
|
continue; |
210
|
|
|
|
|
|
|
} |
211
|
0
|
|
|
|
|
|
return false; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
case 3: |
214
|
17
|
50
|
|
|
|
|
if (((c >= 0x0800) && (c <= 0x0FFF)) || |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
215
|
17
|
50
|
|
|
|
|
((c >= 0x1000) && (c <= 0xFFFF))) { |
216
|
17
|
|
|
|
|
|
continue; |
217
|
|
|
|
|
|
|
} |
218
|
0
|
|
|
|
|
|
return false; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
case 4: |
221
|
0
|
0
|
|
|
|
|
if (((c >= 0x10000) && (c <= 0x3FFFF)) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
222
|
0
|
0
|
|
|
|
|
((c >= 0x40000) && (c <= 0xFFFFF)) || |
|
|
0
|
|
|
|
|
|
223
|
0
|
0
|
|
|
|
|
((c >= 0x100000) && (c <= 0x10FFFF))) { |
224
|
0
|
|
|
|
|
|
continue; |
225
|
|
|
|
|
|
|
} |
226
|
0
|
|
|
|
|
|
return false; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
default: |
229
|
0
|
|
|
|
|
|
return false; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
1988
|
|
|
|
|
|
return true; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
/* |
238
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
239
|
|
|
|
|
|
|
* |
240
|
|
|
|
|
|
|
* bson_utf8_escape_for_json -- |
241
|
|
|
|
|
|
|
* |
242
|
|
|
|
|
|
|
* Allocates a new string matching @utf8 except that special |
243
|
|
|
|
|
|
|
* characters in JSON will be escaped. The resulting string is also |
244
|
|
|
|
|
|
|
* UTF-8 encoded. |
245
|
|
|
|
|
|
|
* |
246
|
|
|
|
|
|
|
* Both " and \ characters will be escaped. Additionally, if a NUL |
247
|
|
|
|
|
|
|
* byte is found before @utf8_len bytes, it will be converted to the |
248
|
|
|
|
|
|
|
* two byte UTF-8 sequence. |
249
|
|
|
|
|
|
|
* |
250
|
|
|
|
|
|
|
* Parameters: |
251
|
|
|
|
|
|
|
* @utf8: A UTF-8 encoded string. |
252
|
|
|
|
|
|
|
* @utf8_len: The length of @utf8 in bytes or -1 if NUL terminated. |
253
|
|
|
|
|
|
|
* |
254
|
|
|
|
|
|
|
* Returns: |
255
|
|
|
|
|
|
|
* A newly allocated string that should be freed with bson_free(). |
256
|
|
|
|
|
|
|
* |
257
|
|
|
|
|
|
|
* Side effects: |
258
|
|
|
|
|
|
|
* None. |
259
|
|
|
|
|
|
|
* |
260
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
261
|
|
|
|
|
|
|
*/ |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
char * |
264
|
0
|
|
|
|
|
|
bson_utf8_escape_for_json (const char *utf8, /* IN */ |
265
|
|
|
|
|
|
|
ssize_t utf8_len) /* IN */ |
266
|
|
|
|
|
|
|
{ |
267
|
|
|
|
|
|
|
bson_unichar_t c; |
268
|
|
|
|
|
|
|
bson_string_t *str; |
269
|
0
|
|
|
|
|
|
bool length_provided = true; |
270
|
|
|
|
|
|
|
const char *end; |
271
|
|
|
|
|
|
|
|
272
|
0
|
0
|
|
|
|
|
BSON_ASSERT (utf8); |
273
|
|
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
str = bson_string_new (NULL); |
275
|
|
|
|
|
|
|
|
276
|
0
|
0
|
|
|
|
|
if (utf8_len < 0) { |
277
|
0
|
|
|
|
|
|
length_provided = false; |
278
|
0
|
|
|
|
|
|
utf8_len = strlen (utf8); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
0
|
|
|
|
|
|
end = utf8 + utf8_len; |
282
|
|
|
|
|
|
|
|
283
|
0
|
0
|
|
|
|
|
while (utf8 < end) { |
284
|
0
|
|
|
|
|
|
c = bson_utf8_get_char (utf8); |
285
|
|
|
|
|
|
|
|
286
|
0
|
|
|
|
|
|
switch (c) { |
287
|
|
|
|
|
|
|
case '\\': |
288
|
|
|
|
|
|
|
case '"': |
289
|
|
|
|
|
|
|
case '/': |
290
|
0
|
|
|
|
|
|
bson_string_append_c (str, '\\'); |
291
|
0
|
|
|
|
|
|
bson_string_append_unichar (str, c); |
292
|
0
|
|
|
|
|
|
break; |
293
|
|
|
|
|
|
|
case '\b': |
294
|
0
|
|
|
|
|
|
bson_string_append (str, "\\b"); |
295
|
0
|
|
|
|
|
|
break; |
296
|
|
|
|
|
|
|
case '\f': |
297
|
0
|
|
|
|
|
|
bson_string_append (str, "\\f"); |
298
|
0
|
|
|
|
|
|
break; |
299
|
|
|
|
|
|
|
case '\n': |
300
|
0
|
|
|
|
|
|
bson_string_append (str, "\\n"); |
301
|
0
|
|
|
|
|
|
break; |
302
|
|
|
|
|
|
|
case '\r': |
303
|
0
|
|
|
|
|
|
bson_string_append (str, "\\r"); |
304
|
0
|
|
|
|
|
|
break; |
305
|
|
|
|
|
|
|
case '\t': |
306
|
0
|
|
|
|
|
|
bson_string_append (str, "\\t"); |
307
|
0
|
|
|
|
|
|
break; |
308
|
|
|
|
|
|
|
default: |
309
|
0
|
0
|
|
|
|
|
if (c < ' ') { |
310
|
0
|
|
|
|
|
|
bson_string_append_printf (str, "\\u%04u", (unsigned)c); |
311
|
|
|
|
|
|
|
} else { |
312
|
0
|
|
|
|
|
|
bson_string_append_unichar (str, c); |
313
|
|
|
|
|
|
|
} |
314
|
0
|
|
|
|
|
|
break; |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
|
317
|
0
|
0
|
|
|
|
|
if (c) { |
318
|
0
|
|
|
|
|
|
utf8 = bson_utf8_next_char (utf8); |
319
|
|
|
|
|
|
|
} else { |
320
|
0
|
0
|
|
|
|
|
if (length_provided && !*utf8) { |
|
|
0
|
|
|
|
|
|
321
|
|
|
|
|
|
|
/* we escaped nil as '\u0000', now advance past it */ |
322
|
0
|
|
|
|
|
|
utf8++; |
323
|
|
|
|
|
|
|
} else { |
324
|
|
|
|
|
|
|
/* invalid UTF-8 */ |
325
|
0
|
|
|
|
|
|
bson_string_free (str, true); |
326
|
0
|
|
|
|
|
|
return NULL; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
0
|
|
|
|
|
|
return bson_string_free (str, false); |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
/* |
336
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
337
|
|
|
|
|
|
|
* |
338
|
|
|
|
|
|
|
* bson_utf8_get_char -- |
339
|
|
|
|
|
|
|
* |
340
|
|
|
|
|
|
|
* Fetches the next UTF-8 character from the UTF-8 sequence. |
341
|
|
|
|
|
|
|
* |
342
|
|
|
|
|
|
|
* Parameters: |
343
|
|
|
|
|
|
|
* @utf8: A string containing validated UTF-8. |
344
|
|
|
|
|
|
|
* |
345
|
|
|
|
|
|
|
* Returns: |
346
|
|
|
|
|
|
|
* A 32-bit bson_unichar_t reprsenting the multi-byte sequence. |
347
|
|
|
|
|
|
|
* |
348
|
|
|
|
|
|
|
* Side effects: |
349
|
|
|
|
|
|
|
* None. |
350
|
|
|
|
|
|
|
* |
351
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
352
|
|
|
|
|
|
|
*/ |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
bson_unichar_t |
355
|
0
|
|
|
|
|
|
bson_utf8_get_char (const char *utf8) /* IN */ |
356
|
|
|
|
|
|
|
{ |
357
|
|
|
|
|
|
|
bson_unichar_t c; |
358
|
|
|
|
|
|
|
uint8_t mask; |
359
|
|
|
|
|
|
|
uint8_t num; |
360
|
|
|
|
|
|
|
int i; |
361
|
|
|
|
|
|
|
|
362
|
0
|
0
|
|
|
|
|
BSON_ASSERT (utf8); |
363
|
|
|
|
|
|
|
|
364
|
0
|
|
|
|
|
|
_bson_utf8_get_sequence (utf8, &num, &mask); |
365
|
0
|
|
|
|
|
|
c = (*utf8) & mask; |
366
|
|
|
|
|
|
|
|
367
|
0
|
0
|
|
|
|
|
for (i = 1; i < num; i++) { |
368
|
0
|
|
|
|
|
|
c = (c << 6) | (utf8[i] & 0x3F); |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
|
371
|
0
|
|
|
|
|
|
return c; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
/* |
376
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
377
|
|
|
|
|
|
|
* |
378
|
|
|
|
|
|
|
* bson_utf8_next_char -- |
379
|
|
|
|
|
|
|
* |
380
|
|
|
|
|
|
|
* Returns an incremented pointer to the beginning of the next |
381
|
|
|
|
|
|
|
* multi-byte sequence in @utf8. |
382
|
|
|
|
|
|
|
* |
383
|
|
|
|
|
|
|
* Parameters: |
384
|
|
|
|
|
|
|
* @utf8: A string containing validated UTF-8. |
385
|
|
|
|
|
|
|
* |
386
|
|
|
|
|
|
|
* Returns: |
387
|
|
|
|
|
|
|
* An incremented pointer in @utf8. |
388
|
|
|
|
|
|
|
* |
389
|
|
|
|
|
|
|
* Side effects: |
390
|
|
|
|
|
|
|
* None. |
391
|
|
|
|
|
|
|
* |
392
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
393
|
|
|
|
|
|
|
*/ |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
const char * |
396
|
0
|
|
|
|
|
|
bson_utf8_next_char (const char *utf8) /* IN */ |
397
|
|
|
|
|
|
|
{ |
398
|
|
|
|
|
|
|
uint8_t mask; |
399
|
|
|
|
|
|
|
uint8_t num; |
400
|
|
|
|
|
|
|
|
401
|
0
|
0
|
|
|
|
|
BSON_ASSERT (utf8); |
402
|
|
|
|
|
|
|
|
403
|
0
|
|
|
|
|
|
_bson_utf8_get_sequence (utf8, &num, &mask); |
404
|
|
|
|
|
|
|
|
405
|
0
|
|
|
|
|
|
return utf8 + num; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
/* |
410
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
411
|
|
|
|
|
|
|
* |
412
|
|
|
|
|
|
|
* bson_utf8_from_unichar -- |
413
|
|
|
|
|
|
|
* |
414
|
|
|
|
|
|
|
* Converts the unichar to a sequence of utf8 bytes and stores those |
415
|
|
|
|
|
|
|
* in @utf8. The number of bytes in the sequence are stored in @len. |
416
|
|
|
|
|
|
|
* |
417
|
|
|
|
|
|
|
* Parameters: |
418
|
|
|
|
|
|
|
* @unichar: A bson_unichar_t. |
419
|
|
|
|
|
|
|
* @utf8: A location for the multi-byte sequence. |
420
|
|
|
|
|
|
|
* @len: A location for number of bytes stored in @utf8. |
421
|
|
|
|
|
|
|
* |
422
|
|
|
|
|
|
|
* Returns: |
423
|
|
|
|
|
|
|
* None. |
424
|
|
|
|
|
|
|
* |
425
|
|
|
|
|
|
|
* Side effects: |
426
|
|
|
|
|
|
|
* @utf8 is set. |
427
|
|
|
|
|
|
|
* @len is set. |
428
|
|
|
|
|
|
|
* |
429
|
|
|
|
|
|
|
*-------------------------------------------------------------------------- |
430
|
|
|
|
|
|
|
*/ |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
void |
433
|
0
|
|
|
|
|
|
bson_utf8_from_unichar ( |
434
|
|
|
|
|
|
|
bson_unichar_t unichar, /* IN */ |
435
|
|
|
|
|
|
|
char utf8[BSON_ENSURE_ARRAY_PARAM_SIZE(6)], /* OUT */ |
436
|
|
|
|
|
|
|
uint32_t *len) /* OUT */ |
437
|
|
|
|
|
|
|
{ |
438
|
0
|
0
|
|
|
|
|
BSON_ASSERT (utf8); |
439
|
0
|
0
|
|
|
|
|
BSON_ASSERT (len); |
440
|
|
|
|
|
|
|
|
441
|
0
|
0
|
|
|
|
|
if (unichar <= 0x7F) { |
442
|
0
|
|
|
|
|
|
utf8[0] = unichar; |
443
|
0
|
|
|
|
|
|
*len = 1; |
444
|
0
|
0
|
|
|
|
|
} else if (unichar <= 0x7FF) { |
445
|
0
|
|
|
|
|
|
*len = 2; |
446
|
0
|
|
|
|
|
|
utf8[0] = 0xC0 | ((unichar >> 6) & 0x3F); |
447
|
0
|
|
|
|
|
|
utf8[1] = 0x80 | ((unichar) & 0x3F); |
448
|
0
|
0
|
|
|
|
|
} else if (unichar <= 0xFFFF) { |
449
|
0
|
|
|
|
|
|
*len = 3; |
450
|
0
|
|
|
|
|
|
utf8[0] = 0xE0 | ((unichar >> 12) & 0xF); |
451
|
0
|
|
|
|
|
|
utf8[1] = 0x80 | ((unichar >> 6) & 0x3F); |
452
|
0
|
|
|
|
|
|
utf8[2] = 0x80 | ((unichar) & 0x3F); |
453
|
0
|
0
|
|
|
|
|
} else if (unichar <= 0x1FFFFF) { |
454
|
0
|
|
|
|
|
|
*len = 4; |
455
|
0
|
|
|
|
|
|
utf8[0] = 0xF0 | ((unichar >> 18) & 0x7); |
456
|
0
|
|
|
|
|
|
utf8[1] = 0x80 | ((unichar >> 12) & 0x3F); |
457
|
0
|
|
|
|
|
|
utf8[2] = 0x80 | ((unichar >> 6) & 0x3F); |
458
|
0
|
|
|
|
|
|
utf8[3] = 0x80 | ((unichar) & 0x3F); |
459
|
0
|
0
|
|
|
|
|
} else if (unichar <= 0x3FFFFFF) { |
460
|
0
|
|
|
|
|
|
*len = 5; |
461
|
0
|
|
|
|
|
|
utf8[0] = 0xF8 | ((unichar >> 24) & 0x3); |
462
|
0
|
|
|
|
|
|
utf8[1] = 0x80 | ((unichar >> 18) & 0x3F); |
463
|
0
|
|
|
|
|
|
utf8[2] = 0x80 | ((unichar >> 12) & 0x3F); |
464
|
0
|
|
|
|
|
|
utf8[3] = 0x80 | ((unichar >> 6) & 0x3F); |
465
|
0
|
|
|
|
|
|
utf8[4] = 0x80 | ((unichar) & 0x3F); |
466
|
0
|
0
|
|
|
|
|
} else if (unichar <= 0x7FFFFFFF) { |
467
|
0
|
|
|
|
|
|
*len = 6; |
468
|
0
|
|
|
|
|
|
utf8[0] = 0xFC | ((unichar >> 31) & 0x1); |
469
|
0
|
|
|
|
|
|
utf8[1] = 0x80 | ((unichar >> 25) & 0x3F); |
470
|
0
|
|
|
|
|
|
utf8[2] = 0x80 | ((unichar >> 19) & 0x3F); |
471
|
0
|
|
|
|
|
|
utf8[3] = 0x80 | ((unichar >> 13) & 0x3F); |
472
|
0
|
|
|
|
|
|
utf8[4] = 0x80 | ((unichar >> 7) & 0x3F); |
473
|
0
|
|
|
|
|
|
utf8[5] = 0x80 | ((unichar) & 0x1); |
474
|
|
|
|
|
|
|
} else { |
475
|
0
|
|
|
|
|
|
*len = 0; |
476
|
|
|
|
|
|
|
} |
477
|
0
|
|
|
|
|
|
} |