File Coverage

unicode.c
Criterion Covered Total %
statement 10 226 4.4
branch 1 124 0.8
condition n/a
subroutine n/a
pod n/a
total 11 350 3.1


line stmt bran cond sub pod time code
1             /* This file is a Unicode library in the programming language C which
2             deals with conversions to and from the UTF-8 format. */
3              
4             /*
5             Author:
6              
7             Ben Bullock ,
8              
9             Repository:
10            
11             https://github.com/benkasminbullock/unicode-c
12             */
13              
14             #include
15             #include
16             #include "unicode.h"
17              
18             #ifdef HEADER
19              
20             /* _ _ _ _
21             | | (_)_ __ ___ (_) |_ ___
22             | | | | '_ ` _ \| | __/ __|
23             | |___| | | | | | | | |_\__ \
24             |_____|_|_| |_| |_|_|\__|___/ */
25            
26              
27              
28             /* The maximum number of bytes we need to contain any Unicode code
29             point as UTF-8 as a C string. This length includes one trailing nul
30             byte. */
31              
32             #define UTF8_MAX_LENGTH 5
33              
34             /* The maximum possible value of a Unicode code point. See
35             http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */
36              
37             #define UNICODE_MAXIMUM 0x10ffff
38              
39             /* The maximum possible value which will fit into four bytes of
40             UTF-8. This is larger than UNICODE_MAXIMUM. */
41              
42             #define UNICODE_UTF8_4 0x1fffff
43              
44             /* ____ _ _
45             | _ \ ___| |_ _ _ _ __ _ __ __ ____ _| |_ _ ___ ___
46             | |_) / _ \ __| | | | '__| '_ \ \ \ / / _` | | | | |/ _ \/ __|
47             | _ < __/ |_| |_| | | | | | | \ V / (_| | | |_| | __/\__ \
48             |_| \_\___|\__|\__,_|_| |_| |_| \_/ \__,_|_|\__,_|\___||___/ */
49            
50              
51             /* All of the functions in this library return an "int32_t". Negative
52             values are used to indicate errors. */
53              
54             /* This return value indicates the successful completion of a routine
55             which doesn't use the return value to communicate data back to the
56             caller. */
57              
58             #define UNICODE_OK 0
59              
60             /* This return value means that the leading byte of a UTF-8 sequence
61             was not valid. */
62              
63             #define UTF8_BAD_LEADING_BYTE -1
64              
65             /* This return value means the caller attempted to turn a code point
66             for a surrogate pair into UTF-8. */
67              
68             #define UNICODE_SURROGATE_PAIR -2
69              
70             /* This return value means that code points which did not form a
71             surrogate pair were tried to be converted into a code point as if
72             they were a surrogate pair. */
73              
74             #define UNICODE_NOT_SURROGATE_PAIR -3
75              
76             /* This return value means that input which was supposed to be UTF-8
77             encoded contained an invalid continuation byte. If the leading byte
78             of a UTF-8 sequence is not valid, UTF8_BAD_LEADING_BYTE is returned
79             instead of this. */
80              
81             #define UTF8_BAD_CONTINUATION_BYTE -4
82              
83             /* This return value indicates a zero byte was found in a string which
84             was supposed to contain UTF-8 bytes. It is returned only by the
85             functions which are documented as not allowing zero bytes. */
86              
87             #define UNICODE_EMPTY_INPUT -5
88              
89             /* This return value indicates that UTF-8 bytes were not in the
90             shortest possible form. See
91             http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. */
92              
93             #define UTF8_NON_SHORTEST -6
94              
95             /* This return value indicates that there was an attempt to convert a
96             code point which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4
97             into UTF-8 bytes. */
98              
99             #define UNICODE_TOO_BIG -7
100              
101             /* This return value indicates that the Unicode code-point ended with
102             either 0xFFFF or 0xFFFE, meaning it cannot be used as a character
103             code point. */
104              
105             #define UNICODE_NOT_CHARACTER -8
106              
107             /* This return value indicates that the UTF-8 is valid. */
108              
109             #define UTF8_VALID 1
110              
111             /* This return value indicates that the UTF-8 is not valid. */
112              
113             #define UTF8_INVALID 0
114              
115             #endif /* def HEADER */
116              
117             /* This table contains the length of a sequence which begins with the
118             byte given. A value of zero indicates that the byte can not begin a
119             UTF-8 sequence. */
120              
121             /* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */
122              
123             const uint8_t utf8_sequence_len[0x100] =
124             {
125             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
126             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
127             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
128             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
129             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
130             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
131             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
132             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
133             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
134             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
135             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
136             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
137             0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
138             2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
139             3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
140             4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
141             };
142              
143             /* This function returns the number of bytes of UTF-8 a sequence
144             starting with byte "c" will become, either 1 (c = 0000xxxx), 2 (c =
145             110xxxxx), 3 (c = 1110xxxx), or 4 (c = 111100xx or c =
146             11110100). If "c" is not a valid UTF-8 first byte, the value
147             UTF8_BAD_LEADING_BYTE is returned. */
148              
149 0           int32_t utf8_bytes (uint8_t c)
150             {
151             int32_t r;
152 0           r = utf8_sequence_len[c];
153 0 0         if (r == 0) {
154 0           return UTF8_BAD_LEADING_BYTE;
155             }
156 0           return r;
157             }
158              
159             /* This macro converts four bytes of UTF-8 into the corresponding code
160             point. */
161              
162             #define FOUR(x) \
163             (((int32_t) (x[0] & 0x07)) << 18) \
164             | (((int32_t) (x[1] & 0x3F)) << 12) \
165             | (((int32_t) (x[2] & 0x3F)) << 6) \
166             | (((int32_t) (x[3] & 0x3F)))
167              
168             /* Try to convert "input" from UTF-8 to UCS-2, and return a value even
169             if the input is partly broken. This checks the first byte of the
170             input, but it doesn't check the subsequent bytes. */
171              
172             int32_t
173 0           utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr)
174             {
175             uint8_t c;
176 0           c = input[0];
177 0           switch (utf8_sequence_len[c]) {
178             case 1:
179 0           * end_ptr = input + 1;
180 0           return c;
181              
182             case 2:
183 0           * end_ptr = input + 2;
184             return
185 0           (c & 0x1F) << 6 |
186 0           (input[1] & 0x3F);
187              
188             case 3:
189 0           * end_ptr = input + 3;
190             return
191 0           (c & 0x0F) << 12 |
192 0           (input[1] & 0x3F) << 6 |
193 0           (input[2] & 0x3F);
194              
195             case 4:
196 0           * end_ptr = input + 4;
197 0           return FOUR (input);
198              
199             case 0:
200             /* fall through */
201             default:
202 0           return UTF8_BAD_LEADING_BYTE;
203             }
204             }
205              
206             /* This function converts UTF-8 encoded bytes in "input" into the
207             equivalent Unicode code point. The return value is the Unicode code
208             point corresponding to the UTF-8 character in "input" if
209             successful, and a negative number if not successful. "*end_ptr" is
210             set to the next character after the read character on
211             success. "*end_ptr" is set to the start of input on
212             failure. "end_ptr" may not be null.
213              
214             If the first byte of "input" is zero, UNICODE_EMPTY_INPUT is
215             returned. If the first byte of "input" is not valid UTF-8,
216             UTF8_BAD_LEADING_BYTE is returned. If the second or later bytes of
217             "input" are not valid UTF-8, UTF8_BAD_CONTINUATION_BYTE is returned. If the
218             UTF-8 is not in the shortest possible form, the error
219             UTF8_NON_SHORTEST is returned. If the value extrapolated from
220             "input" is greater than UNICODE_MAXIMUM, UNICODE_TOO_BIG is
221             returned. If the value extrapolated from "input" ends in 0xFFFF or
222             0xFFFE, UNICODE_NOT_CHARACTER is returned. */
223              
224             int32_t
225 0           utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
226             {
227             uint8_t c;
228             uint8_t l;
229 0           *end_ptr = input;
230 0           c = input[0];
231 0 0         if (c == 0) {
232 0           return UNICODE_EMPTY_INPUT;
233             }
234 0           l = utf8_sequence_len[c];
235 0 0         if (l == 1) {
236 0           * end_ptr = input + 1;
237 0           return c;
238             }
239 0 0         if (l == 2) {
240             /* Two byte case. */
241 0 0         if (input[1] < 0x80 || input[1] > 0xBF) {
    0          
242 0           return UTF8_BAD_CONTINUATION_BYTE;
243             }
244 0 0         if (c <= 0xC1) {
245 0           return UTF8_NON_SHORTEST;
246             }
247 0           * end_ptr = input + 2;
248             return
249 0           ((int32_t) (c & 0x1F) << 6) |
250 0           ((int32_t) (input[1] & 0x3F));
251             }
252 0 0         if (l == 3) {
253             /* Three byte case. */
254 0 0         if (input[1] < 0x80 || input[1] > 0xBF ||
    0          
    0          
255 0 0         input[2] < 0x80 || input[2] > 0xBF) {
256 0           return UTF8_BAD_CONTINUATION_BYTE;
257             }
258 0 0         if (c == 0xe0 && input[1] < 0xa0) {
    0          
259             /* We don't need to check the value of input[2], because
260             the if statement above this one already guarantees that
261             it is 10xxxxxx. */
262 0           return UTF8_NON_SHORTEST;
263             }
264 0           * end_ptr = input + 3;
265             return
266 0           ((int32_t) (c & 0x0F)) << 12 |
267 0           ((int32_t) (input[1] & 0x3F)) << 6 |
268 0           ((int32_t) (input[2] & 0x3F));
269             }
270 0 0         if (l == 4) {
271             /* Four byte case. */
272             uint8_t d;
273             uint8_t e;
274             uint8_t f;
275             int32_t v;
276 0           d = input[1];
277 0           e = input[2];
278 0           f = input[3];
279              
280 0 0         if (/* c must be 11110xxx. */
281 0 0         c >= 0xf8 ||
282             /* d, e, f must be 10xxxxxx. */
283 0 0         d < 0x80 || d >= 0xC0 ||
    0          
284 0 0         e < 0x80 || e >= 0xC0 ||
    0          
285 0 0         f < 0x80 || f >= 0xC0) {
286 0           return UTF8_BAD_CONTINUATION_BYTE;
287             }
288              
289 0 0         if (c == 0xf0 && d < 0x90) {
    0          
290             /* We don't need to check the values of e and d, because
291             the if statement above this one already guarantees that
292             e and d are 10xxxxxx. */
293 0           return UTF8_NON_SHORTEST;
294             }
295             /* Calculate the code point. */
296 0           v = FOUR (input);
297             /* Greater than U+10FFFF */
298 0 0         if (v > UNICODE_MAXIMUM) {
299 0           return UNICODE_TOO_BIG;
300             }
301             /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
302 0 0         if ((v & 0xffff) >= 0xfffe) {
303 0           return UNICODE_NOT_CHARACTER;
304             }
305 0           * end_ptr = input + 4;
306 0           return v;
307             }
308 0           return UTF8_BAD_LEADING_BYTE;
309             }
310              
311             #define UNI_SUR_HIGH_START 0xD800
312             #define UNI_SUR_HIGH_END 0xDBFF
313             #define UNI_SUR_LOW_START 0xDC00
314             #define UNI_SUR_LOW_END 0xDFFF
315              
316             /* Input: a Unicode code point, "ucs2".
317              
318             Output: UTF-8 characters in buffer "utf8".
319              
320             Return value: the number of bytes written into "utf8", or a
321             negative number if there was an error. If the value of "ucs2" is
322             invalid because of being in the surrogate pair range from 0xD800 to
323             0xDFFF, the return value is UNICODE_SURROGATE_PAIR, else if the
324             value is too big to fit into four bytes of UTF-8, UNICODE_UTF8_4,
325             the return value is UNICODE_TOO_BIG. However, it does not insist on
326             ucs2 being less than UNICODE_MAXIMUM, so the user needs to check
327             that "ucs2" is a valid code point. It also does not check for
328             invalid characters, such as 0xFFFF.
329              
330             This adds a zero byte to the end of the string. It assumes that the
331             buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to
332             write to, without checking. */
333              
334             int32_t
335 0           ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8)
336             {
337 0 0         if (ucs2 < 0x80) {
338 0           utf8[0] = ucs2;
339 0           utf8[1] = '\0';
340 0           return 1;
341             }
342 0 0         if (ucs2 < 0x800) {
343 0           utf8[0] = (ucs2 >> 6) | 0xC0;
344 0           utf8[1] = (ucs2 & 0x3F) | 0x80;
345 0           utf8[2] = '\0';
346 0           return 2;
347             }
348 0 0         if (ucs2 < 0xFFFF) {
349 0           utf8[0] = ((ucs2 >> 12) ) | 0xE0;
350 0           utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80;
351 0           utf8[2] = ((ucs2 ) & 0x3F) | 0x80;
352 0           utf8[3] = '\0';
353 0 0         if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) {
    0          
354             /* Ill-formed. */
355 0           return UNICODE_SURROGATE_PAIR;
356             }
357 0           return 3;
358             }
359 0 0         if (ucs2 <= UNICODE_UTF8_4) {
360             /* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */
361 0           utf8[0] = 0xF0 | (ucs2 >> 18);
362 0           utf8[1] = 0x80 | ((ucs2 >> 12) & 0x3F);
363 0           utf8[2] = 0x80 | ((ucs2 >> 6) & 0x3F);
364 0           utf8[3] = 0x80 | ((ucs2 & 0x3F));
365 0           utf8[4] = '\0';
366 0           return 4;
367             }
368 0           return UNICODE_TOO_BIG;
369             }
370              
371             /* For shifting by 10 bits. */
372             #define TEN_BITS 10
373             #define HALF_BASE 0x0010000UL
374             /* 0b1111111111 */
375             #define LOW_TEN_BITS 0x3FF
376              
377             /* This converts the Unicode code point in "unicode" into a surrogate
378             pair, and returns the two parts in "* hi_ptr" and "* lo_ptr".
379              
380             Return value:
381              
382             If "unicode" does not need to be a surrogate pair, the error
383             UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr"
384             and "*lo_ptr" are undefined. If the conversion is successful,
385             UNICODE_OK is returned. */
386              
387             int32_t
388 9           unicode_to_surrogates (int32_t unicode, int32_t * hi_ptr, int32_t * lo_ptr)
389             {
390 9           int32_t hi = UNI_SUR_HIGH_START;
391 9           int32_t lo = UNI_SUR_LOW_START;
392 9 50         if (unicode < HALF_BASE) {
393             /* Doesn't need to be a surrogate pair. */
394 0           return UNICODE_NOT_SURROGATE_PAIR;
395             }
396 9           unicode -= HALF_BASE;
397 9           hi |= ((unicode >> TEN_BITS) & LOW_TEN_BITS);
398 9           lo |= ((unicode) & LOW_TEN_BITS);
399 9           * hi_ptr = hi;
400 9           * lo_ptr = lo;
401 9           return UNICODE_OK;
402             }
403              
404             /* Convert a surrogate pair in "hi" and "lo" to a single Unicode
405             value. The return value is the Unicode value. If the return value
406             is negative, an error has occurred. If "hi" and "lo" do not form a
407             surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is
408             returned.
409            
410             https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */
411              
412             int32_t
413 0           surrogates_to_unicode (int32_t hi, int32_t lo)
414             {
415             int32_t u;
416 0 0         if (hi < UNI_SUR_HIGH_START || hi > UNI_SUR_HIGH_END ||
    0          
    0          
417 0 0         lo < UNI_SUR_LOW_START || lo > UNI_SUR_LOW_END) {
418 0           return UNICODE_NOT_SURROGATE_PAIR;
419             }
420 0           u = ((hi - UNI_SUR_HIGH_START) << TEN_BITS)
421 0           + (lo - UNI_SUR_LOW_START) + HALF_BASE;
422 0           return u;
423             }
424              
425             #undef UNI_SUR_HIGH_START
426             #undef UNI_SUR_HIGH_END
427             #undef UNI_SUR_LOW_START
428             #undef UNI_SUR_LOW_END
429             #undef TEN_BITS
430             #undef HALF_BASE
431             #undef LOW_TEN_BITS
432              
433             /* Convert the surrogate pair in "hi" and "lo" to UTF-8 in
434             "utf8". This calls "surrogates_to_unicode" and "ucs2_to_utf8", thus
435             it can return the same errors as them, and has the same restriction
436             on "utf8" as "ucs2_to_utf8". */
437              
438             int32_t
439 0           surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t * utf8)
440             {
441             int32_t C;
442 0           C = surrogates_to_unicode (hi, lo);
443 0 0         if (C < 0) {
444 0           return C;
445             }
446 0           return ucs2_to_utf8 (C, utf8);
447             }
448              
449             /* Given a nul-terminated string "utf8" and a number of Unicode
450             characters "n_chars", return the number of bytes into "utf8" at
451             which the end of the characters occurs. A negative value indicates
452             some kind of error. If "utf8" contains a zero byte, the return
453             value is UNICODE_EMPTY_INPUT. This may also return any of the error
454             values of "utf8_to_ucs2". */
455              
456             int32_t
457 0           unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars)
458             {
459             int32_t i;
460 0           const uint8_t * p = utf8;
461 0           int32_t len = strlen ((const char *) utf8);
462 0 0         if (len == 0 && n_chars != 0) {
    0          
463 0           return UNICODE_EMPTY_INPUT;
464             }
465 0 0         for (i = 0; i < n_chars; i++) {
466 0           int32_t ucs2 = utf8_to_ucs2 (p, & p);
467 0 0         if (ucs2 < 0) {
468 0           return ucs2;
469             }
470             }
471 0           return p - utf8;
472             }
473              
474             /* Like unicode_count_chars, but without error checks or validation of
475             the input. This only checks the first byte of each UTF-8
476             sequence. It may return UTF8_BAD_LEADING_BYTE if the first byte is
477             invalid. */
478              
479             int32_t
480 0           unicode_count_chars_fast (const uint8_t * utf8)
481             {
482             int32_t chars;
483             const uint8_t * p;
484 0           chars = 0;
485 0           p = utf8;
486 0 0         while (*p) {
487             int32_t len;
488 0           len = utf8_sequence_len[*p];
489 0 0         if (len == 0) {
490             /* The first byte of a UTF-8 sequence is bad, so return
491             this, not BAD_UTF8. */
492 0           return UTF8_BAD_LEADING_BYTE;
493             }
494 0           p += len;
495 0           chars++;
496             }
497 0           return chars;
498             }
499              
500             /* Given a nul-terminated string "utf8", return the total number of
501             Unicode characters it contains.
502              
503             Return value
504              
505             If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the
506             errors of "utf8_to_ucs2". */
507              
508             int32_t
509 0           unicode_count_chars (const uint8_t * utf8)
510             {
511 0           int32_t chars = 0;
512 0           const uint8_t * p = utf8;
513 0           int32_t len = strlen ((const char *) utf8);
514 0 0         if (len == 0) {
515 0           return 0;
516             }
517 0 0         while (p - utf8 < len) {
518             int32_t ucs2;
519 0           ucs2 = utf8_to_ucs2 (p, & p);
520 0 0         if (ucs2 < 0) {
521             /* Return the error from utf8_to_ucs2. */
522 0           return ucs2;
523             }
524 0           chars++;
525 0 0         if (*p == '\0') {
526 0           return chars;
527             }
528             }
529             /* Cannot be reached in practice, since strlen indicates the null
530             byte. */
531 0           return UTF8_BAD_LEADING_BYTE;
532             }
533              
534             #ifdef HEADER
535              
536             /* These are intended for use in switch statements, for example
537              
538             switch (c) {
539             case BYTE_80_8F:
540             do_something;
541              
542             They originally come from the Json3 project. */
543              
544             #define BYTE_80_8F \
545             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
546             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
547             case 0x8E: case 0x8F
548             #define BYTE_80_9F \
549             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
550             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
551             case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
552             case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
553             case 0x9C: case 0x9D: case 0x9E: case 0x9F
554             #define BYTE_80_BF \
555             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
556             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
557             case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
558             case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
559             case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
560             case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
561             case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
562             case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
563             case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
564             case 0xBF
565             #define BYTE_90_BF \
566             0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \
567             case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \
568             case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \
569             case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \
570             case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \
571             case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \
572             case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF
573             #define BYTE_A0_BF \
574             0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \
575             case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \
576             case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \
577             case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \
578             case 0xBC: case 0xBD: case 0xBE: case 0xBF
579             #define BYTE_C2_DF \
580             0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \
581             case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \
582             case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \
583             case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \
584             case 0xDE: case 0xDF
585             #define BYTE_E1_EC \
586             0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \
587             case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC
588             #define BYTE_EE_EF \
589             0xEE: case 0xEF
590             #define BYTE_F1_F3 \
591             0xF1: case 0xF2: case 0xF3
592             #endif /* def HEADER */
593              
594             #define UNICODEADDBYTE i++
595              
596             #define UNICODEFAILUTF8(want) return UTF8_INVALID
597              
598             #define UNICODENEXTBYTE c = input[i]
599              
600             /* Given "input" and "input_length", validate "input" byte by byte up
601             to "input_length". The return value may be UTF8_VALID or
602             UTF8_INVALID. */
603              
604             int32_t
605 0           valid_utf8 (const uint8_t * input, int32_t input_length)
606             {
607             int32_t i;
608             uint8_t c;
609              
610 0           i = 0;
611              
612             string_start:
613              
614 0           i++;
615 0 0         if (i >= input_length) {
616 0           return UTF8_VALID;
617             }
618             /* Set c separately here since we use a range comparison before
619             the switch statement. */
620 0           c = input[i];
621              
622             /* Admit all bytes <= 0x80. */
623 0 0         if (c <= 0x80) {
624 0           goto string_start;
625             }
626              
627 0           switch (c) {
628             case BYTE_C2_DF:
629 0           UNICODEADDBYTE;
630 0           goto byte_last_80_bf;
631            
632             case 0xE0:
633 0           UNICODEADDBYTE;
634 0           goto byte23_a0_bf;
635            
636             case BYTE_E1_EC:
637 0           UNICODEADDBYTE;
638 0           goto byte_penultimate_80_bf;
639            
640             case 0xED:
641 0           UNICODEADDBYTE;
642 0           goto byte23_80_9f;
643            
644             case BYTE_EE_EF:
645 0           UNICODEADDBYTE;
646 0           goto byte_penultimate_80_bf;
647            
648             case 0xF0:
649 0           UNICODEADDBYTE;
650 0           goto byte24_90_bf;
651            
652             case BYTE_F1_F3:
653 0           UNICODEADDBYTE;
654 0           goto byte24_80_bf;
655            
656             case 0xF4:
657 0           UNICODEADDBYTE;
658 0           goto byte24_80_8f;
659              
660             }
661              
662             byte_last_80_bf:
663              
664 0 0         switch (UNICODENEXTBYTE) {
665              
666             case BYTE_80_BF:
667 0           UNICODEADDBYTE;
668 0           goto string_start;
669             default:
670 0           UNICODEFAILUTF8 (XBYTES_80_BF);
671             }
672              
673             byte_penultimate_80_bf:
674              
675 0 0         switch (UNICODENEXTBYTE) {
676              
677             case BYTE_80_BF:
678 0           UNICODEADDBYTE;
679 0           goto byte_last_80_bf;
680             default:
681 0           UNICODEFAILUTF8 (XBYTES_80_BF);
682             }
683              
684             byte24_90_bf:
685              
686 0 0         switch (UNICODENEXTBYTE) {
687              
688             case BYTE_90_BF:
689 0           UNICODEADDBYTE;
690 0           goto byte_penultimate_80_bf;
691             default:
692 0           UNICODEFAILUTF8 (XBYTES_90_BF);
693             }
694              
695             byte23_80_9f:
696              
697 0 0         switch (UNICODENEXTBYTE) {
698              
699             case BYTE_80_9F:
700 0           UNICODEADDBYTE;
701 0           goto byte_last_80_bf;
702             default:
703 0           UNICODEFAILUTF8 (XBYTES_80_9F);
704             }
705              
706             byte23_a0_bf:
707              
708 0 0         switch (UNICODENEXTBYTE) {
709              
710             case BYTE_A0_BF:
711 0           UNICODEADDBYTE;
712 0           goto byte_last_80_bf;
713             default:
714 0           UNICODEFAILUTF8 (XBYTES_A0_BF);
715             }
716              
717             byte24_80_bf:
718              
719 0 0         switch (UNICODENEXTBYTE) {
720              
721             case BYTE_80_BF:
722 0           UNICODEADDBYTE;
723 0           goto byte_penultimate_80_bf;
724             default:
725 0           UNICODEFAILUTF8 (XBYTES_80_BF);
726             }
727              
728             byte24_80_8f:
729              
730 0 0         switch (UNICODENEXTBYTE) {
731              
732             case BYTE_80_8F:
733 0           UNICODEADDBYTE;
734 0           goto byte_penultimate_80_bf;
735             default:
736 0           UNICODEFAILUTF8 (XBYTES_80_8F);
737             }
738             }
739              
740             /* Make "* ptr" point to the start of the first UTF-8 character after
741             its initial value. This assumes that there are at least four bytes
742             which can be read, and that "* ptr" points to valid UTF-8.
743              
744             If "** ptr" does not have its top bit set, 00xx_xxxx, this does not
745             change the value of "* ptr", and it returns UNICODE_OK. If "** ptr"
746             has its top two bits set, 11xx_xxxx, this does not change the value
747             of "* ptr" and it returns UNICODE_OK. If "**ptr" has its top bit
748             set but its second-to-top bit unset, 10xx_xxxx, so it is the
749             second, third, or fourth byte of a multibyte sequence, "* ptr" is
750             incremented until either "** ptr" is a valid first byte of a UTF-8
751             sequence, or too many bytes have passed for it to be valid
752             UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is returned
753             and "*ptr" is left unchanged. If a valid UTF-8 first byte was
754             found, either 11xx_xxxx or 00xx_xxxx, UNICODE_OK is returned, and
755             "*ptr" is set to the address of the valid byte. Nul bytes (bytes
756             containing zero) are considered valid. This does not check for
757             invalid UTF-8 bytes such as 0xFE and 0xFF. */
758              
759             int32_t
760 0           trim_to_utf8_start (uint8_t ** ptr)
761             {
762 0           uint8_t * p = *ptr;
763             uint8_t c;
764             int32_t i;
765             /* 0xC0 = 1100_0000. */
766 0           c = *p & 0xC0;
767 0 0         if (c == 0xC0 || c == 0x00) {
    0          
768 0           return UNICODE_OK;
769             }
770 0 0         for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) {
771 0           c = p[i];
772 0 0         if ((c & 0x80) != 0x80 || (c & 0x40) != 0) {
    0          
773 0           * ptr = p + i;
774 0           return UNICODE_OK;
775             }
776             }
777 0           return UTF8_BAD_CONTINUATION_BYTE;
778             }
779              
780             /* Given a return value "code" which is negative or zero, return a
781             string which describes what the return value means. Positive
782             non-zero return values never indicate errors or statuses in this
783             library. */
784              
785             const char *
786 0           unicode_code_to_error (int32_t code)
787             {
788 0           switch (code) {
789             case UTF8_BAD_LEADING_BYTE:
790 0           return "The leading byte of a UTF-8 sequence was invalid";
791             case UTF8_BAD_CONTINUATION_BYTE:
792 0           return "A continuation byte of a UTF-8 sequence was invalid";
793             case UNICODE_SURROGATE_PAIR:
794 0           return "A surrogate pair code point could not be converted to UTF-8";
795             case UNICODE_NOT_SURROGATE_PAIR:
796 0           return "Input code points did not form a surrogate pair";
797             case UNICODE_OK:
798 0           return "Successful completion";
799             case UNICODE_TOO_BIG:
800 0           return "A code point was beyond limits";
801             case UNICODE_NOT_CHARACTER:
802 0           return "A number ending in hex FFFF or FFFE is not valid Unicode";
803             case UTF8_NON_SHORTEST:
804 0           return "A UTF-8 input was not in the shortest form";
805             case UNICODE_EMPTY_INPUT:
806 0           return "A byte with value zero was found in UTF-8 input";
807             default:
808 0           return "Unknown/invalid error code";
809             }
810             }
811              
812             /* _____ _
813             |_ _|__ ___| |_ ___
814             | |/ _ \/ __| __/ __|
815             | | __/\__ \ |_\__ \
816             |_|\___||___/\__|___/
817             */
818            
819             /* Below this is code for testing which is not normally compiled. Use
820             "make test" to compile the testing version. */
821              
822             #ifdef TEST
823              
824             #include
825             #include
826             #include "c-tap-test.h"
827              
828             static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖX";
829              
830             #define BUFFSIZE 0x100
831              
832             static void test_ucs2_to_utf8 ()
833             {
834             /* Buffer to print utf8 out into. */
835             uint8_t buffer[BUFFSIZE];
836             /* Offset into buffer. */
837             uint8_t * offset;
838             const uint8_t * start = utf8;
839              
840             offset = buffer;
841             while (1) {
842             int32_t unicode;
843             int32_t bytes;
844             const uint8_t * end;
845             unicode = utf8_to_ucs2 (start, & end);
846             if (unicode == UNICODE_EMPTY_INPUT) {
847             break;
848             }
849             if (unicode < 0) {
850             fprintf (stderr,
851             "%s:%d: unexpected error %s converting unicode.\n",
852             __FILE__, __LINE__, unicode_code_to_error (unicode));
853             // exit ok in test
854             exit (EXIT_FAILURE);
855             }
856             bytes = ucs2_to_utf8 (unicode, offset);
857             TAP_TEST_MSG (bytes > 0, "no bad conversion");
858             TAP_TEST_MSG (strncmp ((const char *) offset,
859             (const char *) start, bytes) == 0,
860             "round trip OK for %X (%d bytes)", unicode, bytes);
861             start = end;
862             offset += bytes;
863             if (offset - buffer >= BUFFSIZE) {
864             fprintf (stderr, "%s:%d: out of space in buffer.\n",
865             __FILE__, __LINE__);
866             // exit ok
867             exit (EXIT_FAILURE);
868             }
869             }
870             * offset = '\0';
871             TAP_TEST_MSG (strcmp ((const char *) buffer, (const char *) utf8) == 0,
872             "input %s resulted in identical output %s",
873             utf8, buffer);
874             }
875              
876             static void
877             test_invalid_utf8 ()
878             {
879             uint8_t invalid_utf8[UTF8_MAX_LENGTH];
880             int32_t unicode;
881             int32_t valid;
882             const uint8_t * end;
883             snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1,
884             "%c%c%c", 0xe8, 0xe4, 0xe5);
885             unicode = utf8_to_ucs2 (invalid_utf8, & end);
886             TAP_TEST_MSG (unicode == UTF8_BAD_CONTINUATION_BYTE,
887             "invalid UTF-8 gives incorrect result");
888             valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8));
889             TAP_TEST_MSG (valid == UTF8_INVALID, "Invalid UTF-8 fails valid_utf8");
890             }
891              
892             static void
893             test_surrogate_pairs ()
894             {
895             int32_t status;
896             int32_t hi;
897             int32_t lo;
898             int32_t rt;
899             /* This is the wide character space, which does not require
900             representation as a surrogate pair. */
901             int32_t nogood = 0x3000;
902             /*
903             Two examples from the Wikipedia article on UTF-16
904             https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */
905             int32_t wikipedia_1 = 0x10437;
906             int32_t wikipedia_2 = 0x24b62;
907             /*
908             An example from the JSON RFC
909             http://rfc7159.net/rfc7159#rfc.section.7
910             */
911             int32_t json_spec = 0x1D11E;
912              
913             status = unicode_to_surrogates (nogood, & hi, & lo);
914              
915             TAP_TEST_MSG (status == UNICODE_NOT_SURROGATE_PAIR,
916             "low value to surrogate pair breaker returns error");
917              
918             status = unicode_to_surrogates (wikipedia_1, & hi, & lo);
919             TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1);
920             TAP_TEST_MSG (hi == 0xD801, "Got expected %X == 0xD801", hi);
921             TAP_TEST_MSG (lo == 0xDC37, "Got expected %X == 0xDC37", lo);
922             rt = surrogates_to_unicode (hi, lo);
923             TAP_TEST_MSG (rt == wikipedia_1, "Round trip %X == initial %X",
924             rt, wikipedia_1);
925              
926             status = unicode_to_surrogates (wikipedia_2, & hi, & lo);
927             TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1);
928             TAP_TEST_MSG (hi == 0xD852, "Got expected %X == 0xD852", hi);
929             TAP_TEST_MSG (lo == 0xDF62, "Got expected %X == 0xDF62", lo);
930             rt = surrogates_to_unicode (hi, lo);
931             TAP_TEST_MSG (rt == wikipedia_2, "Round trip %X == initial %X",
932             rt, wikipedia_2);
933              
934             status = unicode_to_surrogates (json_spec, & hi, & lo);
935             TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", json_spec);
936             TAP_TEST_MSG (hi == 0xD834, "Got expected %X == 0xD834", hi);
937             TAP_TEST_MSG (lo == 0xDd1e, "Got expected %X == 0xDD1e", lo);
938             rt = surrogates_to_unicode (hi, lo);
939             TAP_TEST_MSG (rt == json_spec, "Round trip %X == initial %X",
940             rt, json_spec);
941             }
942              
943             /* Test sending various bytes into "utf8_bytes" and seeing whether the
944             return value is what we expected. */
945              
946             static void
947             test_utf8_bytes ()
948             {
949             struct tub {
950             int32_t first;
951             int32_t expect;
952             } tests[] = {
953             {'a', 1},
954             {0xb0, UTF8_BAD_LEADING_BYTE},
955             {0xc2, 2},
956             {0xff, UTF8_BAD_LEADING_BYTE},
957             };
958             int32_t n_tests = sizeof (tests) / sizeof (struct tub);
959             int32_t i;
960             for (i = 0; i < n_tests; i++) {
961             /* Expected bytes. */
962             int32_t xbytes;
963             int32_t firstbyte;
964             firstbyte = tests[i].first;
965             xbytes = utf8_bytes (firstbyte);
966             TAP_TEST_MSG (xbytes == tests[i].expect, "Got %d (%d) with input %d",
967             xbytes, tests[i].expect, firstbyte);
968             }
969             }
970              
971             /* Test the conversion from utf-8 to ucs-2 (UTF-16). */
972              
973             static void
974             test_utf8_to_ucs2 ()
975             {
976             const uint8_t * start = utf8;
977             while (*start) {
978             int32_t unicode;
979             const uint8_t * end;
980             unicode = utf8_to_ucs2 (start, & end);
981             TAP_TEST_MSG (unicode > 0, "no bad value at %s", start);
982             printf ("# %s is %04X, length is %d\n", start, unicode, end - start);
983             start = end;
984             }
985             }
986              
987             /* Test counting of unicode characters. */
988              
989             static void
990             test_unicode_count_chars ()
991             {
992             int32_t cc;
993             cc = unicode_count_chars (utf8);
994             TAP_TEST_MSG (cc == 7, "unicode_count_chars gets seven characters for utf8");
995             cc = unicode_count_chars_fast (utf8);
996             TAP_TEST_MSG (cc == 7, "unicode_count_chars_fast gets seven characters for utf8");
997             }
998              
999             static void
1000             test_valid_utf8 ()
1001             {
1002             int32_t valid;
1003             valid = valid_utf8 (utf8, strlen ((const char *) utf8));
1004             TAP_TEST_MSG (valid == UTF8_VALID, "Valid UTF-8 passes valid_utf8");
1005             }
1006              
1007             static void
1008             test_trim_to_utf8_start ()
1009             {
1010             int32_t status;
1011             uint8_t * p;
1012             /* Invalid UTF-8. */
1013             uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99};
1014             /* Valid UTF-8. */
1015             uint8_t good[] = "化苦";
1016             uint8_t good2[] = "化abc";
1017             p = bad;
1018             status = trim_to_utf8_start (& p);
1019             TAP_TEST_MSG (status == UTF8_BAD_CONTINUATION_BYTE,
1020             "Non-UTF-8 causes error");
1021             TAP_TEST_MSG (p == bad, "Did not change pointer");
1022             p = good + 1;
1023             status = trim_to_utf8_start (& p);
1024             TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result");
1025             TAP_TEST_MSG (p != good + 1, "Moved p");
1026             TAP_TEST_MSG (p == good + 3, "Moved p to the right position");
1027             p = good2 + 1;
1028             status = trim_to_utf8_start (& p);
1029             TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result");
1030             TAP_TEST_MSG (p != good2 + 1, "Moved p");
1031             TAP_TEST_MSG (p == good2 + 3, "Moved p to the right position");
1032             }
1033              
1034             static void
1035             test_constants ()
1036             {
1037             TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM);
1038             }
1039              
1040             int main ()
1041             {
1042             test_utf8_to_ucs2 ();
1043             test_ucs2_to_utf8 ();
1044             test_invalid_utf8 ();
1045             test_unicode_count_chars ();
1046             test_surrogate_pairs ();
1047             test_utf8_bytes ();
1048             test_valid_utf8 ();
1049             test_trim_to_utf8_start ();
1050             test_constants ();
1051             TAP_PLAN;
1052             }
1053              
1054             #endif /* def TEST */