File Coverage

unicode.c
Criterion Covered Total %
statement 32 271 11.8
branch 16 164 9.7
condition n/a
subroutine n/a
pod n/a
total 48 435 11.0


line stmt bran cond sub pod time code
1             /* This is a Unicode library in the programming language C which deals
2             with conversions to and from the UTF-8 format. */
3              
4             /*
5             Author:
6              
7             Ben Bullock ,
8              
9             Repository:
10            
11             https://github.com/benkasminbullock/unicode-c
12             */
13              
14             #include
15             #include
16             #include "unicode.h"
17              
18             #ifdef HEADER
19              
20             /* _ _ _ _
21             | | (_)_ __ ___ (_) |_ ___
22             | | | | '_ ` _ \| | __/ __|
23             | |___| | | | | | | | |_\__ \
24             |_____|_|_| |_| |_|_|\__|___/ */
25            
26              
27              
28             /* The maximum number of bytes we need to contain any Unicode code
29             point as UTF-8 as a C string. This length includes one trailing nul
30             byte. */
31              
32             #define UTF8_MAX_LENGTH 5
33              
34             /* The maximum possible value of a Unicode code point. See
35             http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */
36              
37             #define UNICODE_MAXIMUM 0x10ffff
38              
39             /* The maximum possible value which will fit into four bytes of
40             UTF-8. This is larger than UNICODE_MAXIMUM. */
41              
42             #define UNICODE_UTF8_4 0x1fffff
43              
44             /* ____ _ _
45             | _ \ ___| |_ _ _ _ __ _ __ __ ____ _| |_ _ ___ ___
46             | |_) / _ \ __| | | | '__| '_ \ \ \ / / _` | | | | |/ _ \/ __|
47             | _ < __/ |_| |_| | | | | | | \ V / (_| | | |_| | __/\__ \
48             |_| \_\___|\__|\__,_|_| |_| |_| \_/ \__,_|_|\__,_|\___||___/ */
49            
50              
51             /* All of the functions in this library return an "int32_t". Negative
52             values are used to indicate errors. */
53              
54             /* This return value indicates the successful completion of a routine
55             which doesn't use the return value to communicate data back to the
56             caller. */
57              
58             #define UNICODE_OK 0
59              
60             /* This return value means that the leading byte of a UTF-8 sequence
61             was not valid. */
62              
63             #define UTF8_BAD_LEADING_BYTE -1
64              
65             /* This return value means the caller attempted to turn a code point
66             for a surrogate pair to or from UTF-8. */
67              
68             #define UNICODE_SURROGATE_PAIR -2
69              
70             /* This return value means that code points which did not form a
71             surrogate pair were tried to be converted into a code point as if
72             they were a surrogate pair. */
73              
74             #define UNICODE_NOT_SURROGATE_PAIR -3
75              
76             /* This return value means that input which was supposed to be UTF-8
77             encoded contained an invalid continuation byte. If the leading byte
78             of a UTF-8 sequence is not valid, UTF8_BAD_LEADING_BYTE is returned
79             instead of this. */
80              
81             #define UTF8_BAD_CONTINUATION_BYTE -4
82              
83             /* This return value indicates a zero byte was found in a string which
84             was supposed to contain UTF-8 bytes. It is returned only by the
85             functions which are documented as not allowing zero bytes. */
86              
87             #define UNICODE_EMPTY_INPUT -5
88              
89             /* This return value indicates that UTF-8 bytes were not in the
90             shortest possible form. See
91             http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8.
92              
93             This return value is currently unused. If a character is not in the
94             shortest form, the error UTF8_BAD_CONTINUATION_BYTE is returned. */
95              
96             #define UTF8_NON_SHORTEST -6
97              
98             /* This return value indicates that there was an attempt to convert a
99             code point which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4
100             into UTF-8 bytes. */
101              
102             #define UNICODE_TOO_BIG -7
103              
104             /* This return value indicates that the Unicode code-point ended with
105             either 0xFFFF or 0xFFFE, meaning it cannot be used as a character
106             code point, or it was in the disallowed range FDD0 to FDEF. */
107              
108             #define UNICODE_NOT_CHARACTER -8
109              
110             /* This return value indicates that the UTF-8 is valid. It is only
111             used by "valid_utf8". */
112              
113             #define UTF8_VALID 1
114              
115             /* This return value indicates that the UTF-8 is not valid. It is only
116             used by "valid_utf8". */
117              
118             #define UTF8_INVALID 0
119              
120             #endif /* def HEADER */
121              
122             /* This table contains the length of a sequence which begins with the
123             byte given. A value of zero indicates that the byte can not begin a
124             UTF-8 sequence. */
125              
126             /* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */
127              
128             const uint8_t utf8_sequence_len[0x100] =
129             {
130             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
131             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
132             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
133             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
134             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
135             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
136             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
137             1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
138             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
139             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
140             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
141             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
142             0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
143             2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
144             3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
145             4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
146             };
147              
148             /* This function returns the number of bytes of UTF-8 a sequence
149             starting with byte "c" will become, either 1 (c = 0000xxxx), 2 (c =
150             110xxxxx), 3 (c = 1110xxxx), or 4 (c = 111100xx or c =
151             11110100). If "c" is not a valid UTF-8 first byte, the value
152             UTF8_BAD_LEADING_BYTE is returned. */
153              
154 0           int32_t utf8_bytes (uint8_t c)
155             {
156             int32_t r;
157 0           r = utf8_sequence_len[c];
158 0 0         if (r == 0) {
159 0           return UTF8_BAD_LEADING_BYTE;
160             }
161 0           return r;
162             }
163              
164             /* This macro converts four bytes of UTF-8 into the corresponding code
165             point. */
166              
167             #define FOUR(x) \
168             (((int32_t) (x[0] & 0x07)) << 18) \
169             | (((int32_t) (x[1] & 0x3F)) << 12) \
170             | (((int32_t) (x[2] & 0x3F)) << 6) \
171             | (((int32_t) (x[3] & 0x3F)))
172              
173             /* Reject code points which end in either FFFE or FFFF. */
174              
175             #define REJECT_FFFF(x) \
176             if ((x & 0xFFFF) >= 0xFFFE) { \
177             return UNICODE_NOT_CHARACTER; \
178             }
179              
180             /* Reject code points in a certain range. */
181              
182             #define REJECT_NOT_CHAR(r) \
183             if (r >= UNI_NOT_CHAR_MIN && r <= UNI_NOT_CHAR_MAX) { \
184             return UNICODE_NOT_CHARACTER; \
185             }
186              
187             /* Reject surrogates. */
188              
189             #define REJECT_SURROGATE(ucs2) \
190             if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { \
191             /* Ill-formed. */ \
192             return UNICODE_SURROGATE_PAIR; \
193             }
194              
195             /* Try to convert "input" from UTF-8 to UCS-2, and return a value even
196             if the input is partly broken. This checks the first byte of the
197             input, but it doesn't check the subsequent bytes. */
198              
199             int32_t
200 0           utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr)
201             {
202             uint8_t c;
203 0           c = input[0];
204 0           switch (utf8_sequence_len[c]) {
205             case 1:
206 0           * end_ptr = input + 1;
207 0           return c;
208              
209             case 2:
210 0           * end_ptr = input + 2;
211             return
212 0           (c & 0x1F) << 6 |
213 0           (input[1] & 0x3F);
214              
215             case 3:
216 0           * end_ptr = input + 3;
217             return
218 0           (c & 0x0F) << 12 |
219 0           (input[1] & 0x3F) << 6 |
220 0           (input[2] & 0x3F);
221              
222             case 4:
223 0           * end_ptr = input + 4;
224 0           return FOUR (input);
225              
226             case 0:
227             /* fall through */
228             default:
229 0           return UTF8_BAD_LEADING_BYTE;
230             }
231             }
232              
233             /* Surrogate pair zone. */
234              
235             #define UNI_SUR_HIGH_START 0xD800
236             #define UNI_SUR_HIGH_END 0xDBFF
237             #define UNI_SUR_LOW_START 0xDC00
238             #define UNI_SUR_LOW_END 0xDFFF
239              
240             /* Start of the "not character" range. */
241              
242             #define UNI_NOT_CHAR_MIN 0xFDD0
243              
244             /* End of the "not character" range. */
245              
246             #define UNI_NOT_CHAR_MAX 0xFDEF
247              
248             /* This function converts UTF-8 encoded bytes in "input" into the
249             equivalent Unicode code point. The return value is the Unicode
250             code point corresponding to the UTF-8 character in "input" if
251             successful, and a negative number if not successful. Nul bytes are
252             rejected.
253              
254             "*end_ptr" is set to the next character after the read character on
255             success. "*end_ptr" is set to the start of input on all failures.
256             "end_ptr" may not be NULL.
257              
258             If the first byte of "input" is zero, in other words a NUL or '\0',
259             UNICODE_EMPTY_INPUT is returned.
260              
261             If the first byte of "input" is not valid UTF-8,
262             UTF8_BAD_LEADING_BYTE is returned.
263              
264             If the second or later bytes of "input" are not valid UTF-8,
265             including NUL, UTF8_BAD_CONTINUATION_BYTE is returned.
266              
267             If the value extrapolated from "input" is greater than
268             UNICODE_MAXIMUM, UNICODE_TOO_BIG is returned.
269              
270             If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE,
271             UNICODE_NOT_CHARACTER is returned.
272              
273             If the value extrapolated from "input" is between 0xFDD0 and 0xFDEF,
274             UNICODE_NOT_CHARACTER is returned.
275              
276             If the value is within the range of surrogate pairs, the error
277             UNICODE_SURROGATE_PAIR is returned.
278             */
279              
280             int32_t
281 0           utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
282             {
283             uint8_t c;
284             uint8_t l;
285              
286 0           *end_ptr = input;
287 0           c = input[0];
288 0 0         if (c == 0) {
289 0           return UNICODE_EMPTY_INPUT;
290             }
291 0           l = utf8_sequence_len[c];
292 0 0         if (l == 1) {
293 0           * end_ptr = input + 1;
294 0           return (int32_t) c;
295             }
296 0 0         if (l == 2) {
297             uint8_t d;
298 0           d = input[1];
299             /* Two byte case. */
300 0 0         if (d < 0x80 || d > 0xBF) {
    0          
301 0           return UTF8_BAD_CONTINUATION_BYTE;
302             }
303 0 0         if (c <= 0xC1) {
304 0           return UTF8_BAD_CONTINUATION_BYTE;
305             }
306 0           * end_ptr = input + 2;
307             return
308 0           ((int32_t) (c & 0x1F) << 6) |
309 0           ((int32_t) (d & 0x3F));
310             }
311 0 0         if (l == 3) {
312             uint8_t d;
313             uint8_t e;
314             int32_t r;
315              
316 0           d = input[1];
317 0           e = input[2];
318             /* Three byte case. */
319 0 0         if (d < 0x80 || d > 0xBF ||
    0          
    0          
320 0 0         e < 0x80 || e > 0xBF) {
321 0           return UTF8_BAD_CONTINUATION_BYTE;
322             }
323 0 0         if (c == 0xe0 && d < 0xa0) {
    0          
324             /* We don't need to check the value of input[2], because
325             the if statement above this one already guarantees that
326             it is 10xxxxxx. */
327 0           return UTF8_BAD_CONTINUATION_BYTE;
328             }
329 0           r = ((int32_t) (c & 0x0F)) << 12 |
330 0           ((int32_t) (d & 0x3F)) << 6 |
331 0           ((int32_t) (e & 0x3F));
332 0 0         REJECT_SURROGATE(r);
    0          
333 0 0         REJECT_FFFF(r);
334 0 0         REJECT_NOT_CHAR(r);
    0          
335 0           * end_ptr = input + 3;
336 0           return r;
337             }
338 0 0         else if (l == 4) {
339             /* Four byte case. */
340             uint8_t d;
341             uint8_t e;
342             uint8_t f;
343             int32_t v;
344              
345 0           d = input[1];
346 0           e = input[2];
347 0           f = input[3];
348              
349 0 0         if (/* c must be 11110xxx. */
350 0 0         c >= 0xf8 ||
351             /* d, e, f must be 10xxxxxx. */
352 0 0         d < 0x80 || d >= 0xC0 ||
    0          
353 0 0         e < 0x80 || e >= 0xC0 ||
    0          
354 0 0         f < 0x80 || f >= 0xC0) {
355 0           return UTF8_BAD_CONTINUATION_BYTE;
356             }
357              
358 0 0         if (c == 0xf0 && d < 0x90) {
    0          
359             /* We don't need to check the values of e and f, because
360             the if statement above this one already guarantees that
361             e and f are 10xxxxxx. */
362 0           return UTF8_BAD_CONTINUATION_BYTE;
363             }
364             /* Calculate the code point. */
365 0           v = FOUR (input);
366             /* Greater than U+10FFFF */
367 0 0         if (v > UNICODE_MAXIMUM) {
368 0           return UNICODE_TOO_BIG;
369             }
370             /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
371 0 0         REJECT_FFFF(v);
372             /* We don't need to check for surrogate pairs here, since the
373             minimum value of UCS2 if there are four bytes of UTF-8 is
374             0x10000. */
375 0           * end_ptr = input + 4;
376 0           return v;
377             }
378 0           return UTF8_BAD_LEADING_BYTE;
379             }
380              
381              
382             /* Input: a Unicode code point, "ucs2".
383              
384             Output: UTF-8 characters in buffer "utf8".
385              
386             Return value: the number of bytes written into "utf8", or a
387             negative number if there was an error.
388              
389             If the value of "ucs2" is invalid because of being in the surrogate
390             pair range from 0xD800 to 0xDFFF, the return value is
391             UNICODE_SURROGATE_PAIR.
392              
393             If the value of "ucs2" is in the range 0xFDD0 to 0xFDEF inclusive,
394             the return value is UNICODE_NOT_CHARACTER.
395              
396             If the lower two bytes of "ucs2" are either 0xFFFE or 0xFFFF, the
397             return value is UNICODE_NOT_CHARACTER.
398              
399             If the value is too big to fit into four bytes of UTF-8,
400             UNICODE_UTF8_4, the return value is UNICODE_TOO_BIG.
401              
402             However, it does not insist on ucs2 being less than
403             UNICODE_MAXIMUM, so the user needs to check that "ucs2" is a valid
404             code point.
405              
406             This adds a zero byte to the end of the string. It assumes that the
407             buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to
408             write to, without checking. */
409              
410             int32_t
411 29           ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8)
412             {
413 29 50         REJECT_FFFF(ucs2);
414 29 100         if (ucs2 < 0x80) {
415 4           utf8[0] = ucs2;
416 4           utf8[1] = '\0';
417 4           return 1;
418             }
419 25 50         if (ucs2 < 0x800) {
420 0           utf8[0] = (ucs2 >> 6) | 0xC0;
421 0           utf8[1] = (ucs2 & 0x3F) | 0x80;
422 0           utf8[2] = '\0';
423 0           return 2;
424             }
425 25 100         if (ucs2 < 0xFFFF) {
426 20           utf8[0] = ((ucs2 >> 12) ) | 0xE0;
427 20           utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80;
428 20           utf8[2] = ((ucs2 ) & 0x3F) | 0x80;
429 20           utf8[3] = '\0';
430 20 100         REJECT_SURROGATE(ucs2);
    50          
431 8 50         REJECT_NOT_CHAR(ucs2);
    0          
432 8           return 3;
433             }
434 5 50         if (ucs2 <= UNICODE_UTF8_4) {
435             /* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */
436 5           utf8[0] = 0xF0 | (ucs2 >> 18);
437 5           utf8[1] = 0x80 | ((ucs2 >> 12) & 0x3F);
438 5           utf8[2] = 0x80 | ((ucs2 >> 6) & 0x3F);
439 5           utf8[3] = 0x80 | ((ucs2 & 0x3F));
440 5           utf8[4] = '\0';
441 5           return 4;
442             }
443 0           return UNICODE_TOO_BIG;
444             }
445              
446             /* For shifting by 10 bits. */
447             #define TEN_BITS 10
448             #define HALF_BASE 0x0010000UL
449             /* 0b1111111111 */
450             #define LOW_TEN_BITS 0x3FF
451              
452             /* This converts the Unicode code point in "unicode" into a surrogate
453             pair, and returns the two parts in "* hi_ptr" and "* lo_ptr".
454              
455             Return value:
456              
457             If "unicode" does not need to be a surrogate pair, the error
458             UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr"
459             and "*lo_ptr" are undefined. If the conversion is successful,
460             UNICODE_OK is returned. */
461              
462             int32_t
463 0           unicode_to_surrogates (int32_t unicode, int32_t * hi_ptr, int32_t * lo_ptr)
464             {
465 0           int32_t hi = UNI_SUR_HIGH_START;
466 0           int32_t lo = UNI_SUR_LOW_START;
467 0 0         if (unicode < HALF_BASE) {
468             /* Doesn't need to be a surrogate pair. */
469 0           return UNICODE_NOT_SURROGATE_PAIR;
470             }
471 0           unicode -= HALF_BASE;
472 0           hi |= ((unicode >> TEN_BITS) & LOW_TEN_BITS);
473 0           lo |= ((unicode) & LOW_TEN_BITS);
474 0           * hi_ptr = hi;
475 0           * lo_ptr = lo;
476 0           return UNICODE_OK;
477             }
478              
479             /* Convert a surrogate pair in "hi" and "lo" to a single Unicode
480             value. The return value is the Unicode value. If the return value
481             is negative, an error has occurred. If "hi" and "lo" do not form a
482             surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is
483             returned.
484            
485             https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */
486              
487             int32_t
488 5           surrogates_to_unicode (int32_t hi, int32_t lo)
489             {
490             int32_t u;
491 5 50         if (hi < UNI_SUR_HIGH_START || hi > UNI_SUR_HIGH_END ||
    50          
    50          
492 5 50         lo < UNI_SUR_LOW_START || lo > UNI_SUR_LOW_END) {
493 0           return UNICODE_NOT_SURROGATE_PAIR;
494             }
495 10           u = ((hi - UNI_SUR_HIGH_START) << TEN_BITS)
496 5           + (lo - UNI_SUR_LOW_START) + HALF_BASE;
497 5           return u;
498             }
499              
500             #undef UNI_SUR_HIGH_START
501             #undef UNI_SUR_HIGH_END
502             #undef UNI_SUR_LOW_START
503             #undef UNI_SUR_LOW_END
504             #undef TEN_BITS
505             #undef HALF_BASE
506             #undef LOW_TEN_BITS
507              
508             /* Convert the surrogate pair in "hi" and "lo" to UTF-8 in
509             "utf8". This calls "surrogates_to_unicode" and "ucs2_to_utf8", thus
510             it can return the same errors as them, and has the same restriction
511             on "utf8" as "ucs2_to_utf8". */
512              
513             int32_t
514 5           surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t * utf8)
515             {
516             int32_t C;
517 5           C = surrogates_to_unicode (hi, lo);
518 5 50         if (C < 0) {
519 0           return C;
520             }
521 5           return ucs2_to_utf8 (C, utf8);
522             }
523              
524             /* Given a nul-terminated string "utf8" and a number of Unicode
525             characters "n_chars", return the number of bytes into "utf8" at
526             which the end of the characters occurs. A negative value indicates
527             some kind of error. If "utf8" contains a zero byte, the return
528             value is UNICODE_EMPTY_INPUT. This may also return any of the error
529             values of "utf8_to_ucs2". */
530              
531             int32_t
532 0           unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars)
533             {
534             int32_t i;
535 0           const uint8_t * p = utf8;
536 0           int32_t len = strlen ((const char *) utf8);
537 0 0         if (len == 0 && n_chars != 0) {
    0          
538 0           return UNICODE_EMPTY_INPUT;
539             }
540 0 0         for (i = 0; i < n_chars; i++) {
541 0           int32_t ucs2 = utf8_to_ucs2 (p, & p);
542 0 0         if (ucs2 < 0) {
543 0           return ucs2;
544             }
545             }
546 0           return p - utf8;
547             }
548              
549             /* Like unicode_count_chars, but without error checks or validation of
550             the input. This only checks the first byte of each UTF-8 sequence,
551             then jumps over the succeeding bytes. It may return
552             UTF8_BAD_LEADING_BYTE if the first byte is invalid. */
553              
554             int32_t
555 0           unicode_count_chars_fast (const uint8_t * utf8)
556             {
557             int32_t chars;
558             const uint8_t * p;
559 0           chars = 0;
560 0           p = utf8;
561 0 0         while (*p) {
562             int32_t len;
563 0           len = utf8_sequence_len[*p];
564 0 0         if (len == 0) {
565             /* The first byte of a UTF-8 sequence is bad, so return
566             this, not BAD_UTF8. */
567 0           return UTF8_BAD_LEADING_BYTE;
568             }
569 0           p += len;
570 0           chars++;
571             }
572 0           return chars;
573             }
574              
575             /* Given a nul-terminated string "utf8", return the total number of
576             Unicode characters it contains.
577              
578             Return value
579              
580             If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the
581             errors of "utf8_to_ucs2". */
582              
583             int32_t
584 0           unicode_count_chars (const uint8_t * utf8)
585             {
586 0           int32_t chars = 0;
587 0           const uint8_t * p = utf8;
588 0           int32_t len = strlen ((const char *) utf8);
589 0 0         if (len == 0) {
590 0           return 0;
591             }
592 0 0         while (p - utf8 < len) {
593             int32_t ucs2;
594 0           ucs2 = utf8_to_ucs2 (p, & p);
595 0 0         if (ucs2 < 0) {
596             /* Return the error from utf8_to_ucs2. */
597 0           return ucs2;
598             }
599 0           chars++;
600 0 0         if (*p == '\0') {
601 0           return chars;
602             }
603             }
604             /* Cannot be reached in practice, since strlen indicates the null
605             byte. */
606 0           return UTF8_BAD_LEADING_BYTE;
607             }
608              
609             #ifdef HEADER
610              
611             /* These are intended for use in switch statements, for example
612              
613             switch (c) {
614             case BYTE_80_8F:
615             do_something;
616              
617             They originally come from the Json3 project. */
618              
619             #define BYTE_80_8F \
620             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
621             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
622             case 0x8E: case 0x8F
623             #define BYTE_80_9F \
624             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
625             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
626             case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
627             case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
628             case 0x9C: case 0x9D: case 0x9E: case 0x9F
629             #define BYTE_80_BF \
630             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
631             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
632             case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
633             case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
634             case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
635             case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
636             case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
637             case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
638             case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
639             case 0xBF
640             #define BYTE_80_8F_B0_BF \
641             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
642             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
643             case 0x8E: case 0x8F: case 0xB0: \
644             case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
645             case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
646             case 0xBF
647             #define BYTE_80_B6_B8_BF \
648             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
649             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
650             case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
651             case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
652             case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
653             case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
654             case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
655             case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \
656             case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
657             case 0xBF
658             #define BYTE_80_BD \
659             0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
660             case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
661             case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
662             case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
663             case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
664             case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
665             case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
666             case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
667             case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD
668             #define BYTE_90_BF \
669             0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \
670             case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \
671             case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \
672             case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \
673             case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \
674             case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \
675             case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF
676             #define BYTE_A0_BF \
677             0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \
678             case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \
679             case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \
680             case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \
681             case 0xBC: case 0xBD: case 0xBE: case 0xBF
682             #define BYTE_C2_DF \
683             0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \
684             case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \
685             case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \
686             case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \
687             case 0xDE: case 0xDF
688             #define BYTE_E1_EC \
689             0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \
690             case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC
691             #define BYTE_F1_F3 \
692             0xF1: case 0xF2: case 0xF3
693             #endif /* def HEADER */
694              
695             #define UNICODEADDBYTE i++
696              
697             #define UNICODEFAILUTF8(want) return UTF8_INVALID
698              
699             #define UNICODENEXTBYTE c = input[i]
700              
701             /* Given "input" and "input_length", validate "input" byte by byte up
702             to "input_length". The return value may be UTF8_VALID or
703             UTF8_INVALID. */
704              
705             int32_t
706 0           valid_utf8 (const uint8_t * input, int32_t input_length)
707             {
708             int32_t error;
709             utf8_info_t info;
710 0           error = validate_utf8 (input, input_length, & info);
711 0 0         if (error < 0) {
712 0           return UTF8_INVALID;
713             }
714 0           return UTF8_VALID;
715             }
716              
717             #define FAIL(x) \
718             info->len_read = i; \
719             return x
720              
721             #ifdef HEADER
722              
723             typedef struct utf8_info
724             {
725             int32_t len_read;
726             int32_t runes_read;
727             }
728             utf8_info_t;
729              
730             #endif /* def HEADER */
731              
732             /* Given "input" and "len", validate "input" byte by byte up to
733             "len". The return value is "UNICODE_OK" (zero) on success or the
734             error found (a negative number) on failure.
735              
736             utf8_info_t is defined in "unicode.h".
737              
738             The value of "info.len_read" is the number of bytes processed. the
739             value of "info.runes_read" is the number of Unicode code points in
740             the input. */
741              
742             int32_t
743 0           validate_utf8 (const uint8_t * input, int32_t len, utf8_info_t * info)
744             {
745             int32_t i;
746             uint8_t c;
747              
748 0           info->len_read = 0;
749             /* We want to increment the runes after "string_start", but that
750             would give us one too many. */
751 0           info->runes_read = -1;
752 0           i = 0;
753              
754             string_start:
755              
756             /* We get here after successfully reading a "rune". */
757              
758 0           info->runes_read++;
759 0 0         if (i >= len) {
760 0           info->len_read = len;
761 0           return UNICODE_OK; /* 0 */
762             }
763              
764             /* Set c separately here since we use a range comparison before
765             the switch statement. */
766              
767 0           c = input[i];
768              
769 0 0         if (c == 0) {
770 0           FAIL (UNICODE_EMPTY_INPUT);
771             }
772             /* Admit all bytes < 0x80. */
773 0 0         if (c < 0x80) {
774 0           i++;
775 0           goto string_start;
776             }
777 0           switch (c) {
778             case BYTE_C2_DF:
779 0           UNICODEADDBYTE;
780 0           goto byte_last_80_bf;
781            
782             case 0xE0:
783 0           UNICODEADDBYTE;
784 0           goto byte23_a0_bf;
785            
786             case BYTE_E1_EC:
787 0           UNICODEADDBYTE;
788 0           goto byte_penultimate_80_bf;
789            
790             case 0xED:
791 0           UNICODEADDBYTE;
792 0           goto byte23_80_9f;
793            
794             case 0xEE:
795 0           UNICODEADDBYTE;
796 0           goto byte_penultimate_80_bf;
797            
798             case 0xEF:
799 0           UNICODEADDBYTE;
800 0           goto byte_ef_80_bf;
801            
802             case 0xF0:
803 0           UNICODEADDBYTE;
804 0           goto byte24_90_bf;
805            
806             case BYTE_F1_F3:
807 0           UNICODEADDBYTE;
808 0           goto byte24_80_bf;
809            
810             case 0xF4:
811 0           UNICODEADDBYTE;
812 0           goto byte24_80_8f;
813              
814             default:
815 0           FAIL (UTF8_BAD_LEADING_BYTE);
816             }
817              
818             byte_last_80_bf:
819              
820 0 0         switch (UNICODENEXTBYTE) {
821             case BYTE_80_BF:
822 0           UNICODEADDBYTE;
823 0           goto string_start;
824             default:
825 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
826             }
827              
828             byte_ef_b7:
829 0 0         switch (UNICODENEXTBYTE) {
830             case BYTE_80_8F_B0_BF:
831 0           UNICODEADDBYTE;
832 0           goto string_start;
833             default:
834 0 0         if (c >= 0x90 && c <= 0xAF) {
    0          
835 0           FAIL (UNICODE_NOT_CHARACTER);
836             }
837             else {
838 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
839             }
840             }
841              
842             byte_last_80_bd:
843              
844 0           switch (UNICODENEXTBYTE) {
845             case BYTE_80_BD:
846 0           UNICODEADDBYTE;
847 0           goto string_start;
848             case 0xBE:
849             case 0xBF:
850 0           FAIL (UNICODE_NOT_CHARACTER);
851             default:
852 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
853             }
854              
855             byte_penultimate_80_bf:
856              
857 0 0         switch (UNICODENEXTBYTE) {
858             case BYTE_80_BF:
859 0           UNICODEADDBYTE;
860 0           goto byte_last_80_bf;
861             default:
862 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
863             }
864              
865             byte_ef_80_bf:
866 0           switch (UNICODENEXTBYTE) {
867             case BYTE_80_B6_B8_BF:
868 0           UNICODEADDBYTE;
869 0           goto byte_last_80_bd;
870             case 0xB7:
871 0           UNICODEADDBYTE;
872             /* FDD0 - FDE7 */
873 0           goto byte_ef_b7;
874             default:
875 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
876             }
877              
878             byte24_90_bf:
879              
880 0 0         switch (UNICODENEXTBYTE) {
881             case BYTE_90_BF:
882 0           UNICODEADDBYTE;
883 0           goto byte_penultimate_80_bf;
884             default:
885 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
886             }
887              
888             byte23_80_9f:
889              
890 0 0         switch (UNICODENEXTBYTE) {
891             case BYTE_80_9F:
892 0           UNICODEADDBYTE;
893 0           goto byte_last_80_bf;
894             default:
895 0 0         if (c >= 0xA0 && c <= 0xBF) {
    0          
896 0           FAIL (UNICODE_SURROGATE_PAIR);
897             }
898             else {
899 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
900             }
901             }
902              
903             byte23_a0_bf:
904              
905 0 0         switch (UNICODENEXTBYTE) {
906             case BYTE_A0_BF:
907 0           UNICODEADDBYTE;
908 0           goto byte_last_80_bf;
909             default:
910 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
911             }
912              
913             byte24_80_bf:
914              
915 0 0         switch (UNICODENEXTBYTE) {
916             case BYTE_80_BF:
917 0           UNICODEADDBYTE;
918 0           goto byte_ef_80_bf;
919             default:
920 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
921             }
922              
923             byte24_80_8f:
924              
925 0 0         switch (UNICODENEXTBYTE) {
926             case BYTE_80_8F:
927 0           UNICODEADDBYTE;
928 0           goto byte_ef_80_bf;
929             default:
930 0 0         if (c >= 0x90) {
931 0           FAIL (UNICODE_TOO_BIG);
932             }
933             else {
934 0           FAIL (UTF8_BAD_CONTINUATION_BYTE);
935             }
936             }
937             }
938              
939             #define REJECT_FE_FF(c) \
940             if (c == 0xFF || c == 0xFE) { \
941             return UNICODE_NOT_CHARACTER; \
942             }
943              
944             /* Make "* ptr" point to the start of the first UTF-8 character after
945             its initial value. This assumes that there are at least four bytes
946             which can be read, and that "* ptr" points to valid UTF-8.
947              
948             If "** ptr" does not have its top bit set, 00xx_xxxx, this does not
949             change the value of "* ptr", and it returns UNICODE_OK. If "** ptr"
950             has its top two bits set, 11xx_xxxx, this does not change the value
951             of "* ptr" and it returns UNICODE_OK. If "**ptr" has its top bit
952             set but its second-to-top bit unset, 10xx_xxxx, so it is the
953             second, third, or fourth byte of a multibyte sequence, "* ptr" is
954             incremented until either "** ptr" is a valid first byte of a UTF-8
955             sequence, or too many bytes have passed for it to be valid
956             UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is
957             returned and "*ptr" is left unchanged.
958              
959             If a valid UTF-8 first byte was found, either 11xx_xxxx or
960             00xx_xxxx, UNICODE_OK is returned, and "*ptr" is set to the address
961             of the valid byte. Nul bytes (bytes containing zero) are considered
962             valid.
963              
964             If any of the bytes read contains invalid UTF-8 bytes 0xFE and
965             0xFF, the error code UNICODE_NOT_CHARACTER is returned and "*ptr"
966             is left unchanged. */
967              
968             int32_t
969 0           trim_to_utf8_start (const uint8_t ** ptr)
970             {
971 0           const uint8_t * p = *ptr;
972             uint8_t c;
973             int32_t i;
974              
975 0           c = * p;
976 0 0         REJECT_FE_FF (c);
    0          
977             /* 0xC0 = 1100_0000. */
978 0           c &= 0xC0;
979 0 0         if (c == 0xC0 || c == 0x00) {
    0          
980 0           return UNICODE_OK;
981             }
982 0 0         for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) {
983 0           c = p[i];
984 0 0         REJECT_FE_FF (c);
    0          
985 0 0         if ((c & 0x80) != 0x80 || (c & 0x40) != 0) {
    0          
986 0           * ptr = p + i;
987 0           return UNICODE_OK;
988             }
989             }
990 0           return UTF8_BAD_CONTINUATION_BYTE;
991             }
992              
993             /* Given a return value "code" which is negative or zero, return a
994             string which describes what the return value means. Positive
995             non-zero return values never indicate errors or statuses in this
996             library. */
997              
998             const char *
999 0           unicode_code_to_error (int32_t code)
1000             {
1001 0           switch (code) {
1002             case UTF8_BAD_LEADING_BYTE:
1003 0           return "The leading byte of a UTF-8 sequence was invalid";
1004             case UTF8_BAD_CONTINUATION_BYTE:
1005 0           return "A continuation byte of a UTF-8 sequence was invalid";
1006             case UNICODE_SURROGATE_PAIR:
1007 0           return "A surrogate pair code point could not be converted to UTF-8";
1008             case UNICODE_NOT_SURROGATE_PAIR:
1009 0           return "Input code points did not form a surrogate pair";
1010             case UNICODE_OK:
1011 0           return "Successful completion";
1012             case UNICODE_TOO_BIG:
1013 0           return "A code point was beyond limits";
1014             case UNICODE_NOT_CHARACTER:
1015 0           return "A number ending in hex FFFF or FFFE is not valid Unicode";
1016             case UTF8_NON_SHORTEST:
1017 0           return "A UTF-8 input was not in the shortest form";
1018             case UNICODE_EMPTY_INPUT:
1019 0           return "A byte with value zero was found in UTF-8 input";
1020             default:
1021 0           return "Unknown/invalid error code";
1022             }
1023             }
1024              
1025             /* _____ _
1026             |_ _|__ ___| |_ ___
1027             | |/ _ \/ __| __/ __|
1028             | | __/\__ \ |_\__ \
1029             |_|\___||___/\__|___/
1030             */
1031            
1032             /* Below this is code for testing which is not normally compiled. Use
1033             "make test" to compile the testing version. */
1034              
1035             #ifdef TEST
1036              
1037             #include
1038             #include
1039             #include "c-tap-test.h"
1040              
1041             static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖX";
1042             static const uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x0};
1043              
1044             #define BUFFSIZE 0x100
1045              
1046             static void test_ucs2_to_utf8 ()
1047             {
1048             /* Buffer to print utf8 out into. */
1049             uint8_t buffer[BUFFSIZE];
1050             /* Offset into buffer. */
1051             uint8_t * offset;
1052             const uint8_t * start = utf8;
1053              
1054             offset = buffer;
1055             while (1) {
1056             int32_t unicode;
1057             int32_t bytes;
1058             const uint8_t * end;
1059             unicode = utf8_to_ucs2 (start, & end);
1060             if (unicode == UNICODE_EMPTY_INPUT) {
1061             break;
1062             }
1063             if (unicode < 0) {
1064             fprintf (stderr,
1065             "%s:%d: unexpected error %s converting unicode.\n",
1066             __FILE__, __LINE__, unicode_code_to_error (unicode));
1067             // exit ok in test
1068             exit (EXIT_FAILURE);
1069             }
1070             bytes = ucs2_to_utf8 (unicode, offset);
1071             TAP_TEST_MSG (bytes > 0, "no bad conversion");
1072             TAP_TEST_MSG (strncmp ((const char *) offset,
1073             (const char *) start, bytes) == 0,
1074             "round trip OK for %X (%d bytes)", unicode, bytes);
1075             start = end;
1076             offset += bytes;
1077             if (offset - buffer >= BUFFSIZE) {
1078             fprintf (stderr, "%s:%d: out of space in buffer.\n",
1079             __FILE__, __LINE__);
1080             // exit ok
1081             exit (EXIT_FAILURE);
1082             }
1083             }
1084             * offset = '\0';
1085             TAP_TEST_MSG (strcmp ((const char *) buffer, (const char *) utf8) == 0,
1086             "input %s resulted in identical output %s",
1087             utf8, buffer);
1088             }
1089              
1090             static void
1091             test_invalid_utf8 ()
1092             {
1093             uint8_t invalid_utf8[UTF8_MAX_LENGTH];
1094             int32_t unicode;
1095             int32_t valid;
1096             const uint8_t * end;
1097             snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1,
1098             "%c%c%c", 0xe8, 0xe4, 0xe5);
1099             unicode = utf8_to_ucs2 (invalid_utf8, & end);
1100             TAP_TEST_MSG (unicode == UTF8_BAD_CONTINUATION_BYTE,
1101             "invalid UTF-8 gives incorrect result");
1102             valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8));
1103             TAP_TEST_MSG (valid == UTF8_INVALID, "Invalid UTF-8 fails valid_utf8");
1104             }
1105              
1106             static void
1107             test_surrogate_pairs ()
1108             {
1109             int32_t status;
1110             int32_t hi;
1111             int32_t lo;
1112             int32_t rt;
1113             /* This is the wide character space, which does not require
1114             representation as a surrogate pair. */
1115             int32_t nogood = 0x3000;
1116             /*
1117             Two examples from the Wikipedia article on UTF-16
1118             https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */
1119             int32_t wikipedia_1 = 0x10437;
1120             int32_t wikipedia_2 = 0x24b62;
1121             /*
1122             An example from the JSON RFC
1123             http://rfc7159.net/rfc7159#rfc.section.7
1124             */
1125             int32_t json_spec = 0x1D11E;
1126              
1127             status = unicode_to_surrogates (nogood, & hi, & lo);
1128              
1129             TAP_TEST_MSG (status == UNICODE_NOT_SURROGATE_PAIR,
1130             "low value to surrogate pair breaker returns error");
1131              
1132             status = unicode_to_surrogates (wikipedia_1, & hi, & lo);
1133             TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1);
1134             TAP_TEST_MSG (hi == 0xD801, "Got expected %X == 0xD801", hi);
1135             TAP_TEST_MSG (lo == 0xDC37, "Got expected %X == 0xDC37", lo);
1136             rt = surrogates_to_unicode (hi, lo);
1137             TAP_TEST_MSG (rt == wikipedia_1, "Round trip %X == initial %X",
1138             rt, wikipedia_1);
1139              
1140             status = unicode_to_surrogates (wikipedia_2, & hi, & lo);
1141             TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1);
1142             TAP_TEST_MSG (hi == 0xD852, "Got expected %X == 0xD852", hi);
1143             TAP_TEST_MSG (lo == 0xDF62, "Got expected %X == 0xDF62", lo);
1144             rt = surrogates_to_unicode (hi, lo);
1145             TAP_TEST_MSG (rt == wikipedia_2, "Round trip %X == initial %X",
1146             rt, wikipedia_2);
1147              
1148             status = unicode_to_surrogates (json_spec, & hi, & lo);
1149             TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", json_spec);
1150             TAP_TEST_MSG (hi == 0xD834, "Got expected %X == 0xD834", hi);
1151             TAP_TEST_MSG (lo == 0xDd1e, "Got expected %X == 0xDD1e", lo);
1152             rt = surrogates_to_unicode (hi, lo);
1153             TAP_TEST_MSG (rt == json_spec, "Round trip %X == initial %X",
1154             rt, json_spec);
1155             }
1156              
1157             /* Test sending various bytes into "utf8_bytes" and seeing whether the
1158             return value is what we expected. */
1159              
1160             static void
1161             test_utf8_bytes ()
1162             {
1163             struct tub {
1164             int32_t first;
1165             int32_t expect;
1166             } tests[] = {
1167             {'a', 1},
1168             {0xb0, UTF8_BAD_LEADING_BYTE},
1169             {0xc2, 2},
1170             {0xff, UTF8_BAD_LEADING_BYTE},
1171             };
1172             int32_t n_tests = sizeof (tests) / sizeof (struct tub);
1173             int32_t i;
1174             for (i = 0; i < n_tests; i++) {
1175             /* Expected bytes. */
1176             int32_t xbytes;
1177             int32_t firstbyte;
1178             firstbyte = tests[i].first;
1179             xbytes = utf8_bytes (firstbyte);
1180             TAP_TEST_MSG (xbytes == tests[i].expect, "Got %d (%d) with input %d",
1181             xbytes, tests[i].expect, firstbyte);
1182             }
1183             }
1184              
1185             /* Test the conversion from utf-8 to ucs-2 (UTF-16). */
1186              
1187             static void
1188             test_utf8_to_ucs2 ()
1189             {
1190             const uint8_t * start = utf8;
1191             while (*start) {
1192             int32_t unicode;
1193             const uint8_t * end;
1194             unicode = utf8_to_ucs2 (start, & end);
1195             TAP_TEST_MSG (unicode > 0, "no bad value at %s", start);
1196             printf ("# %s is %04X, length is %d\n",
1197             start, unicode, (int) (end - start));
1198             start = end;
1199             }
1200             }
1201              
1202             /* Test counting of unicode characters. */
1203              
1204             static void
1205             test_unicode_count_chars ()
1206             {
1207             int32_t cc;
1208             cc = unicode_count_chars (utf8);
1209             TAP_TEST_MSG (cc == 7, "unicode_count_chars gets seven characters for utf8");
1210             cc = unicode_count_chars_fast (utf8);
1211             TAP_TEST_MSG (cc == 7, "unicode_count_chars_fast gets seven characters for utf8");
1212             }
1213              
1214             static void
1215             test_valid_utf8 ()
1216             {
1217             int32_t valid;
1218             valid = valid_utf8 (utf8, strlen ((const char *) utf8));
1219             TAP_TEST_MSG (valid == UTF8_VALID, "Valid UTF-8 passes valid_utf8");
1220             }
1221              
1222             static void
1223             test_trim_to_utf8_start ()
1224             {
1225             int32_t status;
1226             const uint8_t * p;
1227             /* Invalid UTF-8. */
1228             /* Valid UTF-8. */
1229             uint8_t good[] = "化苦";
1230             uint8_t good2[] = "化abc";
1231             p = bad;
1232             status = trim_to_utf8_start (& p);
1233             TAP_TEST_MSG (status == UTF8_BAD_CONTINUATION_BYTE,
1234             "Non-UTF-8 causes error");
1235             TAP_TEST_MSG (p == bad, "Did not change pointer");
1236             p = good + 1;
1237             status = trim_to_utf8_start (& p);
1238             TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result");
1239             TAP_TEST_MSG (p != good + 1, "Moved p");
1240             TAP_TEST_MSG (p == good + 3, "Moved p to the right position");
1241             p = good2 + 1;
1242             status = trim_to_utf8_start (& p);
1243             TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result");
1244             TAP_TEST_MSG (p != good2 + 1, "Moved p");
1245             TAP_TEST_MSG (p == good2 + 3, "Moved p to the right position");
1246             }
1247              
1248             static void
1249             test_constants ()
1250             {
1251             TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM);
1252             }
1253              
1254             static void
1255             test_utf8_validate ()
1256             {
1257             int r;
1258             int l;
1259             utf8_info_t info;
1260              
1261             r = validate_utf8 ((const uint8_t *) "", 0, & info);
1262             TAP_TEST_EQUAL (r, UNICODE_OK);
1263             TAP_TEST_EQUAL (info.len_read, 0);
1264             TAP_TEST_EQUAL (info.runes_read, 0);
1265              
1266             l = strlen ((const char *) utf8);
1267             r = validate_utf8 (utf8, l, & info);
1268             TAP_TEST_EQUAL (r, UNICODE_OK);
1269             TAP_TEST_EQUAL (info.len_read, l);
1270             TAP_TEST_EQUAL (info.runes_read, 7);
1271              
1272             l = strlen ((const char *) bad);
1273             r = validate_utf8 (bad, l, & info);
1274             TAP_TEST (r != UNICODE_OK);
1275             }
1276              
1277             int main ()
1278             {
1279             test_utf8_to_ucs2 ();
1280             test_ucs2_to_utf8 ();
1281             test_invalid_utf8 ();
1282             test_unicode_count_chars ();
1283             test_surrogate_pairs ();
1284             test_utf8_bytes ();
1285             test_valid_utf8 ();
1286             test_trim_to_utf8_start ();
1287             test_constants ();
1288             test_utf8_validate ();
1289             TAP_PLAN;
1290             }
1291              
1292             #endif /* def TEST */