File Coverage

unicode.c

Criterion	Covered	Total	%
statement	32	271	11.8
branch	16	164	9.7
condition			n/a
subroutine			n/a
pod			n/a
total	48	435	11.0

line	stmt	bran	code
1			/* This is a Unicode library in the programming language C which deals
2			with conversions to and from the UTF-8 format. */
3
4			/*
5			Author:
6
7			Ben Bullock ,
8
9			Repository:
10
11			https://github.com/benkasminbullock/unicode-c
12			*/
13
14			#include
15			#include
16			#include "unicode.h"
17
18			#ifdef HEADER
19
20			/* _ _ _ _
21			\| \| (_)_ __ ___ (_) \|_ ___
22			\| \| \| \| '_ ` _ \\| \| __/ __\|
23			\| \|___\| \| \| \| \| \| \| \| \|_\__ \
24			\|_____\|_\|_\| \|_\| \|_\|_\|\__\|___/ */
25
26
27
28			/* The maximum number of bytes we need to contain any Unicode code
29			point as UTF-8 as a C string. This length includes one trailing nul
30			byte. */
31
32			#define UTF8_MAX_LENGTH 5
33
34			/* The maximum possible value of a Unicode code point. See
35			http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */
36
37			#define UNICODE_MAXIMUM 0x10ffff
38
39			/* The maximum possible value which will fit into four bytes of
40			UTF-8. This is larger than UNICODE_MAXIMUM. */
41
42			#define UNICODE_UTF8_4 0x1fffff
43
44			/* ____ _ _
45			\| _ \ ___\| \|_ _ _ _ __ _ __ __ ____ _\| \|_ _ ___ ___
46			\| \|_) / _ \ __\| \| \| \| '__\| '_ \ \ \ / / _` \| \| \| \| \|/ _ \/ __\|
47			\| _ < __/ \|_\| \|_\| \| \| \| \| \| \| \ V / (_\| \| \| \|_\| \| __/\__ \
48			\|_\| \_\___\|\__\|\__,_\|_\| \|_\| \|_\| \_/ \__,_\|_\|\__,_\|\___\|\|___/ */
49
50
51			/* All of the functions in this library return an "int32_t". Negative
52			values are used to indicate errors. */
53
54			/* This return value indicates the successful completion of a routine
55			which doesn't use the return value to communicate data back to the
56			caller. */
57
58			#define UNICODE_OK 0
59
60			/* This return value means that the leading byte of a UTF-8 sequence
61			was not valid. */
62
63			#define UTF8_BAD_LEADING_BYTE -1
64
65			/* This return value means the caller attempted to turn a code point
66			for a surrogate pair to or from UTF-8. */
67
68			#define UNICODE_SURROGATE_PAIR -2
69
70			/* This return value means that code points which did not form a
71			surrogate pair were tried to be converted into a code point as if
72			they were a surrogate pair. */
73
74			#define UNICODE_NOT_SURROGATE_PAIR -3
75
76			/* This return value means that input which was supposed to be UTF-8
77			encoded contained an invalid continuation byte. If the leading byte
78			of a UTF-8 sequence is not valid, UTF8_BAD_LEADING_BYTE is returned
79			instead of this. */
80
81			#define UTF8_BAD_CONTINUATION_BYTE -4
82
83			/* This return value indicates a zero byte was found in a string which
84			was supposed to contain UTF-8 bytes. It is returned only by the
85			functions which are documented as not allowing zero bytes. */
86
87			#define UNICODE_EMPTY_INPUT -5
88
89			/* This return value indicates that UTF-8 bytes were not in the
90			shortest possible form. See
91			http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8.
92
93			This return value is currently unused. If a character is not in the
94			shortest form, the error UTF8_BAD_CONTINUATION_BYTE is returned. */
95
96			#define UTF8_NON_SHORTEST -6
97
98			/* This return value indicates that there was an attempt to convert a
99			code point which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4
100			into UTF-8 bytes. */
101
102			#define UNICODE_TOO_BIG -7
103
104			/* This return value indicates that the Unicode code-point ended with
105			either 0xFFFF or 0xFFFE, meaning it cannot be used as a character
106			code point, or it was in the disallowed range FDD0 to FDEF. */
107
108			#define UNICODE_NOT_CHARACTER -8
109
110			/* This return value indicates that the UTF-8 is valid. It is only
111			used by "valid_utf8". */
112
113			#define UTF8_VALID 1
114
115			/* This return value indicates that the UTF-8 is not valid. It is only
116			used by "valid_utf8". */
117
118			#define UTF8_INVALID 0
119
120			#endif /* def HEADER */
121
122			/* This table contains the length of a sequence which begins with the
123			byte given. A value of zero indicates that the byte can not begin a
124			UTF-8 sequence. */
125
126			/* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */
127
128			const uint8_t utf8_sequence_len[0x100] =
129			{
130			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
131			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
132			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
133			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
134			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
135			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
136			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
137			1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
138			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
139			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
140			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
141			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
142			0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
143			2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
144			3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
145			4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
146			};
147
148			/* This function returns the number of bytes of UTF-8 a sequence
149			starting with byte "c" will become, either 1 (c = 0000xxxx), 2 (c =
150			110xxxxx), 3 (c = 1110xxxx), or 4 (c = 111100xx or c =
151			11110100). If "c" is not a valid UTF-8 first byte, the value
152			UTF8_BAD_LEADING_BYTE is returned. */
153
154	0		int32_t utf8_bytes (uint8_t c)
155			{
156			int32_t r;
157	0		r = utf8_sequence_len[c];
158	0	0	if (r == 0) {
159	0		return UTF8_BAD_LEADING_BYTE;
160			}
161	0		return r;
162			}
163
164			/* This macro converts four bytes of UTF-8 into the corresponding code
165			point. */
166
167			#define FOUR(x) \
168			(((int32_t) (x[0] & 0x07)) << 18) \
169			\| (((int32_t) (x[1] & 0x3F)) << 12) \
170			\| (((int32_t) (x[2] & 0x3F)) << 6) \
171			\| (((int32_t) (x[3] & 0x3F)))
172
173			/* Reject code points which end in either FFFE or FFFF. */
174
175			#define REJECT_FFFF(x) \
176			if ((x & 0xFFFF) >= 0xFFFE) { \
177			return UNICODE_NOT_CHARACTER; \
178			}
179
180			/* Reject code points in a certain range. */
181
182			#define REJECT_NOT_CHAR(r) \
183			if (r >= UNI_NOT_CHAR_MIN && r <= UNI_NOT_CHAR_MAX) { \
184			return UNICODE_NOT_CHARACTER; \
185			}
186
187			/* Reject surrogates. */
188
189			#define REJECT_SURROGATE(ucs2) \
190			if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { \
191			/* Ill-formed. */ \
192			return UNICODE_SURROGATE_PAIR; \
193			}
194
195			/* Try to convert "input" from UTF-8 to UCS-2, and return a value even
196			if the input is partly broken. This checks the first byte of the
197			input, but it doesn't check the subsequent bytes. */
198
199			int32_t
200	0		utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr)
201			{
202			uint8_t c;
203	0		c = input[0];
204	0		switch (utf8_sequence_len[c]) {
205			case 1:
206	0		* end_ptr = input + 1;
207	0		return c;
208
209			case 2:
210	0		* end_ptr = input + 2;
211			return
212	0		(c & 0x1F) << 6 \|
213	0		(input[1] & 0x3F);
214
215			case 3:
216	0		* end_ptr = input + 3;
217			return
218	0		(c & 0x0F) << 12 \|
219	0		(input[1] & 0x3F) << 6 \|
220	0		(input[2] & 0x3F);
221
222			case 4:
223	0		* end_ptr = input + 4;
224	0		return FOUR (input);
225
226			case 0:
227			/* fall through */
228			default:
229	0		return UTF8_BAD_LEADING_BYTE;
230			}
231			}
232
233			/* Surrogate pair zone. */
234
235			#define UNI_SUR_HIGH_START 0xD800
236			#define UNI_SUR_HIGH_END 0xDBFF
237			#define UNI_SUR_LOW_START 0xDC00
238			#define UNI_SUR_LOW_END 0xDFFF
239
240			/* Start of the "not character" range. */
241
242			#define UNI_NOT_CHAR_MIN 0xFDD0
243
244			/* End of the "not character" range. */
245
246			#define UNI_NOT_CHAR_MAX 0xFDEF
247
248			/* This function converts UTF-8 encoded bytes in "input" into the
249			equivalent Unicode code point. The return value is the Unicode
250			code point corresponding to the UTF-8 character in "input" if
251			successful, and a negative number if not successful. Nul bytes are
252			rejected.
253
254			"*end_ptr" is set to the next character after the read character on
255			success. "*end_ptr" is set to the start of input on all failures.
256			"end_ptr" may not be NULL.
257
258			If the first byte of "input" is zero, in other words a NUL or '\0',
259			UNICODE_EMPTY_INPUT is returned.
260
261			If the first byte of "input" is not valid UTF-8,
262			UTF8_BAD_LEADING_BYTE is returned.
263
264			If the second or later bytes of "input" are not valid UTF-8,
265			including NUL, UTF8_BAD_CONTINUATION_BYTE is returned.
266
267			If the value extrapolated from "input" is greater than
268			UNICODE_MAXIMUM, UNICODE_TOO_BIG is returned.
269
270			If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE,
271			UNICODE_NOT_CHARACTER is returned.
272
273			If the value extrapolated from "input" is between 0xFDD0 and 0xFDEF,
274			UNICODE_NOT_CHARACTER is returned.
275
276			If the value is within the range of surrogate pairs, the error
277			UNICODE_SURROGATE_PAIR is returned.
278			*/
279
280			int32_t
281	0		utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
282			{
283			uint8_t c;
284			uint8_t l;
285
286	0		*end_ptr = input;
287	0		c = input[0];
288	0	0	if (c == 0) {
289	0		return UNICODE_EMPTY_INPUT;
290			}
291	0		l = utf8_sequence_len[c];
292	0	0	if (l == 1) {
293	0		* end_ptr = input + 1;
294	0		return (int32_t) c;
295			}
296	0	0	if (l == 2) {
297			uint8_t d;
298	0		d = input[1];
299			/* Two byte case. */
300	0	0	if (d < 0x80 \|\| d > 0xBF) {
		0
301	0		return UTF8_BAD_CONTINUATION_BYTE;
302			}
303	0	0	if (c <= 0xC1) {
304	0		return UTF8_BAD_CONTINUATION_BYTE;
305			}
306	0		* end_ptr = input + 2;
307			return
308	0		((int32_t) (c & 0x1F) << 6) \|
309	0		((int32_t) (d & 0x3F));
310			}
311	0	0	if (l == 3) {
312			uint8_t d;
313			uint8_t e;
314			int32_t r;
315
316	0		d = input[1];
317	0		e = input[2];
318			/* Three byte case. */
319	0	0	if (d < 0x80 \|\| d > 0xBF \|\|
		0
		0
320	0	0	e < 0x80 \|\| e > 0xBF) {
321	0		return UTF8_BAD_CONTINUATION_BYTE;
322			}
323	0	0	if (c == 0xe0 && d < 0xa0) {
		0
324			/* We don't need to check the value of input[2], because
325			the if statement above this one already guarantees that
326			it is 10xxxxxx. */
327	0		return UTF8_BAD_CONTINUATION_BYTE;
328			}
329	0		r = ((int32_t) (c & 0x0F)) << 12 \|
330	0		((int32_t) (d & 0x3F)) << 6 \|
331	0		((int32_t) (e & 0x3F));
332	0	0	REJECT_SURROGATE(r);
		0
333	0	0	REJECT_FFFF(r);
334	0	0	REJECT_NOT_CHAR(r);
		0
335	0		* end_ptr = input + 3;
336	0		return r;
337			}
338	0	0	else if (l == 4) {
339			/* Four byte case. */
340			uint8_t d;
341			uint8_t e;
342			uint8_t f;
343			int32_t v;
344
345	0		d = input[1];
346	0		e = input[2];
347	0		f = input[3];
348
349	0	0	if (/* c must be 11110xxx. */
350	0	0	c >= 0xf8 \|\|
351			/* d, e, f must be 10xxxxxx. */
352	0	0	d < 0x80 \|\| d >= 0xC0 \|\|
		0
353	0	0	e < 0x80 \|\| e >= 0xC0 \|\|
		0
354	0	0	f < 0x80 \|\| f >= 0xC0) {
355	0		return UTF8_BAD_CONTINUATION_BYTE;
356			}
357
358	0	0	if (c == 0xf0 && d < 0x90) {
		0
359			/* We don't need to check the values of e and f, because
360			the if statement above this one already guarantees that
361			e and f are 10xxxxxx. */
362	0		return UTF8_BAD_CONTINUATION_BYTE;
363			}
364			/* Calculate the code point. */
365	0		v = FOUR (input);
366			/* Greater than U+10FFFF */
367	0	0	if (v > UNICODE_MAXIMUM) {
368	0		return UNICODE_TOO_BIG;
369			}
370			/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
371	0	0	REJECT_FFFF(v);
372			/* We don't need to check for surrogate pairs here, since the
373			minimum value of UCS2 if there are four bytes of UTF-8 is
374			0x10000. */
375	0		* end_ptr = input + 4;
376	0		return v;
377			}
378	0		return UTF8_BAD_LEADING_BYTE;
379			}
380
381
382			/* Input: a Unicode code point, "ucs2".
383
384			Output: UTF-8 characters in buffer "utf8".
385
386			Return value: the number of bytes written into "utf8", or a
387			negative number if there was an error.
388
389			If the value of "ucs2" is invalid because of being in the surrogate
390			pair range from 0xD800 to 0xDFFF, the return value is
391			UNICODE_SURROGATE_PAIR.
392
393			If the value of "ucs2" is in the range 0xFDD0 to 0xFDEF inclusive,
394			the return value is UNICODE_NOT_CHARACTER.
395
396			If the lower two bytes of "ucs2" are either 0xFFFE or 0xFFFF, the
397			return value is UNICODE_NOT_CHARACTER.
398
399			If the value is too big to fit into four bytes of UTF-8,
400			UNICODE_UTF8_4, the return value is UNICODE_TOO_BIG.
401
402			However, it does not insist on ucs2 being less than
403			UNICODE_MAXIMUM, so the user needs to check that "ucs2" is a valid
404			code point.
405
406			This adds a zero byte to the end of the string. It assumes that the
407			buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to
408			write to, without checking. */
409
410			int32_t
411	29		ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8)
412			{
413	29	50	REJECT_FFFF(ucs2);
414	29	100	if (ucs2 < 0x80) {
415	4		utf8[0] = ucs2;
416	4		utf8[1] = '\0';
417	4		return 1;
418			}
419	25	50	if (ucs2 < 0x800) {
420	0		utf8[0] = (ucs2 >> 6) \| 0xC0;
421	0		utf8[1] = (ucs2 & 0x3F) \| 0x80;
422	0		utf8[2] = '\0';
423	0		return 2;
424			}
425	25	100	if (ucs2 < 0xFFFF) {
426	20		utf8[0] = ((ucs2 >> 12) ) \| 0xE0;
427	20		utf8[1] = ((ucs2 >> 6 ) & 0x3F) \| 0x80;
428	20		utf8[2] = ((ucs2 ) & 0x3F) \| 0x80;
429	20		utf8[3] = '\0';
430	20	100	REJECT_SURROGATE(ucs2);
		50
431	8	50	REJECT_NOT_CHAR(ucs2);
		0
432	8		return 3;
433			}
434	5	50	if (ucs2 <= UNICODE_UTF8_4) {
435			/* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */
436	5		utf8[0] = 0xF0 \| (ucs2 >> 18);
437	5		utf8[1] = 0x80 \| ((ucs2 >> 12) & 0x3F);
438	5		utf8[2] = 0x80 \| ((ucs2 >> 6) & 0x3F);
439	5		utf8[3] = 0x80 \| ((ucs2 & 0x3F));
440	5		utf8[4] = '\0';
441	5		return 4;
442			}
443	0		return UNICODE_TOO_BIG;
444			}
445
446			/* For shifting by 10 bits. */
447			#define TEN_BITS 10
448			#define HALF_BASE 0x0010000UL
449			/* 0b1111111111 */
450			#define LOW_TEN_BITS 0x3FF
451
452			/* This converts the Unicode code point in "unicode" into a surrogate
453			pair, and returns the two parts in "* hi_ptr" and "* lo_ptr".
454
455			Return value:
456
457			If "unicode" does not need to be a surrogate pair, the error
458			UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr"
459			and "*lo_ptr" are undefined. If the conversion is successful,
460			UNICODE_OK is returned. */
461
462			int32_t
463	0		unicode_to_surrogates (int32_t unicode, int32_t * hi_ptr, int32_t * lo_ptr)
464			{
465	0		int32_t hi = UNI_SUR_HIGH_START;
466	0		int32_t lo = UNI_SUR_LOW_START;
467	0	0	if (unicode < HALF_BASE) {
468			/* Doesn't need to be a surrogate pair. */
469	0		return UNICODE_NOT_SURROGATE_PAIR;
470			}
471	0		unicode -= HALF_BASE;
472	0		hi \|= ((unicode >> TEN_BITS) & LOW_TEN_BITS);
473	0		lo \|= ((unicode) & LOW_TEN_BITS);
474	0		* hi_ptr = hi;
475	0		* lo_ptr = lo;
476	0		return UNICODE_OK;
477			}
478
479			/* Convert a surrogate pair in "hi" and "lo" to a single Unicode
480			value. The return value is the Unicode value. If the return value
481			is negative, an error has occurred. If "hi" and "lo" do not form a
482			surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is
483			returned.
484
485			https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */
486
487			int32_t
488	5		surrogates_to_unicode (int32_t hi, int32_t lo)
489			{
490			int32_t u;
491	5	50	if (hi < UNI_SUR_HIGH_START \|\| hi > UNI_SUR_HIGH_END \|\|
		50
		50
492	5	50	lo < UNI_SUR_LOW_START \|\| lo > UNI_SUR_LOW_END) {
493	0		return UNICODE_NOT_SURROGATE_PAIR;
494			}
495	10		u = ((hi - UNI_SUR_HIGH_START) << TEN_BITS)
496	5		+ (lo - UNI_SUR_LOW_START) + HALF_BASE;
497	5		return u;
498			}
499
500			#undef UNI_SUR_HIGH_START
501			#undef UNI_SUR_HIGH_END
502			#undef UNI_SUR_LOW_START
503			#undef UNI_SUR_LOW_END
504			#undef TEN_BITS
505			#undef HALF_BASE
506			#undef LOW_TEN_BITS
507
508			/* Convert the surrogate pair in "hi" and "lo" to UTF-8 in
509			"utf8". This calls "surrogates_to_unicode" and "ucs2_to_utf8", thus
510			it can return the same errors as them, and has the same restriction
511			on "utf8" as "ucs2_to_utf8". */
512
513			int32_t
514	5		surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t * utf8)
515			{
516			int32_t C;
517	5		C = surrogates_to_unicode (hi, lo);
518	5	50	if (C < 0) {
519	0		return C;
520			}
521	5		return ucs2_to_utf8 (C, utf8);
522			}
523
524			/* Given a nul-terminated string "utf8" and a number of Unicode
525			characters "n_chars", return the number of bytes into "utf8" at
526			which the end of the characters occurs. A negative value indicates
527			some kind of error. If "utf8" contains a zero byte, the return
528			value is UNICODE_EMPTY_INPUT. This may also return any of the error
529			values of "utf8_to_ucs2". */
530
531			int32_t
532	0		unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars)
533			{
534			int32_t i;
535	0		const uint8_t * p = utf8;
536	0		int32_t len = strlen ((const char *) utf8);
537	0	0	if (len == 0 && n_chars != 0) {
		0
538	0		return UNICODE_EMPTY_INPUT;
539			}
540	0	0	for (i = 0; i < n_chars; i++) {
541	0		int32_t ucs2 = utf8_to_ucs2 (p, & p);
542	0	0	if (ucs2 < 0) {
543	0		return ucs2;
544			}
545			}
546	0		return p - utf8;
547			}
548
549			/* Like unicode_count_chars, but without error checks or validation of
550			the input. This only checks the first byte of each UTF-8 sequence,
551			then jumps over the succeeding bytes. It may return
552			UTF8_BAD_LEADING_BYTE if the first byte is invalid. */
553
554			int32_t
555	0		unicode_count_chars_fast (const uint8_t * utf8)
556			{
557			int32_t chars;
558			const uint8_t * p;
559	0		chars = 0;
560	0		p = utf8;
561	0	0	while (*p) {
562			int32_t len;
563	0		len = utf8_sequence_len[*p];
564	0	0	if (len == 0) {
565			/* The first byte of a UTF-8 sequence is bad, so return
566			this, not BAD_UTF8. */
567	0		return UTF8_BAD_LEADING_BYTE;
568			}
569	0		p += len;
570	0		chars++;
571			}
572	0		return chars;
573			}
574
575			/* Given a nul-terminated string "utf8", return the total number of
576			Unicode characters it contains.
577
578			Return value
579
580			If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the
581			errors of "utf8_to_ucs2". */
582
583			int32_t
584	0		unicode_count_chars (const uint8_t * utf8)
585			{
586	0		int32_t chars = 0;
587	0		const uint8_t * p = utf8;
588	0		int32_t len = strlen ((const char *) utf8);
589	0	0	if (len == 0) {
590	0		return 0;
591			}
592	0	0	while (p - utf8 < len) {
593			int32_t ucs2;
594	0		ucs2 = utf8_to_ucs2 (p, & p);
595	0	0	if (ucs2 < 0) {
596			/* Return the error from utf8_to_ucs2. */
597	0		return ucs2;
598			}
599	0		chars++;
600	0	0	if (*p == '\0') {
601	0		return chars;
602			}
603			}
604			/* Cannot be reached in practice, since strlen indicates the null
605			byte. */
606	0		return UTF8_BAD_LEADING_BYTE;
607			}
608
609			#ifdef HEADER
610
611			/* These are intended for use in switch statements, for example
612
613			switch (c) {
614			case BYTE_80_8F:
615			do_something;
616
617			They originally come from the Json3 project. */
618
619			#define BYTE_80_8F \
620			0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
621			case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
622			case 0x8E: case 0x8F
623			#define BYTE_80_9F \
624			0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
625			case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
626			case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
627			case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
628			case 0x9C: case 0x9D: case 0x9E: case 0x9F
629			#define BYTE_80_BF \
630			0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
631			case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
632			case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
633			case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
634			case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
635			case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
636			case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
637			case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
638			case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
639			case 0xBF
640			#define BYTE_80_8F_B0_BF \
641			0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
642			case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
643			case 0x8E: case 0x8F: case 0xB0: \
644			case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
645			case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
646			case 0xBF
647			#define BYTE_80_B6_B8_BF \
648			0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
649			case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
650			case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
651			case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
652			case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
653			case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
654			case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
655			case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \
656			case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
657			case 0xBF
658			#define BYTE_80_BD \
659			0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
660			case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
661			case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
662			case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
663			case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
664			case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
665			case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
666			case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
667			case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD
668			#define BYTE_90_BF \
669			0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \
670			case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \
671			case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \
672			case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \
673			case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \
674			case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \
675			case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF
676			#define BYTE_A0_BF \
677			0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \
678			case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \
679			case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \
680			case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \
681			case 0xBC: case 0xBD: case 0xBE: case 0xBF
682			#define BYTE_C2_DF \
683			0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \
684			case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \
685			case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \
686			case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \
687			case 0xDE: case 0xDF
688			#define BYTE_E1_EC \
689			0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \
690			case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC
691			#define BYTE_F1_F3 \
692			0xF1: case 0xF2: case 0xF3
693			#endif /* def HEADER */
694
695			#define UNICODEADDBYTE i++
696
697			#define UNICODEFAILUTF8(want) return UTF8_INVALID
698
699			#define UNICODENEXTBYTE c = input[i]
700
701			/* Given "input" and "input_length", validate "input" byte by byte up
702			to "input_length". The return value may be UTF8_VALID or
703			UTF8_INVALID. */
704
705			int32_t
706	0		valid_utf8 (const uint8_t * input, int32_t input_length)
707			{
708			int32_t error;
709			utf8_info_t info;
710	0		error = validate_utf8 (input, input_length, & info);
711	0	0	if (error < 0) {
712	0		return UTF8_INVALID;
713			}
714	0		return UTF8_VALID;
715			}
716
717			#define FAIL(x) \
718			info->len_read = i; \
719			return x
720
721			#ifdef HEADER
722
723			typedef struct utf8_info
724			{
725			int32_t len_read;
726			int32_t runes_read;
727			}
728			utf8_info_t;
729
730			#endif /* def HEADER */
731
732			/* Given "input" and "len", validate "input" byte by byte up to
733			"len". The return value is "UNICODE_OK" (zero) on success or the
734			error found (a negative number) on failure.
735
736			utf8_info_t is defined in "unicode.h".
737
738			The value of "info.len_read" is the number of bytes processed. the
739			value of "info.runes_read" is the number of Unicode code points in
740			the input. */
741
742			int32_t
743	0		validate_utf8 (const uint8_t * input, int32_t len, utf8_info_t * info)
744			{
745			int32_t i;
746			uint8_t c;
747
748	0		info->len_read = 0;
749			/* We want to increment the runes after "string_start", but that
750			would give us one too many. */
751	0		info->runes_read = -1;
752	0		i = 0;
753
754			string_start:
755
756			/* We get here after successfully reading a "rune". */
757
758	0		info->runes_read++;
759	0	0	if (i >= len) {
760	0		info->len_read = len;
761	0		return UNICODE_OK; /* 0 */
762			}
763
764			/* Set c separately here since we use a range comparison before
765			the switch statement. */
766
767	0		c = input[i];
768
769	0	0	if (c == 0) {
770	0		FAIL (UNICODE_EMPTY_INPUT);
771			}
772			/* Admit all bytes < 0x80. */
773	0	0	if (c < 0x80) {
774	0		i++;
775	0		goto string_start;
776			}
777	0		switch (c) {
778			case BYTE_C2_DF:
779	0		UNICODEADDBYTE;
780	0		goto byte_last_80_bf;
781
782			case 0xE0:
783	0		UNICODEADDBYTE;
784	0		goto byte23_a0_bf;
785
786			case BYTE_E1_EC:
787	0		UNICODEADDBYTE;
788	0		goto byte_penultimate_80_bf;
789
790			case 0xED:
791	0		UNICODEADDBYTE;
792	0		goto byte23_80_9f;
793
794			case 0xEE:
795	0		UNICODEADDBYTE;
796	0		goto byte_penultimate_80_bf;
797
798			case 0xEF:
799	0		UNICODEADDBYTE;
800	0		goto byte_ef_80_bf;
801
802			case 0xF0:
803	0		UNICODEADDBYTE;
804	0		goto byte24_90_bf;
805
806			case BYTE_F1_F3:
807	0		UNICODEADDBYTE;
808	0		goto byte24_80_bf;
809
810			case 0xF4:
811	0		UNICODEADDBYTE;
812	0		goto byte24_80_8f;
813
814			default:
815	0		FAIL (UTF8_BAD_LEADING_BYTE);
816			}
817
818			byte_last_80_bf:
819
820	0	0	switch (UNICODENEXTBYTE) {
821			case BYTE_80_BF:
822	0		UNICODEADDBYTE;
823	0		goto string_start;
824			default:
825	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
826			}
827
828			byte_ef_b7:
829	0	0	switch (UNICODENEXTBYTE) {
830			case BYTE_80_8F_B0_BF:
831	0		UNICODEADDBYTE;
832	0		goto string_start;
833			default:
834	0	0	if (c >= 0x90 && c <= 0xAF) {
		0
835	0		FAIL (UNICODE_NOT_CHARACTER);
836			}
837			else {
838	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
839			}
840			}
841
842			byte_last_80_bd:
843
844	0		switch (UNICODENEXTBYTE) {
845			case BYTE_80_BD:
846	0		UNICODEADDBYTE;
847	0		goto string_start;
848			case 0xBE:
849			case 0xBF:
850	0		FAIL (UNICODE_NOT_CHARACTER);
851			default:
852	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
853			}
854
855			byte_penultimate_80_bf:
856
857	0	0	switch (UNICODENEXTBYTE) {
858			case BYTE_80_BF:
859	0		UNICODEADDBYTE;
860	0		goto byte_last_80_bf;
861			default:
862	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
863			}
864
865			byte_ef_80_bf:
866	0		switch (UNICODENEXTBYTE) {
867			case BYTE_80_B6_B8_BF:
868	0		UNICODEADDBYTE;
869	0		goto byte_last_80_bd;
870			case 0xB7:
871	0		UNICODEADDBYTE;
872			/* FDD0 - FDE7 */
873	0		goto byte_ef_b7;
874			default:
875	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
876			}
877
878			byte24_90_bf:
879
880	0	0	switch (UNICODENEXTBYTE) {
881			case BYTE_90_BF:
882	0		UNICODEADDBYTE;
883	0		goto byte_penultimate_80_bf;
884			default:
885	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
886			}
887
888			byte23_80_9f:
889
890	0	0	switch (UNICODENEXTBYTE) {
891			case BYTE_80_9F:
892	0		UNICODEADDBYTE;
893	0		goto byte_last_80_bf;
894			default:
895	0	0	if (c >= 0xA0 && c <= 0xBF) {
		0
896	0		FAIL (UNICODE_SURROGATE_PAIR);
897			}
898			else {
899	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
900			}
901			}
902
903			byte23_a0_bf:
904
905	0	0	switch (UNICODENEXTBYTE) {
906			case BYTE_A0_BF:
907	0		UNICODEADDBYTE;
908	0		goto byte_last_80_bf;
909			default:
910	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
911			}
912
913			byte24_80_bf:
914
915	0	0	switch (UNICODENEXTBYTE) {
916			case BYTE_80_BF:
917	0		UNICODEADDBYTE;
918	0		goto byte_ef_80_bf;
919			default:
920	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
921			}
922
923			byte24_80_8f:
924
925	0	0	switch (UNICODENEXTBYTE) {
926			case BYTE_80_8F:
927	0		UNICODEADDBYTE;
928	0		goto byte_ef_80_bf;
929			default:
930	0	0	if (c >= 0x90) {
931	0		FAIL (UNICODE_TOO_BIG);
932			}
933			else {
934	0		FAIL (UTF8_BAD_CONTINUATION_BYTE);
935			}
936			}
937			}
938
939			#define REJECT_FE_FF(c) \
940			if (c == 0xFF \|\| c == 0xFE) { \
941			return UNICODE_NOT_CHARACTER; \
942			}
943
944			/* Make "* ptr" point to the start of the first UTF-8 character after
945			its initial value. This assumes that there are at least four bytes
946			which can be read, and that "* ptr" points to valid UTF-8.
947
948			If "** ptr" does not have its top bit set, 00xx_xxxx, this does not
949			change the value of "* ptr", and it returns UNICODE_OK. If "** ptr"
950			has its top two bits set, 11xx_xxxx, this does not change the value
951			of "* ptr" and it returns UNICODE_OK. If "**ptr" has its top bit
952			set but its second-to-top bit unset, 10xx_xxxx, so it is the
953			second, third, or fourth byte of a multibyte sequence, "* ptr" is
954			incremented until either "** ptr" is a valid first byte of a UTF-8
955			sequence, or too many bytes have passed for it to be valid
956			UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is
957			returned and "*ptr" is left unchanged.
958
959			If a valid UTF-8 first byte was found, either 11xx_xxxx or
960			00xx_xxxx, UNICODE_OK is returned, and "*ptr" is set to the address
961			of the valid byte. Nul bytes (bytes containing zero) are considered
962			valid.
963
964			If any of the bytes read contains invalid UTF-8 bytes 0xFE and
965			0xFF, the error code UNICODE_NOT_CHARACTER is returned and "*ptr"
966			is left unchanged. */
967
968			int32_t
969	0		trim_to_utf8_start (const uint8_t ** ptr)
970			{
971	0		const uint8_t * p = *ptr;
972			uint8_t c;
973			int32_t i;
974
975	0		c = * p;
976	0	0	REJECT_FE_FF (c);
		0
977			/* 0xC0 = 1100_0000. */
978	0		c &= 0xC0;
979	0	0	if (c == 0xC0 \|\| c == 0x00) {
		0
980	0		return UNICODE_OK;
981			}
982	0	0	for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) {
983	0		c = p[i];
984	0	0	REJECT_FE_FF (c);
		0
985	0	0	if ((c & 0x80) != 0x80 \|\| (c & 0x40) != 0) {
		0
986	0		* ptr = p + i;
987	0		return UNICODE_OK;
988			}
989			}
990	0		return UTF8_BAD_CONTINUATION_BYTE;
991			}
992
993			/* Given a return value "code" which is negative or zero, return a
994			string which describes what the return value means. Positive
995			non-zero return values never indicate errors or statuses in this
996			library. */
997
998			const char *
999	0		unicode_code_to_error (int32_t code)
1000			{
1001	0		switch (code) {
1002			case UTF8_BAD_LEADING_BYTE:
1003	0		return "The leading byte of a UTF-8 sequence was invalid";
1004			case UTF8_BAD_CONTINUATION_BYTE:
1005	0		return "A continuation byte of a UTF-8 sequence was invalid";
1006			case UNICODE_SURROGATE_PAIR:
1007	0		return "A surrogate pair code point could not be converted to UTF-8";
1008			case UNICODE_NOT_SURROGATE_PAIR:
1009	0		return "Input code points did not form a surrogate pair";
1010			case UNICODE_OK:
1011	0		return "Successful completion";
1012			case UNICODE_TOO_BIG:
1013	0		return "A code point was beyond limits";
1014			case UNICODE_NOT_CHARACTER:
1015	0		return "A number ending in hex FFFF or FFFE is not valid Unicode";
1016			case UTF8_NON_SHORTEST:
1017	0		return "A UTF-8 input was not in the shortest form";
1018			case UNICODE_EMPTY_INPUT:
1019	0		return "A byte with value zero was found in UTF-8 input";
1020			default:
1021	0		return "Unknown/invalid error code";
1022			}
1023			}
1024
1025			/* _____ _
1026			\|_ _\|__ ___\| \|_ ___
1027			\| \|/ _ \/ __\| __/ __\|
1028			\| \| __/\__ \ \|_\__ \
1029			\|_\|\___\|\|___/\__\|___/
1030			*/
1031
1032			/* Below this is code for testing which is not normally compiled. Use
1033			"make test" to compile the testing version. */
1034
1035			#ifdef TEST
1036
1037			#include
1038			#include
1039			#include "c-tap-test.h"
1040
1041			static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖＸ";
1042			static const uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x0};
1043
1044			#define BUFFSIZE 0x100
1045
1046			static void test_ucs2_to_utf8 ()
1047			{
1048			/* Buffer to print utf8 out into. */
1049			uint8_t buffer[BUFFSIZE];
1050			/* Offset into buffer. */
1051			uint8_t * offset;
1052			const uint8_t * start = utf8;
1053
1054			offset = buffer;
1055			while (1) {
1056			int32_t unicode;
1057			int32_t bytes;
1058			const uint8_t * end;
1059			unicode = utf8_to_ucs2 (start, & end);
1060			if (unicode == UNICODE_EMPTY_INPUT) {
1061			break;
1062			}
1063			if (unicode < 0) {
1064			fprintf (stderr,
1065			"%s:%d: unexpected error %s converting unicode.\n",
1066			__FILE__, __LINE__, unicode_code_to_error (unicode));
1067			// exit ok in test
1068			exit (EXIT_FAILURE);
1069			}
1070			bytes = ucs2_to_utf8 (unicode, offset);
1071			TAP_TEST_MSG (bytes > 0, "no bad conversion");
1072			TAP_TEST_MSG (strncmp ((const char *) offset,
1073			(const char *) start, bytes) == 0,
1074			"round trip OK for %X (%d bytes)", unicode, bytes);
1075			start = end;
1076			offset += bytes;
1077			if (offset - buffer >= BUFFSIZE) {
1078			fprintf (stderr, "%s:%d: out of space in buffer.\n",
1079			__FILE__, __LINE__);
1080			// exit ok
1081			exit (EXIT_FAILURE);
1082			}
1083			}
1084			* offset = '\0';
1085			TAP_TEST_MSG (strcmp ((const char ) buffer, (const char ) utf8) == 0,
1086			"input %s resulted in identical output %s",
1087			utf8, buffer);
1088			}
1089
1090			static void
1091			test_invalid_utf8 ()
1092			{
1093			uint8_t invalid_utf8[UTF8_MAX_LENGTH];
1094			int32_t unicode;
1095			int32_t valid;
1096			const uint8_t * end;
1097			snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1,
1098			"%c%c%c", 0xe8, 0xe4, 0xe5);
1099			unicode = utf8_to_ucs2 (invalid_utf8, & end);
1100			TAP_TEST_MSG (unicode == UTF8_BAD_CONTINUATION_BYTE,
1101			"invalid UTF-8 gives incorrect result");
1102			valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8));
1103			TAP_TEST_MSG (valid == UTF8_INVALID, "Invalid UTF-8 fails valid_utf8");
1104			}
1105
1106			static void
1107			test_surrogate_pairs ()
1108			{
1109			int32_t status;
1110			int32_t hi;
1111			int32_t lo;
1112			int32_t rt;
1113			/* This is the wide character space, which does not require
1114			representation as a surrogate pair. */
1115			int32_t nogood = 0x3000;
1116			/*
1117			Two examples from the Wikipedia article on UTF-16
1118			https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */
1119			int32_t wikipedia_1 = 0x10437;
1120			int32_t wikipedia_2 = 0x24b62;
1121			/*
1122			An example from the JSON RFC
1123			http://rfc7159.net/rfc7159#rfc.section.7
1124			*/
1125			int32_t json_spec = 0x1D11E;
1126
1127			status = unicode_to_surrogates (nogood, & hi, & lo);
1128
1129			TAP_TEST_MSG (status == UNICODE_NOT_SURROGATE_PAIR,
1130			"low value to surrogate pair breaker returns error");
1131
1132			status = unicode_to_surrogates (wikipedia_1, & hi, & lo);
1133			TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1);
1134			TAP_TEST_MSG (hi == 0xD801, "Got expected %X == 0xD801", hi);
1135			TAP_TEST_MSG (lo == 0xDC37, "Got expected %X == 0xDC37", lo);
1136			rt = surrogates_to_unicode (hi, lo);
1137			TAP_TEST_MSG (rt == wikipedia_1, "Round trip %X == initial %X",
1138			rt, wikipedia_1);
1139
1140			status = unicode_to_surrogates (wikipedia_2, & hi, & lo);
1141			TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1);
1142			TAP_TEST_MSG (hi == 0xD852, "Got expected %X == 0xD852", hi);
1143			TAP_TEST_MSG (lo == 0xDF62, "Got expected %X == 0xDF62", lo);
1144			rt = surrogates_to_unicode (hi, lo);
1145			TAP_TEST_MSG (rt == wikipedia_2, "Round trip %X == initial %X",
1146			rt, wikipedia_2);
1147
1148			status = unicode_to_surrogates (json_spec, & hi, & lo);
1149			TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", json_spec);
1150			TAP_TEST_MSG (hi == 0xD834, "Got expected %X == 0xD834", hi);
1151			TAP_TEST_MSG (lo == 0xDd1e, "Got expected %X == 0xDD1e", lo);
1152			rt = surrogates_to_unicode (hi, lo);
1153			TAP_TEST_MSG (rt == json_spec, "Round trip %X == initial %X",
1154			rt, json_spec);
1155			}
1156
1157			/* Test sending various bytes into "utf8_bytes" and seeing whether the
1158			return value is what we expected. */
1159
1160			static void
1161			test_utf8_bytes ()
1162			{
1163			struct tub {
1164			int32_t first;
1165			int32_t expect;
1166			} tests[] = {
1167			{'a', 1},
1168			{0xb0, UTF8_BAD_LEADING_BYTE},
1169			{0xc2, 2},
1170			{0xff, UTF8_BAD_LEADING_BYTE},
1171			};
1172			int32_t n_tests = sizeof (tests) / sizeof (struct tub);
1173			int32_t i;
1174			for (i = 0; i < n_tests; i++) {
1175			/* Expected bytes. */
1176			int32_t xbytes;
1177			int32_t firstbyte;
1178			firstbyte = tests[i].first;
1179			xbytes = utf8_bytes (firstbyte);
1180			TAP_TEST_MSG (xbytes == tests[i].expect, "Got %d (%d) with input %d",
1181			xbytes, tests[i].expect, firstbyte);
1182			}
1183			}
1184
1185			/* Test the conversion from utf-8 to ucs-2 (UTF-16). */
1186
1187			static void
1188			test_utf8_to_ucs2 ()
1189			{
1190			const uint8_t * start = utf8;
1191			while (*start) {
1192			int32_t unicode;
1193			const uint8_t * end;
1194			unicode = utf8_to_ucs2 (start, & end);
1195			TAP_TEST_MSG (unicode > 0, "no bad value at %s", start);
1196			printf ("# %s is %04X, length is %d\n",
1197			start, unicode, (int) (end - start));
1198			start = end;
1199			}
1200			}
1201
1202			/* Test counting of unicode characters. */
1203
1204			static void
1205			test_unicode_count_chars ()
1206			{
1207			int32_t cc;
1208			cc = unicode_count_chars (utf8);
1209			TAP_TEST_MSG (cc == 7, "unicode_count_chars gets seven characters for utf8");
1210			cc = unicode_count_chars_fast (utf8);
1211			TAP_TEST_MSG (cc == 7, "unicode_count_chars_fast gets seven characters for utf8");
1212			}
1213
1214			static void
1215			test_valid_utf8 ()
1216			{
1217			int32_t valid;
1218			valid = valid_utf8 (utf8, strlen ((const char *) utf8));
1219			TAP_TEST_MSG (valid == UTF8_VALID, "Valid UTF-8 passes valid_utf8");
1220			}
1221
1222			static void
1223			test_trim_to_utf8_start ()
1224			{
1225			int32_t status;
1226			const uint8_t * p;
1227			/* Invalid UTF-8. */
1228			/* Valid UTF-8. */
1229			uint8_t good[] = "化苦";
1230			uint8_t good2[] = "化abc";
1231			p = bad;
1232			status = trim_to_utf8_start (& p);
1233			TAP_TEST_MSG (status == UTF8_BAD_CONTINUATION_BYTE,
1234			"Non-UTF-8 causes error");
1235			TAP_TEST_MSG (p == bad, "Did not change pointer");
1236			p = good + 1;
1237			status = trim_to_utf8_start (& p);
1238			TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result");
1239			TAP_TEST_MSG (p != good + 1, "Moved p");
1240			TAP_TEST_MSG (p == good + 3, "Moved p to the right position");
1241			p = good2 + 1;
1242			status = trim_to_utf8_start (& p);
1243			TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result");
1244			TAP_TEST_MSG (p != good2 + 1, "Moved p");
1245			TAP_TEST_MSG (p == good2 + 3, "Moved p to the right position");
1246			}
1247
1248			static void
1249			test_constants ()
1250			{
1251			TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM);
1252			}
1253
1254			static void
1255			test_utf8_validate ()
1256			{
1257			int r;
1258			int l;
1259			utf8_info_t info;
1260
1261			r = validate_utf8 ((const uint8_t *) "", 0, & info);
1262			TAP_TEST_EQUAL (r, UNICODE_OK);
1263			TAP_TEST_EQUAL (info.len_read, 0);
1264			TAP_TEST_EQUAL (info.runes_read, 0);
1265
1266			l = strlen ((const char *) utf8);
1267			r = validate_utf8 (utf8, l, & info);
1268			TAP_TEST_EQUAL (r, UNICODE_OK);
1269			TAP_TEST_EQUAL (info.len_read, l);
1270			TAP_TEST_EQUAL (info.runes_read, 7);
1271
1272			l = strlen ((const char *) bad);
1273			r = validate_utf8 (bad, l, & info);
1274			TAP_TEST (r != UNICODE_OK);
1275			}
1276
1277			int main ()
1278			{
1279			test_utf8_to_ucs2 ();
1280			test_ucs2_to_utf8 ();
1281			test_invalid_utf8 ();
1282			test_unicode_count_chars ();
1283			test_surrogate_pairs ();
1284			test_utf8_bytes ();
1285			test_valid_utf8 ();
1286			test_trim_to_utf8_start ();
1287			test_constants ();
1288			test_utf8_validate ();
1289			TAP_PLAN;
1290			}
1291
1292			#endif /* def TEST */