File Coverage

utf8_valid.h
Criterion Covered Total %
statement 48 50 96.0
branch 24 24 100.0
condition n/a
subroutine n/a
pod n/a
total 72 74 97.3


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017-2026 Christian Hansen
3             *
4             * All rights reserved.
5             *
6             * Redistribution and use in source and binary forms, with or without
7             * modification, are permitted provided that the following conditions are met:
8             *
9             * 1. Redistributions of source code must retain the above copyright notice, this
10             * list of conditions and the following disclaimer.
11             * 2. Redistributions in binary form must reproduce the above copyright notice,
12             * this list of conditions and the following disclaimer in the documentation
13             * and/or other materials provided with the distribution.
14             *
15             * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16             * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17             * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18             * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19             * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20             * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21             * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22             * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23             * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24             * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25             */
26              
27             /*
28             * Shift-based DFA for UTF-8 validation
29             *
30             * Same 9-state DFA as the 64-bit version, but state offsets are chosen
31             * by an SMT solver so all transition rows fit in a plain uint32_t.
32             *
33             * S_ERROR = 0: error transitions contribute nothing to a row value
34             * since (S_ERROR << offset) == 0 for any offset.
35             *
36             * State offsets (bit positions within each row):
37             *
38             * S_ERROR = 0 Invalid byte seen (absorbing)
39             * S_ACCEPT = 6 Start / Accept
40             * S_TAIL1 = 16 Expect 1 more tail byte (80-BF -> S_ACCEPT)
41             * S_TAIL2 = 1 Expect 2 more tail bytes (80-BF -> S_TAIL1)
42             * S_E0 = 19 After E0: next tail must be A0-BF -> S_TAIL1
43             * S_ED = 25 After ED: next tail must be 80-9F -> S_TAIL1
44             * S_F0 = 11 After F0: next tail must be 90-BF -> S_TAIL2
45             * S_F1_F3 = 18 After F1-F3: next tail 80-BF -> S_TAIL2
46             * S_F4 = 24 After F4: next tail must be 80-8F -> S_TAIL2
47             *
48             * Sequence flows:
49             * 1-byte: S_ACCEPT -> S_ACCEPT
50             * 2-byte: S_ACCEPT -> S_TAIL1 -> S_ACCEPT
51             * 3-byte: S_ACCEPT -> S_TAIL2 -> S_TAIL1 -> S_ACCEPT
52             * (via S_E0 or S_ED for restricted leads)
53             * 4-byte: S_ACCEPT -> S_TAIL2 -> S_TAIL1 -> S_ACCEPT
54             * (via S_F0, S_F1_F3, or S_F4 for lead)
55             *
56             *
57             * UTF-8 Encoding Form:
58             *
59             * U+0000..U+007F 0xxxxxxx
60             * U+0080..U+07FF 110xxxxx 10xxxxxx
61             * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
62             * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
63             *
64             *
65             * U+0000..U+007F 00..7F
66             * N C0..C1 80..BF 1100000x 10xxxxxx
67             * U+0080..U+07FF C2..DF 80..BF
68             * N E0 80..9F 80..BF 11100000 100xxxxx
69             * U+0800..U+0FFF E0 A0..BF 80..BF
70             * U+1000..U+CFFF E1..EC 80..BF 80..BF
71             * U+D000..U+D7FF ED 80..9F 80..BF
72             * S ED A0..BF 80..BF 11101101 101xxxxx
73             * U+E000..U+FFFF EE..EF 80..BF 80..BF
74             * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
75             * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
76             * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
77             * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
78             *
79             * Legend:
80             * N = Non-shortest form
81             * S = Surrogates
82             */
83              
84             #ifndef UTF8_VALID_H
85             #define UTF8_VALID_H
86             #include
87             #include
88             #include
89             #include
90              
91             #if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
92             # define UTF8_VALID_HAS_SSE2 1
93             # include
94             #elif defined(__aarch64__)
95             # define UTF8_VALID_HAS_NEON 1
96             # include
97             #endif
98              
99             #ifdef __cplusplus
100             extern "C" {
101             #endif
102              
103             #define S_ERROR 0
104             #define S_ACCEPT 6
105             #define S_TAIL1 16
106             #define S_TAIL2 1
107             #define S_E0 19
108             #define S_ED 25
109             #define S_F0 11
110             #define S_F1_F3 18
111             #define S_F4 24
112              
113             /* clang-format off */
114              
115             #define DFA_ROW(accept,error,tail1,tail2,e0,ed,f0,f1_f3,f4) \
116             ( ((uint32_t)(accept) << S_ACCEPT) \
117             | ((uint32_t)(error) << S_ERROR) \
118             | ((uint32_t)(tail1) << S_TAIL1) \
119             | ((uint32_t)(tail2) << S_TAIL2) \
120             | ((uint32_t)(e0) << S_E0) \
121             | ((uint32_t)(ed) << S_ED) \
122             | ((uint32_t)(f0) << S_F0) \
123             | ((uint32_t)(f1_f3) << S_F1_F3) \
124             | ((uint32_t)(f4) << S_F4) )
125              
126             #define ERR S_ERROR
127              
128             #define ASCII_ROW DFA_ROW(S_ACCEPT,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
129             #define LEAD2_ROW DFA_ROW(S_TAIL1,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
130             #define LEAD3_ROW DFA_ROW(S_TAIL2,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
131             #define LEAD4_ROW DFA_ROW(S_F1_F3,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
132             #define ERROR_ROW DFA_ROW(ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
133              
134             /*
135             * Continuation byte rows.
136             * Columns: ACCEPT ERROR TAIL1 TAIL2 E0 ED F0 F1_F3 F4
137             *
138             * 80-8F: ERR ERR ->ACCEPT ->TAIL1 ->ERR ->TAIL1 ->ERR ->TAIL2 ->TAIL2
139             * 90-9F: ERR ERR ->ACCEPT ->TAIL1 ->ERR ->TAIL1 ->TAIL2 ->TAIL2 ->ERR
140             * A0-BF: ERR ERR ->ACCEPT ->TAIL1 ->TAIL1 ->ERR ->TAIL2 ->TAIL2 ->ERR
141             */
142             #define CONT_80_8F DFA_ROW(ERR,ERR,S_ACCEPT,S_TAIL1,ERR, S_TAIL1,ERR, S_TAIL2,S_TAIL2)
143             #define CONT_90_9F DFA_ROW(ERR,ERR,S_ACCEPT,S_TAIL1,ERR, S_TAIL1,S_TAIL2, S_TAIL2,ERR)
144             #define CONT_A0_BF DFA_ROW(ERR,ERR,S_ACCEPT,S_TAIL1,S_TAIL1,ERR, S_TAIL2, S_TAIL2,ERR)
145              
146             static const uint32_t utf8_dfa[256] = {
147             // 00-7F
148             [0x00]=ASCII_ROW,[0x01]=ASCII_ROW,[0x02]=ASCII_ROW,[0x03]=ASCII_ROW,
149             [0x04]=ASCII_ROW,[0x05]=ASCII_ROW,[0x06]=ASCII_ROW,[0x07]=ASCII_ROW,
150             [0x08]=ASCII_ROW,[0x09]=ASCII_ROW,[0x0A]=ASCII_ROW,[0x0B]=ASCII_ROW,
151             [0x0C]=ASCII_ROW,[0x0D]=ASCII_ROW,[0x0E]=ASCII_ROW,[0x0F]=ASCII_ROW,
152             [0x10]=ASCII_ROW,[0x11]=ASCII_ROW,[0x12]=ASCII_ROW,[0x13]=ASCII_ROW,
153             [0x14]=ASCII_ROW,[0x15]=ASCII_ROW,[0x16]=ASCII_ROW,[0x17]=ASCII_ROW,
154             [0x18]=ASCII_ROW,[0x19]=ASCII_ROW,[0x1A]=ASCII_ROW,[0x1B]=ASCII_ROW,
155             [0x1C]=ASCII_ROW,[0x1D]=ASCII_ROW,[0x1E]=ASCII_ROW,[0x1F]=ASCII_ROW,
156             [0x20]=ASCII_ROW,[0x21]=ASCII_ROW,[0x22]=ASCII_ROW,[0x23]=ASCII_ROW,
157             [0x24]=ASCII_ROW,[0x25]=ASCII_ROW,[0x26]=ASCII_ROW,[0x27]=ASCII_ROW,
158             [0x28]=ASCII_ROW,[0x29]=ASCII_ROW,[0x2A]=ASCII_ROW,[0x2B]=ASCII_ROW,
159             [0x2C]=ASCII_ROW,[0x2D]=ASCII_ROW,[0x2E]=ASCII_ROW,[0x2F]=ASCII_ROW,
160             [0x30]=ASCII_ROW,[0x31]=ASCII_ROW,[0x32]=ASCII_ROW,[0x33]=ASCII_ROW,
161             [0x34]=ASCII_ROW,[0x35]=ASCII_ROW,[0x36]=ASCII_ROW,[0x37]=ASCII_ROW,
162             [0x38]=ASCII_ROW,[0x39]=ASCII_ROW,[0x3A]=ASCII_ROW,[0x3B]=ASCII_ROW,
163             [0x3C]=ASCII_ROW,[0x3D]=ASCII_ROW,[0x3E]=ASCII_ROW,[0x3F]=ASCII_ROW,
164             [0x40]=ASCII_ROW,[0x41]=ASCII_ROW,[0x42]=ASCII_ROW,[0x43]=ASCII_ROW,
165             [0x44]=ASCII_ROW,[0x45]=ASCII_ROW,[0x46]=ASCII_ROW,[0x47]=ASCII_ROW,
166             [0x48]=ASCII_ROW,[0x49]=ASCII_ROW,[0x4A]=ASCII_ROW,[0x4B]=ASCII_ROW,
167             [0x4C]=ASCII_ROW,[0x4D]=ASCII_ROW,[0x4E]=ASCII_ROW,[0x4F]=ASCII_ROW,
168             [0x50]=ASCII_ROW,[0x51]=ASCII_ROW,[0x52]=ASCII_ROW,[0x53]=ASCII_ROW,
169             [0x54]=ASCII_ROW,[0x55]=ASCII_ROW,[0x56]=ASCII_ROW,[0x57]=ASCII_ROW,
170             [0x58]=ASCII_ROW,[0x59]=ASCII_ROW,[0x5A]=ASCII_ROW,[0x5B]=ASCII_ROW,
171             [0x5C]=ASCII_ROW,[0x5D]=ASCII_ROW,[0x5E]=ASCII_ROW,[0x5F]=ASCII_ROW,
172             [0x60]=ASCII_ROW,[0x61]=ASCII_ROW,[0x62]=ASCII_ROW,[0x63]=ASCII_ROW,
173             [0x64]=ASCII_ROW,[0x65]=ASCII_ROW,[0x66]=ASCII_ROW,[0x67]=ASCII_ROW,
174             [0x68]=ASCII_ROW,[0x69]=ASCII_ROW,[0x6A]=ASCII_ROW,[0x6B]=ASCII_ROW,
175             [0x6C]=ASCII_ROW,[0x6D]=ASCII_ROW,[0x6E]=ASCII_ROW,[0x6F]=ASCII_ROW,
176             [0x70]=ASCII_ROW,[0x71]=ASCII_ROW,[0x72]=ASCII_ROW,[0x73]=ASCII_ROW,
177             [0x74]=ASCII_ROW,[0x75]=ASCII_ROW,[0x76]=ASCII_ROW,[0x77]=ASCII_ROW,
178             [0x78]=ASCII_ROW,[0x79]=ASCII_ROW,[0x7A]=ASCII_ROW,[0x7B]=ASCII_ROW,
179             [0x7C]=ASCII_ROW,[0x7D]=ASCII_ROW,[0x7E]=ASCII_ROW,[0x7F]=ASCII_ROW,
180              
181             // 80-8F
182             [0x80]=CONT_80_8F,[0x81]=CONT_80_8F,[0x82]=CONT_80_8F,[0x83]=CONT_80_8F,
183             [0x84]=CONT_80_8F,[0x85]=CONT_80_8F,[0x86]=CONT_80_8F,[0x87]=CONT_80_8F,
184             [0x88]=CONT_80_8F,[0x89]=CONT_80_8F,[0x8A]=CONT_80_8F,[0x8B]=CONT_80_8F,
185             [0x8C]=CONT_80_8F,[0x8D]=CONT_80_8F,[0x8E]=CONT_80_8F,[0x8F]=CONT_80_8F,
186              
187             // 90-9F
188             [0x90]=CONT_90_9F,[0x91]=CONT_90_9F,[0x92]=CONT_90_9F,[0x93]=CONT_90_9F,
189             [0x94]=CONT_90_9F,[0x95]=CONT_90_9F,[0x96]=CONT_90_9F,[0x97]=CONT_90_9F,
190             [0x98]=CONT_90_9F,[0x99]=CONT_90_9F,[0x9A]=CONT_90_9F,[0x9B]=CONT_90_9F,
191             [0x9C]=CONT_90_9F,[0x9D]=CONT_90_9F,[0x9E]=CONT_90_9F,[0x9F]=CONT_90_9F,
192              
193             // A0-BF
194             [0xA0]=CONT_A0_BF,[0xA1]=CONT_A0_BF,[0xA2]=CONT_A0_BF,[0xA3]=CONT_A0_BF,
195             [0xA4]=CONT_A0_BF,[0xA5]=CONT_A0_BF,[0xA6]=CONT_A0_BF,[0xA7]=CONT_A0_BF,
196             [0xA8]=CONT_A0_BF,[0xA9]=CONT_A0_BF,[0xAA]=CONT_A0_BF,[0xAB]=CONT_A0_BF,
197             [0xAC]=CONT_A0_BF,[0xAD]=CONT_A0_BF,[0xAE]=CONT_A0_BF,[0xAF]=CONT_A0_BF,
198             [0xB0]=CONT_A0_BF,[0xB1]=CONT_A0_BF,[0xB2]=CONT_A0_BF,[0xB3]=CONT_A0_BF,
199             [0xB4]=CONT_A0_BF,[0xB5]=CONT_A0_BF,[0xB6]=CONT_A0_BF,[0xB7]=CONT_A0_BF,
200             [0xB8]=CONT_A0_BF,[0xB9]=CONT_A0_BF,[0xBA]=CONT_A0_BF,[0xBB]=CONT_A0_BF,
201             [0xBC]=CONT_A0_BF,[0xBD]=CONT_A0_BF,[0xBE]=CONT_A0_BF,[0xBF]=CONT_A0_BF,
202              
203             // C0-C1: invalid
204             [0xC0]=ERROR_ROW,[0xC1]=ERROR_ROW,
205              
206             // C2-DF: 2-byte lead
207             [0xC2]=LEAD2_ROW,[0xC3]=LEAD2_ROW,[0xC4]=LEAD2_ROW,[0xC5]=LEAD2_ROW,
208             [0xC6]=LEAD2_ROW,[0xC7]=LEAD2_ROW,[0xC8]=LEAD2_ROW,[0xC9]=LEAD2_ROW,
209             [0xCA]=LEAD2_ROW,[0xCB]=LEAD2_ROW,[0xCC]=LEAD2_ROW,[0xCD]=LEAD2_ROW,
210             [0xCE]=LEAD2_ROW,[0xCF]=LEAD2_ROW,[0xD0]=LEAD2_ROW,[0xD1]=LEAD2_ROW,
211             [0xD2]=LEAD2_ROW,[0xD3]=LEAD2_ROW,[0xD4]=LEAD2_ROW,[0xD5]=LEAD2_ROW,
212             [0xD6]=LEAD2_ROW,[0xD7]=LEAD2_ROW,[0xD8]=LEAD2_ROW,[0xD9]=LEAD2_ROW,
213             [0xDA]=LEAD2_ROW,[0xDB]=LEAD2_ROW,[0xDC]=LEAD2_ROW,[0xDD]=LEAD2_ROW,
214             [0xDE]=LEAD2_ROW,[0xDF]=LEAD2_ROW,
215              
216             // E0: first cont A0-BF
217             [0xE0]=DFA_ROW(S_E0,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR),
218              
219             // E1-EC: 3-byte lead
220             [0xE1]=LEAD3_ROW,[0xE2]=LEAD3_ROW,[0xE3]=LEAD3_ROW,[0xE4]=LEAD3_ROW,
221             [0xE5]=LEAD3_ROW,[0xE6]=LEAD3_ROW,[0xE7]=LEAD3_ROW,[0xE8]=LEAD3_ROW,
222             [0xE9]=LEAD3_ROW,[0xEA]=LEAD3_ROW,[0xEB]=LEAD3_ROW,[0xEC]=LEAD3_ROW,
223              
224             // ED: first cont 80-9F
225             [0xED]=DFA_ROW(S_ED,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR),
226              
227             // EE-EF: 3-byte lead
228             [0xEE]=LEAD3_ROW,[0xEF]=LEAD3_ROW,
229              
230             // F0: first cont 90-BF
231             [0xF0]=DFA_ROW(S_F0,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR),
232              
233             // F1-F3: 4-byte lead
234             [0xF1]=LEAD4_ROW,[0xF2]=LEAD4_ROW,[0xF3]=LEAD4_ROW,
235              
236             // F4: first cont 80-8F
237             [0xF4]=DFA_ROW(S_F4,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR),
238              
239             // F5-FF: invalid
240             [0xF5]=ERROR_ROW,[0xF6]=ERROR_ROW,[0xF7]=ERROR_ROW,[0xF8]=ERROR_ROW,
241             [0xF9]=ERROR_ROW,[0xFA]=ERROR_ROW,[0xFB]=ERROR_ROW,[0xFC]=ERROR_ROW,
242             [0xFD]=ERROR_ROW,[0xFE]=ERROR_ROW,[0xFF]=ERROR_ROW,
243             };
244              
245             /* clang-format on */
246              
247             #undef S_TAIL1
248             #undef S_TAIL2
249             #undef S_E0
250             #undef S_ED
251             #undef S_F0
252             #undef S_F1_F3
253             #undef S_F4
254              
255             #undef ERR
256             #undef DFA_ROW
257             #undef ASCII_ROW
258             #undef CONT_80_8F
259             #undef CONT_90_9F
260             #undef CONT_A0_BF
261             #undef LEAD2_ROW
262             #undef LEAD3_ROW
263             #undef LEAD4_ROW
264             #undef ERROR_ROW
265              
266 68959           static inline uint32_t utf8_dfa_step(uint32_t state, unsigned char c) {
267 68959           return (utf8_dfa[c] >> state) & 31;
268             }
269              
270 13756           static inline uint32_t utf8_dfa_run(uint32_t state,
271             const unsigned char* src,
272             size_t len) {
273 58550 100         for (size_t i = 0; i < len; i++)
274 44794           state = utf8_dfa_step(state, src[i]);
275 13756           return state;
276             }
277              
278 4913           static inline size_t utf8_maximal_subpart(const char* src, size_t len) {
279 4913           const unsigned char* s = (const unsigned char*)src;
280 4913           uint32_t state = S_ACCEPT;
281              
282 9956 100         for (size_t i = 0; i < len; i++) {
283 9075           state = utf8_dfa_step(state, s[i]);
284 9075           switch (state) {
285 0           case S_ACCEPT:
286 0           return i + 1;
287 4032           case S_ERROR:
288 4032 100         return i > 0 ? i : 1;
289             }
290             }
291 881           return len;
292             }
293              
294 8280           static inline size_t utf8_maximal_prefix(const char* src, size_t len) {
295 8280           const unsigned char* s = (const unsigned char*)src;
296 8280           uint32_t state = S_ACCEPT;
297 8280           size_t prefix = 0;
298              
299 16243 100         for (size_t i = 0; i < len; i++) {
300 15090           state = utf8_dfa_step(state, s[i]);
301 15090 100         if (state == S_ACCEPT)
302 44           prefix = i + 1;
303 15046 100         else if (state == S_ERROR)
304 7127           break;
305             }
306 8280           return prefix;
307             }
308              
309 712           static inline bool utf8_check_ascii_block16(const unsigned char *s) {
310             #if defined(UTF8_VALID_HAS_SSE2)
311 712           __m128i v = _mm_loadu_si128((const __m128i *)s);
312 712           return _mm_movemask_epi8(v) == 0;
313             #elif defined(UTF8_VALID_HAS_NEON)
314             uint8x16_t v = vld1q_u8(s);
315             uint8x16_t high = vshrq_n_u8(v, 7);
316             return vmaxvq_u8(high) == 0;
317             #else
318             uint64_t v1, v2;
319             memcpy(&v1, s, sizeof(v1));
320             memcpy(&v2, s + sizeof(v1), sizeof(v2));
321             v1 |= v2;
322             return (v1 & UINT64_C(0x8080808080808080)) == 0;
323             #endif
324             }
325              
326 13254           static inline bool utf8_check(const char* src, size_t slen, size_t* cursor) {
327 13254           const unsigned char* s = (const unsigned char*)src;
328 13254           size_t len = slen;
329 13254           uint32_t state = S_ACCEPT;
330              
331             // Process 16-byte chunks; skip DFA when state is clean and chunk is ASCII
332 14125 100         while (len >= 16) {
333 871 100         if (state != S_ACCEPT || !utf8_check_ascii_block16(s))
    100          
334 502           state = utf8_dfa_run(state, s, 16);
335 871           s += 16;
336 871           len -= 16;
337             }
338              
339 13254           state = utf8_dfa_run(state, s, len);
340 13254 100         if (state == S_ACCEPT) {
341 2639 100         if (cursor)
342 2299           *cursor = slen;
343 2639           return true;
344             }
345              
346 10615 100         if (cursor)
347 8280           *cursor = utf8_maximal_prefix(src, slen);
348 10615           return false;
349             }
350              
351 2675           static inline bool utf8_valid(const char *src, size_t len) {
352 2675           return utf8_check(src, len, NULL);
353             }
354              
355             static inline bool utf8_check_constant(const char* src,
356             size_t slen,
357             size_t* cursor) {
358             const unsigned char* s = (const unsigned char*)src;
359             size_t len = slen;
360             uint32_t state = S_ACCEPT;
361              
362             // Process 16-byte chunks
363             while (len >= 16) {
364             state = utf8_dfa_run(state, s, 16);
365             s += 16;
366             len -= 16;
367             }
368              
369             state = utf8_dfa_run(state, s, len);
370             if (state == S_ACCEPT) {
371             if (cursor)
372             *cursor = slen;
373             return true;
374             }
375              
376             if (cursor)
377             *cursor = utf8_maximal_prefix(src, slen);
378             return false;
379             }
380              
381             static inline bool utf8_valid_constant(const char* src, size_t len) {
382             return utf8_check_constant(src, len, NULL);
383             }
384              
385             /*
386             * Streaming API
387             *
388             * utf8_stream_t holds the DFA state between calls. Initialize with
389             * utf8_stream_init() before the first call to utf8_stream_check().
390             *
391             * utf8_stream_check() validates the next chunk of a UTF-8 stream and
392             * returns the number of bytes forming complete, valid sequences. Any
393             * remaining bytes at the end of the chunk (an incomplete sequence
394             * crossing a chunk boundary) must be prepended to the next chunk by
395             * the caller.
396             *
397             * If eof is true and the stream does not end on a sequence boundary,
398             * the input is treated as ill-formed.
399             *
400             * On error, (size_t)-1 is returned and *cursor, if non-NULL, is set
401             * to the byte offset of the start of the invalid or truncated sequence
402             * within src. The stream state is automatically reset to S_ACCEPT so
403             * the caller can resume from the next byte without reinitializing.
404             */
405             typedef struct {
406             uint32_t state;
407             } utf8_stream_t;
408              
409             static inline void
410             utf8_stream_init(utf8_stream_t *s) {
411             s->state = S_ACCEPT;
412             }
413              
414             static inline size_t utf8_stream_check(utf8_stream_t* s,
415             const char* src,
416             size_t len,
417             bool eof,
418             size_t* cursor) {
419             const unsigned char* p = (const unsigned char*)src;
420             uint32_t state = s->state;
421             size_t last_accept = 0;
422              
423             for (size_t i = 0; i < len; i++) {
424             state = utf8_dfa_step(state, p[i]);
425             if (state == S_ACCEPT)
426             last_accept = i + 1;
427             else if (state == S_ERROR) {
428             s->state = S_ACCEPT;
429             if (cursor)
430             *cursor = last_accept;
431             return (size_t)-1;
432             }
433             }
434              
435             s->state = state;
436              
437             if (state != S_ACCEPT) {
438             if (eof) {
439             s->state = S_ACCEPT;
440             if (cursor)
441             *cursor = last_accept;
442             return (size_t)-1;
443             }
444             return last_accept;
445             }
446              
447             return len;
448             }
449              
450             #undef S_ACCEPT
451             #undef S_ERROR
452              
453             #ifdef __cplusplus
454             }
455             #endif
456             #endif