File Coverage

utf8_dfa32.h
Criterion Covered Total %
statement 10 10 100.0
branch 4 4 100.0
condition n/a
subroutine n/a
pod n/a
total 14 14 100.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2026 Christian Hansen
3             *
4             *
5             * Permission is hereby granted, free of charge, to any person obtaining a copy
6             * of this software and associated documentation files (the "Software"), to deal
7             * in the Software without restriction, including without limitation the rights
8             * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9             * copies of the Software, and to permit persons to whom the Software is
10             * furnished to do so, subject to the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be included in all
13             * copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16             * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17             * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18             * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19             * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20             * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21             * SOFTWARE.
22             */
23              
24             /*
25             * utf8_dfa32.h -- Shift-based DFA for Forward UTF-8 validation
26             * =============================================================================
27             *
28             * Same 9-state DFA as utf8_dfa64.h but validation-only (no decode). State
29             * offsets differ: chosen by an SMT solver to pack rows into uint32_t rather
30             * than uniform multiples of 6. Use utf8_dfa64.h if you need codepoint decoding.
31             *
32             *
33             * CONCEPT
34             * -------
35             *
36             * Scans UTF-8 bytes left-to-right. Feed bytes one at a time starting from
37             * S_ACCEPT. Each return to S_ACCEPT marks a complete valid sequence boundary.
38             * S_ERROR is an absorbing trap: once entered, no byte can leave it.
39             *
40             * Because the lead byte arrives first, the DFA must carry forward enough
41             * context to validate the bytes that follow. Two things determine the next
42             * valid byte range:
43             *
44             * (1) How many continuation bytes remain (depth 1, 2, or 3)
45             * (2) Whether the lead was E0/ED/F0/F4, which narrows the first
46             * continuation byte range to reject non-shortest form, surrogates,
47             * and codepoints above U+10FFFF
48             *
49             *
50             * STATE DEFINITIONS
51             * -----------------
52             *
53             * State Value Meaning
54             * ------- ----- -----------------------------------------------------
55             * S_ERROR 0 Invalid byte seen (absorbing trap state)
56             * S_ACCEPT 6 Start state / valid sequence boundary
57             * S_TAIL1 16 Expect 1 more continuation (80-BF → ACCEPT)
58             * S_TAIL2 1 Expect 2 more continuations (80-BF → TAIL1)
59             * S_TAIL3 18 Expect 3 more continuations (80-BF → TAIL2)
60             * S_E0 19 After 0xE0; next must be A0-BF (no non-shortest form)
61             * S_ED 25 After 0xED; next must be 80-9F (no surrogates)
62             * S_F0 11 After 0xF0; next must be 90-BF (no non-shortest form)
63             * S_F4 24 After 0xF4; next must be 80-8F (no >U+10FFFF)
64             *
65             * State value offsets are chosen by an SMT solver so all transition
66             * rows fit in a plain uint32_t.
67             *
68             * S_ERROR = 0 is not arbitrary: any transition to S_ERROR contributes
69             * (0 << offset) = 0 to the row value, which is itself S_ERROR at every
70             * state offset. The trap is enforced for free by the encoding.
71             *
72             * If states or transitions are changed, rerun tool/smt_solver.py to
73             * find new valid offsets that still pack into uint32_t.
74             *
75             *
76             * TRANSITION TABLE
77             * ----------------
78             * Current State
79             *
80             * Input Byte ACCEPT TAIL1 TAIL2 TAIL3 E0 ED F0 F4
81             * ---------- ------ ------ ------ ------ ------ ------ ------ ------
82             * 00..7F ACCEPT - - - - - - -
83             * 80..8F - ACCEPT TAIL1 TAIL2 - TAIL1 - TAIL2
84             * 90..9F - ACCEPT TAIL1 TAIL2 - TAIL1 TAIL2 -
85             * A0..BF - ACCEPT TAIL1 TAIL2 TAIL1 - TAIL2 -
86             * C0..C1 - - - - - - - -
87             * C2..DF TAIL1 - - - - - - -
88             * E0 E0 - - - - - - -
89             * E1..EC TAIL2 - - - - - - -
90             * ED ED - - - - - - -
91             * EE..EF TAIL2 - - - - - - -
92             * F0 F0 - - - - - - -
93             * F1..F3 TAIL3 - - - - - - -
94             * F4 F4 - - - - - - -
95             * F5..FF - - - - - - - -
96             *
97             * Note: "-" means transition to S_ERROR (invalid in that context)
98             *
99             *
100             * STATE FLOW DIAGRAMS
101             * -------------------
102             *
103             * 1-byte (ASCII):
104             * ACCEPT ─[0x00–0x7F]─→ ACCEPT
105             *
106             * 2-byte (U+0080–U+07FF):
107             * ACCEPT ─[0xC2–0xDF]─→ TAIL1 ─[0x80–0xBF]─→ ACCEPT
108             *
109             * 3-byte (U+0800–U+FFFF, excluding surrogates U+D800–U+DFFF):
110             * ACCEPT ─[lead]─→ [state] ─[cont1]─→ TAIL1 ─[cont2]─→ ACCEPT
111             * │ │ │
112             * ├── 0xE0 ───────→ E0 ──────┴─ 0xA0–0xBF (no non-shortest form)
113             * ├── 0xED ───────→ ED ──────┴─ 0x80–0x9F (no surrogates)
114             * └── 0xE1–0xEC, ─→ TAIL2 ───┴─ 0x80–0xBF (unrestricted)
115             * 0xEE–0xEF
116             *
117             * 4-byte (U+10000–U+10FFFF):
118             * ACCEPT ─[lead]─→ [state] ─[cont1]─→ TAIL2 ─[cont2]─→ TAIL1 ─[cont3]─→ ACCEPT
119             * │ │ │
120             * ├── 0xF0 ───────→ F0 ──────┴─ 0x90–0xBF (no non-shortest form)
121             * ├── 0xF4 ───────→ F4 ──────┴─ 0x80–0x8F (no >U+10FFFF)
122             * └── 0xF1–0xF3 ──→ TAIL3 ───┴─ 0x80–0xBF (unrestricted)
123             *
124             *
125             * UTF-8 ENCODING FORM
126             * -------------------
127             *
128             * U+0000..U+007F 0xxxxxxx
129             * U+0080..U+07FF 110xxxxx 10xxxxxx
130             * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
131             * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
132             *
133             *
134             * U+0000..U+007F 00..7F
135             * N C0..C1 80..BF 1100000x 10xxxxxx
136             * U+0080..U+07FF C2..DF 80..BF
137             * N E0 80..9F 80..BF 11100000 100xxxxx
138             * U+0800..U+0FFF E0 A0..BF 80..BF
139             * U+1000..U+CFFF E1..EC 80..BF 80..BF
140             * U+D000..U+D7FF ED 80..9F 80..BF
141             * S ED A0..BF 80..BF 11101101 101xxxxx
142             * U+E000..U+FFFF EE..EF 80..BF 80..BF
143             * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
144             * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
145             * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
146             * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
147             *
148             * Legend:
149             * N = Non-shortest form
150             * S = Surrogates
151             *
152             *
153             * PERFORMANCE
154             * -----------
155             *
156             * - 9 states total (minimal for well-formed forward UTF-8 validation)
157             * - Table-driven: 256-entry uint32_t table (1 KB, fits in L1 cache)
158             * - Branchless step: (table[byte] >> state) & 31
159             *
160             *
161             * REFERENCES
162             * ----------
163             *
164             * - Unicode Standard §3.9: Unicode Encoding Forms
165             *
166             *
167             *
168             * USAGE PATTERN
169             * -------------
170             *
171             * utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
172             * for (size_t i = 0; i < len; i++) {
173             * state = utf8_dfa_step(state, buffer[i]);
174             * if (state == UTF8_DFA_REJECT) {
175             * // Invalid UTF-8 at position i
176             * break;
177             * }
178             * if (state == UTF8_DFA_ACCEPT) {
179             * // Complete valid sequence at position i
180             * }
181             * }
182             *
183             */
184             #ifndef UTF8_DFA32_H
185             #define UTF8_DFA32_H
186             #include
187             #include
188              
189             #ifdef __cplusplus
190             extern "C" {
191             #endif
192              
193             typedef uint32_t utf8_dfa_state_t;
194              
195             #define UTF8_DFA_REJECT ((utf8_dfa_state_t)0)
196             #define UTF8_DFA_ACCEPT ((utf8_dfa_state_t)6)
197              
198             #define S_ERROR 0
199             #define S_ACCEPT 6
200             #define S_TAIL1 16
201             #define S_TAIL2 1
202             #define S_TAIL3 18
203             #define S_E0 19
204             #define S_ED 25
205             #define S_F0 11
206             #define S_F4 24
207              
208             /* clang-format off */
209              
210             #define DFA_ROW(accept,error,tail1,tail2,tail3,e0,ed,f0,f4) \
211             ( ((utf8_dfa_state_t)(accept) << S_ACCEPT) \
212             | ((utf8_dfa_state_t)(error) << S_ERROR) \
213             | ((utf8_dfa_state_t)(tail1) << S_TAIL1) \
214             | ((utf8_dfa_state_t)(tail2) << S_TAIL2) \
215             | ((utf8_dfa_state_t)(tail3) << S_TAIL3) \
216             | ((utf8_dfa_state_t)(e0) << S_E0) \
217             | ((utf8_dfa_state_t)(ed) << S_ED) \
218             | ((utf8_dfa_state_t)(f0) << S_F0) \
219             | ((utf8_dfa_state_t)(f4) << S_F4) )
220              
221             #define ERR S_ERROR
222              
223             #define ASCII_ROW DFA_ROW(S_ACCEPT,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
224             #define LEAD2_ROW DFA_ROW(S_TAIL1,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
225             #define LEAD3_ROW DFA_ROW(S_TAIL2,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
226             #define LEAD4_ROW DFA_ROW(S_TAIL3,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
227             #define ERROR_ROW DFA_ROW(ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
228              
229             #define E0_ROW DFA_ROW(S_E0,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
230             #define ED_ROW DFA_ROW(S_ED,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
231             #define F0_ROW DFA_ROW(S_F0,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
232             #define F4_ROW DFA_ROW(S_F4,ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR)
233              
234             /*
235             * Continuation byte rows:
236             *
237             * Columns: ACCEPT ERROR TAIL1 TAIL2 TAIL3 E0 ED F0 F4
238             *
239             * 80-8F: ERR ERR ->ACCEPT ->TAIL1 ->TAIL2 ->ERR ->TAIL1 ->ERR ->TAIL2
240             * 90-9F: ERR ERR ->ACCEPT ->TAIL1 ->TAIL2 ->ERR ->TAIL1 ->TAIL2 ->ERR
241             * A0-BF: ERR ERR ->ACCEPT ->TAIL1 ->TAIL2 ->TAIL1 ->ERR ->TAIL2 ->ERR
242             */
243             #define CONT_80_8F DFA_ROW(ERR,ERR,S_ACCEPT,S_TAIL1,S_TAIL2,ERR, S_TAIL1,ERR, S_TAIL2)
244             #define CONT_90_9F DFA_ROW(ERR,ERR,S_ACCEPT,S_TAIL1,S_TAIL2,ERR, S_TAIL1,S_TAIL2, ERR)
245             #define CONT_A0_BF DFA_ROW(ERR,ERR,S_ACCEPT,S_TAIL1,S_TAIL2,S_TAIL1,ERR, S_TAIL2, ERR)
246              
247             static const utf8_dfa_state_t utf8_dfa[256] = {
248             // 00-7F
249             [0x00]=ASCII_ROW,[0x01]=ASCII_ROW,[0x02]=ASCII_ROW,[0x03]=ASCII_ROW,
250             [0x04]=ASCII_ROW,[0x05]=ASCII_ROW,[0x06]=ASCII_ROW,[0x07]=ASCII_ROW,
251             [0x08]=ASCII_ROW,[0x09]=ASCII_ROW,[0x0A]=ASCII_ROW,[0x0B]=ASCII_ROW,
252             [0x0C]=ASCII_ROW,[0x0D]=ASCII_ROW,[0x0E]=ASCII_ROW,[0x0F]=ASCII_ROW,
253             [0x10]=ASCII_ROW,[0x11]=ASCII_ROW,[0x12]=ASCII_ROW,[0x13]=ASCII_ROW,
254             [0x14]=ASCII_ROW,[0x15]=ASCII_ROW,[0x16]=ASCII_ROW,[0x17]=ASCII_ROW,
255             [0x18]=ASCII_ROW,[0x19]=ASCII_ROW,[0x1A]=ASCII_ROW,[0x1B]=ASCII_ROW,
256             [0x1C]=ASCII_ROW,[0x1D]=ASCII_ROW,[0x1E]=ASCII_ROW,[0x1F]=ASCII_ROW,
257             [0x20]=ASCII_ROW,[0x21]=ASCII_ROW,[0x22]=ASCII_ROW,[0x23]=ASCII_ROW,
258             [0x24]=ASCII_ROW,[0x25]=ASCII_ROW,[0x26]=ASCII_ROW,[0x27]=ASCII_ROW,
259             [0x28]=ASCII_ROW,[0x29]=ASCII_ROW,[0x2A]=ASCII_ROW,[0x2B]=ASCII_ROW,
260             [0x2C]=ASCII_ROW,[0x2D]=ASCII_ROW,[0x2E]=ASCII_ROW,[0x2F]=ASCII_ROW,
261             [0x30]=ASCII_ROW,[0x31]=ASCII_ROW,[0x32]=ASCII_ROW,[0x33]=ASCII_ROW,
262             [0x34]=ASCII_ROW,[0x35]=ASCII_ROW,[0x36]=ASCII_ROW,[0x37]=ASCII_ROW,
263             [0x38]=ASCII_ROW,[0x39]=ASCII_ROW,[0x3A]=ASCII_ROW,[0x3B]=ASCII_ROW,
264             [0x3C]=ASCII_ROW,[0x3D]=ASCII_ROW,[0x3E]=ASCII_ROW,[0x3F]=ASCII_ROW,
265             [0x40]=ASCII_ROW,[0x41]=ASCII_ROW,[0x42]=ASCII_ROW,[0x43]=ASCII_ROW,
266             [0x44]=ASCII_ROW,[0x45]=ASCII_ROW,[0x46]=ASCII_ROW,[0x47]=ASCII_ROW,
267             [0x48]=ASCII_ROW,[0x49]=ASCII_ROW,[0x4A]=ASCII_ROW,[0x4B]=ASCII_ROW,
268             [0x4C]=ASCII_ROW,[0x4D]=ASCII_ROW,[0x4E]=ASCII_ROW,[0x4F]=ASCII_ROW,
269             [0x50]=ASCII_ROW,[0x51]=ASCII_ROW,[0x52]=ASCII_ROW,[0x53]=ASCII_ROW,
270             [0x54]=ASCII_ROW,[0x55]=ASCII_ROW,[0x56]=ASCII_ROW,[0x57]=ASCII_ROW,
271             [0x58]=ASCII_ROW,[0x59]=ASCII_ROW,[0x5A]=ASCII_ROW,[0x5B]=ASCII_ROW,
272             [0x5C]=ASCII_ROW,[0x5D]=ASCII_ROW,[0x5E]=ASCII_ROW,[0x5F]=ASCII_ROW,
273             [0x60]=ASCII_ROW,[0x61]=ASCII_ROW,[0x62]=ASCII_ROW,[0x63]=ASCII_ROW,
274             [0x64]=ASCII_ROW,[0x65]=ASCII_ROW,[0x66]=ASCII_ROW,[0x67]=ASCII_ROW,
275             [0x68]=ASCII_ROW,[0x69]=ASCII_ROW,[0x6A]=ASCII_ROW,[0x6B]=ASCII_ROW,
276             [0x6C]=ASCII_ROW,[0x6D]=ASCII_ROW,[0x6E]=ASCII_ROW,[0x6F]=ASCII_ROW,
277             [0x70]=ASCII_ROW,[0x71]=ASCII_ROW,[0x72]=ASCII_ROW,[0x73]=ASCII_ROW,
278             [0x74]=ASCII_ROW,[0x75]=ASCII_ROW,[0x76]=ASCII_ROW,[0x77]=ASCII_ROW,
279             [0x78]=ASCII_ROW,[0x79]=ASCII_ROW,[0x7A]=ASCII_ROW,[0x7B]=ASCII_ROW,
280             [0x7C]=ASCII_ROW,[0x7D]=ASCII_ROW,[0x7E]=ASCII_ROW,[0x7F]=ASCII_ROW,
281             // 80-8F
282             [0x80]=CONT_80_8F,[0x81]=CONT_80_8F,[0x82]=CONT_80_8F,[0x83]=CONT_80_8F,
283             [0x84]=CONT_80_8F,[0x85]=CONT_80_8F,[0x86]=CONT_80_8F,[0x87]=CONT_80_8F,
284             [0x88]=CONT_80_8F,[0x89]=CONT_80_8F,[0x8A]=CONT_80_8F,[0x8B]=CONT_80_8F,
285             [0x8C]=CONT_80_8F,[0x8D]=CONT_80_8F,[0x8E]=CONT_80_8F,[0x8F]=CONT_80_8F,
286             // 90-9F
287             [0x90]=CONT_90_9F,[0x91]=CONT_90_9F,[0x92]=CONT_90_9F,[0x93]=CONT_90_9F,
288             [0x94]=CONT_90_9F,[0x95]=CONT_90_9F,[0x96]=CONT_90_9F,[0x97]=CONT_90_9F,
289             [0x98]=CONT_90_9F,[0x99]=CONT_90_9F,[0x9A]=CONT_90_9F,[0x9B]=CONT_90_9F,
290             [0x9C]=CONT_90_9F,[0x9D]=CONT_90_9F,[0x9E]=CONT_90_9F,[0x9F]=CONT_90_9F,
291             // A0-BF
292             [0xA0]=CONT_A0_BF,[0xA1]=CONT_A0_BF,[0xA2]=CONT_A0_BF,[0xA3]=CONT_A0_BF,
293             [0xA4]=CONT_A0_BF,[0xA5]=CONT_A0_BF,[0xA6]=CONT_A0_BF,[0xA7]=CONT_A0_BF,
294             [0xA8]=CONT_A0_BF,[0xA9]=CONT_A0_BF,[0xAA]=CONT_A0_BF,[0xAB]=CONT_A0_BF,
295             [0xAC]=CONT_A0_BF,[0xAD]=CONT_A0_BF,[0xAE]=CONT_A0_BF,[0xAF]=CONT_A0_BF,
296             [0xB0]=CONT_A0_BF,[0xB1]=CONT_A0_BF,[0xB2]=CONT_A0_BF,[0xB3]=CONT_A0_BF,
297             [0xB4]=CONT_A0_BF,[0xB5]=CONT_A0_BF,[0xB6]=CONT_A0_BF,[0xB7]=CONT_A0_BF,
298             [0xB8]=CONT_A0_BF,[0xB9]=CONT_A0_BF,[0xBA]=CONT_A0_BF,[0xBB]=CONT_A0_BF,
299             [0xBC]=CONT_A0_BF,[0xBD]=CONT_A0_BF,[0xBE]=CONT_A0_BF,[0xBF]=CONT_A0_BF,
300             // C0-C1
301             [0xC0]=ERROR_ROW,[0xC1]=ERROR_ROW,
302             // C2-DF
303             [0xC2]=LEAD2_ROW,[0xC3]=LEAD2_ROW,[0xC4]=LEAD2_ROW,[0xC5]=LEAD2_ROW,
304             [0xC6]=LEAD2_ROW,[0xC7]=LEAD2_ROW,[0xC8]=LEAD2_ROW,[0xC9]=LEAD2_ROW,
305             [0xCA]=LEAD2_ROW,[0xCB]=LEAD2_ROW,[0xCC]=LEAD2_ROW,[0xCD]=LEAD2_ROW,
306             [0xCE]=LEAD2_ROW,[0xCF]=LEAD2_ROW,[0xD0]=LEAD2_ROW,[0xD1]=LEAD2_ROW,
307             [0xD2]=LEAD2_ROW,[0xD3]=LEAD2_ROW,[0xD4]=LEAD2_ROW,[0xD5]=LEAD2_ROW,
308             [0xD6]=LEAD2_ROW,[0xD7]=LEAD2_ROW,[0xD8]=LEAD2_ROW,[0xD9]=LEAD2_ROW,
309             [0xDA]=LEAD2_ROW,[0xDB]=LEAD2_ROW,[0xDC]=LEAD2_ROW,[0xDD]=LEAD2_ROW,
310             [0xDE]=LEAD2_ROW,[0xDF]=LEAD2_ROW,
311             // E0
312             [0xE0]=E0_ROW,
313             // E1-EC
314             [0xE1]=LEAD3_ROW,[0xE2]=LEAD3_ROW,[0xE3]=LEAD3_ROW,[0xE4]=LEAD3_ROW,
315             [0xE5]=LEAD3_ROW,[0xE6]=LEAD3_ROW,[0xE7]=LEAD3_ROW,[0xE8]=LEAD3_ROW,
316             [0xE9]=LEAD3_ROW,[0xEA]=LEAD3_ROW,[0xEB]=LEAD3_ROW,[0xEC]=LEAD3_ROW,
317             // ED
318             [0xED]=ED_ROW,
319             // EE-EF
320             [0xEE]=LEAD3_ROW,[0xEF]=LEAD3_ROW,
321             // F0
322             [0xF0]=F0_ROW,
323             // F1-F3
324             [0xF1]=LEAD4_ROW,[0xF2]=LEAD4_ROW,[0xF3]=LEAD4_ROW,
325             // F4
326             [0xF4]=F4_ROW,
327             // F5-FF
328             [0xF5]=ERROR_ROW,[0xF6]=ERROR_ROW,[0xF7]=ERROR_ROW,[0xF8]=ERROR_ROW,
329             [0xF9]=ERROR_ROW,[0xFA]=ERROR_ROW,[0xFB]=ERROR_ROW,[0xFC]=ERROR_ROW,
330             [0xFD]=ERROR_ROW,[0xFE]=ERROR_ROW,[0xFF]=ERROR_ROW,
331             };
332              
333             /* clang-format on */
334              
335             #undef S_ERROR
336             #undef S_ACCEPT
337             #undef S_TAIL1
338             #undef S_TAIL2
339             #undef S_TAIL3
340             #undef S_E0
341             #undef S_ED
342             #undef S_F0
343             #undef S_F4
344              
345             #undef ERR
346             #undef DFA_ROW
347             #undef ASCII_ROW
348             #undef CONT_80_8F
349             #undef CONT_90_9F
350             #undef CONT_A0_BF
351             #undef LEAD2_ROW
352             #undef LEAD3_ROW
353             #undef LEAD4_ROW
354             #undef ERROR_ROW
355             #undef E0_ROW
356             #undef ED_ROW
357             #undef F0_ROW
358             #undef F4_ROW
359              
360 24165           static inline utf8_dfa_state_t utf8_dfa_step(utf8_dfa_state_t state,
361             unsigned char c) {
362 24165           return (utf8_dfa[c] >> state) & 31;
363             }
364              
365 13254           static inline utf8_dfa_state_t utf8_dfa_run(utf8_dfa_state_t state,
366             const unsigned char* src,
367             size_t len) {
368 50016 100         for (size_t i = 0; i < len; i++)
369 36762           state = utf8_dfa[src[i]] >> (state & 31);
370 13254           return state & 31;
371             }
372              
373 502           static inline utf8_dfa_state_t utf8_dfa_run16(utf8_dfa_state_t state,
374             const unsigned char* src) {
375             #pragma GCC unroll 16
376 8534 100         for (size_t i = 0; i < 16; i++)
377 8032           state = utf8_dfa[src[i]] >> (state & 31);
378 502           return state & 31;
379             }
380              
381             #ifdef __cplusplus
382             }
383             #endif
384             #endif // UTF8_DFA32_H