File Coverage

utf8_valid.h
Criterion Covered Total %
statement 44 46 95.6
branch 22 22 100.0
condition n/a
subroutine n/a
pod n/a
total 66 68 97.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017-2026 Christian Hansen
3             *
4             *
5             * Permission is hereby granted, free of charge, to any person obtaining a copy
6             * of this software and associated documentation files (the "Software"), to deal
7             * in the Software without restriction, including without limitation the rights
8             * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9             * copies of the Software, and to permit persons to whom the Software is
10             * furnished to do so, subject to the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be included in all
13             * copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16             * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17             * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18             * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19             * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20             * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21             * SOFTWARE.
22             */
23             #ifndef UTF8_VALID_H
24             #define UTF8_VALID_H
25             #include
26             #include
27             #include
28             #include
29              
30             #if defined(UTF8_DFA32_H) && defined(UTF8_DFA64_H)
31             # error "utf8_dfa32.h and utf8_dfa64.h are mutually exclusive"
32             #elif !defined(UTF8_DFA32_H) && !defined(UTF8_DFA64_H)
33             # error "include utf8_dfa32.h or utf8_dfa64.h before utf8_valid.h"
34             #endif
35              
36             #ifdef UTF8_VALID_USE_SIMD
37             # if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
38             # define UTF8_VALID_HAS_SSE2 1
39             # include
40             # elif defined(__aarch64__)
41             # define UTF8_VALID_HAS_NEON 1
42             # include
43             # endif
44             #endif
45              
46             #ifdef __cplusplus
47             extern "C" {
48             #endif
49              
50 4913           static inline size_t utf8_maximal_subpart(const char* src, size_t len) {
51 4913           const unsigned char* s = (const unsigned char*)src;
52 4913           utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
53              
54 9956 100         for (size_t i = 0; i < len; i++) {
55 9075           state = utf8_dfa_step(state, s[i]);
56 9075           switch (state) {
57 0           case UTF8_DFA_ACCEPT:
58 0           return i + 1;
59 4032           case UTF8_DFA_REJECT:
60 4032 100         return i > 0 ? i : 1;
61             }
62             }
63 881           return len;
64             }
65              
66 8280           static inline size_t utf8_maximal_prefix(const char* src, size_t len) {
67 8280           const unsigned char* s = (const unsigned char*)src;
68 8280           utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
69 8280           size_t prefix = 0;
70              
71 16243 100         for (size_t i = 0; i < len; i++) {
72 15090           state = utf8_dfa_step(state, s[i]);
73 15090 100         if (state == UTF8_DFA_ACCEPT)
74 44           prefix = i + 1;
75 15046 100         else if (state == UTF8_DFA_REJECT)
76 7127           break;
77             }
78 8280           return prefix;
79             }
80              
81             static inline bool utf8_check(const char* src,
82             size_t slen,
83             size_t* cursor) {
84             const unsigned char* s = (const unsigned char*)src;
85             size_t len = slen;
86             utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
87              
88             // Process 16-byte chunks
89             while (len >= 16) {
90             state = utf8_dfa_run16(state, s);
91             s += 16;
92             len -= 16;
93             }
94              
95             state = utf8_dfa_run(state, s, len);
96             if (state == UTF8_DFA_ACCEPT) {
97             if (cursor)
98             *cursor = slen;
99             return true;
100             }
101              
102             if (cursor)
103             *cursor = utf8_maximal_prefix(src, slen);
104             return false;
105             }
106              
107             static inline bool utf8_valid(const char* src, size_t len) {
108             return utf8_check(src, len, NULL);
109             }
110              
111 712           static inline bool utf8_check_ascii_block16(const unsigned char *s) {
112             #if defined(UTF8_VALID_HAS_SSE2)
113             __m128i v = _mm_loadu_si128((const __m128i *)s);
114             return _mm_movemask_epi8(v) == 0;
115             #elif defined(UTF8_VALID_HAS_NEON)
116             uint8x16_t v = vld1q_u8(s);
117             uint8x16_t high = vshrq_n_u8(v, 7);
118             return vmaxvq_u8(high) == 0;
119             #else
120             uint64_t v1, v2;
121 712           memcpy(&v1, s, sizeof(v1));
122 712           memcpy(&v2, s + sizeof(v1), sizeof(v2));
123 712           v1 |= v2;
124 712           return (v1 & UINT64_C(0x8080808080808080)) == 0;
125             #endif
126             }
127              
128 13254           static inline bool utf8_check_ascii(const char* src, size_t slen, size_t* cursor) {
129 13254           const unsigned char* s = (const unsigned char*)src;
130 13254           size_t len = slen;
131 13254           utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
132              
133             // Process 16-byte chunks; skip DFA when state is clean and chunk is ASCII
134 14125 100         while (len >= 16) {
135 871 100         if (state != UTF8_DFA_ACCEPT || !utf8_check_ascii_block16(s))
    100          
136 502           state = utf8_dfa_run16(state, s);
137 871           s += 16;
138 871           len -= 16;
139             }
140              
141 13254           state = utf8_dfa_run(state, s, len);
142 13254 100         if (state == UTF8_DFA_ACCEPT) {
143 2639 100         if (cursor)
144 2299           *cursor = slen;
145 2639           return true;
146             }
147              
148 10615 100         if (cursor)
149 8280           *cursor = utf8_maximal_prefix(src, slen);
150 10615           return false;
151             }
152              
153 2675           static inline bool utf8_valid_ascii(const char *src, size_t len) {
154 2675           return utf8_check_ascii(src, len, NULL);
155             }
156              
157             #ifdef __cplusplus
158             }
159             #endif
160             #endif