File Coverage

src/simd/mds_simd_sse2.c
Criterion Covered Total %
statement 0 69 0.0
branch 0 36 0.0
condition n/a
subroutine n/a
pod n/a
total 0 105 0.0


line stmt bran cond sub pod time code
1             /* src/simd/mds_simd_sse2.c — SSE2/SSSE3 classifier.
2             *
3             * Built when -DMDS_HAVE_SSE2 is set. Uses _mm_shuffle_epi8 (SSSE3) on
4             * 16-byte chunks. SSSE3 is part of every x86_64 CPU since 2007, so we
5             * treat it as a hard baseline for the "SSE2" backend.
6             *
7             * On the rare x86_64 host without SSSE3, the AVX2 build won't be
8             * selected either, and the runtime dispatcher falls back to scalar.
9             */
10             #include "mds_simd.h"
11             #include "mds_classifier_lut.h"
12              
13             #ifdef MDS_HAVE_SSE2
14              
15             #include /* SSSE3 for _mm_shuffle_epi8 */
16             #include
17              
18             #if defined(__GNUC__) || defined(__clang__)
19             # define MDS_SSSE3_FN __attribute__((target("ssse3")))
20             #else
21             # define MDS_SSSE3_FN
22             #endif
23              
24 0           MDS_SSSE3_FN static void classify_structural_sse2(const char* in, size_t len,
25             uint64_t* out)
26             {
27 0           __m128i lo_tbl = _mm_loadu_si128((const __m128i*)MDS_CLASSIFIER_LO);
28 0           __m128i hi_tbl = _mm_loadu_si128((const __m128i*)MDS_CLASSIFIER_HI);
29 0           __m128i mask_lo = _mm_set1_epi8(0x0F);
30 0           __m128i zero = _mm_setzero_si128();
31              
32 0           size_t i = 0;
33 0 0         while (i + 16 <= len) {
34 0           __m128i v = _mm_loadu_si128((const __m128i*)(in + i));
35 0           __m128i lo = _mm_and_si128(v, mask_lo);
36 0           __m128i hi = _mm_and_si128(_mm_srli_epi16(v, 4), mask_lo);
37 0           __m128i la = _mm_shuffle_epi8(lo_tbl, lo);
38 0           __m128i ha = _mm_shuffle_epi8(hi_tbl, hi);
39 0           __m128i m = _mm_and_si128(la, ha);
40             /* See mds_simd_avx2.c: cmpgt_epi8 is signed and would misclassify
41             * bytes whose LUT product is 0x80 ('|' 0x7C, '~' 0x7E). Invert
42             * cmpeq(m,0) — and mask to 16 bits AFTER inversion. */
43 0           __m128i is_zero = _mm_cmpeq_epi8(m, zero);
44 0           uint32_t bits = (uint32_t)((unsigned)(~_mm_movemask_epi8(is_zero)) & 0xFFFFu);
45              
46 0           size_t word = i >> 6;
47 0           size_t off = i & 63u;
48 0           out[word] |= (uint64_t)bits << off;
49 0           i += 16;
50             }
51 0 0         for (; i < len; i++) {
52 0           uint8_t b = (uint8_t)in[i];
53 0 0         if (MDS_CLASSIFIER_LO[b & 0xF] & MDS_CLASSIFIER_HI[b >> 4])
54 0           out[i >> 6] |= (uint64_t)1 << (i & 63);
55             }
56 0           }
57              
58 0           static const mds_simd_ops* s_scalar_sse2(void) { return mds_simd_ops_scalar(); }
59             #define s_scalar s_scalar_sse2
60              
61 0           MDS_SSSE3_FN static int validate_utf8_sse2(const char* in, size_t len)
62             {
63 0           const unsigned char* p = (const unsigned char*)in;
64 0           const unsigned char* end = p + len;
65              
66 0 0         while ((size_t)(end - p) >= 16) {
67 0           __m128i v = _mm_loadu_si128((const __m128i*)p);
68 0           int mask = _mm_movemask_epi8(v);
69 0 0         if (mask == 0) { p += 16; continue; }
70              
71 0           const unsigned char* tail = p + 16;
72 0 0         if (tail > end) tail = end;
73 0           int extend = 3;
74 0 0         while (extend-- > 0 && tail < end && (*tail & 0xC0) == 0x80) tail++;
    0          
    0          
75 0 0         if (!s_scalar()->validate_utf8((const char*)p, (size_t)(tail - p)))
76 0           return 0;
77 0           p = tail;
78             }
79 0 0         if (p < end) return s_scalar()->validate_utf8((const char*)p, (size_t)(end - p));
80 0           return 1;
81             }
82              
83 0           MDS_SSSE3_FN static size_t find_newlines_sse2(const char* in, size_t len,
84             uint32_t* out, size_t cap)
85             {
86 0           const char* p = in;
87 0           const char* end = in + len;
88 0           __m128i needle = _mm_set1_epi8('\n');
89 0           size_t k = 0;
90              
91 0 0         while ((size_t)(end - p) >= 16) {
92 0           __m128i v = _mm_loadu_si128((const __m128i*)p);
93 0           __m128i cmp = _mm_cmpeq_epi8(v, needle);
94 0           unsigned m = (unsigned)(uint16_t)_mm_movemask_epi8(cmp);
95 0 0         if (m) {
96 0           uint32_t base = (uint32_t)(p - in);
97             do {
98 0           unsigned bit = (unsigned)__builtin_ctz(m);
99 0 0         if (k >= cap) return (size_t)-1;
100 0           out[k++] = base + bit;
101 0           m &= m - 1;
102 0 0         } while (m);
103             }
104 0           p += 16;
105             }
106 0 0         while (p < end) {
107 0 0         if (*p == '\n') {
108 0 0         if (k >= cap) return (size_t)-1;
109 0           out[k++] = (uint32_t)(p - in);
110             }
111 0           p++;
112             }
113 0           return k;
114             }
115              
116 0           MDS_SSSE3_FN static const char* next_structural_sse2(const char* p,
117             const char* end)
118 0           { return s_scalar()->next_structural(p, end); }
119              
120 0           MDS_SSSE3_FN static const char* next_structural_bm_sse2(const char* base,
121             size_t bm_len,
122             const uint64_t* bm,
123             size_t p_off)
124 0           { return s_scalar()->next_structural_bm(base, bm_len, bm, p_off); }
125              
126             static const mds_simd_ops k_ops_sse2 = {
127             classify_structural_sse2,
128             validate_utf8_sse2,
129             find_newlines_sse2,
130             next_structural_bm_sse2,
131             next_structural_sse2,
132             };
133              
134 0           const mds_simd_ops* mds_simd_ops_sse2(void) { return &k_ops_sse2; }
135              
136             #endif /* MDS_HAVE_SSE2 */