| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* src/simd/mds_simd_sse2.c — SSE2/SSSE3 classifier. |
|
2
|
|
|
|
|
|
|
* |
|
3
|
|
|
|
|
|
|
* Built when -DMDS_HAVE_SSE2 is set. Uses _mm_shuffle_epi8 (SSSE3) on |
|
4
|
|
|
|
|
|
|
* 16-byte chunks. SSSE3 is part of every x86_64 CPU since 2007, so we |
|
5
|
|
|
|
|
|
|
* treat it as a hard baseline for the "SSE2" backend. |
|
6
|
|
|
|
|
|
|
* |
|
7
|
|
|
|
|
|
|
* On the rare x86_64 host without SSSE3, the AVX2 build won't be |
|
8
|
|
|
|
|
|
|
* selected either, and the runtime dispatcher falls back to scalar. |
|
9
|
|
|
|
|
|
|
*/ |
|
10
|
|
|
|
|
|
|
#include "mds_simd.h" |
|
11
|
|
|
|
|
|
|
#include "mds_classifier_lut.h" |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
#ifdef MDS_HAVE_SSE2 |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
#include /* SSSE3 for _mm_shuffle_epi8 */ |
|
16
|
|
|
|
|
|
|
#include |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
#if defined(__GNUC__) || defined(__clang__) |
|
19
|
|
|
|
|
|
|
# define MDS_SSSE3_FN __attribute__((target("ssse3"))) |
|
20
|
|
|
|
|
|
|
#else |
|
21
|
|
|
|
|
|
|
# define MDS_SSSE3_FN |
|
22
|
|
|
|
|
|
|
#endif |
|
23
|
|
|
|
|
|
|
|
|
24
|
0
|
|
|
|
|
|
MDS_SSSE3_FN static void classify_structural_sse2(const char* in, size_t len, |
|
25
|
|
|
|
|
|
|
uint64_t* out) |
|
26
|
|
|
|
|
|
|
{ |
|
27
|
0
|
|
|
|
|
|
__m128i lo_tbl = _mm_loadu_si128((const __m128i*)MDS_CLASSIFIER_LO); |
|
28
|
0
|
|
|
|
|
|
__m128i hi_tbl = _mm_loadu_si128((const __m128i*)MDS_CLASSIFIER_HI); |
|
29
|
0
|
|
|
|
|
|
__m128i mask_lo = _mm_set1_epi8(0x0F); |
|
30
|
0
|
|
|
|
|
|
__m128i zero = _mm_setzero_si128(); |
|
31
|
|
|
|
|
|
|
|
|
32
|
0
|
|
|
|
|
|
size_t i = 0; |
|
33
|
0
|
0
|
|
|
|
|
while (i + 16 <= len) { |
|
34
|
0
|
|
|
|
|
|
__m128i v = _mm_loadu_si128((const __m128i*)(in + i)); |
|
35
|
0
|
|
|
|
|
|
__m128i lo = _mm_and_si128(v, mask_lo); |
|
36
|
0
|
|
|
|
|
|
__m128i hi = _mm_and_si128(_mm_srli_epi16(v, 4), mask_lo); |
|
37
|
0
|
|
|
|
|
|
__m128i la = _mm_shuffle_epi8(lo_tbl, lo); |
|
38
|
0
|
|
|
|
|
|
__m128i ha = _mm_shuffle_epi8(hi_tbl, hi); |
|
39
|
0
|
|
|
|
|
|
__m128i m = _mm_and_si128(la, ha); |
|
40
|
|
|
|
|
|
|
/* See mds_simd_avx2.c: cmpgt_epi8 is signed and would misclassify |
|
41
|
|
|
|
|
|
|
* bytes whose LUT product is 0x80 ('|' 0x7C, '~' 0x7E). Invert |
|
42
|
|
|
|
|
|
|
* cmpeq(m,0) — and mask to 16 bits AFTER inversion. */ |
|
43
|
0
|
|
|
|
|
|
__m128i is_zero = _mm_cmpeq_epi8(m, zero); |
|
44
|
0
|
|
|
|
|
|
uint32_t bits = (uint32_t)((unsigned)(~_mm_movemask_epi8(is_zero)) & 0xFFFFu); |
|
45
|
|
|
|
|
|
|
|
|
46
|
0
|
|
|
|
|
|
size_t word = i >> 6; |
|
47
|
0
|
|
|
|
|
|
size_t off = i & 63u; |
|
48
|
0
|
|
|
|
|
|
out[word] |= (uint64_t)bits << off; |
|
49
|
0
|
|
|
|
|
|
i += 16; |
|
50
|
|
|
|
|
|
|
} |
|
51
|
0
|
0
|
|
|
|
|
for (; i < len; i++) { |
|
52
|
0
|
|
|
|
|
|
uint8_t b = (uint8_t)in[i]; |
|
53
|
0
|
0
|
|
|
|
|
if (MDS_CLASSIFIER_LO[b & 0xF] & MDS_CLASSIFIER_HI[b >> 4]) |
|
54
|
0
|
|
|
|
|
|
out[i >> 6] |= (uint64_t)1 << (i & 63); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
0
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
static const mds_simd_ops* s_scalar_sse2(void) { return mds_simd_ops_scalar(); } |
|
59
|
|
|
|
|
|
|
#define s_scalar s_scalar_sse2 |
|
60
|
|
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
|
MDS_SSSE3_FN static int validate_utf8_sse2(const char* in, size_t len) |
|
62
|
|
|
|
|
|
|
{ |
|
63
|
0
|
|
|
|
|
|
const unsigned char* p = (const unsigned char*)in; |
|
64
|
0
|
|
|
|
|
|
const unsigned char* end = p + len; |
|
65
|
|
|
|
|
|
|
|
|
66
|
0
|
0
|
|
|
|
|
while ((size_t)(end - p) >= 16) { |
|
67
|
0
|
|
|
|
|
|
__m128i v = _mm_loadu_si128((const __m128i*)p); |
|
68
|
0
|
|
|
|
|
|
int mask = _mm_movemask_epi8(v); |
|
69
|
0
|
0
|
|
|
|
|
if (mask == 0) { p += 16; continue; } |
|
70
|
|
|
|
|
|
|
|
|
71
|
0
|
|
|
|
|
|
const unsigned char* tail = p + 16; |
|
72
|
0
|
0
|
|
|
|
|
if (tail > end) tail = end; |
|
73
|
0
|
|
|
|
|
|
int extend = 3; |
|
74
|
0
|
0
|
|
|
|
|
while (extend-- > 0 && tail < end && (*tail & 0xC0) == 0x80) tail++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
75
|
0
|
0
|
|
|
|
|
if (!s_scalar()->validate_utf8((const char*)p, (size_t)(tail - p))) |
|
76
|
0
|
|
|
|
|
|
return 0; |
|
77
|
0
|
|
|
|
|
|
p = tail; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
0
|
0
|
|
|
|
|
if (p < end) return s_scalar()->validate_utf8((const char*)p, (size_t)(end - p)); |
|
80
|
0
|
|
|
|
|
|
return 1; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
MDS_SSSE3_FN static size_t find_newlines_sse2(const char* in, size_t len, |
|
84
|
|
|
|
|
|
|
uint32_t* out, size_t cap) |
|
85
|
|
|
|
|
|
|
{ |
|
86
|
0
|
|
|
|
|
|
const char* p = in; |
|
87
|
0
|
|
|
|
|
|
const char* end = in + len; |
|
88
|
0
|
|
|
|
|
|
__m128i needle = _mm_set1_epi8('\n'); |
|
89
|
0
|
|
|
|
|
|
size_t k = 0; |
|
90
|
|
|
|
|
|
|
|
|
91
|
0
|
0
|
|
|
|
|
while ((size_t)(end - p) >= 16) { |
|
92
|
0
|
|
|
|
|
|
__m128i v = _mm_loadu_si128((const __m128i*)p); |
|
93
|
0
|
|
|
|
|
|
__m128i cmp = _mm_cmpeq_epi8(v, needle); |
|
94
|
0
|
|
|
|
|
|
unsigned m = (unsigned)(uint16_t)_mm_movemask_epi8(cmp); |
|
95
|
0
|
0
|
|
|
|
|
if (m) { |
|
96
|
0
|
|
|
|
|
|
uint32_t base = (uint32_t)(p - in); |
|
97
|
|
|
|
|
|
|
do { |
|
98
|
0
|
|
|
|
|
|
unsigned bit = (unsigned)__builtin_ctz(m); |
|
99
|
0
|
0
|
|
|
|
|
if (k >= cap) return (size_t)-1; |
|
100
|
0
|
|
|
|
|
|
out[k++] = base + bit; |
|
101
|
0
|
|
|
|
|
|
m &= m - 1; |
|
102
|
0
|
0
|
|
|
|
|
} while (m); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
0
|
|
|
|
|
|
p += 16; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
0
|
0
|
|
|
|
|
while (p < end) { |
|
107
|
0
|
0
|
|
|
|
|
if (*p == '\n') { |
|
108
|
0
|
0
|
|
|
|
|
if (k >= cap) return (size_t)-1; |
|
109
|
0
|
|
|
|
|
|
out[k++] = (uint32_t)(p - in); |
|
110
|
|
|
|
|
|
|
} |
|
111
|
0
|
|
|
|
|
|
p++; |
|
112
|
|
|
|
|
|
|
} |
|
113
|
0
|
|
|
|
|
|
return k; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
MDS_SSSE3_FN static const char* next_structural_sse2(const char* p, |
|
117
|
|
|
|
|
|
|
const char* end) |
|
118
|
0
|
|
|
|
|
|
{ return s_scalar()->next_structural(p, end); } |
|
119
|
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
MDS_SSSE3_FN static const char* next_structural_bm_sse2(const char* base, |
|
121
|
|
|
|
|
|
|
size_t bm_len, |
|
122
|
|
|
|
|
|
|
const uint64_t* bm, |
|
123
|
|
|
|
|
|
|
size_t p_off) |
|
124
|
0
|
|
|
|
|
|
{ return s_scalar()->next_structural_bm(base, bm_len, bm, p_off); } |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
static const mds_simd_ops k_ops_sse2 = { |
|
127
|
|
|
|
|
|
|
classify_structural_sse2, |
|
128
|
|
|
|
|
|
|
validate_utf8_sse2, |
|
129
|
|
|
|
|
|
|
find_newlines_sse2, |
|
130
|
|
|
|
|
|
|
next_structural_bm_sse2, |
|
131
|
|
|
|
|
|
|
next_structural_sse2, |
|
132
|
|
|
|
|
|
|
}; |
|
133
|
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
const mds_simd_ops* mds_simd_ops_sse2(void) { return &k_ops_sse2; } |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
#endif /* MDS_HAVE_SSE2 */ |