File Coverage

src/mds_inline.c
Criterion Covered Total %
statement 901 947 95.1
branch 917 1202 76.2
condition n/a
subroutine n/a
pod n/a
total 1818 2149 84.6


line stmt bran cond sub pod time code
1             /* mds_inline.c — scalar CommonMark §6 inline tokenizer.
2             *
3             * Algorithm follows the cmark reference implementation and the
4             * CommonMark spec appendix ("An algorithm for parsing nested emphasis
5             * and links").
6             *
7             * Single forward pass builds a doubly-linked list of nodes:
8             * TEXT, CODE, AUTOLINK, HTMLINLINE, SOFTBREAK, LINEBREAK
9             * DELIM (* / _ runs)
10             * OPEN_BRACKET ([), OPEN_BANG_BRACKET (![)
11             *
12             * Then process_emphasis() folds DELIM nodes into EMPH/STRONG using the
13             * delimiter-run stack algorithm. process_links_and_images() is folded
14             * inline during the forward pass at ']' time (cmark does it that way too).
15             *
16             * Finally emit() walks the linked list and dispatches SAX events.
17             */
18              
19             #include "mds_inline.h"
20             #include "mds_ir.h"
21             #include "mds_linkref.h"
22             #include "mds_footnote.h"
23             #include "mds_entity.h"
24             #include "mds_arena.h"
25             #include "mds.h"
26             #if defined(__ARM_NEON) || defined(__aarch64__)
27             # include
28             # define MDS_INLINE_HAVE_NEON 1
29             #endif
30              
31             #include
32             #include
33             #include
34             #include
35              
36             /* ---------------- byte class table ---------------- */
37              
38             enum {
39             BC_PUNCT = 1 << 0, /* ASCII punctuation per CommonMark §2.1 */
40             BC_WS = 1 << 1, /* ASCII whitespace: space tab \n \v \f \r */
41             BC_ALNUM = 1 << 2
42             };
43              
44             static unsigned char g_byteclass[256];
45             static int g_byteclass_inited = 0;
46              
47 1706           static void byteclass_init(void) {
48             int c;
49 1706 100         if (g_byteclass_inited) return;
50 5654 100         for (c = 0; c < 256; c++) {
51 5632           unsigned f = 0;
52 5632 100         if (c == ' ' || c == '\t' || c == '\n' || c == '\v' ||
    100          
    100          
    100          
    100          
53 5522 100         c == '\f' || c == '\r')
54 132           f |= BC_WS;
55 5632 100         if ((c >= '0' && c <= '9') ||
    100          
    100          
56 5412 100         (c >= 'A' && c <= 'Z') ||
    100          
57 3498 100         (c >= 'a' && c <= 'z'))
58 1364           f |= BC_ALNUM;
59             /* CommonMark "ASCII punctuation": !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ */
60 5632 100         if ((c >= 33 && c <= 47) || (c >= 58 && c <= 64) ||
    100          
    100          
    100          
    100          
61 5148 100         (c >= 91 && c <= 96) || (c >= 123 && c <= 126))
    100          
    100          
62 704           f |= BC_PUNCT;
63 5632           g_byteclass[c] = (unsigned char)f;
64             }
65 22           g_byteclass_inited = 1;
66             }
67              
68 314           static inline int is_ascii_punct(unsigned char c) { return g_byteclass[c] & BC_PUNCT; }
69 848           static inline int is_unicode_ws(unsigned char c) { return g_byteclass[c] & BC_WS; }
70              
71             /* Decode a single UTF-8 codepoint at s[i] (i < n). Returns codepoint;
72             * sets *adv to bytes consumed. Lenient on invalid bytes. */
73 54           static unsigned mds_utf8_decode(const char* s, size_t n, size_t i, int* adv) {
74 54           unsigned char c = (unsigned char)s[i];
75 54 50         if (c < 0x80) { *adv = 1; return c; }
76 54 100         if ((c & 0xE0) == 0xC0 && i + 1 < n &&
    50          
77 48 50         ((unsigned char)s[i+1] & 0xC0) == 0x80) {
78 48           *adv = 2;
79 48           return ((unsigned)(c & 0x1F) << 6) | ((unsigned char)s[i+1] & 0x3F);
80             }
81 6 50         if ((c & 0xF0) == 0xE0 && i + 2 < n &&
    50          
82 6 50         ((unsigned char)s[i+1] & 0xC0) == 0x80 &&
83 6 50         ((unsigned char)s[i+2] & 0xC0) == 0x80) {
84 6           *adv = 3;
85 6           return ((unsigned)(c & 0x0F) << 12)
86 6           | (((unsigned char)s[i+1] & 0x3F) << 6)
87 6           | ((unsigned char)s[i+2] & 0x3F);
88             }
89 0 0         if ((c & 0xF8) == 0xF0 && i + 3 < n &&
    0          
90 0 0         ((unsigned char)s[i+1] & 0xC0) == 0x80 &&
91 0 0         ((unsigned char)s[i+2] & 0xC0) == 0x80 &&
92 0 0         ((unsigned char)s[i+3] & 0xC0) == 0x80) {
93 0           *adv = 4;
94 0           return ((unsigned)(c & 0x07) << 18)
95 0           | (((unsigned char)s[i+1] & 0x3F) << 12)
96 0           | (((unsigned char)s[i+2] & 0x3F) << 6)
97 0           | ((unsigned char)s[i+3] & 0x3F);
98             }
99 0           *adv = 1;
100 0           return c;
101             }
102              
103             /* Decode the codepoint ending just before s[pos]; pos > 0 required.
104             * Walks backward over continuation bytes (max 3). Returns codepoint;
105             * sets *cp_start to start offset. */
106 27           static unsigned mds_utf8_decode_prev(const char* s, size_t n, size_t pos,
107             size_t* cp_start) {
108 27           size_t i = pos - 1;
109 27           int back = 0;
110             int adv;
111 57 50         while (i > 0 && back < 3 &&
    50          
112 57 100         ((unsigned char)s[i] & 0xC0) == 0x80) {
113 30           i--; back++;
114             }
115 27           *cp_start = i;
116 27           return mds_utf8_decode(s, n, i, &adv);
117             }
118              
119             /* Is codepoint Unicode whitespace per CommonMark spec (General_Category
120             * Zs, plus tab/CR/LF/FF). */
121 3036           static int cp_is_ws(unsigned cp) {
122 3036 50         if (cp == 0x09 || cp == 0x0A || cp == 0x0B || cp == 0x0C || cp == 0x0D ||
    100          
    50          
    50          
    50          
    100          
123 1252           cp == 0x20) return 1;
124 1784 100         if (cp == 0xA0) return 1; /* NBSP */
125 1778 50         if (cp == 0x1680) return 1;
126 1778 100         if (cp >= 0x2000 && cp <= 0x200A) return 1;
    50          
127 1778 50         if (cp == 0x2028 || cp == 0x2029) return 1;
    50          
128 1778 50         if (cp == 0x202F || cp == 0x205F) return 1;
    50          
129 1778 50         if (cp == 0x3000) return 1;
130 1778           return 0;
131             }
132              
133             /* Is codepoint Unicode punctuation per CommonMark 0.31 spec
134             * (General_Category P* or S*). For non-ASCII we approximate with the
135             * ranges most likely to appear in spec examples: Latin-1 punctuation
136             * and symbols (¡¢£¤¥¦§¨©ª«¬®¯°±²³´¶·¸¹º»¼½¾¿×÷), General Punctuation
137             * (U+2000-206F), Currency Symbols (U+20A0-U+20CF), Letterlike (some),
138             * Arrows (U+2190-U+21FF), Mathematical (U+2200-U+22FF), Misc Tech
139             * (U+2300-U+23FF), Box Drawing/Block (U+2500-U+259F), Geometric
140             * (U+25A0-U+25FF), Misc Symbols (U+2600-U+26FF), Dingbats (U+2700-U+27BF),
141             * CJK Symbols (U+3000-U+303F), Halfwidth (U+FF00-U+FFEF symbols subset). */
142 3036           static int cp_is_punct(unsigned cp) {
143 3036 100         if (cp < 0x80) return g_byteclass[cp] & BC_PUNCT;
144             /* Latin-1 Supplement P/S categories */
145 54 100         if (cp >= 0xA1 && cp <= 0xBF) return 1;
    100          
146 48 50         if (cp == 0xD7 || cp == 0xF7) return 1;
    50          
147             /* General Punctuation block */
148 48 100         if (cp >= 0x2000 && cp <= 0x206F) return 1;
    50          
149             /* Superscripts/Subscripts (Sm subset) */
150 48 100         if (cp >= 0x2070 && cp <= 0x209F) return 1;
    50          
151             /* Currency Symbols */
152 48 100         if (cp >= 0x20A0 && cp <= 0x20CF) return 1;
    50          
153             /* Letterlike symbols (S subset) */
154 42 50         if (cp >= 0x2100 && cp <= 0x214F) return 1;
    0          
155             /* Arrows / Math / Misc Tech / Box / Geometric / Misc / Dingbats */
156 42 50         if (cp >= 0x2190 && cp <= 0x27BF) return 1;
    0          
157             /* CJK Symbols and Punctuation */
158 42 50         if (cp >= 0x3000 && cp <= 0x303F) return 1;
    0          
159             /* Halfwidth / Fullwidth punctuation (subset) */
160 42 50         if (cp >= 0xFF00 && cp <= 0xFF0F) return 1;
    0          
161 42 50         if (cp >= 0xFF1A && cp <= 0xFF20) return 1;
    0          
162 42 50         if (cp >= 0xFF3B && cp <= 0xFF40) return 1;
    0          
163 42 50         if (cp >= 0xFF5B && cp <= 0xFF65) return 1;
    0          
164 42           return 0;
165             }
166              
167             /* ---------------- inline node ---------------- */
168              
169             typedef enum {
170             N_TEXT,
171             N_CODE,
172             N_AUTOLINK,
173             N_HTMLINLINE,
174             N_SOFTBREAK,
175             N_LINEBREAK,
176             N_DELIM, /* * or _ run */
177             N_OPEN_BRACKET, /* '[' */
178             N_OPEN_BANG, /* '![' */
179             N_EMPH, /* after process_emphasis */
180             N_STRONG,
181             N_STRIKE,
182             N_LINK,
183             N_IMAGE,
184             N_FOOTNOTE_REF /* GFM §6.13 */
185             } ntype;
186              
187             typedef struct inode {
188             ntype type;
189             struct inode* prev;
190             struct inode* next;
191             const char* s;
192             size_t n;
193             int is_email;
194             unsigned char delim_char;
195             int count;
196             int can_open;
197             int can_close;
198             int active;
199             int bracket_after_emph;
200             struct inode* children;
201             struct inode* children_tail;
202             const char* href; size_t hlen;
203             const char* title; size_t tlen;
204             } inode;
205             /* Bitfield packing of the flag ints was attempted but
206             * produced no measurable speedup (commonmark-spec / synth-prose within
207             * +/-2%, synth-tables drifted -5%) so the original layout is kept. The
208             * 32-byte aspirational target requires splitting href/title into a side
209             * allocation keyed off type == N_LINK|N_IMAGE; deferred until the inline
210             * parser is rewritten around tagged unions. */
211              
212             /* ---------------- scanner state ---------------- */
213              
214             typedef struct {
215             mds_ctx* ctx;
216             const char* s;
217             size_t n;
218             size_t pos;
219             inode* head;
220             inode* tail;
221             } scn;
222              
223 7198           static inode* node_new(scn* z, ntype t) {
224 7198           inode* x = (inode*)mds_arena_alloc(&z->ctx->arena, sizeof(inode));
225 7198           memset(x, 0, sizeof *x);
226 7198           x->type = t;
227 7198           return x;
228             }
229              
230 6592           static void append(scn* z, inode* x) {
231 6592           x->prev = z->tail;
232 6592           x->next = NULL;
233 6592 100         if (z->tail) z->tail->next = x;
234 2100           else z->head = x;
235 6592           z->tail = x;
236 6592           }
237              
238 0           static void append_to(inode* parent, inode* x) {
239 0           x->prev = parent->children_tail;
240 0           x->next = NULL;
241 0 0         if (parent->children_tail) parent->children_tail->next = x;
242 0           else parent->children = x;
243 0           parent->children_tail = x;
244 0           }
245              
246             /* Append literal text bytes; coalesces with previous TEXT node if possible
247             * (only if contiguous in source). */
248 3311           static void append_text(scn* z, const char* p, size_t k) {
249             inode* x;
250 3311 50         if (k == 0) return;
251 3311 100         if (z->tail && z->tail->type == N_TEXT &&
    100          
252 736 100         z->tail->s + z->tail->n == p) {
253 579           z->tail->n += k;
254 579           return;
255             }
256 2732           x = node_new(z, N_TEXT);
257 2732           x->s = p; x->n = k;
258 2732           append(z, x);
259             }
260              
261             /* Allocate a fresh text node referring to arena-stored bytes (e.g. an
262             * entity expansion). */
263 333           static void append_text_dup(scn* z, const char* p, size_t k) {
264             char* d;
265             inode* x;
266 333 50         if (k == 0) return;
267 333           d = (char*)mds_arena_alloc(&z->ctx->arena, k);
268 333           memcpy(d, p, k);
269 333           x = node_new(z, N_TEXT);
270 333           x->s = d; x->n = k;
271 333           append(z, x);
272             }
273              
274             /* ---------------- flanking rules (§6.4) ---------------- */
275             /*
276             * preceded_by_ws / followed_by_ws : Unicode whitespace at run boundary
277             * preceded_by_punct / followed_by_punct : ASCII punct (Unicode punct is
278             * approximated as the high-bit set + any non-alnum byte; full UTF-8
279             * Unicode-punct lookup is deferred).
280             *
281             * left-flanking iff: NOT followed by Unicode-WS AND
282             * (NOT followed by punct OR
283             * preceded by Unicode-WS or punct)
284             *
285             * right-flanking iff: NOT preceded by Unicode-WS AND
286             * (NOT preceded by punct OR
287             * followed by Unicode-WS or punct)
288             */
289 1518           static int classify_run(const char* s, size_t n, size_t pos, size_t runlen,
290             int* can_open_out, int* can_close_out,
291             unsigned char ch) {
292             unsigned cp_before, cp_after;
293             size_t after_pos;
294             int before_ws, after_ws, before_punct, after_punct;
295             int left, right;
296             int can_open, can_close;
297              
298             /* Decode the codepoint immediately before pos and the one starting
299             * at pos+runlen. Treat document edges as line feeds (whitespace).
300             * Decoding multi-byte codepoints is essential for non-ASCII spec
301             * cases (NBSP as WS, currency / arrows / etc. as Unicode punct). */
302 1518 100         if (pos == 0) {
303 405           cp_before = '\n';
304             } else {
305 1113           unsigned char b = (unsigned char)s[pos - 1];
306 1113 100         if (b < 0x80) {
307 1086           cp_before = b;
308             } else {
309             size_t st;
310 27           cp_before = mds_utf8_decode_prev(s, n, pos, &st);
311             }
312             }
313 1518           after_pos = pos + runlen;
314 1518 100         if (after_pos >= n) {
315 416           cp_after = '\n';
316             } else {
317 1102           unsigned char a = (unsigned char)s[after_pos];
318 1102 100         if (a < 0x80) {
319 1075           cp_after = a;
320             } else {
321             int adv;
322 27           cp_after = mds_utf8_decode(s, n, after_pos, &adv);
323             }
324             }
325              
326 1518           before_ws = cp_is_ws(cp_before);
327 1518           after_ws = cp_is_ws(cp_after);
328 1518           before_punct = cp_is_punct(cp_before);
329 1518           after_punct = cp_is_punct(cp_after);
330              
331 1518 100         left = !after_ws && (!after_punct || before_ws || before_punct);
    100          
    100          
    100          
332 1518 100         right = !before_ws && (!before_punct || after_ws || after_punct);
    100          
    100          
    100          
333              
334 1518 100         if (ch == '_') {
335             /* §6.4: _ delimiters with intra-word restrictions */
336 533 100         can_open = left && (!right || before_punct);
    100          
    100          
337 533 100         can_close = right && (!left || after_punct);
    100          
    100          
338             } else {
339             /* * (and ~ for strikethrough) */
340 985           can_open = left;
341 985           can_close = right;
342             }
343 1518           *can_open_out = can_open;
344 1518           *can_close_out = can_close;
345 1518           return 1;
346             }
347              
348             /* ---------------- code span (§6.3) ---------------- */
349              
350             /* Try to match a code span starting at pos (first byte = '`').
351             * On success returns new pos past the closing fence; emits one node.
352             * On failure returns 0 (caller consumes one backtick as text). */
353 378           static size_t try_code_span(scn* z, size_t pos) {
354 378           const char* s = z->s;
355 378           size_t n = z->n;
356 378           size_t open_start = pos;
357             size_t open_len;
358             size_t content_start;
359             size_t scan;
360 848 100         while (pos < n && s[pos] == '`') pos++;
    100          
361 378           open_len = pos - open_start;
362 378           content_start = pos;
363 378           scan = pos;
364 432 100         while (scan < n) {
365             /* find next run of backticks */
366 366           const char* p = (const char*)memchr(s + scan, '`', n - scan);
367             size_t bs;
368             size_t be;
369 366 100         if (!p) return 0;
370 342           bs = (size_t)(p - s);
371 342           be = bs;
372 794 100         while (be < n && s[be] == '`') be++;
    100          
373 342 100         if (be - bs == open_len) {
374             /* matched */
375 288           size_t cs = content_start;
376 288           size_t ce = bs;
377 288           int has_nonspace = 0;
378 288           int needs_replace = 0;
379             inode* x;
380             size_t i;
381             /* normalisation: if first and last are space, and content is
382             * not all spaces, strip one leading and trailing space. */
383 374 100         for (i = cs; i < ce; i++) {
384 362 100         if (s[i] != ' ' && s[i] != '\n') { has_nonspace = 1; break; }
    100          
385             }
386 288 100         if (has_nonspace && ce - cs >= 2 &&
    100          
387 232 100         (s[cs] == ' ' || s[cs] == '\n') &&
    100          
388 62 100         (s[ce - 1] == ' ' || s[ce - 1] == '\n')) {
    100          
389 56           cs++; ce--;
390             }
391             /* replace newlines with spaces */
392 1524 100         for (i = cs; i < ce; i++) {
393 1260 100         if (s[i] == '\n') { needs_replace = 1; break; }
394             }
395 288           x = node_new(z, N_CODE);
396 288 100         if (needs_replace) {
397 24           char* d = (char*)mds_arena_alloc(&z->ctx->arena, ce - cs);
398 312 100         for (i = cs; i < ce; i++)
399 288 100         d[i - cs] = (s[i] == '\n') ? ' ' : s[i];
400 24           x->s = d; x->n = ce - cs;
401             } else {
402 264           x->s = s + cs; x->n = ce - cs;
403             }
404 288           append(z, x);
405 288           return be;
406             }
407 54           scan = be;
408             }
409 66           return 0;
410             }
411              
412             /* ---------------- entity (§6.2) ---------------- */
413              
414             /* Try to decode entity starting at pos (s[pos] == '&').
415             * Returns chars consumed (including & and ;) on success, 0 otherwise. */
416 112           static size_t try_entity(scn* z, size_t pos) {
417 112           const char* s = z->s; size_t n = z->n;
418             size_t q;
419             size_t name_start;
420             const mds_entity* e;
421             mds_entity ent_scratch;
422 112 50         if (pos + 1 >= n) return 0;
423 112           q = pos + 1;
424 112 100         if (s[q] == '#') {
425 53           unsigned long cp = 0;
426 53           size_t digits = 0;
427             char buf[5];
428             size_t blen;
429 53           q++;
430 53 50         if (q < n && (s[q] == 'x' || s[q] == 'X')) {
    100          
    100          
431 14           q++;
432 42 50         while (q < n && digits < 6 && isxdigit((unsigned char)s[q])) {
    50          
    100          
433 28           char c = s[q];
434 40 100         cp = cp * 16 + (c <= '9' ? c - '0' :
    100          
435 12           (c <= 'F' ? c - 'A' + 10 : c - 'a' + 10));
436 28           q++; digits++;
437             }
438             } else {
439 123 50         while (q < n && digits < 7 && s[q] >= '0' && s[q] <= '9') {
    100          
    50          
    100          
440 84           cp = cp * 10 + (unsigned long)(s[q] - '0');
441 84           q++; digits++;
442             }
443             }
444 53 100         if (!digits || q >= n || s[q] != ';') return 0;
    50          
    100          
445 41           q++;
446             /* Encode codepoint as UTF-8. NUL → U+FFFD. */
447 41 100         if (cp == 0 || cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF))
    50          
    50          
    0          
448 3           cp = 0xFFFD;
449 41 100         if (cp < 0x80) {
450 26           buf[0] = (char)cp; blen = 1;
451 15 100         } else if (cp < 0x800) {
452 6           buf[0] = (char)(0xC0 | (cp >> 6));
453 6           buf[1] = (char)(0x80 | (cp & 0x3F)); blen = 2;
454 9 50         } else if (cp < 0x10000) {
455 9           buf[0] = (char)(0xE0 | (cp >> 12));
456 9           buf[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
457 9           buf[2] = (char)(0x80 | (cp & 0x3F)); blen = 3;
458             } else {
459 0           buf[0] = (char)(0xF0 | (cp >> 18));
460 0           buf[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
461 0           buf[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
462 0           buf[3] = (char)(0x80 | (cp & 0x3F)); blen = 4;
463             }
464 41           append_text_dup(z, buf, blen);
465 41           return q - pos;
466             }
467             /* named entity */
468 59           name_start = q;
469 446 100         while (q < n && isalnum((unsigned char)s[q])) q++;
    100          
470 59 100         if (q == name_start || q >= n || s[q] != ';') return 0;
    100          
    100          
471 47           e = mds_entity_lookup(s + name_start, q - name_start, &ent_scratch);
472 47 100         if (!e) return 0;
473 38           q++;
474 38           append_text_dup(z, e->utf8, e->ulen);
475 38           return q - pos;
476             }
477              
478             /* ---------------- autolink (§6.7) ---------------- */
479              
480             /* Returns total chars (including <>) on success, 0 otherwise. */
481 279           static size_t try_autolink(scn* z, size_t pos) {
482 279           const char* s = z->s; size_t n = z->n;
483             size_t q;
484             size_t scheme_start;
485             size_t scheme_len;
486             size_t body_start;
487             size_t r;
488             size_t e_start;
489 279           int dot_ok = 0;
490 279           int label_len = 0;
491             inode* x;
492 279 50         if (pos >= n || s[pos] != '<') return 0;
    50          
493 279           q = pos + 1;
494             /* URI autolink: scheme = [A-Za-z][A-Za-z0-9+.-]{1,31}: */
495 279           scheme_start = q;
496 279 50         if (q >= n || !isalpha((unsigned char)s[q])) goto try_email;
    100          
497 201           q++;
498 742 50         while (q < n && (isalnum((unsigned char)s[q]) || s[q] == '+' ||
    100          
    100          
499 216 100         s[q] == '.' || s[q] == '-'))
    100          
500 541           q++;
501 201           scheme_len = q - scheme_start;
502 201 100         if (scheme_len < 2 || scheme_len > 32) goto try_email;
    50          
503 135 50         if (q >= n || s[q] != ':') goto try_email;
    100          
504             /* body: any non-WS, non-< non-> */
505 53           body_start = q + 1;
506 53           r = body_start;
507 1799 100         while (r < n && s[r] != '>' && s[r] != '<' &&
508 1746 50         !is_unicode_ws((unsigned char)s[r]) &&
509 845 50         (unsigned char)s[r] >= 0x20)
510 845           r++;
511 53 50         if (r < n && s[r] == '>') {
    100          
512 50           x = node_new(z, N_AUTOLINK);
513 50           x->s = s + pos + 1; x->n = r - (pos + 1);
514 50           x->is_email = 0;
515 50           append(z, x);
516 50           return r - pos + 1;
517             }
518 3           try_email:
519             /* email autolink: simple validation */
520 229           q = pos + 1;
521 229           e_start = q;
522 953 50         while (q < n && (isalnum((unsigned char)s[q]) ||
    100          
523 337 100         strchr(".!#$%&'*+/=?^_`{|}~-", s[q])))
524 724           q++;
525 229 100         if (q == e_start || q >= n || s[q] != '@') return 0;
    50          
    100          
526 7           q++;
527 111 50         while (q < n && s[q] != '>') {
    100          
528 104           char c = s[q];
529 104 100         if (isalnum((unsigned char)c)) { label_len++; q++; }
530 16 100         else if (c == '-') { if (!label_len) return 0; label_len++; q++; }
    50          
531 13 50         else if (c == '.') { if (!label_len) return 0; dot_ok = 1; label_len = 0; q++; }
    50          
532 0           else return 0;
533 104 50         if (label_len > 63) return 0;
534             }
535             (void)dot_ok;
536 7 50         if (q >= n || s[q] != '>' || label_len == 0) return 0;
    50          
    50          
537 7           x = node_new(z, N_AUTOLINK);
538 7           x->s = s + pos + 1; x->n = q - (pos + 1);
539 7           x->is_email = 1;
540 7           append(z, x);
541 7           return q - pos + 1;
542             }
543              
544             /* ---------------- raw HTML inline (§6.8) ---------------- */
545              
546 402           static int html_attr_name_char(char c, int first) {
547 402 100         if (first) return isalpha((unsigned char)c) || c == '_' || c == ':';
    100          
    50          
    0          
548 321 100         return isalnum((unsigned char)c) || c == '_' || c == ':' || c == '.' || c == '-';
    50          
    100          
    50          
    50          
549             }
550              
551 220           static size_t try_html_inline(scn* z, size_t pos) {
552 220           const char* s = z->s; size_t n = z->n;
553             size_t q;
554             inode* x;
555             int closing;
556 220 50         if (pos >= n || s[pos] != '<') return 0;
    50          
557 220 50         if (pos + 1 >= n) return 0;
558 220           q = pos + 1;
559             /* comment (CommonMark 0.30+):
560             * | | ... --> */
561 220 100         if (q + 2 < n && s[q] == '!' && s[q+1] == '-' && s[q+2] == '-') {
    100          
    100          
    100          
562 9           size_t r = q + 3;
563             /* short forms */
564 9 50         if (r < n && s[r] == '>') {
    100          
565 3           r += 1;
566 3           x = node_new(z, N_HTMLINLINE);
567 3           x->s = s + pos; x->n = r - pos;
568 3           append(z, x);
569 3           return r - pos;
570             }
571 6 50         if (r + 1 < n && s[r] == '-' && s[r+1] == '>') {
    100          
    50          
572 3           r += 2;
573 3           x = node_new(z, N_HTMLINLINE);
574 3           x->s = s + pos; x->n = r - pos;
575 3           append(z, x);
576 3           return r - pos;
577             }
578             /* general form: scan for "-->" with no constraint on inner '--'. */
579 114 50         while (r + 2 < n) {
580 114 100         if (s[r] == '-' && s[r+1] == '-' && s[r+2] == '>') {
    100          
    100          
581 3           r += 3;
582 3           x = node_new(z, N_HTMLINLINE);
583 3           x->s = s + pos; x->n = r - pos;
584 3           append(z, x);
585 3           return r - pos;
586             }
587 111           r++;
588             }
589 0           return 0;
590             }
591             /* PI */
592 211 50         if (q < n && s[q] == '?') {
    100          
593 3           q++;
594 42 50         while (q + 1 < n) {
595 42 100         if (s[q] == '?' && s[q+1] == '>') {
    50          
596 3           q += 2;
597 3           x = node_new(z, N_HTMLINLINE);
598 3           x->s = s + pos; x->n = q - pos;
599 3           append(z, x);
600 3           return q - pos;
601             }
602 39           q++;
603             }
604 0           return 0;
605             }
606             /* CDATA */
607 208 100         if (q + 7 < n && memcmp(s + q, "![CDATA[", 8) == 0) {
    100          
608 3           q += 8;
609 12 50         while (q + 2 < n) {
610 12 100         if (s[q] == ']' && s[q+1] == ']' && s[q+2] == '>') {
    50          
    50          
611 3           q += 3;
612 3           x = node_new(z, N_HTMLINLINE);
613 3           x->s = s + pos; x->n = q - pos;
614 3           append(z, x);
615 3           return q - pos;
616             }
617 9           q++;
618             }
619 0           return 0;
620             }
621             /* declaration */
622 205 50         if (q < n && s[q] == '!' && q + 1 < n && isalpha((unsigned char)s[q+1])) {
    100          
    50          
    100          
623 3           q += 2;
624 48 50         while (q < n && s[q] != '>') q++;
    100          
625 3 50         if (q >= n) return 0;
626 3           q++;
627 3           x = node_new(z, N_HTMLINLINE);
628 3           x->s = s + pos; x->n = q - pos;
629 3           append(z, x);
630 3           return q - pos;
631             }
632             /* closing tag */
633 202           closing = 0;
634 202 50         if (q < n && s[q] == '/') { closing = 1; q++; }
    100          
635             /* tag name */
636 202 50         if (q >= n || !isalpha((unsigned char)s[q])) return 0;
    100          
637 175           q++;
638 516 50         while (q < n && (isalnum((unsigned char)s[q]) || s[q] == '-')) q++;
    100          
    100          
639 175 100         if (closing) {
640 38 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    100          
    50          
    50          
641 32 50         if (q >= n || s[q] != '>') return 0;
    100          
642 29           q++;
643 29           x = node_new(z, N_HTMLINLINE);
644 29           x->s = s + pos; x->n = q - pos;
645 29           append(z, x);
646 29           return q - pos;
647             }
648             /* attributes */
649 218 50         while (q < n) {
650 218           size_t pre_attr = q;
651 218           int saw_ws = 0;
652             size_t save;
653 314 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) {
    100          
    50          
    100          
654 96           saw_ws = 1; q++;
655             }
656 218 50         if (q >= n) return 0;
657 218 100         if (s[q] == '>' || s[q] == '/') break;
    100          
658 117 100         if (!saw_ws) return 0;
659 81 50         if (!html_attr_name_char(s[q], 1)) { q = pre_attr; break; }
660 81           q++;
661 321 50         while (q < n && html_attr_name_char(s[q], 0)) q++;
    100          
662             /* optional value */
663 81           save = q;
664 87 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    100          
    50          
    50          
665 81 50         if (q < n && s[q] == '=') {
    100          
666 69           q++;
667 72 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    100          
    50          
    50          
668 69 50         if (q < n && (s[q] == '"' || s[q] == '\'')) {
    100          
    100          
669 60           char qc = s[q]; q++;
670 366 100         while (q < n && s[q] != qc) q++;
    100          
671 60 100         if (q >= n) return 0;
672 54           q++;
673             } else {
674             /* unquoted */
675 9           size_t vs = q;
676 54 100         while (q < n && s[q] != ' ' && s[q] != '\t' && s[q] != '\n' &&
    50          
    100          
677 39 50         s[q] != '"' && s[q] != '\'' && s[q] != '=' &&
    100          
    50          
678 45 50         s[q] != '<' && s[q] != '>' && s[q] != '`') q++;
    50          
    50          
    50          
679 9 50         if (q == vs) return 0;
680             }
681             } else {
682 12           q = save;
683             }
684             }
685 101 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    50          
    50          
    50          
686 101 50         if (q < n && s[q] == '/') q++;
    100          
687 101 50         if (q >= n || s[q] != '>') return 0;
    100          
688 98           q++;
689 98           x = node_new(z, N_HTMLINLINE);
690 98           x->s = s + pos; x->n = q - pos;
691 98           append(z, x);
692 98           return q - pos;
693             }
694              
695             /* ---------------- link parsing helpers ---------------- */
696              
697             /* Parse a CommonMark link destination starting at *p. On success advances
698             * *p past it and sets [out_s, out_e) to the destination bytes (still in the
699             * source buffer; the renderer normalises). */
700 220           static int parse_link_destination(const char* s, size_t* pp, size_t n,
701             const char** out_s, size_t* out_n) {
702 220           size_t p = *pp;
703             size_t ds;
704             int paren;
705 220 50         if (p >= n) return 0;
706 220 100         if (s[p] == '<') {
707 30           p++;
708 30           ds = p;
709 150 100         while (p < n && s[p] != '>' && s[p] != '<' && s[p] != '\n') {
    100          
    50          
    100          
710 120 100         if (s[p] == '\\' && p + 1 < n) p++;
    50          
711 120           p++;
712             }
713 30 100         if (p >= n || s[p] != '>') return 0;
    100          
714 21           *out_s = s + ds;
715 21           *out_n = p - ds;
716 21           *pp = p + 1;
717 21           return 1;
718             }
719 190           paren = 0;
720 190           ds = p;
721 1554 100         while (p < n) {
722 1551           unsigned char c = (unsigned char)s[p];
723 1551 100         if (c < 0x20 || c == 0x7f) break;
    50          
724 1545 100         if (c == ' ' || c == '\t' || c == '\n') break;
    50          
    50          
725 1498 100         if (c == '\\' && p + 1 < n && is_ascii_punct((unsigned char)s[p+1])) { p += 2; continue; }
    50          
    100          
726 1474 100         if (c == '(') { paren++; p++; continue; }
727 1459 100         if (c == ')') { if (paren == 0) break; paren--; p++; continue; }
    100          
728 1310           p++;
729             }
730 190 50         if (p == ds || paren != 0) return 0;
    50          
731 190           *out_s = s + ds;
732 190           *out_n = p - ds;
733 190           *pp = p;
734 190           return 1;
735             }
736              
737 41           static int parse_link_title(const char* s, size_t* pp, size_t n,
738             const char** out_s, size_t* out_n) {
739 41           size_t p = *pp;
740             char open, close;
741             size_t ts;
742             int prev_blank_line;
743 41 50         if (p >= n) return 0;
744 41           open = s[p];
745 41 100         if (open == '"' || open == '\'') close = open;
    100          
746 3 50         else if (open == '(') close = ')';
747 0           else return 0;
748 41           p++;
749 41           ts = p;
750 41           prev_blank_line = 0;
751 339 50         while (p < n && s[p] != close) {
    100          
752 298 100         if (s[p] == '\\' && p + 1 < n && is_ascii_punct((unsigned char)s[p+1])) { p += 2; continue; }
    50          
    50          
753 292 100         if (open == '(' && s[p] == '(') return 0;
    50          
754 292 50         if (s[p] == '\n') {
755             /* check for blank line */
756 0           size_t r = p + 1;
757 0 0         while (r < n && (s[r] == ' ' || s[r] == '\t')) r++;
    0          
    0          
758 0 0         if (r >= n || s[r] == '\n') { prev_blank_line = 1; break; }
    0          
759             }
760 292           p++;
761             }
762 41 50         if (prev_blank_line || p >= n || s[p] != close) return 0;
    50          
    50          
763 41           *out_s = s + ts;
764 41           *out_n = p - ts;
765 41           *pp = p + 1;
766 41           return 1;
767             }
768              
769             /* ---------------- find matching open bracket ---------------- */
770              
771 723           static inode* find_open_bracket(scn* z, int* is_image) {
772             inode* x;
773 723           *is_image = 0;
774 1851 100         for (x = z->tail; x; x = x->prev) {
775 1835 100         if (x->type == N_OPEN_BRACKET || x->type == N_OPEN_BANG) {
    100          
776 707           *is_image = (x->type == N_OPEN_BANG);
777 707           return x; /* return topmost — caller checks ->active */
778             }
779             }
780 16           return NULL;
781             }
782              
783             /* Disable any '[' opener nodes appearing before x (for nested-link rule). */
784 343           static void deactivate_brackets(scn* z, inode* x) {
785             inode* p;
786 603 100         for (p = x->prev; p; p = p->prev) {
787 260 100         if (p->type == N_OPEN_BRACKET) p->active = 0;
788             }
789             (void)z;
790 343           }
791              
792             /* Move nodes (open_bracket->next .. end) into a new container of type t,
793             * which replaces the open_bracket and everything after. Returns the
794             * container. */
795 422           static inode* wrap_after(scn* z, inode* open, ntype t) {
796 422           inode* c = node_new(z, t);
797 422           c->children = open->next;
798 422           c->children_tail = z->tail;
799 422 100         if (c->children) c->children->prev = NULL;
800             /* sever tail link */
801 422           z->tail = open->prev;
802 422 100         if (z->tail) z->tail->next = NULL;
803 295           else z->head = NULL;
804             /* remove the open bracket itself */
805             /* (it's now floating; we won't re-link it) */
806 422           append(z, c);
807 422           return c;
808             }
809              
810             /* ---------------- process_emphasis (§6.4) ---------------- */
811              
812             /* The CommonMark algorithm operates on the delimiter stack. We use the
813             * doubly-linked node list directly; DELIM nodes ARE the stack entries.
814             *
815             * stack_bottom: only consider delimiters strictly after this node.
816             * NULL = whole list (or list head).
817             */
818 2128           static void process_emphasis(scn* z, inode* stack_bottom) {
819             /* openers_bottom[delim_idx][closer_count_mod3][can_open(0|1)] */
820             inode* openers_bottom[3][3][2];
821             inode* closer;
822             int a, b, c;
823             int use2;
824             ntype tt;
825             unsigned _ifl;
826             inode* container;
827             inode* first;
828             inode* last;
829             inode* before;
830             inode* after;
831             inode* new_open;
832             inode* new_close;
833             inode* prev_link;
834             inode* start;
835             inode* p;
836 8512 100         for (a = 0; a < 3; a++)
837 25536 100         for (b = 0; b < 3; b++)
838 57456 100         for (c = 0; c < 2; c++)
839 38304           openers_bottom[a][b][c] = stack_bottom;
840              
841 2128 100         closer = stack_bottom ? stack_bottom->next : z->head;
842             /* find first potential closer */
843 6943 100         while (closer) {
844 5333 100         if (closer->type == N_DELIM && closer->can_close &&
    100          
845 518 100         (closer->delim_char == '*' || closer->delim_char == '_' ||
    100          
846 26 50         closer->delim_char == '~'))
847             break;
848 4815           closer = closer->next;
849             }
850 2904 100         while (closer) {
851 776           unsigned char ch = closer->delim_char;
852 776 100         int didx = (ch == '*') ? 0 : (ch == '_') ? 1 : 2;
    100          
853 776           int co_mod = closer->count % 3;
854 776           int co_op = closer->can_open ? 1 : 0;
855 776           inode* bot = openers_bottom[didx][co_mod][co_op];
856              
857             /* walk back for matching opener */
858 776           inode* opener = closer->prev;
859 776           int found = 0;
860 1951 100         while (opener && opener != bot && opener != stack_bottom) {
    100          
    50          
861 1786 100         if (opener->type == N_DELIM && opener->can_open &&
    100          
862 662 100         opener->delim_char == ch) {
863             /* rule of three */
864 557 100         int odd_match = (closer->can_open || opener->can_close) &&
865 1204 100         ((opener->count + closer->count) % 3 == 0) &&
    100          
866 21 100         !(opener->count % 3 == 0 && closer->count % 3 == 0);
    50          
867 626 100         if (!odd_match) { found = 1; break; }
868             }
869 1175           opener = opener->prev;
870             }
871 776 100         if (!found) {
872 165           openers_bottom[didx][co_mod][co_op] = closer->prev;
873             /* If the closer itself can't also open, mark it inert so it
874             * becomes literal text in the final sweep. Either way, advance. */
875 165 100         if (!closer->can_open) closer->can_close = 0;
876 165           closer = closer->next;
877 315 100         while (closer) {
878 210 100         if (closer->type == N_DELIM && closer->can_close &&
    100          
879 60 100         (closer->delim_char == '*' || closer->delim_char == '_' ||
    50          
880 0 0         closer->delim_char == '~'))
881             break;
882 150           closer = closer->next;
883             }
884 165           continue;
885             }
886              
887 611           use2 = (ch == '~')
888             ? opener->count /* matched count for tildes */
889 611 100         : ((opener->count >= 2 && closer->count >= 2) ? 2 : 1);
    100          
    100          
890              
891             /* GFM strike: counts must match and be 1 or 2 (no triple+). */
892 611 100         if (ch == '~' && (opener->count != closer->count ||
    100          
893 26 100         (opener->count != 1 && opener->count != 2))) {
    50          
894             /* skip — leave as text */
895 3           openers_bottom[didx][co_mod][co_op] = opener;
896 3           closer = closer->next;
897 3 50         while (closer) {
898 0 0         if (closer->type == N_DELIM && closer->can_close &&
    0          
899 0 0         (closer->delim_char == '*' || closer->delim_char == '_' ||
    0          
900 0 0         closer->delim_char == '~'))
901             break;
902 0           closer = closer->next;
903             }
904 3           continue;
905             }
906              
907 608 100         tt = (ch == '~') ? N_STRIKE : (use2 == 2 ? N_STRONG : N_EMPH);
    100          
908 608           _ifl = z->ctx->flags;
909 608 100         if ((tt == N_EMPH && (_ifl & MDS_FLAG_NO_EMPH)) ||
    100          
    100          
910 238 100         (tt == N_STRONG && (_ifl & MDS_FLAG_NO_STRONG))) {
911             /* Skip: leave delim run as-is, advance closer (becomes text) */
912 2           openers_bottom[didx][co_mod][co_op] = opener;
913 2 50         if (!closer->can_open) closer->can_close = 0;
914 2           closer = closer->next;
915 2 50         while (closer) {
916 0 0         if (closer->type == N_DELIM && closer->can_close &&
    0          
917 0 0         (closer->delim_char == '*' || closer->delim_char == '_' ||
    0          
918 0 0         closer->delim_char == '~'))
919             break;
920 0           closer = closer->next;
921             }
922 2           continue;
923             }
924 606           container = node_new(z, tt);
925             /* children = (opener->next .. closer->prev) */
926 606           first = opener->next;
927 606           last = closer->prev;
928 606 50         if (first != closer) {
929 606           container->children = first;
930 606           container->children_tail = last;
931 606           first->prev = NULL;
932 606           last->next = NULL;
933             }
934             /* shrink/remove delimiters */
935 606           opener->count -= use2;
936 606           closer->count -= use2;
937              
938             /* relink: replace [opener? closer?] block with container */
939 606           before = opener->prev;
940 606           after = closer->next;
941              
942 606 100         new_open = (opener->count > 0) ? opener : NULL;
943 606 100         new_close = (closer->count > 0) ? closer : NULL;
944              
945             /* shrink opener s/n bytes for proper future text emission? Not
946             * needed — opener bytes are never emitted directly; only the count
947             * matters when shrunken to >0 and treated as remaining delim. */
948 606 100         if (new_open) {
949             /* truncate opener's literal length so leftover delim chars
950             * remain rendered if not consumed later */
951 65           new_open->n = (size_t)new_open->count;
952             }
953 606 100         if (new_close) {
954 71           new_close->n = (size_t)new_close->count;
955 71           new_close->s = new_close->s; /* keep pointer, length adjusted */
956             }
957              
958             /* build list: before, new_open?, container, new_close?, after */
959 606           prev_link = before;
960 606 100         if (new_open) {
961 65 100         if (prev_link) prev_link->next = new_open;
962 54           else z->head = new_open;
963 65           new_open->prev = prev_link;
964 65           prev_link = new_open;
965             }
966 606 100         if (prev_link) prev_link->next = container;
967 265           else z->head = container;
968 606           container->prev = prev_link;
969 606           prev_link = container;
970 606 100         if (new_close) {
971 71           prev_link->next = new_close;
972 71           new_close->prev = prev_link;
973 71           prev_link = new_close;
974             }
975 606           prev_link->next = after;
976 606 100         if (after) after->prev = prev_link;
977 381           else z->tail = prev_link;
978              
979             /* continue: if closer still has count, use it as closer again;
980             * otherwise resume from after */
981 606 100         if (new_close) {
982 71           closer = new_close;
983             } else {
984 535           closer = after;
985 852 100         while (closer) {
986 444 100         if (closer->type == N_DELIM && closer->can_close &&
    100          
987 127 100         (closer->delim_char == '*' || closer->delim_char == '_' ||
    100          
988 6 50         closer->delim_char == '~'))
989             break;
990 317           closer = closer->next;
991             }
992             }
993             }
994             /* clear remaining DELIMs to TEXT */
995 2128 100         start = stack_bottom ? stack_bottom->next : z->head;
996 6736 100         for (p = start; p; p = p->next) {
997 4608 100         if (p->type == N_DELIM) {
998 406           p->type = N_TEXT;
999             }
1000             }
1001 2128           }
1002              
1003             /* ---------------- process ']' ---------------- */
1004              
1005 723           static int try_close_bracket(scn* z, size_t* pos_io) {
1006 723           const char* s = z->s; size_t n = z->n;
1007 723           size_t pos = *pos_io;
1008 723           int is_image = 0;
1009             inode* opener;
1010             size_t p;
1011 723           int matched = 0;
1012 723           const char *href_s = NULL, *title_s = NULL;
1013 723           size_t hlen = 0, tlen = 0;
1014 723           int is_ref = 0;
1015 723           const mds_linkref* refent = NULL;
1016             ntype t;
1017             inode* container;
1018 723           opener = find_open_bracket(z, &is_image);
1019 723 100         if (!opener) {
1020 16           return 0; /* no opener — caller emits literal ']' */
1021             }
1022             /* CommonMark "look for link or image" step 3: if the opener exists
1023             * but is inactive, remove it from the stack (convert to literal `[`)
1024             * and treat this `]` as literal text. Do NOT keep searching for an
1025             * earlier active opener — the inactive opener blocks it. This is
1026             * what makes the alt text of an image with a nested link come out
1027             * as `[foo](uri2)` literally (CM example 520). */
1028 707 100         if (!opener->active) {
1029 21           opener->type = N_TEXT;
1030 21           opener->active = 0;
1031 21           return 0;
1032             }
1033 686           p = pos + 1;
1034              
1035             /* GFM footnote reference [^label] — checked first so it wins over
1036             * inline link/ref interpretations. Requires the bracket content to
1037             * begin with `^` and the label (everything after) to be present in
1038             * ctx->footnotes. Unresolved [^label] falls through to normal
1039             * processing (becomes literal text). */
1040 686 100         if ((z->ctx->flags & MDS_FLAG_FOOTNOTES) && z->ctx->footnotes) {
    100          
1041 27           size_t txt_s0 = (size_t)((opener->s + opener->n) - s);
1042 27           size_t txt_e0 = pos;
1043 27 50         if (txt_e0 > txt_s0 && s[txt_s0] == '^') {
    50          
1044 27           const char* lab_s = s + txt_s0 + 1;
1045 27           size_t lab_n = txt_e0 - txt_s0 - 1;
1046 27           const mds_footnote* fn = mds_footnote_get(z->ctx->footnotes,
1047             lab_s, lab_n);
1048 27 100         if (fn) {
1049             /* Discard any children between opener and the `]` (the
1050             * `^label` text/delim nodes); we don't render them. */
1051 24           opener->children = NULL;
1052 24           opener->children_tail = NULL;
1053             /* Drop everything after opener up to but not including pos. */
1054 24           opener->next = NULL;
1055 24           z->tail = opener;
1056 24 100         if (opener->type == N_OPEN_BANG) {
1057             inode* fnref;
1058             /* Salvage the literal `!` byte that the bang opener
1059             * absorbed — emit it as a sibling TEXT node BEFORE
1060             * the footnote ref. Without this, inputs like
1061             * `text![^1]` lose the `!`. */
1062 3           opener->type = N_TEXT;
1063 3           opener->n = 1; /* s already points at '!' */
1064 3           fnref = node_new(z, N_FOOTNOTE_REF);
1065 3           fnref->href = fn->label;
1066 3           fnref->hlen = fn->llen;
1067 3           fnref->active = 0;
1068 3           append(z, fnref); /* updates z->tail */
1069             } else {
1070             /* Convert opener into the FOOTNOTE_REF node itself. */
1071 21           opener->type = N_FOOTNOTE_REF;
1072 21           opener->href = fn->label;
1073 21           opener->hlen = fn->llen;
1074 21           opener->active = 0;
1075             }
1076 24           *pos_io = pos + 1;
1077 24           return 1;
1078             }
1079             }
1080             }
1081              
1082             /* (a) inline link [text](url "title") */
1083 662 100         if (p < n && s[p] == '(') {
    100          
1084 229           size_t q = p + 1;
1085 238 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    100          
    50          
    50          
1086 229 50         if (q < n && s[q] != ')') {
    100          
1087 431 100         if (parse_link_destination(s, &q, n, &href_s, &hlen)) {
1088 211           size_t after_dest = q;
1089 270 100         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    100          
    50          
    100          
1090 211 100         if (q < n && (s[q] == '"' || s[q] == '\'' || s[q] == '(')) {
    100          
    100          
    100          
1091 41 50         if (!parse_link_title(s, &q, n, &title_s, &tlen)) {
1092 0           q = after_dest;
1093 0           title_s = NULL; tlen = 0;
1094             } else {
1095 56 50         while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++;
    100          
    50          
    50          
1096             }
1097             }
1098 211 100         if (q < n && s[q] == ')') {
    100          
1099 187           p = q + 1;
1100 187           matched = 1;
1101             }
1102             }
1103 9 50         } else if (q < n && s[q] == ')') {
    50          
1104             /* empty destination */
1105 9           href_s = s; hlen = 0;
1106 9           p = q + 1;
1107 9           matched = 1;
1108             }
1109             }
1110              
1111             /* (b) ref link forms */
1112 662 100         if (!matched && z->ctx->refs) {
    100          
1113             /* label text bytes are between opener and current ] */
1114 256           size_t lbl_start_off = (size_t)(opener->s - s) + opener->n; /* after [ */
1115             /* Actually opener->s points at '[' itself; opener->n == 1 (or 2 for ![) */
1116             /* simpler: text content range is open_text..pos */
1117 256           size_t txt_s = (size_t)((opener->s + opener->n) - s);
1118 256           size_t txt_e = pos;
1119 256           int tried_full = 0;
1120             (void)lbl_start_off;
1121              
1122             /* full ref: [text][label] */
1123 256 100         if (p < n && s[p] == '[') {
    100          
1124 105           size_t q = p + 1;
1125 105           size_t lbl_s = q;
1126 337 50         while (q < n && s[q] != ']' && s[q] != '[' && q - lbl_s < 1000) {
    100          
    50          
    50          
1127 232 100         if (s[q] == '\\' && q + 1 < n) q++;
    50          
1128 232           q++;
1129             }
1130 105 50         if (q < n && s[q] == ']' && q > lbl_s) {
    50          
    100          
1131 71           tried_full = 1;
1132 71           refent = mds_linkref_get(z->ctx->refs, s + lbl_s, q - lbl_s);
1133 71 100         if (refent) { p = q + 1; is_ref = 1; matched = 1; }
1134 34 50         } else if (q < n && s[q] == ']' && q == lbl_s) {
    50          
    50          
1135             /* collapsed [text][] */
1136 34           refent = mds_linkref_get(z->ctx->refs, s + txt_s, txt_e - txt_s);
1137 34 50         if (refent) { p = q + 1; is_ref = 1; matched = 1; }
1138             }
1139             }
1140 256 100         if (!matched && !tried_full) {
    100          
1141             /* shortcut [text] — only when there is no [label] following */
1142 151           refent = mds_linkref_get(z->ctx->refs, s + txt_s, txt_e - txt_s);
1143 151 100         if (refent) { is_ref = 1; matched = 1; }
1144             }
1145 256 100         if (matched && refent) {
    50          
1146 226           href_s = refent->url; hlen = refent->ulen;
1147 226           title_s = refent->title; tlen = refent->tlen;
1148             }
1149             }
1150              
1151 662 100         if (!matched) {
1152             /* no link: drop opener from stack — convert to plain TEXT so it
1153             * doesn't block outer brackets from matching this ']'. (CM spec
1154             * "look for link or image" step: remove opener on failure.) */
1155 240           opener->type = N_TEXT;
1156 240           opener->active = 0;
1157 240           return 0;
1158             }
1159              
1160             /* run process_emphasis on the children range (opener->next .. tail) */
1161 422           process_emphasis(z, opener);
1162              
1163             /* wrap into LINK or IMAGE container */
1164 422 100         t = is_image ? N_IMAGE : N_LINK;
1165 422           container = wrap_after(z, opener, t);
1166 422           container->href = href_s; container->hlen = hlen;
1167 422           container->title = title_s; container->tlen = tlen;
1168              
1169             /* unlink opener from list (it became the boundary; wrap_after kept it
1170             * outside the new container — we need to remove it now). */
1171 422 100         if (opener->prev) opener->prev->next = container;
1172 295           else z->head = container;
1173 422           container->prev = opener->prev;
1174              
1175             /* deactivate any earlier '[' if this is a link (not image) */
1176 422 100         if (!is_image) deactivate_brackets(z, container);
1177              
1178 422           *pos_io = p;
1179             (void)is_ref;
1180 422           return 1;
1181             }
1182              
1183             /* ---------------- forward pass ---------------- */
1184              
1185             /* SWAR / NEON fast-skip over runs of plain prose. The inline
1186             * scanner's outer switch fires on exactly these 11 bytes:
1187             * '\\' '`' '<' '&' '*' '_' '~' '[' '!' ']' '\n'
1188             * Everything else falls through to `default: pos++;`. We replace that
1189             * single-byte advance with a 16-byte (NEON) or 8-byte (SWAR) scan that
1190             * returns the offset to the next interesting byte (or the chunk size if
1191             * none). On prose corpora ~99% of bytes match the fast path. */
1192              
1193             static const unsigned char mds_inline_interest[256] = {
1194             ['\\']=1, ['`']=1, ['<']=1, ['&']=1, ['*']=1, ['_']=1,
1195             ['~']=1, ['[']=1, ['!']=1, [']']=1, ['\n']=1,
1196             };
1197              
1198             #if MDS_INLINE_HAVE_NEON
1199             static inline size_t mds_inline_skip16(const char* p) {
1200             /* Returns 0..16: bytes safe to skip before the first interesting one. */
1201             uint8x16_t v = vld1q_u8((const uint8_t*)p);
1202             uint8x16_t bs = vceqq_u8(v, vdupq_n_u8('\\'));
1203             uint8x16_t bt = vceqq_u8(v, vdupq_n_u8('`'));
1204             uint8x16_t lt = vceqq_u8(v, vdupq_n_u8('<'));
1205             uint8x16_t amp = vceqq_u8(v, vdupq_n_u8('&'));
1206             uint8x16_t st = vceqq_u8(v, vdupq_n_u8('*'));
1207             uint8x16_t us = vceqq_u8(v, vdupq_n_u8('_'));
1208             uint8x16_t ti = vceqq_u8(v, vdupq_n_u8('~'));
1209             uint8x16_t lb = vceqq_u8(v, vdupq_n_u8('['));
1210             uint8x16_t bg = vceqq_u8(v, vdupq_n_u8('!'));
1211             uint8x16_t rb = vceqq_u8(v, vdupq_n_u8(']'));
1212             uint8x16_t nl = vceqq_u8(v, vdupq_n_u8('\n'));
1213             uint8x16_t any = vorrq_u8(vorrq_u8(vorrq_u8(bs, bt), vorrq_u8(lt, amp)),
1214             vorrq_u8(vorrq_u8(vorrq_u8(st, us), vorrq_u8(ti, lb)),
1215             vorrq_u8(vorrq_u8(bg, rb), nl)));
1216             uint8x8_t lo, hi;
1217             uint8x8_t packed_lo;
1218             uint64_t m;
1219             if (vmaxvq_u8(any) == 0) return 16;
1220             /* Reduce to 64-bit then ctz to find first match. */
1221             lo = vget_low_u8(any);
1222             hi = vget_high_u8(any);
1223             /* Pack each byte's high bit into a 16-bit mask via shrn trick. */
1224             packed_lo = vshrn_n_u16(vreinterpretq_u16_u8(any), 4);
1225             m = vget_lane_u64(vreinterpret_u64_u8(packed_lo), 0);
1226             (void)lo; (void)hi;
1227             return (size_t)(__builtin_ctzll(m) >> 2);
1228             }
1229             #endif
1230              
1231 2221           static inline size_t mds_inline_skip8(const char* p) {
1232             /* Portable SWAR fallback: 8-byte stride. */
1233             uint64_t w;
1234             uint64_t m;
1235 2221           memcpy(&w, p, 8);
1236             #define MDS_HASZ(x) (((x) - 0x0101010101010101ULL) & ~(x) & 0x8080808080808080ULL)
1237             #define MDS_BC(b) ((uint64_t)(b) * 0x0101010101010101ULL)
1238 2221           m = MDS_HASZ(w ^ MDS_BC('\\'))
1239 2221           | MDS_HASZ(w ^ MDS_BC('`'))
1240 2221           | MDS_HASZ(w ^ MDS_BC('<'))
1241 2221           | MDS_HASZ(w ^ MDS_BC('&'))
1242 2221           | MDS_HASZ(w ^ MDS_BC('*'))
1243 2221           | MDS_HASZ(w ^ MDS_BC('_'))
1244 2221           | MDS_HASZ(w ^ MDS_BC('~'))
1245 2221           | MDS_HASZ(w ^ MDS_BC('['))
1246 2221           | MDS_HASZ(w ^ MDS_BC('!'))
1247 2221           | MDS_HASZ(w ^ MDS_BC(']'))
1248 2221           | MDS_HASZ(w ^ MDS_BC('\n'));
1249             #undef MDS_HASZ
1250             #undef MDS_BC
1251 2221 100         if (!m) return 8;
1252             /* m has high bit set in each matching lane (little-endian byte order). */
1253 1459           return (size_t)(__builtin_ctzll(m) >> 3);
1254             }
1255              
1256 1706           static void scan_forward(scn* z) {
1257 1706           const char* s = z->s;
1258 1706           size_t n = z->n;
1259 1706           size_t pos = 0;
1260 1706           size_t text_start = 0;
1261              
1262             #define FLUSH_TEXT() do { \
1263             if (pos > text_start) append_text(z, s + text_start, pos - text_start); \
1264             text_start = pos; \
1265             } while (0)
1266              
1267 8883 100         while (pos < n) {
1268 7177           unsigned char c = (unsigned char)s[pos];
1269 7177           switch (c) {
1270 339           case '\\': {
1271             inode* x;
1272 303 100         if (pos + 1 < n && s[pos+1] == '\n') {
    100          
1273             /* hard break */
1274 13 50         FLUSH_TEXT();
1275 13           x = node_new(z, N_LINEBREAK);
1276 13           append(z, x);
1277 13           pos += 2;
1278             /* skip leading spaces on next line */
1279 19 50         while (pos < n && (s[pos] == ' ' || s[pos] == '\t')) pos++;
    100          
    50          
1280 13           text_start = pos;
1281 13           continue;
1282             }
1283 290 100         if (pos + 1 < n && is_ascii_punct((unsigned char)s[pos+1])) {
    100          
1284 254 100         FLUSH_TEXT();
1285 254           append_text_dup(z, s + pos + 1, 1);
1286 254           pos += 2;
1287 254           text_start = pos;
1288 254           continue;
1289             }
1290 36           pos++;
1291             (void)x;
1292 36           continue;
1293             }
1294 191           case '`': {
1295             size_t end;
1296 191 100         if (z->ctx->flags & MDS_FLAG_NO_CODE) {
1297             /* emit literal backtick(s) as text */
1298 2           size_t r = pos;
1299 4 50         while (r < n && s[r] == '`') r++;
    100          
1300 2 50         FLUSH_TEXT();
1301 2           append_text(z, s + pos, r - pos);
1302 2           pos = r; text_start = pos; continue;
1303             }
1304 189           end = try_code_span(z, pos);
1305             if (end) {
1306             /* flush bytes before pos */
1307             if (pos > text_start) {
1308             /* append_text already; but z->tail is the new CODE node.
1309             * We need to insert text BEFORE it. Re-do manually. */
1310             }
1311             /* Actually: try_code_span already appended a CODE node,
1312             * so the prior bytes weren't flushed. Need to flush first. */
1313             /* To keep things simple, flush BEFORE attempting span. */
1314             /* Implementation note: re-do as flush-then-attempt below. */
1315             (void)end;
1316             }
1317             /* re-attempt with flush */
1318 45           {
1319 189           size_t saved_pos = pos;
1320             size_t end2;
1321             size_t r;
1322             /* remove the CODE node just appended (we did it above) */
1323 189 100         if (end && z->tail && z->tail->type == N_CODE) {
    50          
    50          
1324 144           inode* dead = z->tail;
1325 144           z->tail = dead->prev;
1326 144 100         if (z->tail) z->tail->next = NULL;
1327 96           else z->head = NULL;
1328             }
1329             /* flush text */
1330 189 100         if (saved_pos > text_start)
1331 71           append_text(z, s + text_start, saved_pos - text_start);
1332 189           text_start = saved_pos;
1333             /* re-attempt cleanly */
1334 189           end2 = try_code_span(z, saved_pos);
1335 189 100         if (end2) {
1336 144           pos = end2; text_start = pos; continue;
1337             }
1338             /* failed: emit literal backticks */
1339 45           r = pos;
1340 99 100         while (r < n && s[r] == '`') r++;
    100          
1341 45           append_text(z, s + pos, r - pos);
1342 45           pos = r; text_start = pos; continue;
1343             }
1344             }
1345 356           case '<': {
1346             size_t end;
1347 279 100         FLUSH_TEXT();
1348 279           end = try_autolink(z, pos);
1349 279 100         if (end) { pos += end; text_start = pos; continue; }
1350 222 100         if (!(z->ctx->flags & MDS_FLAG_NO_HTML)) {
1351 220           end = try_html_inline(z, pos);
1352 220 100         if (end) { pos += end; text_start = pos; continue; }
1353             }
1354 77           append_text(z, s + pos, 1);
1355 77           pos++; text_start = pos;
1356 77           continue;
1357             }
1358 145           case '&': {
1359 112           size_t consumed = try_entity(z, pos);
1360 112 100         if (consumed) {
1361             /* flush prior text first */
1362 79           size_t before = pos;
1363             /* try_entity already appended the entity TEXT; we need to
1364             * insert prior bytes before it. */
1365 79 50         if (z->tail && before > text_start) {
    100          
1366 52           inode* added = z->tail;
1367             /* detach */
1368 52           z->tail = added->prev;
1369 52 100         if (z->tail) z->tail->next = NULL;
1370 3           else z->head = NULL;
1371 52           append_text(z, s + text_start, before - text_start);
1372             /* re-append */
1373 52           added->prev = z->tail;
1374 52           added->next = NULL;
1375 52 50         if (z->tail) z->tail->next = added;
1376 0           else z->head = added;
1377 52           z->tail = added;
1378             }
1379 79           pos += consumed; text_start = pos;
1380 79           continue;
1381             }
1382 33           pos++; continue;
1383             }
1384 1540           case '*':
1385             case '_':
1386 1518           case '~': {
1387             size_t start;
1388             size_t runlen;
1389             int co, cc;
1390             inode* x;
1391 1540 100         FLUSH_TEXT();
1392 1540           start = pos;
1393 3984 100         while (pos < n && (unsigned char)s[pos] == c) pos++;
    100          
1394 1540           runlen = pos - start;
1395 1540 100         if (c == '~' && ((runlen != 1 && runlen != 2) || !(z->ctx->flags & MDS_FLAG_STRIKE))) {
    100          
    100          
    100          
1396             /* not strike candidate (or strikethrough disabled); emit as text */
1397 22           append_text(z, s + start, runlen);
1398 22           text_start = pos;
1399 22           continue;
1400             }
1401 1518           classify_run(s, n, start, runlen, &co, &cc, c);
1402 1518           x = node_new(z, N_DELIM);
1403 1518           x->delim_char = c;
1404 1518           x->count = (int)runlen;
1405 1518           x->can_open = co;
1406 1518           x->can_close = cc;
1407 1518           x->s = s + start; x->n = runlen;
1408 1518           append(z, x);
1409 1518           text_start = pos;
1410 1518           continue;
1411             }
1412 1317           case '[': {
1413             inode* x;
1414 659 100         if (z->ctx->flags & MDS_FLAG_NO_LINKS) {
1415 1 50         FLUSH_TEXT();
1416 1           append_text(z, s + pos, 1);
1417 1           pos++; text_start = pos; continue;
1418             }
1419 658 100         FLUSH_TEXT();
1420 658           x = node_new(z, N_OPEN_BRACKET);
1421 658           x->s = s + pos; x->n = 1;
1422 658           x->active = 1;
1423 658           append(z, x);
1424 658           pos++; text_start = pos;
1425 658           continue;
1426             }
1427 113           case '!': {
1428 113 100         if (pos + 1 < n && s[pos+1] == '[' &&
    100          
1429 86 100         !(z->ctx->flags & MDS_FLAG_NO_IMAGES)) {
1430             inode* x;
1431 85 100         FLUSH_TEXT();
1432 85           x = node_new(z, N_OPEN_BANG);
1433 85           x->s = s + pos; x->n = 2;
1434 85           x->active = 1;
1435 85           append(z, x);
1436 85           pos += 2; text_start = pos;
1437 85           continue;
1438             }
1439 28           pos++; continue;
1440             }
1441 1000           case ']': {
1442             size_t p2;
1443 723 100         FLUSH_TEXT();
1444 723           p2 = pos;
1445 723 100         if (try_close_bracket(z, &p2)) {
1446 446           pos = p2; text_start = pos;
1447 446           continue;
1448             }
1449             /* literal ] */
1450 277           append_text(z, s + pos, 1);
1451 277           pos++; text_start = pos;
1452 277           continue;
1453             }
1454 676           case '\n': {
1455             int hard;
1456             inode* br;
1457 338 100         FLUSH_TEXT();
1458             /* hard break iff prev TEXT ended with two-or-more spaces */
1459 338           hard = 0;
1460 338 50         if (z->tail && z->tail->type == N_TEXT) {
    100          
1461 300           inode* t = z->tail;
1462 300 100         if (t->n >= 2 && t->s[t->n - 1] == ' ' && t->s[t->n - 2] == ' ') {
    100          
    100          
1463             /* trim trailing spaces */
1464 72 50         while (t->n > 0 && t->s[t->n - 1] == ' ') t->n--;
    100          
1465 16 50         if (t->n == 0) {
1466             /* remove empty text */
1467 0           z->tail = t->prev;
1468 0 0         if (z->tail) z->tail->next = NULL;
1469 0           else z->head = NULL;
1470             }
1471 16           hard = 1;
1472 284 50         } else if (t->n >= 1 && t->s[t->n - 1] == ' ') {
    100          
1473             /* single space trailing — strip */
1474 9           t->n--;
1475 9 100         if (t->n == 0) {
1476 6           z->tail = t->prev;
1477 6 50         if (z->tail) z->tail->next = NULL;
1478 0           else z->head = NULL;
1479             }
1480             }
1481             }
1482 338 100         br = node_new(z, hard ? N_LINEBREAK : N_SOFTBREAK);
1483 338           append(z, br);
1484 338           pos++;
1485             /* skip leading spaces on next line */
1486 503 50         while (pos < n && (s[pos] == ' ' || s[pos] == '\t')) pos++;
    100          
    50          
1487 338           text_start = pos;
1488 338           continue;
1489             }
1490 2919           default:
1491             {
1492             /* Fast skip over plain prose. The chunked stride keeps the
1493             * text run intact (no FLUSH_TEXT needed) — we just advance
1494             * `pos` past bytes the outer switch would have ignored. */
1495             #if MDS_INLINE_HAVE_NEON
1496             while (pos + 16 <= n) {
1497             size_t k = mds_inline_skip16(s + pos);
1498             pos += k;
1499             if (k < 16) goto next_iter;
1500             }
1501             #endif
1502 3681 100         while (pos + 8 <= n) {
1503 2221           size_t k = mds_inline_skip8(s + pos);
1504 2221           pos += k;
1505 2221 100         if (k < 8) goto next_iter;
1506             }
1507             /* Tail: 1-byte at a time. The interest table makes the
1508             * predicate branch-free. */
1509 5694 100         while (pos < n && !mds_inline_interest[(unsigned char)s[pos]])
    100          
1510 4234           pos++;
1511 1460           next_iter:
1512 2919           continue;
1513             }
1514             }
1515             }
1516 1706 100         FLUSH_TEXT();
1517             #undef FLUSH_TEXT
1518 1706           }
1519              
1520             /* ---------------- emit pass ---------------- */
1521              
1522             static void emit_children(scn* z, inode* head);
1523              
1524             /* HTML-escape NOT done here; renderer cb_text does the escaping. */
1525 3918           static void emit_text(scn* z, const char* s, size_t n) {
1526 3918 50         if (n == 0) return;
1527 3918 50         if (z->ctx->cb.text) z->ctx->cb.text(z->ctx->ud, s, n);
1528             }
1529 139           static void emit_raw(scn* z, const char* s, size_t n) {
1530 139 50         if (n == 0) return;
1531 139 50         if (z->ctx->cb.raw) z->ctx->cb.raw(z->ctx->ud, s, n);
1532             }
1533              
1534 5517           static void emit_node(scn* z, inode* x) {
1535 5517           mds_callbacks* cb = &z->ctx->cb;
1536             mds_inline_detail d;
1537 5517           memset(&d, 0, sizeof d);
1538 5517           switch (x->type) {
1539 3702           case N_TEXT:
1540 3702           emit_text(z, x->s, x->n);
1541 3702           break;
1542 322           case N_SOFTBREAK:
1543 322 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_SOFTBREAK, &d);
1544 322 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_SOFTBREAK);
1545 322           break;
1546 29           case N_LINEBREAK:
1547 29 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_LINEBREAK, &d);
1548 29 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_LINEBREAK);
1549 29           break;
1550 144           case N_CODE:
1551 144 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_CODE, &d);
1552 144           emit_text(z, x->s, x->n);
1553 144 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_CODE);
1554 144           break;
1555 57           case N_AUTOLINK:
1556 57           d.u.autolink.uri = x->s;
1557 57           d.u.autolink.uri_len = x->n;
1558 57           d.u.autolink.is_email = x->is_email;
1559 57 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_AUTOLINK, &d);
1560 57 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_AUTOLINK);
1561 57           break;
1562 139           case N_HTMLINLINE:
1563 139 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_HTML_INLINE, &d);
1564 139           emit_raw(z, x->s, x->n);
1565 139 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_HTML_INLINE);
1566 139           break;
1567 343           case N_EMPH:
1568 343 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_EMPH, &d);
1569 343           emit_children(z, x->children);
1570 343 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_EMPH);
1571 343           break;
1572 237           case N_STRONG:
1573 237 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_STRONG, &d);
1574 237           emit_children(z, x->children);
1575 237 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_STRONG);
1576 237           break;
1577 26           case N_STRIKE:
1578 26 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_STRIKE, &d);
1579 26           emit_children(z, x->children);
1580 26 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_STRIKE);
1581 26           break;
1582 343           case N_LINK:
1583 343           d.u.link.href = x->href; d.u.link.href_len = x->hlen;
1584 343           d.u.link.title = x->title; d.u.link.title_len = x->tlen;
1585 343 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_LINK, &d);
1586 343           emit_children(z, x->children);
1587 343 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_LINK);
1588 343           break;
1589 79           case N_IMAGE:
1590 79           d.u.image.href = x->href; d.u.image.href_len = x->hlen;
1591 79           d.u.image.title = x->title; d.u.image.title_len = x->tlen;
1592 79           d.u.image.alt = NULL; d.u.image.alt_len = 0; /* renderer derives alt from children */
1593 79 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_IMAGE, &d);
1594 79           emit_children(z, x->children);
1595 79 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_IMAGE);
1596 79           break;
1597 24           case N_FOOTNOTE_REF:
1598             /* opener->href / hlen carry the raw label captured at match
1599             * time; we stuffed them there because inode has no dedicated
1600             * label slot. The renderer manages numbering. */
1601 24           d.u.footnote_ref.label = x->href;
1602 24           d.u.footnote_ref.label_len = x->hlen;
1603 24 50         if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_FOOTNOTE_REF, &d);
1604 24 50         if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_FOOTNOTE_REF);
1605 24           break;
1606 72           case N_DELIM:
1607             case N_OPEN_BRACKET:
1608             case N_OPEN_BANG:
1609             /* leftover unmatched delimiter/bracket — emit as literal text */
1610 72           emit_text(z, x->s, x->n);
1611 72           break;
1612             }
1613 5517           }
1614              
1615 2734           static void emit_children(scn* z, inode* head) {
1616             inode* p;
1617 8251 100         for (p = head; p; p = p->next) emit_node(z, p);
1618 2734           }
1619              
1620             /* ---------------- public entry ---------------- */
1621              
1622 3164           MDS_HOT void mds_inline_scan(mds_ctx* ctx, const char* s, size_t n) {
1623             scn z;
1624 4622 50         if (n == 0) return;
1625              
1626             /* Fast path for table cells and trivial paragraphs: if no byte in
1627             * the run can possibly trigger an inline construct, we can skip
1628             * the entire inode-list build / process_emphasis / emit_children
1629             * pipeline and just call cb.text directly. Inline triggers are:
1630             * `* _ ~ ` [ ] ! < & \\` plus the line-break candidates `\n` and
1631             * the trailing-spaces hard-break case. The classifier dispatch
1632             * table (src/simd/mds_dispatch.h) is the authoritative list; we
1633             * use a small per-call SWAR-style scalar scan rather than the SIMD
1634             * classifier so this stays cheap for short cell-sized runs.
1635             *
1636             * Tables hit this constantly (cells are typically a single word),
1637             * and ordinary prose paragraphs hit it for runs between inline
1638             * markers. The slow path is bit-identical to the original code. */
1639             {
1640 3164           const unsigned char* p = (const unsigned char*)s;
1641 3164           const unsigned char* end = p + n;
1642 117515 100         for (; p < end; p++) {
1643 116057           unsigned char c = *p;
1644             /* Bucket the trigger set with a small bitmap-style check.
1645             * The compiler turns this into a branchless OR-chain. */
1646 116057 100         if (c == '*' || c == '_' || c == '~' || c == '`' ||
    100          
    100          
    100          
    100          
1647 114895 50         c == '[' || c == ']' || c == '!' || c == '<' ||
    100          
    100          
    100          
1648 114607 100         c == '&' || c == '\\' || c == '\n') break;
    100          
1649             }
1650 3164 100         if (MDS_LIKELY(p == end)) {
1651             /* Also bail out on trailing spaces, which CommonMark would
1652             * otherwise turn into a hard-break candidate. Table cells
1653             * never have them (the splitter trims) and most paragraph
1654             * runs don't either. */
1655 1458 100         if (n < 2 || !(s[n-1] == ' ' && s[n-2] == ' ')) {
    50          
    0          
1656 1458 50         if (ctx->cb.text) ctx->cb.text(ctx->ud, s, n);
1657 1458           return;
1658             }
1659             }
1660             }
1661              
1662 1706           byteclass_init();
1663 1706           memset(&z, 0, sizeof z);
1664 1706           z.ctx = ctx;
1665 1706           z.s = s;
1666 1706           z.n = n;
1667              
1668 1706           scan_forward(&z);
1669 1706           process_emphasis(&z, NULL);
1670 1706           emit_children(&z, z.head);
1671             }
1672