| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* mds_inline.c — scalar CommonMark §6 inline tokenizer. |
|
2
|
|
|
|
|
|
|
* |
|
3
|
|
|
|
|
|
|
* Algorithm follows the cmark reference implementation and the |
|
4
|
|
|
|
|
|
|
* CommonMark spec appendix ("An algorithm for parsing nested emphasis |
|
5
|
|
|
|
|
|
|
* and links"). |
|
6
|
|
|
|
|
|
|
* |
|
7
|
|
|
|
|
|
|
* Single forward pass builds a doubly-linked list of nodes: |
|
8
|
|
|
|
|
|
|
* TEXT, CODE, AUTOLINK, HTMLINLINE, SOFTBREAK, LINEBREAK |
|
9
|
|
|
|
|
|
|
* DELIM (* / _ runs) |
|
10
|
|
|
|
|
|
|
* OPEN_BRACKET ([), OPEN_BANG_BRACKET (![) |
|
11
|
|
|
|
|
|
|
* |
|
12
|
|
|
|
|
|
|
* Then process_emphasis() folds DELIM nodes into EMPH/STRONG using the |
|
13
|
|
|
|
|
|
|
* delimiter-run stack algorithm. process_links_and_images() is folded |
|
14
|
|
|
|
|
|
|
* inline during the forward pass at ']' time (cmark does it that way too). |
|
15
|
|
|
|
|
|
|
* |
|
16
|
|
|
|
|
|
|
* Finally emit() walks the linked list and dispatches SAX events. |
|
17
|
|
|
|
|
|
|
*/ |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
#include "mds_inline.h" |
|
20
|
|
|
|
|
|
|
#include "mds_ir.h" |
|
21
|
|
|
|
|
|
|
#include "mds_linkref.h" |
|
22
|
|
|
|
|
|
|
#include "mds_footnote.h" |
|
23
|
|
|
|
|
|
|
#include "mds_entity.h" |
|
24
|
|
|
|
|
|
|
#include "mds_arena.h" |
|
25
|
|
|
|
|
|
|
#include "mds.h" |
|
26
|
|
|
|
|
|
|
#if defined(__ARM_NEON) || defined(__aarch64__) |
|
27
|
|
|
|
|
|
|
# include |
|
28
|
|
|
|
|
|
|
# define MDS_INLINE_HAVE_NEON 1 |
|
29
|
|
|
|
|
|
|
#endif |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
#include |
|
32
|
|
|
|
|
|
|
#include |
|
33
|
|
|
|
|
|
|
#include |
|
34
|
|
|
|
|
|
|
#include |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
/* ---------------- byte class table ---------------- */ |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
enum { |
|
39
|
|
|
|
|
|
|
BC_PUNCT = 1 << 0, /* ASCII punctuation per CommonMark §2.1 */ |
|
40
|
|
|
|
|
|
|
BC_WS = 1 << 1, /* ASCII whitespace: space tab \n \v \f \r */ |
|
41
|
|
|
|
|
|
|
BC_ALNUM = 1 << 2 |
|
42
|
|
|
|
|
|
|
}; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
static unsigned char g_byteclass[256]; |
|
45
|
|
|
|
|
|
|
static int g_byteclass_inited = 0; |
|
46
|
|
|
|
|
|
|
|
|
47
|
1706
|
|
|
|
|
|
static void byteclass_init(void) { |
|
48
|
|
|
|
|
|
|
int c; |
|
49
|
1706
|
100
|
|
|
|
|
if (g_byteclass_inited) return; |
|
50
|
5654
|
100
|
|
|
|
|
for (c = 0; c < 256; c++) { |
|
51
|
5632
|
|
|
|
|
|
unsigned f = 0; |
|
52
|
5632
|
100
|
|
|
|
|
if (c == ' ' || c == '\t' || c == '\n' || c == '\v' || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
53
|
5522
|
100
|
|
|
|
|
c == '\f' || c == '\r') |
|
54
|
132
|
|
|
|
|
|
f |= BC_WS; |
|
55
|
5632
|
100
|
|
|
|
|
if ((c >= '0' && c <= '9') || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
56
|
5412
|
100
|
|
|
|
|
(c >= 'A' && c <= 'Z') || |
|
|
|
100
|
|
|
|
|
|
|
57
|
3498
|
100
|
|
|
|
|
(c >= 'a' && c <= 'z')) |
|
58
|
1364
|
|
|
|
|
|
f |= BC_ALNUM; |
|
59
|
|
|
|
|
|
|
/* CommonMark "ASCII punctuation": !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ */ |
|
60
|
5632
|
100
|
|
|
|
|
if ((c >= 33 && c <= 47) || (c >= 58 && c <= 64) || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
61
|
5148
|
100
|
|
|
|
|
(c >= 91 && c <= 96) || (c >= 123 && c <= 126)) |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
62
|
704
|
|
|
|
|
|
f |= BC_PUNCT; |
|
63
|
5632
|
|
|
|
|
|
g_byteclass[c] = (unsigned char)f; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
22
|
|
|
|
|
|
g_byteclass_inited = 1; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
314
|
|
|
|
|
|
static inline int is_ascii_punct(unsigned char c) { return g_byteclass[c] & BC_PUNCT; } |
|
69
|
848
|
|
|
|
|
|
static inline int is_unicode_ws(unsigned char c) { return g_byteclass[c] & BC_WS; } |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
/* Decode a single UTF-8 codepoint at s[i] (i < n). Returns codepoint; |
|
72
|
|
|
|
|
|
|
* sets *adv to bytes consumed. Lenient on invalid bytes. */ |
|
73
|
54
|
|
|
|
|
|
static unsigned mds_utf8_decode(const char* s, size_t n, size_t i, int* adv) { |
|
74
|
54
|
|
|
|
|
|
unsigned char c = (unsigned char)s[i]; |
|
75
|
54
|
50
|
|
|
|
|
if (c < 0x80) { *adv = 1; return c; } |
|
76
|
54
|
100
|
|
|
|
|
if ((c & 0xE0) == 0xC0 && i + 1 < n && |
|
|
|
50
|
|
|
|
|
|
|
77
|
48
|
50
|
|
|
|
|
((unsigned char)s[i+1] & 0xC0) == 0x80) { |
|
78
|
48
|
|
|
|
|
|
*adv = 2; |
|
79
|
48
|
|
|
|
|
|
return ((unsigned)(c & 0x1F) << 6) | ((unsigned char)s[i+1] & 0x3F); |
|
80
|
|
|
|
|
|
|
} |
|
81
|
6
|
50
|
|
|
|
|
if ((c & 0xF0) == 0xE0 && i + 2 < n && |
|
|
|
50
|
|
|
|
|
|
|
82
|
6
|
50
|
|
|
|
|
((unsigned char)s[i+1] & 0xC0) == 0x80 && |
|
83
|
6
|
50
|
|
|
|
|
((unsigned char)s[i+2] & 0xC0) == 0x80) { |
|
84
|
6
|
|
|
|
|
|
*adv = 3; |
|
85
|
6
|
|
|
|
|
|
return ((unsigned)(c & 0x0F) << 12) |
|
86
|
6
|
|
|
|
|
|
| (((unsigned char)s[i+1] & 0x3F) << 6) |
|
87
|
6
|
|
|
|
|
|
| ((unsigned char)s[i+2] & 0x3F); |
|
88
|
|
|
|
|
|
|
} |
|
89
|
0
|
0
|
|
|
|
|
if ((c & 0xF8) == 0xF0 && i + 3 < n && |
|
|
|
0
|
|
|
|
|
|
|
90
|
0
|
0
|
|
|
|
|
((unsigned char)s[i+1] & 0xC0) == 0x80 && |
|
91
|
0
|
0
|
|
|
|
|
((unsigned char)s[i+2] & 0xC0) == 0x80 && |
|
92
|
0
|
0
|
|
|
|
|
((unsigned char)s[i+3] & 0xC0) == 0x80) { |
|
93
|
0
|
|
|
|
|
|
*adv = 4; |
|
94
|
0
|
|
|
|
|
|
return ((unsigned)(c & 0x07) << 18) |
|
95
|
0
|
|
|
|
|
|
| (((unsigned char)s[i+1] & 0x3F) << 12) |
|
96
|
0
|
|
|
|
|
|
| (((unsigned char)s[i+2] & 0x3F) << 6) |
|
97
|
0
|
|
|
|
|
|
| ((unsigned char)s[i+3] & 0x3F); |
|
98
|
|
|
|
|
|
|
} |
|
99
|
0
|
|
|
|
|
|
*adv = 1; |
|
100
|
0
|
|
|
|
|
|
return c; |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
/* Decode the codepoint ending just before s[pos]; pos > 0 required. |
|
104
|
|
|
|
|
|
|
* Walks backward over continuation bytes (max 3). Returns codepoint; |
|
105
|
|
|
|
|
|
|
* sets *cp_start to start offset. */ |
|
106
|
27
|
|
|
|
|
|
static unsigned mds_utf8_decode_prev(const char* s, size_t n, size_t pos, |
|
107
|
|
|
|
|
|
|
size_t* cp_start) { |
|
108
|
27
|
|
|
|
|
|
size_t i = pos - 1; |
|
109
|
27
|
|
|
|
|
|
int back = 0; |
|
110
|
|
|
|
|
|
|
int adv; |
|
111
|
57
|
50
|
|
|
|
|
while (i > 0 && back < 3 && |
|
|
|
50
|
|
|
|
|
|
|
112
|
57
|
100
|
|
|
|
|
((unsigned char)s[i] & 0xC0) == 0x80) { |
|
113
|
30
|
|
|
|
|
|
i--; back++; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
27
|
|
|
|
|
|
*cp_start = i; |
|
116
|
27
|
|
|
|
|
|
return mds_utf8_decode(s, n, i, &adv); |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
/* Is codepoint Unicode whitespace per CommonMark spec (General_Category |
|
120
|
|
|
|
|
|
|
* Zs, plus tab/CR/LF/FF). */ |
|
121
|
3036
|
|
|
|
|
|
static int cp_is_ws(unsigned cp) { |
|
122
|
3036
|
50
|
|
|
|
|
if (cp == 0x09 || cp == 0x0A || cp == 0x0B || cp == 0x0C || cp == 0x0D || |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
123
|
1252
|
|
|
|
|
|
cp == 0x20) return 1; |
|
124
|
1784
|
100
|
|
|
|
|
if (cp == 0xA0) return 1; /* NBSP */ |
|
125
|
1778
|
50
|
|
|
|
|
if (cp == 0x1680) return 1; |
|
126
|
1778
|
100
|
|
|
|
|
if (cp >= 0x2000 && cp <= 0x200A) return 1; |
|
|
|
50
|
|
|
|
|
|
|
127
|
1778
|
50
|
|
|
|
|
if (cp == 0x2028 || cp == 0x2029) return 1; |
|
|
|
50
|
|
|
|
|
|
|
128
|
1778
|
50
|
|
|
|
|
if (cp == 0x202F || cp == 0x205F) return 1; |
|
|
|
50
|
|
|
|
|
|
|
129
|
1778
|
50
|
|
|
|
|
if (cp == 0x3000) return 1; |
|
130
|
1778
|
|
|
|
|
|
return 0; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
/* Is codepoint Unicode punctuation per CommonMark 0.31 spec |
|
134
|
|
|
|
|
|
|
* (General_Category P* or S*). For non-ASCII we approximate with the |
|
135
|
|
|
|
|
|
|
* ranges most likely to appear in spec examples: Latin-1 punctuation |
|
136
|
|
|
|
|
|
|
* and symbols (¡¢£¤¥¦§¨©ª«¬®¯°±²³´¶·¸¹º»¼½¾¿×÷), General Punctuation |
|
137
|
|
|
|
|
|
|
* (U+2000-206F), Currency Symbols (U+20A0-U+20CF), Letterlike (some), |
|
138
|
|
|
|
|
|
|
* Arrows (U+2190-U+21FF), Mathematical (U+2200-U+22FF), Misc Tech |
|
139
|
|
|
|
|
|
|
* (U+2300-U+23FF), Box Drawing/Block (U+2500-U+259F), Geometric |
|
140
|
|
|
|
|
|
|
* (U+25A0-U+25FF), Misc Symbols (U+2600-U+26FF), Dingbats (U+2700-U+27BF), |
|
141
|
|
|
|
|
|
|
* CJK Symbols (U+3000-U+303F), Halfwidth (U+FF00-U+FFEF symbols subset). */ |
|
142
|
3036
|
|
|
|
|
|
static int cp_is_punct(unsigned cp) { |
|
143
|
3036
|
100
|
|
|
|
|
if (cp < 0x80) return g_byteclass[cp] & BC_PUNCT; |
|
144
|
|
|
|
|
|
|
/* Latin-1 Supplement P/S categories */ |
|
145
|
54
|
100
|
|
|
|
|
if (cp >= 0xA1 && cp <= 0xBF) return 1; |
|
|
|
100
|
|
|
|
|
|
|
146
|
48
|
50
|
|
|
|
|
if (cp == 0xD7 || cp == 0xF7) return 1; |
|
|
|
50
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
/* General Punctuation block */ |
|
148
|
48
|
100
|
|
|
|
|
if (cp >= 0x2000 && cp <= 0x206F) return 1; |
|
|
|
50
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
/* Superscripts/Subscripts (Sm subset) */ |
|
150
|
48
|
100
|
|
|
|
|
if (cp >= 0x2070 && cp <= 0x209F) return 1; |
|
|
|
50
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
/* Currency Symbols */ |
|
152
|
48
|
100
|
|
|
|
|
if (cp >= 0x20A0 && cp <= 0x20CF) return 1; |
|
|
|
50
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
/* Letterlike symbols (S subset) */ |
|
154
|
42
|
50
|
|
|
|
|
if (cp >= 0x2100 && cp <= 0x214F) return 1; |
|
|
|
0
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
/* Arrows / Math / Misc Tech / Box / Geometric / Misc / Dingbats */ |
|
156
|
42
|
50
|
|
|
|
|
if (cp >= 0x2190 && cp <= 0x27BF) return 1; |
|
|
|
0
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
/* CJK Symbols and Punctuation */ |
|
158
|
42
|
50
|
|
|
|
|
if (cp >= 0x3000 && cp <= 0x303F) return 1; |
|
|
|
0
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
/* Halfwidth / Fullwidth punctuation (subset) */ |
|
160
|
42
|
50
|
|
|
|
|
if (cp >= 0xFF00 && cp <= 0xFF0F) return 1; |
|
|
|
0
|
|
|
|
|
|
|
161
|
42
|
50
|
|
|
|
|
if (cp >= 0xFF1A && cp <= 0xFF20) return 1; |
|
|
|
0
|
|
|
|
|
|
|
162
|
42
|
50
|
|
|
|
|
if (cp >= 0xFF3B && cp <= 0xFF40) return 1; |
|
|
|
0
|
|
|
|
|
|
|
163
|
42
|
50
|
|
|
|
|
if (cp >= 0xFF5B && cp <= 0xFF65) return 1; |
|
|
|
0
|
|
|
|
|
|
|
164
|
42
|
|
|
|
|
|
return 0; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
/* ---------------- inline node ---------------- */ |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
typedef enum { |
|
170
|
|
|
|
|
|
|
N_TEXT, |
|
171
|
|
|
|
|
|
|
N_CODE, |
|
172
|
|
|
|
|
|
|
N_AUTOLINK, |
|
173
|
|
|
|
|
|
|
N_HTMLINLINE, |
|
174
|
|
|
|
|
|
|
N_SOFTBREAK, |
|
175
|
|
|
|
|
|
|
N_LINEBREAK, |
|
176
|
|
|
|
|
|
|
N_DELIM, /* * or _ run */ |
|
177
|
|
|
|
|
|
|
N_OPEN_BRACKET, /* '[' */ |
|
178
|
|
|
|
|
|
|
N_OPEN_BANG, /* '![' */ |
|
179
|
|
|
|
|
|
|
N_EMPH, /* after process_emphasis */ |
|
180
|
|
|
|
|
|
|
N_STRONG, |
|
181
|
|
|
|
|
|
|
N_STRIKE, |
|
182
|
|
|
|
|
|
|
N_LINK, |
|
183
|
|
|
|
|
|
|
N_IMAGE, |
|
184
|
|
|
|
|
|
|
N_FOOTNOTE_REF /* GFM §6.13 */ |
|
185
|
|
|
|
|
|
|
} ntype; |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
typedef struct inode { |
|
188
|
|
|
|
|
|
|
ntype type; |
|
189
|
|
|
|
|
|
|
struct inode* prev; |
|
190
|
|
|
|
|
|
|
struct inode* next; |
|
191
|
|
|
|
|
|
|
const char* s; |
|
192
|
|
|
|
|
|
|
size_t n; |
|
193
|
|
|
|
|
|
|
int is_email; |
|
194
|
|
|
|
|
|
|
unsigned char delim_char; |
|
195
|
|
|
|
|
|
|
int count; |
|
196
|
|
|
|
|
|
|
int can_open; |
|
197
|
|
|
|
|
|
|
int can_close; |
|
198
|
|
|
|
|
|
|
int active; |
|
199
|
|
|
|
|
|
|
int bracket_after_emph; |
|
200
|
|
|
|
|
|
|
struct inode* children; |
|
201
|
|
|
|
|
|
|
struct inode* children_tail; |
|
202
|
|
|
|
|
|
|
const char* href; size_t hlen; |
|
203
|
|
|
|
|
|
|
const char* title; size_t tlen; |
|
204
|
|
|
|
|
|
|
} inode; |
|
205
|
|
|
|
|
|
|
/* Bitfield packing of the flag ints was attempted but |
|
206
|
|
|
|
|
|
|
* produced no measurable speedup (commonmark-spec / synth-prose within |
|
207
|
|
|
|
|
|
|
* +/-2%, synth-tables drifted -5%) so the original layout is kept. The |
|
208
|
|
|
|
|
|
|
* 32-byte aspirational target requires splitting href/title into a side |
|
209
|
|
|
|
|
|
|
* allocation keyed off type == N_LINK|N_IMAGE; deferred until the inline |
|
210
|
|
|
|
|
|
|
* parser is rewritten around tagged unions. */ |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
/* ---------------- scanner state ---------------- */ |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
typedef struct { |
|
215
|
|
|
|
|
|
|
mds_ctx* ctx; |
|
216
|
|
|
|
|
|
|
const char* s; |
|
217
|
|
|
|
|
|
|
size_t n; |
|
218
|
|
|
|
|
|
|
size_t pos; |
|
219
|
|
|
|
|
|
|
inode* head; |
|
220
|
|
|
|
|
|
|
inode* tail; |
|
221
|
|
|
|
|
|
|
} scn; |
|
222
|
|
|
|
|
|
|
|
|
223
|
7198
|
|
|
|
|
|
static inode* node_new(scn* z, ntype t) { |
|
224
|
7198
|
|
|
|
|
|
inode* x = (inode*)mds_arena_alloc(&z->ctx->arena, sizeof(inode)); |
|
225
|
7198
|
|
|
|
|
|
memset(x, 0, sizeof *x); |
|
226
|
7198
|
|
|
|
|
|
x->type = t; |
|
227
|
7198
|
|
|
|
|
|
return x; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
|
|
230
|
6592
|
|
|
|
|
|
static void append(scn* z, inode* x) { |
|
231
|
6592
|
|
|
|
|
|
x->prev = z->tail; |
|
232
|
6592
|
|
|
|
|
|
x->next = NULL; |
|
233
|
6592
|
100
|
|
|
|
|
if (z->tail) z->tail->next = x; |
|
234
|
2100
|
|
|
|
|
|
else z->head = x; |
|
235
|
6592
|
|
|
|
|
|
z->tail = x; |
|
236
|
6592
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
|
|
238
|
0
|
|
|
|
|
|
static void append_to(inode* parent, inode* x) { |
|
239
|
0
|
|
|
|
|
|
x->prev = parent->children_tail; |
|
240
|
0
|
|
|
|
|
|
x->next = NULL; |
|
241
|
0
|
0
|
|
|
|
|
if (parent->children_tail) parent->children_tail->next = x; |
|
242
|
0
|
|
|
|
|
|
else parent->children = x; |
|
243
|
0
|
|
|
|
|
|
parent->children_tail = x; |
|
244
|
0
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
/* Append literal text bytes; coalesces with previous TEXT node if possible |
|
247
|
|
|
|
|
|
|
* (only if contiguous in source). */ |
|
248
|
3311
|
|
|
|
|
|
static void append_text(scn* z, const char* p, size_t k) { |
|
249
|
|
|
|
|
|
|
inode* x; |
|
250
|
3311
|
50
|
|
|
|
|
if (k == 0) return; |
|
251
|
3311
|
100
|
|
|
|
|
if (z->tail && z->tail->type == N_TEXT && |
|
|
|
100
|
|
|
|
|
|
|
252
|
736
|
100
|
|
|
|
|
z->tail->s + z->tail->n == p) { |
|
253
|
579
|
|
|
|
|
|
z->tail->n += k; |
|
254
|
579
|
|
|
|
|
|
return; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
2732
|
|
|
|
|
|
x = node_new(z, N_TEXT); |
|
257
|
2732
|
|
|
|
|
|
x->s = p; x->n = k; |
|
258
|
2732
|
|
|
|
|
|
append(z, x); |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
/* Allocate a fresh text node referring to arena-stored bytes (e.g. an |
|
262
|
|
|
|
|
|
|
* entity expansion). */ |
|
263
|
333
|
|
|
|
|
|
static void append_text_dup(scn* z, const char* p, size_t k) { |
|
264
|
|
|
|
|
|
|
char* d; |
|
265
|
|
|
|
|
|
|
inode* x; |
|
266
|
333
|
50
|
|
|
|
|
if (k == 0) return; |
|
267
|
333
|
|
|
|
|
|
d = (char*)mds_arena_alloc(&z->ctx->arena, k); |
|
268
|
333
|
|
|
|
|
|
memcpy(d, p, k); |
|
269
|
333
|
|
|
|
|
|
x = node_new(z, N_TEXT); |
|
270
|
333
|
|
|
|
|
|
x->s = d; x->n = k; |
|
271
|
333
|
|
|
|
|
|
append(z, x); |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
/* ---------------- flanking rules (§6.4) ---------------- */ |
|
275
|
|
|
|
|
|
|
/* |
|
276
|
|
|
|
|
|
|
* preceded_by_ws / followed_by_ws : Unicode whitespace at run boundary |
|
277
|
|
|
|
|
|
|
* preceded_by_punct / followed_by_punct : ASCII punct (Unicode punct is |
|
278
|
|
|
|
|
|
|
* approximated as the high-bit set + any non-alnum byte; full UTF-8 |
|
279
|
|
|
|
|
|
|
* Unicode-punct lookup is deferred). |
|
280
|
|
|
|
|
|
|
* |
|
281
|
|
|
|
|
|
|
* left-flanking iff: NOT followed by Unicode-WS AND |
|
282
|
|
|
|
|
|
|
* (NOT followed by punct OR |
|
283
|
|
|
|
|
|
|
* preceded by Unicode-WS or punct) |
|
284
|
|
|
|
|
|
|
* |
|
285
|
|
|
|
|
|
|
* right-flanking iff: NOT preceded by Unicode-WS AND |
|
286
|
|
|
|
|
|
|
* (NOT preceded by punct OR |
|
287
|
|
|
|
|
|
|
* followed by Unicode-WS or punct) |
|
288
|
|
|
|
|
|
|
*/ |
|
289
|
1518
|
|
|
|
|
|
static int classify_run(const char* s, size_t n, size_t pos, size_t runlen, |
|
290
|
|
|
|
|
|
|
int* can_open_out, int* can_close_out, |
|
291
|
|
|
|
|
|
|
unsigned char ch) { |
|
292
|
|
|
|
|
|
|
unsigned cp_before, cp_after; |
|
293
|
|
|
|
|
|
|
size_t after_pos; |
|
294
|
|
|
|
|
|
|
int before_ws, after_ws, before_punct, after_punct; |
|
295
|
|
|
|
|
|
|
int left, right; |
|
296
|
|
|
|
|
|
|
int can_open, can_close; |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
/* Decode the codepoint immediately before pos and the one starting |
|
299
|
|
|
|
|
|
|
* at pos+runlen. Treat document edges as line feeds (whitespace). |
|
300
|
|
|
|
|
|
|
* Decoding multi-byte codepoints is essential for non-ASCII spec |
|
301
|
|
|
|
|
|
|
* cases (NBSP as WS, currency / arrows / etc. as Unicode punct). */ |
|
302
|
1518
|
100
|
|
|
|
|
if (pos == 0) { |
|
303
|
405
|
|
|
|
|
|
cp_before = '\n'; |
|
304
|
|
|
|
|
|
|
} else { |
|
305
|
1113
|
|
|
|
|
|
unsigned char b = (unsigned char)s[pos - 1]; |
|
306
|
1113
|
100
|
|
|
|
|
if (b < 0x80) { |
|
307
|
1086
|
|
|
|
|
|
cp_before = b; |
|
308
|
|
|
|
|
|
|
} else { |
|
309
|
|
|
|
|
|
|
size_t st; |
|
310
|
27
|
|
|
|
|
|
cp_before = mds_utf8_decode_prev(s, n, pos, &st); |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
} |
|
313
|
1518
|
|
|
|
|
|
after_pos = pos + runlen; |
|
314
|
1518
|
100
|
|
|
|
|
if (after_pos >= n) { |
|
315
|
416
|
|
|
|
|
|
cp_after = '\n'; |
|
316
|
|
|
|
|
|
|
} else { |
|
317
|
1102
|
|
|
|
|
|
unsigned char a = (unsigned char)s[after_pos]; |
|
318
|
1102
|
100
|
|
|
|
|
if (a < 0x80) { |
|
319
|
1075
|
|
|
|
|
|
cp_after = a; |
|
320
|
|
|
|
|
|
|
} else { |
|
321
|
|
|
|
|
|
|
int adv; |
|
322
|
27
|
|
|
|
|
|
cp_after = mds_utf8_decode(s, n, after_pos, &adv); |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
1518
|
|
|
|
|
|
before_ws = cp_is_ws(cp_before); |
|
327
|
1518
|
|
|
|
|
|
after_ws = cp_is_ws(cp_after); |
|
328
|
1518
|
|
|
|
|
|
before_punct = cp_is_punct(cp_before); |
|
329
|
1518
|
|
|
|
|
|
after_punct = cp_is_punct(cp_after); |
|
330
|
|
|
|
|
|
|
|
|
331
|
1518
|
100
|
|
|
|
|
left = !after_ws && (!after_punct || before_ws || before_punct); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
332
|
1518
|
100
|
|
|
|
|
right = !before_ws && (!before_punct || after_ws || after_punct); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
|
334
|
1518
|
100
|
|
|
|
|
if (ch == '_') { |
|
335
|
|
|
|
|
|
|
/* §6.4: _ delimiters with intra-word restrictions */ |
|
336
|
533
|
100
|
|
|
|
|
can_open = left && (!right || before_punct); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
337
|
533
|
100
|
|
|
|
|
can_close = right && (!left || after_punct); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
} else { |
|
339
|
|
|
|
|
|
|
/* * (and ~ for strikethrough) */ |
|
340
|
985
|
|
|
|
|
|
can_open = left; |
|
341
|
985
|
|
|
|
|
|
can_close = right; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
1518
|
|
|
|
|
|
*can_open_out = can_open; |
|
344
|
1518
|
|
|
|
|
|
*can_close_out = can_close; |
|
345
|
1518
|
|
|
|
|
|
return 1; |
|
346
|
|
|
|
|
|
|
} |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
/* ---------------- code span (§6.3) ---------------- */ |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
/* Try to match a code span starting at pos (first byte = '`'). |
|
351
|
|
|
|
|
|
|
* On success returns new pos past the closing fence; emits one node. |
|
352
|
|
|
|
|
|
|
* On failure returns 0 (caller consumes one backtick as text). */ |
|
353
|
378
|
|
|
|
|
|
static size_t try_code_span(scn* z, size_t pos) { |
|
354
|
378
|
|
|
|
|
|
const char* s = z->s; |
|
355
|
378
|
|
|
|
|
|
size_t n = z->n; |
|
356
|
378
|
|
|
|
|
|
size_t open_start = pos; |
|
357
|
|
|
|
|
|
|
size_t open_len; |
|
358
|
|
|
|
|
|
|
size_t content_start; |
|
359
|
|
|
|
|
|
|
size_t scan; |
|
360
|
848
|
100
|
|
|
|
|
while (pos < n && s[pos] == '`') pos++; |
|
|
|
100
|
|
|
|
|
|
|
361
|
378
|
|
|
|
|
|
open_len = pos - open_start; |
|
362
|
378
|
|
|
|
|
|
content_start = pos; |
|
363
|
378
|
|
|
|
|
|
scan = pos; |
|
364
|
432
|
100
|
|
|
|
|
while (scan < n) { |
|
365
|
|
|
|
|
|
|
/* find next run of backticks */ |
|
366
|
366
|
|
|
|
|
|
const char* p = (const char*)memchr(s + scan, '`', n - scan); |
|
367
|
|
|
|
|
|
|
size_t bs; |
|
368
|
|
|
|
|
|
|
size_t be; |
|
369
|
366
|
100
|
|
|
|
|
if (!p) return 0; |
|
370
|
342
|
|
|
|
|
|
bs = (size_t)(p - s); |
|
371
|
342
|
|
|
|
|
|
be = bs; |
|
372
|
794
|
100
|
|
|
|
|
while (be < n && s[be] == '`') be++; |
|
|
|
100
|
|
|
|
|
|
|
373
|
342
|
100
|
|
|
|
|
if (be - bs == open_len) { |
|
374
|
|
|
|
|
|
|
/* matched */ |
|
375
|
288
|
|
|
|
|
|
size_t cs = content_start; |
|
376
|
288
|
|
|
|
|
|
size_t ce = bs; |
|
377
|
288
|
|
|
|
|
|
int has_nonspace = 0; |
|
378
|
288
|
|
|
|
|
|
int needs_replace = 0; |
|
379
|
|
|
|
|
|
|
inode* x; |
|
380
|
|
|
|
|
|
|
size_t i; |
|
381
|
|
|
|
|
|
|
/* normalisation: if first and last are space, and content is |
|
382
|
|
|
|
|
|
|
* not all spaces, strip one leading and trailing space. */ |
|
383
|
374
|
100
|
|
|
|
|
for (i = cs; i < ce; i++) { |
|
384
|
362
|
100
|
|
|
|
|
if (s[i] != ' ' && s[i] != '\n') { has_nonspace = 1; break; } |
|
|
|
100
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
} |
|
386
|
288
|
100
|
|
|
|
|
if (has_nonspace && ce - cs >= 2 && |
|
|
|
100
|
|
|
|
|
|
|
387
|
232
|
100
|
|
|
|
|
(s[cs] == ' ' || s[cs] == '\n') && |
|
|
|
100
|
|
|
|
|
|
|
388
|
62
|
100
|
|
|
|
|
(s[ce - 1] == ' ' || s[ce - 1] == '\n')) { |
|
|
|
100
|
|
|
|
|
|
|
389
|
56
|
|
|
|
|
|
cs++; ce--; |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
/* replace newlines with spaces */ |
|
392
|
1524
|
100
|
|
|
|
|
for (i = cs; i < ce; i++) { |
|
393
|
1260
|
100
|
|
|
|
|
if (s[i] == '\n') { needs_replace = 1; break; } |
|
394
|
|
|
|
|
|
|
} |
|
395
|
288
|
|
|
|
|
|
x = node_new(z, N_CODE); |
|
396
|
288
|
100
|
|
|
|
|
if (needs_replace) { |
|
397
|
24
|
|
|
|
|
|
char* d = (char*)mds_arena_alloc(&z->ctx->arena, ce - cs); |
|
398
|
312
|
100
|
|
|
|
|
for (i = cs; i < ce; i++) |
|
399
|
288
|
100
|
|
|
|
|
d[i - cs] = (s[i] == '\n') ? ' ' : s[i]; |
|
400
|
24
|
|
|
|
|
|
x->s = d; x->n = ce - cs; |
|
401
|
|
|
|
|
|
|
} else { |
|
402
|
264
|
|
|
|
|
|
x->s = s + cs; x->n = ce - cs; |
|
403
|
|
|
|
|
|
|
} |
|
404
|
288
|
|
|
|
|
|
append(z, x); |
|
405
|
288
|
|
|
|
|
|
return be; |
|
406
|
|
|
|
|
|
|
} |
|
407
|
54
|
|
|
|
|
|
scan = be; |
|
408
|
|
|
|
|
|
|
} |
|
409
|
66
|
|
|
|
|
|
return 0; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
/* ---------------- entity (§6.2) ---------------- */ |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
/* Try to decode entity starting at pos (s[pos] == '&'). |
|
415
|
|
|
|
|
|
|
* Returns chars consumed (including & and ;) on success, 0 otherwise. */ |
|
416
|
112
|
|
|
|
|
|
static size_t try_entity(scn* z, size_t pos) { |
|
417
|
112
|
|
|
|
|
|
const char* s = z->s; size_t n = z->n; |
|
418
|
|
|
|
|
|
|
size_t q; |
|
419
|
|
|
|
|
|
|
size_t name_start; |
|
420
|
|
|
|
|
|
|
const mds_entity* e; |
|
421
|
|
|
|
|
|
|
mds_entity ent_scratch; |
|
422
|
112
|
50
|
|
|
|
|
if (pos + 1 >= n) return 0; |
|
423
|
112
|
|
|
|
|
|
q = pos + 1; |
|
424
|
112
|
100
|
|
|
|
|
if (s[q] == '#') { |
|
425
|
53
|
|
|
|
|
|
unsigned long cp = 0; |
|
426
|
53
|
|
|
|
|
|
size_t digits = 0; |
|
427
|
|
|
|
|
|
|
char buf[5]; |
|
428
|
|
|
|
|
|
|
size_t blen; |
|
429
|
53
|
|
|
|
|
|
q++; |
|
430
|
53
|
50
|
|
|
|
|
if (q < n && (s[q] == 'x' || s[q] == 'X')) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
431
|
14
|
|
|
|
|
|
q++; |
|
432
|
42
|
50
|
|
|
|
|
while (q < n && digits < 6 && isxdigit((unsigned char)s[q])) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
433
|
28
|
|
|
|
|
|
char c = s[q]; |
|
434
|
40
|
100
|
|
|
|
|
cp = cp * 16 + (c <= '9' ? c - '0' : |
|
|
|
100
|
|
|
|
|
|
|
435
|
12
|
|
|
|
|
|
(c <= 'F' ? c - 'A' + 10 : c - 'a' + 10)); |
|
436
|
28
|
|
|
|
|
|
q++; digits++; |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
} else { |
|
439
|
123
|
50
|
|
|
|
|
while (q < n && digits < 7 && s[q] >= '0' && s[q] <= '9') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
440
|
84
|
|
|
|
|
|
cp = cp * 10 + (unsigned long)(s[q] - '0'); |
|
441
|
84
|
|
|
|
|
|
q++; digits++; |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
} |
|
444
|
53
|
100
|
|
|
|
|
if (!digits || q >= n || s[q] != ';') return 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
445
|
41
|
|
|
|
|
|
q++; |
|
446
|
|
|
|
|
|
|
/* Encode codepoint as UTF-8. NUL → U+FFFD. */ |
|
447
|
41
|
100
|
|
|
|
|
if (cp == 0 || cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
448
|
3
|
|
|
|
|
|
cp = 0xFFFD; |
|
449
|
41
|
100
|
|
|
|
|
if (cp < 0x80) { |
|
450
|
26
|
|
|
|
|
|
buf[0] = (char)cp; blen = 1; |
|
451
|
15
|
100
|
|
|
|
|
} else if (cp < 0x800) { |
|
452
|
6
|
|
|
|
|
|
buf[0] = (char)(0xC0 | (cp >> 6)); |
|
453
|
6
|
|
|
|
|
|
buf[1] = (char)(0x80 | (cp & 0x3F)); blen = 2; |
|
454
|
9
|
50
|
|
|
|
|
} else if (cp < 0x10000) { |
|
455
|
9
|
|
|
|
|
|
buf[0] = (char)(0xE0 | (cp >> 12)); |
|
456
|
9
|
|
|
|
|
|
buf[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
|
457
|
9
|
|
|
|
|
|
buf[2] = (char)(0x80 | (cp & 0x3F)); blen = 3; |
|
458
|
|
|
|
|
|
|
} else { |
|
459
|
0
|
|
|
|
|
|
buf[0] = (char)(0xF0 | (cp >> 18)); |
|
460
|
0
|
|
|
|
|
|
buf[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); |
|
461
|
0
|
|
|
|
|
|
buf[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
|
462
|
0
|
|
|
|
|
|
buf[3] = (char)(0x80 | (cp & 0x3F)); blen = 4; |
|
463
|
|
|
|
|
|
|
} |
|
464
|
41
|
|
|
|
|
|
append_text_dup(z, buf, blen); |
|
465
|
41
|
|
|
|
|
|
return q - pos; |
|
466
|
|
|
|
|
|
|
} |
|
467
|
|
|
|
|
|
|
/* named entity */ |
|
468
|
59
|
|
|
|
|
|
name_start = q; |
|
469
|
446
|
100
|
|
|
|
|
while (q < n && isalnum((unsigned char)s[q])) q++; |
|
|
|
100
|
|
|
|
|
|
|
470
|
59
|
100
|
|
|
|
|
if (q == name_start || q >= n || s[q] != ';') return 0; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
471
|
47
|
|
|
|
|
|
e = mds_entity_lookup(s + name_start, q - name_start, &ent_scratch); |
|
472
|
47
|
100
|
|
|
|
|
if (!e) return 0; |
|
473
|
38
|
|
|
|
|
|
q++; |
|
474
|
38
|
|
|
|
|
|
append_text_dup(z, e->utf8, e->ulen); |
|
475
|
38
|
|
|
|
|
|
return q - pos; |
|
476
|
|
|
|
|
|
|
} |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
/* ---------------- autolink (§6.7) ---------------- */ |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
/* Returns total chars (including <>) on success, 0 otherwise. */ |
|
481
|
279
|
|
|
|
|
|
static size_t try_autolink(scn* z, size_t pos) { |
|
482
|
279
|
|
|
|
|
|
const char* s = z->s; size_t n = z->n; |
|
483
|
|
|
|
|
|
|
size_t q; |
|
484
|
|
|
|
|
|
|
size_t scheme_start; |
|
485
|
|
|
|
|
|
|
size_t scheme_len; |
|
486
|
|
|
|
|
|
|
size_t body_start; |
|
487
|
|
|
|
|
|
|
size_t r; |
|
488
|
|
|
|
|
|
|
size_t e_start; |
|
489
|
279
|
|
|
|
|
|
int dot_ok = 0; |
|
490
|
279
|
|
|
|
|
|
int label_len = 0; |
|
491
|
|
|
|
|
|
|
inode* x; |
|
492
|
279
|
50
|
|
|
|
|
if (pos >= n || s[pos] != '<') return 0; |
|
|
|
50
|
|
|
|
|
|
|
493
|
279
|
|
|
|
|
|
q = pos + 1; |
|
494
|
|
|
|
|
|
|
/* URI autolink: scheme = [A-Za-z][A-Za-z0-9+.-]{1,31}: */ |
|
495
|
279
|
|
|
|
|
|
scheme_start = q; |
|
496
|
279
|
50
|
|
|
|
|
if (q >= n || !isalpha((unsigned char)s[q])) goto try_email; |
|
|
|
100
|
|
|
|
|
|
|
497
|
201
|
|
|
|
|
|
q++; |
|
498
|
742
|
50
|
|
|
|
|
while (q < n && (isalnum((unsigned char)s[q]) || s[q] == '+' || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
499
|
216
|
100
|
|
|
|
|
s[q] == '.' || s[q] == '-')) |
|
|
|
100
|
|
|
|
|
|
|
500
|
541
|
|
|
|
|
|
q++; |
|
501
|
201
|
|
|
|
|
|
scheme_len = q - scheme_start; |
|
502
|
201
|
100
|
|
|
|
|
if (scheme_len < 2 || scheme_len > 32) goto try_email; |
|
|
|
50
|
|
|
|
|
|
|
503
|
135
|
50
|
|
|
|
|
if (q >= n || s[q] != ':') goto try_email; |
|
|
|
100
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
/* body: any non-WS, non-< non-> */ |
|
505
|
53
|
|
|
|
|
|
body_start = q + 1; |
|
506
|
53
|
|
|
|
|
|
r = body_start; |
|
507
|
1799
|
100
|
|
|
|
|
while (r < n && s[r] != '>' && s[r] != '<' && |
|
508
|
1746
|
50
|
|
|
|
|
!is_unicode_ws((unsigned char)s[r]) && |
|
509
|
845
|
50
|
|
|
|
|
(unsigned char)s[r] >= 0x20) |
|
510
|
845
|
|
|
|
|
|
r++; |
|
511
|
53
|
50
|
|
|
|
|
if (r < n && s[r] == '>') { |
|
|
|
100
|
|
|
|
|
|
|
512
|
50
|
|
|
|
|
|
x = node_new(z, N_AUTOLINK); |
|
513
|
50
|
|
|
|
|
|
x->s = s + pos + 1; x->n = r - (pos + 1); |
|
514
|
50
|
|
|
|
|
|
x->is_email = 0; |
|
515
|
50
|
|
|
|
|
|
append(z, x); |
|
516
|
50
|
|
|
|
|
|
return r - pos + 1; |
|
517
|
|
|
|
|
|
|
} |
|
518
|
3
|
|
|
|
|
|
try_email: |
|
519
|
|
|
|
|
|
|
/* email autolink: simple validation */ |
|
520
|
229
|
|
|
|
|
|
q = pos + 1; |
|
521
|
229
|
|
|
|
|
|
e_start = q; |
|
522
|
953
|
50
|
|
|
|
|
while (q < n && (isalnum((unsigned char)s[q]) || |
|
|
|
100
|
|
|
|
|
|
|
523
|
337
|
100
|
|
|
|
|
strchr(".!#$%&'*+/=?^_`{|}~-", s[q]))) |
|
524
|
724
|
|
|
|
|
|
q++; |
|
525
|
229
|
100
|
|
|
|
|
if (q == e_start || q >= n || s[q] != '@') return 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
526
|
7
|
|
|
|
|
|
q++; |
|
527
|
111
|
50
|
|
|
|
|
while (q < n && s[q] != '>') { |
|
|
|
100
|
|
|
|
|
|
|
528
|
104
|
|
|
|
|
|
char c = s[q]; |
|
529
|
104
|
100
|
|
|
|
|
if (isalnum((unsigned char)c)) { label_len++; q++; } |
|
530
|
16
|
100
|
|
|
|
|
else if (c == '-') { if (!label_len) return 0; label_len++; q++; } |
|
|
|
50
|
|
|
|
|
|
|
531
|
13
|
50
|
|
|
|
|
else if (c == '.') { if (!label_len) return 0; dot_ok = 1; label_len = 0; q++; } |
|
|
|
50
|
|
|
|
|
|
|
532
|
0
|
|
|
|
|
|
else return 0; |
|
533
|
104
|
50
|
|
|
|
|
if (label_len > 63) return 0; |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
(void)dot_ok; |
|
536
|
7
|
50
|
|
|
|
|
if (q >= n || s[q] != '>' || label_len == 0) return 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
537
|
7
|
|
|
|
|
|
x = node_new(z, N_AUTOLINK); |
|
538
|
7
|
|
|
|
|
|
x->s = s + pos + 1; x->n = q - (pos + 1); |
|
539
|
7
|
|
|
|
|
|
x->is_email = 1; |
|
540
|
7
|
|
|
|
|
|
append(z, x); |
|
541
|
7
|
|
|
|
|
|
return q - pos + 1; |
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
/* ---------------- raw HTML inline (§6.8) ---------------- */ |
|
545
|
|
|
|
|
|
|
|
|
546
|
402
|
|
|
|
|
|
static int html_attr_name_char(char c, int first) { |
|
547
|
402
|
100
|
|
|
|
|
if (first) return isalpha((unsigned char)c) || c == '_' || c == ':'; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
548
|
321
|
100
|
|
|
|
|
return isalnum((unsigned char)c) || c == '_' || c == ':' || c == '.' || c == '-'; |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
} |
|
550
|
|
|
|
|
|
|
|
|
551
|
220
|
|
|
|
|
|
static size_t try_html_inline(scn* z, size_t pos) { |
|
552
|
220
|
|
|
|
|
|
const char* s = z->s; size_t n = z->n; |
|
553
|
|
|
|
|
|
|
size_t q; |
|
554
|
|
|
|
|
|
|
inode* x; |
|
555
|
|
|
|
|
|
|
int closing; |
|
556
|
220
|
50
|
|
|
|
|
if (pos >= n || s[pos] != '<') return 0; |
|
|
|
50
|
|
|
|
|
|
|
557
|
220
|
50
|
|
|
|
|
if (pos + 1 >= n) return 0; |
|
558
|
220
|
|
|
|
|
|
q = pos + 1; |
|
559
|
|
|
|
|
|
|
/* comment (CommonMark 0.30+): |
|
560
|
|
|
|
|
|
|
* | | ... --> */ |
|
561
|
220
|
100
|
|
|
|
|
if (q + 2 < n && s[q] == '!' && s[q+1] == '-' && s[q+2] == '-') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
562
|
9
|
|
|
|
|
|
size_t r = q + 3; |
|
563
|
|
|
|
|
|
|
/* short forms */ |
|
564
|
9
|
50
|
|
|
|
|
if (r < n && s[r] == '>') { |
|
|
|
100
|
|
|
|
|
|
|
565
|
3
|
|
|
|
|
|
r += 1; |
|
566
|
3
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
567
|
3
|
|
|
|
|
|
x->s = s + pos; x->n = r - pos; |
|
568
|
3
|
|
|
|
|
|
append(z, x); |
|
569
|
3
|
|
|
|
|
|
return r - pos; |
|
570
|
|
|
|
|
|
|
} |
|
571
|
6
|
50
|
|
|
|
|
if (r + 1 < n && s[r] == '-' && s[r+1] == '>') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
572
|
3
|
|
|
|
|
|
r += 2; |
|
573
|
3
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
574
|
3
|
|
|
|
|
|
x->s = s + pos; x->n = r - pos; |
|
575
|
3
|
|
|
|
|
|
append(z, x); |
|
576
|
3
|
|
|
|
|
|
return r - pos; |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
/* general form: scan for "-->" with no constraint on inner '--'. */ |
|
579
|
114
|
50
|
|
|
|
|
while (r + 2 < n) { |
|
580
|
114
|
100
|
|
|
|
|
if (s[r] == '-' && s[r+1] == '-' && s[r+2] == '>') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
581
|
3
|
|
|
|
|
|
r += 3; |
|
582
|
3
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
583
|
3
|
|
|
|
|
|
x->s = s + pos; x->n = r - pos; |
|
584
|
3
|
|
|
|
|
|
append(z, x); |
|
585
|
3
|
|
|
|
|
|
return r - pos; |
|
586
|
|
|
|
|
|
|
} |
|
587
|
111
|
|
|
|
|
|
r++; |
|
588
|
|
|
|
|
|
|
} |
|
589
|
0
|
|
|
|
|
|
return 0; |
|
590
|
|
|
|
|
|
|
} |
|
591
|
|
|
|
|
|
|
/* PI */ |
|
592
|
211
|
50
|
|
|
|
|
if (q < n && s[q] == '?') { |
|
|
|
100
|
|
|
|
|
|
|
593
|
3
|
|
|
|
|
|
q++; |
|
594
|
42
|
50
|
|
|
|
|
while (q + 1 < n) { |
|
595
|
42
|
100
|
|
|
|
|
if (s[q] == '?' && s[q+1] == '>') { |
|
|
|
50
|
|
|
|
|
|
|
596
|
3
|
|
|
|
|
|
q += 2; |
|
597
|
3
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
598
|
3
|
|
|
|
|
|
x->s = s + pos; x->n = q - pos; |
|
599
|
3
|
|
|
|
|
|
append(z, x); |
|
600
|
3
|
|
|
|
|
|
return q - pos; |
|
601
|
|
|
|
|
|
|
} |
|
602
|
39
|
|
|
|
|
|
q++; |
|
603
|
|
|
|
|
|
|
} |
|
604
|
0
|
|
|
|
|
|
return 0; |
|
605
|
|
|
|
|
|
|
} |
|
606
|
|
|
|
|
|
|
/* CDATA */ |
|
607
|
208
|
100
|
|
|
|
|
if (q + 7 < n && memcmp(s + q, "![CDATA[", 8) == 0) { |
|
|
|
100
|
|
|
|
|
|
|
608
|
3
|
|
|
|
|
|
q += 8; |
|
609
|
12
|
50
|
|
|
|
|
while (q + 2 < n) { |
|
610
|
12
|
100
|
|
|
|
|
if (s[q] == ']' && s[q+1] == ']' && s[q+2] == '>') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
611
|
3
|
|
|
|
|
|
q += 3; |
|
612
|
3
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
613
|
3
|
|
|
|
|
|
x->s = s + pos; x->n = q - pos; |
|
614
|
3
|
|
|
|
|
|
append(z, x); |
|
615
|
3
|
|
|
|
|
|
return q - pos; |
|
616
|
|
|
|
|
|
|
} |
|
617
|
9
|
|
|
|
|
|
q++; |
|
618
|
|
|
|
|
|
|
} |
|
619
|
0
|
|
|
|
|
|
return 0; |
|
620
|
|
|
|
|
|
|
} |
|
621
|
|
|
|
|
|
|
/* declaration */ |
|
622
|
205
|
50
|
|
|
|
|
if (q < n && s[q] == '!' && q + 1 < n && isalpha((unsigned char)s[q+1])) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
623
|
3
|
|
|
|
|
|
q += 2; |
|
624
|
48
|
50
|
|
|
|
|
while (q < n && s[q] != '>') q++; |
|
|
|
100
|
|
|
|
|
|
|
625
|
3
|
50
|
|
|
|
|
if (q >= n) return 0; |
|
626
|
3
|
|
|
|
|
|
q++; |
|
627
|
3
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
628
|
3
|
|
|
|
|
|
x->s = s + pos; x->n = q - pos; |
|
629
|
3
|
|
|
|
|
|
append(z, x); |
|
630
|
3
|
|
|
|
|
|
return q - pos; |
|
631
|
|
|
|
|
|
|
} |
|
632
|
|
|
|
|
|
|
/* closing tag */ |
|
633
|
202
|
|
|
|
|
|
closing = 0; |
|
634
|
202
|
50
|
|
|
|
|
if (q < n && s[q] == '/') { closing = 1; q++; } |
|
|
|
100
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
/* tag name */ |
|
636
|
202
|
50
|
|
|
|
|
if (q >= n || !isalpha((unsigned char)s[q])) return 0; |
|
|
|
100
|
|
|
|
|
|
|
637
|
175
|
|
|
|
|
|
q++; |
|
638
|
516
|
50
|
|
|
|
|
while (q < n && (isalnum((unsigned char)s[q]) || s[q] == '-')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
639
|
175
|
100
|
|
|
|
|
if (closing) { |
|
640
|
38
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
641
|
32
|
50
|
|
|
|
|
if (q >= n || s[q] != '>') return 0; |
|
|
|
100
|
|
|
|
|
|
|
642
|
29
|
|
|
|
|
|
q++; |
|
643
|
29
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
644
|
29
|
|
|
|
|
|
x->s = s + pos; x->n = q - pos; |
|
645
|
29
|
|
|
|
|
|
append(z, x); |
|
646
|
29
|
|
|
|
|
|
return q - pos; |
|
647
|
|
|
|
|
|
|
} |
|
648
|
|
|
|
|
|
|
/* attributes */ |
|
649
|
218
|
50
|
|
|
|
|
while (q < n) { |
|
650
|
218
|
|
|
|
|
|
size_t pre_attr = q; |
|
651
|
218
|
|
|
|
|
|
int saw_ws = 0; |
|
652
|
|
|
|
|
|
|
size_t save; |
|
653
|
314
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
654
|
96
|
|
|
|
|
|
saw_ws = 1; q++; |
|
655
|
|
|
|
|
|
|
} |
|
656
|
218
|
50
|
|
|
|
|
if (q >= n) return 0; |
|
657
|
218
|
100
|
|
|
|
|
if (s[q] == '>' || s[q] == '/') break; |
|
|
|
100
|
|
|
|
|
|
|
658
|
117
|
100
|
|
|
|
|
if (!saw_ws) return 0; |
|
659
|
81
|
50
|
|
|
|
|
if (!html_attr_name_char(s[q], 1)) { q = pre_attr; break; } |
|
660
|
81
|
|
|
|
|
|
q++; |
|
661
|
321
|
50
|
|
|
|
|
while (q < n && html_attr_name_char(s[q], 0)) q++; |
|
|
|
100
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
/* optional value */ |
|
663
|
81
|
|
|
|
|
|
save = q; |
|
664
|
87
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
665
|
81
|
50
|
|
|
|
|
if (q < n && s[q] == '=') { |
|
|
|
100
|
|
|
|
|
|
|
666
|
69
|
|
|
|
|
|
q++; |
|
667
|
72
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
668
|
69
|
50
|
|
|
|
|
if (q < n && (s[q] == '"' || s[q] == '\'')) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
669
|
60
|
|
|
|
|
|
char qc = s[q]; q++; |
|
670
|
366
|
100
|
|
|
|
|
while (q < n && s[q] != qc) q++; |
|
|
|
100
|
|
|
|
|
|
|
671
|
60
|
100
|
|
|
|
|
if (q >= n) return 0; |
|
672
|
54
|
|
|
|
|
|
q++; |
|
673
|
|
|
|
|
|
|
} else { |
|
674
|
|
|
|
|
|
|
/* unquoted */ |
|
675
|
9
|
|
|
|
|
|
size_t vs = q; |
|
676
|
54
|
100
|
|
|
|
|
while (q < n && s[q] != ' ' && s[q] != '\t' && s[q] != '\n' && |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
677
|
39
|
50
|
|
|
|
|
s[q] != '"' && s[q] != '\'' && s[q] != '=' && |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
678
|
45
|
50
|
|
|
|
|
s[q] != '<' && s[q] != '>' && s[q] != '`') q++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
679
|
9
|
50
|
|
|
|
|
if (q == vs) return 0; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
} else { |
|
682
|
12
|
|
|
|
|
|
q = save; |
|
683
|
|
|
|
|
|
|
} |
|
684
|
|
|
|
|
|
|
} |
|
685
|
101
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
686
|
101
|
50
|
|
|
|
|
if (q < n && s[q] == '/') q++; |
|
|
|
100
|
|
|
|
|
|
|
687
|
101
|
50
|
|
|
|
|
if (q >= n || s[q] != '>') return 0; |
|
|
|
100
|
|
|
|
|
|
|
688
|
98
|
|
|
|
|
|
q++; |
|
689
|
98
|
|
|
|
|
|
x = node_new(z, N_HTMLINLINE); |
|
690
|
98
|
|
|
|
|
|
x->s = s + pos; x->n = q - pos; |
|
691
|
98
|
|
|
|
|
|
append(z, x); |
|
692
|
98
|
|
|
|
|
|
return q - pos; |
|
693
|
|
|
|
|
|
|
} |
|
694
|
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
/* ---------------- link parsing helpers ---------------- */ |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
/* Parse a CommonMark link destination starting at *p. On success advances |
|
698
|
|
|
|
|
|
|
* *p past it and sets [out_s, out_e) to the destination bytes (still in the |
|
699
|
|
|
|
|
|
|
* source buffer; the renderer normalises). */ |
|
700
|
220
|
|
|
|
|
|
static int parse_link_destination(const char* s, size_t* pp, size_t n, |
|
701
|
|
|
|
|
|
|
const char** out_s, size_t* out_n) { |
|
702
|
220
|
|
|
|
|
|
size_t p = *pp; |
|
703
|
|
|
|
|
|
|
size_t ds; |
|
704
|
|
|
|
|
|
|
int paren; |
|
705
|
220
|
50
|
|
|
|
|
if (p >= n) return 0; |
|
706
|
220
|
100
|
|
|
|
|
if (s[p] == '<') { |
|
707
|
30
|
|
|
|
|
|
p++; |
|
708
|
30
|
|
|
|
|
|
ds = p; |
|
709
|
150
|
100
|
|
|
|
|
while (p < n && s[p] != '>' && s[p] != '<' && s[p] != '\n') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
710
|
120
|
100
|
|
|
|
|
if (s[p] == '\\' && p + 1 < n) p++; |
|
|
|
50
|
|
|
|
|
|
|
711
|
120
|
|
|
|
|
|
p++; |
|
712
|
|
|
|
|
|
|
} |
|
713
|
30
|
100
|
|
|
|
|
if (p >= n || s[p] != '>') return 0; |
|
|
|
100
|
|
|
|
|
|
|
714
|
21
|
|
|
|
|
|
*out_s = s + ds; |
|
715
|
21
|
|
|
|
|
|
*out_n = p - ds; |
|
716
|
21
|
|
|
|
|
|
*pp = p + 1; |
|
717
|
21
|
|
|
|
|
|
return 1; |
|
718
|
|
|
|
|
|
|
} |
|
719
|
190
|
|
|
|
|
|
paren = 0; |
|
720
|
190
|
|
|
|
|
|
ds = p; |
|
721
|
1554
|
100
|
|
|
|
|
while (p < n) { |
|
722
|
1551
|
|
|
|
|
|
unsigned char c = (unsigned char)s[p]; |
|
723
|
1551
|
100
|
|
|
|
|
if (c < 0x20 || c == 0x7f) break; |
|
|
|
50
|
|
|
|
|
|
|
724
|
1545
|
100
|
|
|
|
|
if (c == ' ' || c == '\t' || c == '\n') break; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
725
|
1498
|
100
|
|
|
|
|
if (c == '\\' && p + 1 < n && is_ascii_punct((unsigned char)s[p+1])) { p += 2; continue; } |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
726
|
1474
|
100
|
|
|
|
|
if (c == '(') { paren++; p++; continue; } |
|
727
|
1459
|
100
|
|
|
|
|
if (c == ')') { if (paren == 0) break; paren--; p++; continue; } |
|
|
|
100
|
|
|
|
|
|
|
728
|
1310
|
|
|
|
|
|
p++; |
|
729
|
|
|
|
|
|
|
} |
|
730
|
190
|
50
|
|
|
|
|
if (p == ds || paren != 0) return 0; |
|
|
|
50
|
|
|
|
|
|
|
731
|
190
|
|
|
|
|
|
*out_s = s + ds; |
|
732
|
190
|
|
|
|
|
|
*out_n = p - ds; |
|
733
|
190
|
|
|
|
|
|
*pp = p; |
|
734
|
190
|
|
|
|
|
|
return 1; |
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
|
|
737
|
41
|
|
|
|
|
|
static int parse_link_title(const char* s, size_t* pp, size_t n, |
|
738
|
|
|
|
|
|
|
const char** out_s, size_t* out_n) { |
|
739
|
41
|
|
|
|
|
|
size_t p = *pp; |
|
740
|
|
|
|
|
|
|
char open, close; |
|
741
|
|
|
|
|
|
|
size_t ts; |
|
742
|
|
|
|
|
|
|
int prev_blank_line; |
|
743
|
41
|
50
|
|
|
|
|
if (p >= n) return 0; |
|
744
|
41
|
|
|
|
|
|
open = s[p]; |
|
745
|
41
|
100
|
|
|
|
|
if (open == '"' || open == '\'') close = open; |
|
|
|
100
|
|
|
|
|
|
|
746
|
3
|
50
|
|
|
|
|
else if (open == '(') close = ')'; |
|
747
|
0
|
|
|
|
|
|
else return 0; |
|
748
|
41
|
|
|
|
|
|
p++; |
|
749
|
41
|
|
|
|
|
|
ts = p; |
|
750
|
41
|
|
|
|
|
|
prev_blank_line = 0; |
|
751
|
339
|
50
|
|
|
|
|
while (p < n && s[p] != close) { |
|
|
|
100
|
|
|
|
|
|
|
752
|
298
|
100
|
|
|
|
|
if (s[p] == '\\' && p + 1 < n && is_ascii_punct((unsigned char)s[p+1])) { p += 2; continue; } |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
753
|
292
|
100
|
|
|
|
|
if (open == '(' && s[p] == '(') return 0; |
|
|
|
50
|
|
|
|
|
|
|
754
|
292
|
50
|
|
|
|
|
if (s[p] == '\n') { |
|
755
|
|
|
|
|
|
|
/* check for blank line */ |
|
756
|
0
|
|
|
|
|
|
size_t r = p + 1; |
|
757
|
0
|
0
|
|
|
|
|
while (r < n && (s[r] == ' ' || s[r] == '\t')) r++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
758
|
0
|
0
|
|
|
|
|
if (r >= n || s[r] == '\n') { prev_blank_line = 1; break; } |
|
|
|
0
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
} |
|
760
|
292
|
|
|
|
|
|
p++; |
|
761
|
|
|
|
|
|
|
} |
|
762
|
41
|
50
|
|
|
|
|
if (prev_blank_line || p >= n || s[p] != close) return 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
763
|
41
|
|
|
|
|
|
*out_s = s + ts; |
|
764
|
41
|
|
|
|
|
|
*out_n = p - ts; |
|
765
|
41
|
|
|
|
|
|
*pp = p + 1; |
|
766
|
41
|
|
|
|
|
|
return 1; |
|
767
|
|
|
|
|
|
|
} |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
/* ---------------- find matching open bracket ---------------- */ |
|
770
|
|
|
|
|
|
|
|
|
771
|
723
|
|
|
|
|
|
static inode* find_open_bracket(scn* z, int* is_image) { |
|
772
|
|
|
|
|
|
|
inode* x; |
|
773
|
723
|
|
|
|
|
|
*is_image = 0; |
|
774
|
1851
|
100
|
|
|
|
|
for (x = z->tail; x; x = x->prev) { |
|
775
|
1835
|
100
|
|
|
|
|
if (x->type == N_OPEN_BRACKET || x->type == N_OPEN_BANG) { |
|
|
|
100
|
|
|
|
|
|
|
776
|
707
|
|
|
|
|
|
*is_image = (x->type == N_OPEN_BANG); |
|
777
|
707
|
|
|
|
|
|
return x; /* return topmost — caller checks ->active */ |
|
778
|
|
|
|
|
|
|
} |
|
779
|
|
|
|
|
|
|
} |
|
780
|
16
|
|
|
|
|
|
return NULL; |
|
781
|
|
|
|
|
|
|
} |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
/* Disable any '[' opener nodes appearing before x (for nested-link rule). */ |
|
784
|
343
|
|
|
|
|
|
static void deactivate_brackets(scn* z, inode* x) { |
|
785
|
|
|
|
|
|
|
inode* p; |
|
786
|
603
|
100
|
|
|
|
|
for (p = x->prev; p; p = p->prev) { |
|
787
|
260
|
100
|
|
|
|
|
if (p->type == N_OPEN_BRACKET) p->active = 0; |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
(void)z; |
|
790
|
343
|
|
|
|
|
|
} |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
/* Move nodes (open_bracket->next .. end) into a new container of type t, |
|
793
|
|
|
|
|
|
|
* which replaces the open_bracket and everything after. Returns the |
|
794
|
|
|
|
|
|
|
* container. */ |
|
795
|
422
|
|
|
|
|
|
static inode* wrap_after(scn* z, inode* open, ntype t) { |
|
796
|
422
|
|
|
|
|
|
inode* c = node_new(z, t); |
|
797
|
422
|
|
|
|
|
|
c->children = open->next; |
|
798
|
422
|
|
|
|
|
|
c->children_tail = z->tail; |
|
799
|
422
|
100
|
|
|
|
|
if (c->children) c->children->prev = NULL; |
|
800
|
|
|
|
|
|
|
/* sever tail link */ |
|
801
|
422
|
|
|
|
|
|
z->tail = open->prev; |
|
802
|
422
|
100
|
|
|
|
|
if (z->tail) z->tail->next = NULL; |
|
803
|
295
|
|
|
|
|
|
else z->head = NULL; |
|
804
|
|
|
|
|
|
|
/* remove the open bracket itself */ |
|
805
|
|
|
|
|
|
|
/* (it's now floating; we won't re-link it) */ |
|
806
|
422
|
|
|
|
|
|
append(z, c); |
|
807
|
422
|
|
|
|
|
|
return c; |
|
808
|
|
|
|
|
|
|
} |
|
809
|
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
/* ---------------- process_emphasis (§6.4) ---------------- */ |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
/* The CommonMark algorithm operates on the delimiter stack. We use the |
|
813
|
|
|
|
|
|
|
* doubly-linked node list directly; DELIM nodes ARE the stack entries. |
|
814
|
|
|
|
|
|
|
* |
|
815
|
|
|
|
|
|
|
* stack_bottom: only consider delimiters strictly after this node. |
|
816
|
|
|
|
|
|
|
* NULL = whole list (or list head). |
|
817
|
|
|
|
|
|
|
*/ |
|
818
|
2128
|
|
|
|
|
|
static void process_emphasis(scn* z, inode* stack_bottom) { |
|
819
|
|
|
|
|
|
|
/* openers_bottom[delim_idx][closer_count_mod3][can_open(0|1)] */ |
|
820
|
|
|
|
|
|
|
inode* openers_bottom[3][3][2]; |
|
821
|
|
|
|
|
|
|
inode* closer; |
|
822
|
|
|
|
|
|
|
int a, b, c; |
|
823
|
|
|
|
|
|
|
int use2; |
|
824
|
|
|
|
|
|
|
ntype tt; |
|
825
|
|
|
|
|
|
|
unsigned _ifl; |
|
826
|
|
|
|
|
|
|
inode* container; |
|
827
|
|
|
|
|
|
|
inode* first; |
|
828
|
|
|
|
|
|
|
inode* last; |
|
829
|
|
|
|
|
|
|
inode* before; |
|
830
|
|
|
|
|
|
|
inode* after; |
|
831
|
|
|
|
|
|
|
inode* new_open; |
|
832
|
|
|
|
|
|
|
inode* new_close; |
|
833
|
|
|
|
|
|
|
inode* prev_link; |
|
834
|
|
|
|
|
|
|
inode* start; |
|
835
|
|
|
|
|
|
|
inode* p; |
|
836
|
8512
|
100
|
|
|
|
|
for (a = 0; a < 3; a++) |
|
837
|
25536
|
100
|
|
|
|
|
for (b = 0; b < 3; b++) |
|
838
|
57456
|
100
|
|
|
|
|
for (c = 0; c < 2; c++) |
|
839
|
38304
|
|
|
|
|
|
openers_bottom[a][b][c] = stack_bottom; |
|
840
|
|
|
|
|
|
|
|
|
841
|
2128
|
100
|
|
|
|
|
closer = stack_bottom ? stack_bottom->next : z->head; |
|
842
|
|
|
|
|
|
|
/* find first potential closer */ |
|
843
|
6943
|
100
|
|
|
|
|
while (closer) { |
|
844
|
5333
|
100
|
|
|
|
|
if (closer->type == N_DELIM && closer->can_close && |
|
|
|
100
|
|
|
|
|
|
|
845
|
518
|
100
|
|
|
|
|
(closer->delim_char == '*' || closer->delim_char == '_' || |
|
|
|
100
|
|
|
|
|
|
|
846
|
26
|
50
|
|
|
|
|
closer->delim_char == '~')) |
|
847
|
|
|
|
|
|
|
break; |
|
848
|
4815
|
|
|
|
|
|
closer = closer->next; |
|
849
|
|
|
|
|
|
|
} |
|
850
|
2904
|
100
|
|
|
|
|
while (closer) { |
|
851
|
776
|
|
|
|
|
|
unsigned char ch = closer->delim_char; |
|
852
|
776
|
100
|
|
|
|
|
int didx = (ch == '*') ? 0 : (ch == '_') ? 1 : 2; |
|
|
|
100
|
|
|
|
|
|
|
853
|
776
|
|
|
|
|
|
int co_mod = closer->count % 3; |
|
854
|
776
|
|
|
|
|
|
int co_op = closer->can_open ? 1 : 0; |
|
855
|
776
|
|
|
|
|
|
inode* bot = openers_bottom[didx][co_mod][co_op]; |
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
/* walk back for matching opener */ |
|
858
|
776
|
|
|
|
|
|
inode* opener = closer->prev; |
|
859
|
776
|
|
|
|
|
|
int found = 0; |
|
860
|
1951
|
100
|
|
|
|
|
while (opener && opener != bot && opener != stack_bottom) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
861
|
1786
|
100
|
|
|
|
|
if (opener->type == N_DELIM && opener->can_open && |
|
|
|
100
|
|
|
|
|
|
|
862
|
662
|
100
|
|
|
|
|
opener->delim_char == ch) { |
|
863
|
|
|
|
|
|
|
/* rule of three */ |
|
864
|
557
|
100
|
|
|
|
|
int odd_match = (closer->can_open || opener->can_close) && |
|
865
|
1204
|
100
|
|
|
|
|
((opener->count + closer->count) % 3 == 0) && |
|
|
|
100
|
|
|
|
|
|
|
866
|
21
|
100
|
|
|
|
|
!(opener->count % 3 == 0 && closer->count % 3 == 0); |
|
|
|
50
|
|
|
|
|
|
|
867
|
626
|
100
|
|
|
|
|
if (!odd_match) { found = 1; break; } |
|
868
|
|
|
|
|
|
|
} |
|
869
|
1175
|
|
|
|
|
|
opener = opener->prev; |
|
870
|
|
|
|
|
|
|
} |
|
871
|
776
|
100
|
|
|
|
|
if (!found) { |
|
872
|
165
|
|
|
|
|
|
openers_bottom[didx][co_mod][co_op] = closer->prev; |
|
873
|
|
|
|
|
|
|
/* If the closer itself can't also open, mark it inert so it |
|
874
|
|
|
|
|
|
|
* becomes literal text in the final sweep. Either way, advance. */ |
|
875
|
165
|
100
|
|
|
|
|
if (!closer->can_open) closer->can_close = 0; |
|
876
|
165
|
|
|
|
|
|
closer = closer->next; |
|
877
|
315
|
100
|
|
|
|
|
while (closer) { |
|
878
|
210
|
100
|
|
|
|
|
if (closer->type == N_DELIM && closer->can_close && |
|
|
|
100
|
|
|
|
|
|
|
879
|
60
|
100
|
|
|
|
|
(closer->delim_char == '*' || closer->delim_char == '_' || |
|
|
|
50
|
|
|
|
|
|
|
880
|
0
|
0
|
|
|
|
|
closer->delim_char == '~')) |
|
881
|
|
|
|
|
|
|
break; |
|
882
|
150
|
|
|
|
|
|
closer = closer->next; |
|
883
|
|
|
|
|
|
|
} |
|
884
|
165
|
|
|
|
|
|
continue; |
|
885
|
|
|
|
|
|
|
} |
|
886
|
|
|
|
|
|
|
|
|
887
|
611
|
|
|
|
|
|
use2 = (ch == '~') |
|
888
|
|
|
|
|
|
|
? opener->count /* matched count for tildes */ |
|
889
|
611
|
100
|
|
|
|
|
: ((opener->count >= 2 && closer->count >= 2) ? 2 : 1); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
/* GFM strike: counts must match and be 1 or 2 (no triple+). */ |
|
892
|
611
|
100
|
|
|
|
|
if (ch == '~' && (opener->count != closer->count || |
|
|
|
100
|
|
|
|
|
|
|
893
|
26
|
100
|
|
|
|
|
(opener->count != 1 && opener->count != 2))) { |
|
|
|
50
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
/* skip — leave as text */ |
|
895
|
3
|
|
|
|
|
|
openers_bottom[didx][co_mod][co_op] = opener; |
|
896
|
3
|
|
|
|
|
|
closer = closer->next; |
|
897
|
3
|
50
|
|
|
|
|
while (closer) { |
|
898
|
0
|
0
|
|
|
|
|
if (closer->type == N_DELIM && closer->can_close && |
|
|
|
0
|
|
|
|
|
|
|
899
|
0
|
0
|
|
|
|
|
(closer->delim_char == '*' || closer->delim_char == '_' || |
|
|
|
0
|
|
|
|
|
|
|
900
|
0
|
0
|
|
|
|
|
closer->delim_char == '~')) |
|
901
|
|
|
|
|
|
|
break; |
|
902
|
0
|
|
|
|
|
|
closer = closer->next; |
|
903
|
|
|
|
|
|
|
} |
|
904
|
3
|
|
|
|
|
|
continue; |
|
905
|
|
|
|
|
|
|
} |
|
906
|
|
|
|
|
|
|
|
|
907
|
608
|
100
|
|
|
|
|
tt = (ch == '~') ? N_STRIKE : (use2 == 2 ? N_STRONG : N_EMPH); |
|
|
|
100
|
|
|
|
|
|
|
908
|
608
|
|
|
|
|
|
_ifl = z->ctx->flags; |
|
909
|
608
|
100
|
|
|
|
|
if ((tt == N_EMPH && (_ifl & MDS_FLAG_NO_EMPH)) || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
910
|
238
|
100
|
|
|
|
|
(tt == N_STRONG && (_ifl & MDS_FLAG_NO_STRONG))) { |
|
911
|
|
|
|
|
|
|
/* Skip: leave delim run as-is, advance closer (becomes text) */ |
|
912
|
2
|
|
|
|
|
|
openers_bottom[didx][co_mod][co_op] = opener; |
|
913
|
2
|
50
|
|
|
|
|
if (!closer->can_open) closer->can_close = 0; |
|
914
|
2
|
|
|
|
|
|
closer = closer->next; |
|
915
|
2
|
50
|
|
|
|
|
while (closer) { |
|
916
|
0
|
0
|
|
|
|
|
if (closer->type == N_DELIM && closer->can_close && |
|
|
|
0
|
|
|
|
|
|
|
917
|
0
|
0
|
|
|
|
|
(closer->delim_char == '*' || closer->delim_char == '_' || |
|
|
|
0
|
|
|
|
|
|
|
918
|
0
|
0
|
|
|
|
|
closer->delim_char == '~')) |
|
919
|
|
|
|
|
|
|
break; |
|
920
|
0
|
|
|
|
|
|
closer = closer->next; |
|
921
|
|
|
|
|
|
|
} |
|
922
|
2
|
|
|
|
|
|
continue; |
|
923
|
|
|
|
|
|
|
} |
|
924
|
606
|
|
|
|
|
|
container = node_new(z, tt); |
|
925
|
|
|
|
|
|
|
/* children = (opener->next .. closer->prev) */ |
|
926
|
606
|
|
|
|
|
|
first = opener->next; |
|
927
|
606
|
|
|
|
|
|
last = closer->prev; |
|
928
|
606
|
50
|
|
|
|
|
if (first != closer) { |
|
929
|
606
|
|
|
|
|
|
container->children = first; |
|
930
|
606
|
|
|
|
|
|
container->children_tail = last; |
|
931
|
606
|
|
|
|
|
|
first->prev = NULL; |
|
932
|
606
|
|
|
|
|
|
last->next = NULL; |
|
933
|
|
|
|
|
|
|
} |
|
934
|
|
|
|
|
|
|
/* shrink/remove delimiters */ |
|
935
|
606
|
|
|
|
|
|
opener->count -= use2; |
|
936
|
606
|
|
|
|
|
|
closer->count -= use2; |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
/* relink: replace [opener? closer?] block with container */ |
|
939
|
606
|
|
|
|
|
|
before = opener->prev; |
|
940
|
606
|
|
|
|
|
|
after = closer->next; |
|
941
|
|
|
|
|
|
|
|
|
942
|
606
|
100
|
|
|
|
|
new_open = (opener->count > 0) ? opener : NULL; |
|
943
|
606
|
100
|
|
|
|
|
new_close = (closer->count > 0) ? closer : NULL; |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
/* shrink opener s/n bytes for proper future text emission? Not |
|
946
|
|
|
|
|
|
|
* needed — opener bytes are never emitted directly; only the count |
|
947
|
|
|
|
|
|
|
* matters when shrunken to >0 and treated as remaining delim. */ |
|
948
|
606
|
100
|
|
|
|
|
if (new_open) { |
|
949
|
|
|
|
|
|
|
/* truncate opener's literal length so leftover delim chars |
|
950
|
|
|
|
|
|
|
* remain rendered if not consumed later */ |
|
951
|
65
|
|
|
|
|
|
new_open->n = (size_t)new_open->count; |
|
952
|
|
|
|
|
|
|
} |
|
953
|
606
|
100
|
|
|
|
|
if (new_close) { |
|
954
|
71
|
|
|
|
|
|
new_close->n = (size_t)new_close->count; |
|
955
|
71
|
|
|
|
|
|
new_close->s = new_close->s; /* keep pointer, length adjusted */ |
|
956
|
|
|
|
|
|
|
} |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
/* build list: before, new_open?, container, new_close?, after */ |
|
959
|
606
|
|
|
|
|
|
prev_link = before; |
|
960
|
606
|
100
|
|
|
|
|
if (new_open) { |
|
961
|
65
|
100
|
|
|
|
|
if (prev_link) prev_link->next = new_open; |
|
962
|
54
|
|
|
|
|
|
else z->head = new_open; |
|
963
|
65
|
|
|
|
|
|
new_open->prev = prev_link; |
|
964
|
65
|
|
|
|
|
|
prev_link = new_open; |
|
965
|
|
|
|
|
|
|
} |
|
966
|
606
|
100
|
|
|
|
|
if (prev_link) prev_link->next = container; |
|
967
|
265
|
|
|
|
|
|
else z->head = container; |
|
968
|
606
|
|
|
|
|
|
container->prev = prev_link; |
|
969
|
606
|
|
|
|
|
|
prev_link = container; |
|
970
|
606
|
100
|
|
|
|
|
if (new_close) { |
|
971
|
71
|
|
|
|
|
|
prev_link->next = new_close; |
|
972
|
71
|
|
|
|
|
|
new_close->prev = prev_link; |
|
973
|
71
|
|
|
|
|
|
prev_link = new_close; |
|
974
|
|
|
|
|
|
|
} |
|
975
|
606
|
|
|
|
|
|
prev_link->next = after; |
|
976
|
606
|
100
|
|
|
|
|
if (after) after->prev = prev_link; |
|
977
|
381
|
|
|
|
|
|
else z->tail = prev_link; |
|
978
|
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
/* continue: if closer still has count, use it as closer again; |
|
980
|
|
|
|
|
|
|
* otherwise resume from after */ |
|
981
|
606
|
100
|
|
|
|
|
if (new_close) { |
|
982
|
71
|
|
|
|
|
|
closer = new_close; |
|
983
|
|
|
|
|
|
|
} else { |
|
984
|
535
|
|
|
|
|
|
closer = after; |
|
985
|
852
|
100
|
|
|
|
|
while (closer) { |
|
986
|
444
|
100
|
|
|
|
|
if (closer->type == N_DELIM && closer->can_close && |
|
|
|
100
|
|
|
|
|
|
|
987
|
127
|
100
|
|
|
|
|
(closer->delim_char == '*' || closer->delim_char == '_' || |
|
|
|
100
|
|
|
|
|
|
|
988
|
6
|
50
|
|
|
|
|
closer->delim_char == '~')) |
|
989
|
|
|
|
|
|
|
break; |
|
990
|
317
|
|
|
|
|
|
closer = closer->next; |
|
991
|
|
|
|
|
|
|
} |
|
992
|
|
|
|
|
|
|
} |
|
993
|
|
|
|
|
|
|
} |
|
994
|
|
|
|
|
|
|
/* clear remaining DELIMs to TEXT */ |
|
995
|
2128
|
100
|
|
|
|
|
start = stack_bottom ? stack_bottom->next : z->head; |
|
996
|
6736
|
100
|
|
|
|
|
for (p = start; p; p = p->next) { |
|
997
|
4608
|
100
|
|
|
|
|
if (p->type == N_DELIM) { |
|
998
|
406
|
|
|
|
|
|
p->type = N_TEXT; |
|
999
|
|
|
|
|
|
|
} |
|
1000
|
|
|
|
|
|
|
} |
|
1001
|
2128
|
|
|
|
|
|
} |
|
1002
|
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
/* ---------------- process ']' ---------------- */ |
|
1004
|
|
|
|
|
|
|
|
|
1005
|
723
|
|
|
|
|
|
static int try_close_bracket(scn* z, size_t* pos_io) { |
|
1006
|
723
|
|
|
|
|
|
const char* s = z->s; size_t n = z->n; |
|
1007
|
723
|
|
|
|
|
|
size_t pos = *pos_io; |
|
1008
|
723
|
|
|
|
|
|
int is_image = 0; |
|
1009
|
|
|
|
|
|
|
inode* opener; |
|
1010
|
|
|
|
|
|
|
size_t p; |
|
1011
|
723
|
|
|
|
|
|
int matched = 0; |
|
1012
|
723
|
|
|
|
|
|
const char *href_s = NULL, *title_s = NULL; |
|
1013
|
723
|
|
|
|
|
|
size_t hlen = 0, tlen = 0; |
|
1014
|
723
|
|
|
|
|
|
int is_ref = 0; |
|
1015
|
723
|
|
|
|
|
|
const mds_linkref* refent = NULL; |
|
1016
|
|
|
|
|
|
|
ntype t; |
|
1017
|
|
|
|
|
|
|
inode* container; |
|
1018
|
723
|
|
|
|
|
|
opener = find_open_bracket(z, &is_image); |
|
1019
|
723
|
100
|
|
|
|
|
if (!opener) { |
|
1020
|
16
|
|
|
|
|
|
return 0; /* no opener — caller emits literal ']' */ |
|
1021
|
|
|
|
|
|
|
} |
|
1022
|
|
|
|
|
|
|
/* CommonMark "look for link or image" step 3: if the opener exists |
|
1023
|
|
|
|
|
|
|
* but is inactive, remove it from the stack (convert to literal `[`) |
|
1024
|
|
|
|
|
|
|
* and treat this `]` as literal text. Do NOT keep searching for an |
|
1025
|
|
|
|
|
|
|
* earlier active opener — the inactive opener blocks it. This is |
|
1026
|
|
|
|
|
|
|
* what makes the alt text of an image with a nested link come out |
|
1027
|
|
|
|
|
|
|
* as `[foo](uri2)` literally (CM example 520). */ |
|
1028
|
707
|
100
|
|
|
|
|
if (!opener->active) { |
|
1029
|
21
|
|
|
|
|
|
opener->type = N_TEXT; |
|
1030
|
21
|
|
|
|
|
|
opener->active = 0; |
|
1031
|
21
|
|
|
|
|
|
return 0; |
|
1032
|
|
|
|
|
|
|
} |
|
1033
|
686
|
|
|
|
|
|
p = pos + 1; |
|
1034
|
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
/* GFM footnote reference [^label] — checked first so it wins over |
|
1036
|
|
|
|
|
|
|
* inline link/ref interpretations. Requires the bracket content to |
|
1037
|
|
|
|
|
|
|
* begin with `^` and the label (everything after) to be present in |
|
1038
|
|
|
|
|
|
|
* ctx->footnotes. Unresolved [^label] falls through to normal |
|
1039
|
|
|
|
|
|
|
* processing (becomes literal text). */ |
|
1040
|
686
|
100
|
|
|
|
|
if ((z->ctx->flags & MDS_FLAG_FOOTNOTES) && z->ctx->footnotes) { |
|
|
|
100
|
|
|
|
|
|
|
1041
|
27
|
|
|
|
|
|
size_t txt_s0 = (size_t)((opener->s + opener->n) - s); |
|
1042
|
27
|
|
|
|
|
|
size_t txt_e0 = pos; |
|
1043
|
27
|
50
|
|
|
|
|
if (txt_e0 > txt_s0 && s[txt_s0] == '^') { |
|
|
|
50
|
|
|
|
|
|
|
1044
|
27
|
|
|
|
|
|
const char* lab_s = s + txt_s0 + 1; |
|
1045
|
27
|
|
|
|
|
|
size_t lab_n = txt_e0 - txt_s0 - 1; |
|
1046
|
27
|
|
|
|
|
|
const mds_footnote* fn = mds_footnote_get(z->ctx->footnotes, |
|
1047
|
|
|
|
|
|
|
lab_s, lab_n); |
|
1048
|
27
|
100
|
|
|
|
|
if (fn) { |
|
1049
|
|
|
|
|
|
|
/* Discard any children between opener and the `]` (the |
|
1050
|
|
|
|
|
|
|
* `^label` text/delim nodes); we don't render them. */ |
|
1051
|
24
|
|
|
|
|
|
opener->children = NULL; |
|
1052
|
24
|
|
|
|
|
|
opener->children_tail = NULL; |
|
1053
|
|
|
|
|
|
|
/* Drop everything after opener up to but not including pos. */ |
|
1054
|
24
|
|
|
|
|
|
opener->next = NULL; |
|
1055
|
24
|
|
|
|
|
|
z->tail = opener; |
|
1056
|
24
|
100
|
|
|
|
|
if (opener->type == N_OPEN_BANG) { |
|
1057
|
|
|
|
|
|
|
inode* fnref; |
|
1058
|
|
|
|
|
|
|
/* Salvage the literal `!` byte that the bang opener |
|
1059
|
|
|
|
|
|
|
* absorbed — emit it as a sibling TEXT node BEFORE |
|
1060
|
|
|
|
|
|
|
* the footnote ref. Without this, inputs like |
|
1061
|
|
|
|
|
|
|
* `text![^1]` lose the `!`. */ |
|
1062
|
3
|
|
|
|
|
|
opener->type = N_TEXT; |
|
1063
|
3
|
|
|
|
|
|
opener->n = 1; /* s already points at '!' */ |
|
1064
|
3
|
|
|
|
|
|
fnref = node_new(z, N_FOOTNOTE_REF); |
|
1065
|
3
|
|
|
|
|
|
fnref->href = fn->label; |
|
1066
|
3
|
|
|
|
|
|
fnref->hlen = fn->llen; |
|
1067
|
3
|
|
|
|
|
|
fnref->active = 0; |
|
1068
|
3
|
|
|
|
|
|
append(z, fnref); /* updates z->tail */ |
|
1069
|
|
|
|
|
|
|
} else { |
|
1070
|
|
|
|
|
|
|
/* Convert opener into the FOOTNOTE_REF node itself. */ |
|
1071
|
21
|
|
|
|
|
|
opener->type = N_FOOTNOTE_REF; |
|
1072
|
21
|
|
|
|
|
|
opener->href = fn->label; |
|
1073
|
21
|
|
|
|
|
|
opener->hlen = fn->llen; |
|
1074
|
21
|
|
|
|
|
|
opener->active = 0; |
|
1075
|
|
|
|
|
|
|
} |
|
1076
|
24
|
|
|
|
|
|
*pos_io = pos + 1; |
|
1077
|
24
|
|
|
|
|
|
return 1; |
|
1078
|
|
|
|
|
|
|
} |
|
1079
|
|
|
|
|
|
|
} |
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
/* (a) inline link [text](url "title") */ |
|
1083
|
662
|
100
|
|
|
|
|
if (p < n && s[p] == '(') { |
|
|
|
100
|
|
|
|
|
|
|
1084
|
229
|
|
|
|
|
|
size_t q = p + 1; |
|
1085
|
238
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1086
|
229
|
50
|
|
|
|
|
if (q < n && s[q] != ')') { |
|
|
|
100
|
|
|
|
|
|
|
1087
|
431
|
100
|
|
|
|
|
if (parse_link_destination(s, &q, n, &href_s, &hlen)) { |
|
1088
|
211
|
|
|
|
|
|
size_t after_dest = q; |
|
1089
|
270
|
100
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1090
|
211
|
100
|
|
|
|
|
if (q < n && (s[q] == '"' || s[q] == '\'' || s[q] == '(')) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1091
|
41
|
50
|
|
|
|
|
if (!parse_link_title(s, &q, n, &title_s, &tlen)) { |
|
1092
|
0
|
|
|
|
|
|
q = after_dest; |
|
1093
|
0
|
|
|
|
|
|
title_s = NULL; tlen = 0; |
|
1094
|
|
|
|
|
|
|
} else { |
|
1095
|
56
|
50
|
|
|
|
|
while (q < n && (s[q] == ' ' || s[q] == '\t' || s[q] == '\n')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
} |
|
1097
|
|
|
|
|
|
|
} |
|
1098
|
211
|
100
|
|
|
|
|
if (q < n && s[q] == ')') { |
|
|
|
100
|
|
|
|
|
|
|
1099
|
187
|
|
|
|
|
|
p = q + 1; |
|
1100
|
187
|
|
|
|
|
|
matched = 1; |
|
1101
|
|
|
|
|
|
|
} |
|
1102
|
|
|
|
|
|
|
} |
|
1103
|
9
|
50
|
|
|
|
|
} else if (q < n && s[q] == ')') { |
|
|
|
50
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
/* empty destination */ |
|
1105
|
9
|
|
|
|
|
|
href_s = s; hlen = 0; |
|
1106
|
9
|
|
|
|
|
|
p = q + 1; |
|
1107
|
9
|
|
|
|
|
|
matched = 1; |
|
1108
|
|
|
|
|
|
|
} |
|
1109
|
|
|
|
|
|
|
} |
|
1110
|
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
/* (b) ref link forms */ |
|
1112
|
662
|
100
|
|
|
|
|
if (!matched && z->ctx->refs) { |
|
|
|
100
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
/* label text bytes are between opener and current ] */ |
|
1114
|
256
|
|
|
|
|
|
size_t lbl_start_off = (size_t)(opener->s - s) + opener->n; /* after [ */ |
|
1115
|
|
|
|
|
|
|
/* Actually opener->s points at '[' itself; opener->n == 1 (or 2 for ![) */ |
|
1116
|
|
|
|
|
|
|
/* simpler: text content range is open_text..pos */ |
|
1117
|
256
|
|
|
|
|
|
size_t txt_s = (size_t)((opener->s + opener->n) - s); |
|
1118
|
256
|
|
|
|
|
|
size_t txt_e = pos; |
|
1119
|
256
|
|
|
|
|
|
int tried_full = 0; |
|
1120
|
|
|
|
|
|
|
(void)lbl_start_off; |
|
1121
|
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
/* full ref: [text][label] */ |
|
1123
|
256
|
100
|
|
|
|
|
if (p < n && s[p] == '[') { |
|
|
|
100
|
|
|
|
|
|
|
1124
|
105
|
|
|
|
|
|
size_t q = p + 1; |
|
1125
|
105
|
|
|
|
|
|
size_t lbl_s = q; |
|
1126
|
337
|
50
|
|
|
|
|
while (q < n && s[q] != ']' && s[q] != '[' && q - lbl_s < 1000) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1127
|
232
|
100
|
|
|
|
|
if (s[q] == '\\' && q + 1 < n) q++; |
|
|
|
50
|
|
|
|
|
|
|
1128
|
232
|
|
|
|
|
|
q++; |
|
1129
|
|
|
|
|
|
|
} |
|
1130
|
105
|
50
|
|
|
|
|
if (q < n && s[q] == ']' && q > lbl_s) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1131
|
71
|
|
|
|
|
|
tried_full = 1; |
|
1132
|
71
|
|
|
|
|
|
refent = mds_linkref_get(z->ctx->refs, s + lbl_s, q - lbl_s); |
|
1133
|
71
|
100
|
|
|
|
|
if (refent) { p = q + 1; is_ref = 1; matched = 1; } |
|
1134
|
34
|
50
|
|
|
|
|
} else if (q < n && s[q] == ']' && q == lbl_s) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
/* collapsed [text][] */ |
|
1136
|
34
|
|
|
|
|
|
refent = mds_linkref_get(z->ctx->refs, s + txt_s, txt_e - txt_s); |
|
1137
|
34
|
50
|
|
|
|
|
if (refent) { p = q + 1; is_ref = 1; matched = 1; } |
|
1138
|
|
|
|
|
|
|
} |
|
1139
|
|
|
|
|
|
|
} |
|
1140
|
256
|
100
|
|
|
|
|
if (!matched && !tried_full) { |
|
|
|
100
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
/* shortcut [text] — only when there is no [label] following */ |
|
1142
|
151
|
|
|
|
|
|
refent = mds_linkref_get(z->ctx->refs, s + txt_s, txt_e - txt_s); |
|
1143
|
151
|
100
|
|
|
|
|
if (refent) { is_ref = 1; matched = 1; } |
|
1144
|
|
|
|
|
|
|
} |
|
1145
|
256
|
100
|
|
|
|
|
if (matched && refent) { |
|
|
|
50
|
|
|
|
|
|
|
1146
|
226
|
|
|
|
|
|
href_s = refent->url; hlen = refent->ulen; |
|
1147
|
226
|
|
|
|
|
|
title_s = refent->title; tlen = refent->tlen; |
|
1148
|
|
|
|
|
|
|
} |
|
1149
|
|
|
|
|
|
|
} |
|
1150
|
|
|
|
|
|
|
|
|
1151
|
662
|
100
|
|
|
|
|
if (!matched) { |
|
1152
|
|
|
|
|
|
|
/* no link: drop opener from stack — convert to plain TEXT so it |
|
1153
|
|
|
|
|
|
|
* doesn't block outer brackets from matching this ']'. (CM spec |
|
1154
|
|
|
|
|
|
|
* "look for link or image" step: remove opener on failure.) */ |
|
1155
|
240
|
|
|
|
|
|
opener->type = N_TEXT; |
|
1156
|
240
|
|
|
|
|
|
opener->active = 0; |
|
1157
|
240
|
|
|
|
|
|
return 0; |
|
1158
|
|
|
|
|
|
|
} |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
/* run process_emphasis on the children range (opener->next .. tail) */ |
|
1161
|
422
|
|
|
|
|
|
process_emphasis(z, opener); |
|
1162
|
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
/* wrap into LINK or IMAGE container */ |
|
1164
|
422
|
100
|
|
|
|
|
t = is_image ? N_IMAGE : N_LINK; |
|
1165
|
422
|
|
|
|
|
|
container = wrap_after(z, opener, t); |
|
1166
|
422
|
|
|
|
|
|
container->href = href_s; container->hlen = hlen; |
|
1167
|
422
|
|
|
|
|
|
container->title = title_s; container->tlen = tlen; |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
/* unlink opener from list (it became the boundary; wrap_after kept it |
|
1170
|
|
|
|
|
|
|
* outside the new container — we need to remove it now). */ |
|
1171
|
422
|
100
|
|
|
|
|
if (opener->prev) opener->prev->next = container; |
|
1172
|
295
|
|
|
|
|
|
else z->head = container; |
|
1173
|
422
|
|
|
|
|
|
container->prev = opener->prev; |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
/* deactivate any earlier '[' if this is a link (not image) */ |
|
1176
|
422
|
100
|
|
|
|
|
if (!is_image) deactivate_brackets(z, container); |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
422
|
|
|
|
|
|
*pos_io = p; |
|
1179
|
|
|
|
|
|
|
(void)is_ref; |
|
1180
|
422
|
|
|
|
|
|
return 1; |
|
1181
|
|
|
|
|
|
|
} |
|
1182
|
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
/* ---------------- forward pass ---------------- */ |
|
1184
|
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
/* SWAR / NEON fast-skip over runs of plain prose. The inline |
|
1186
|
|
|
|
|
|
|
* scanner's outer switch fires on exactly these 11 bytes: |
|
1187
|
|
|
|
|
|
|
* '\\' '`' '<' '&' '*' '_' '~' '[' '!' ']' '\n' |
|
1188
|
|
|
|
|
|
|
* Everything else falls through to `default: pos++;`. We replace that |
|
1189
|
|
|
|
|
|
|
* single-byte advance with a 16-byte (NEON) or 8-byte (SWAR) scan that |
|
1190
|
|
|
|
|
|
|
* returns the offset to the next interesting byte (or the chunk size if |
|
1191
|
|
|
|
|
|
|
* none). On prose corpora ~99% of bytes match the fast path. */ |
|
1192
|
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
static const unsigned char mds_inline_interest[256] = { |
|
1194
|
|
|
|
|
|
|
['\\']=1, ['`']=1, ['<']=1, ['&']=1, ['*']=1, ['_']=1, |
|
1195
|
|
|
|
|
|
|
['~']=1, ['[']=1, ['!']=1, [']']=1, ['\n']=1, |
|
1196
|
|
|
|
|
|
|
}; |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
#if MDS_INLINE_HAVE_NEON |
|
1199
|
|
|
|
|
|
|
static inline size_t mds_inline_skip16(const char* p) { |
|
1200
|
|
|
|
|
|
|
/* Returns 0..16: bytes safe to skip before the first interesting one. */ |
|
1201
|
|
|
|
|
|
|
uint8x16_t v = vld1q_u8((const uint8_t*)p); |
|
1202
|
|
|
|
|
|
|
uint8x16_t bs = vceqq_u8(v, vdupq_n_u8('\\')); |
|
1203
|
|
|
|
|
|
|
uint8x16_t bt = vceqq_u8(v, vdupq_n_u8('`')); |
|
1204
|
|
|
|
|
|
|
uint8x16_t lt = vceqq_u8(v, vdupq_n_u8('<')); |
|
1205
|
|
|
|
|
|
|
uint8x16_t amp = vceqq_u8(v, vdupq_n_u8('&')); |
|
1206
|
|
|
|
|
|
|
uint8x16_t st = vceqq_u8(v, vdupq_n_u8('*')); |
|
1207
|
|
|
|
|
|
|
uint8x16_t us = vceqq_u8(v, vdupq_n_u8('_')); |
|
1208
|
|
|
|
|
|
|
uint8x16_t ti = vceqq_u8(v, vdupq_n_u8('~')); |
|
1209
|
|
|
|
|
|
|
uint8x16_t lb = vceqq_u8(v, vdupq_n_u8('[')); |
|
1210
|
|
|
|
|
|
|
uint8x16_t bg = vceqq_u8(v, vdupq_n_u8('!')); |
|
1211
|
|
|
|
|
|
|
uint8x16_t rb = vceqq_u8(v, vdupq_n_u8(']')); |
|
1212
|
|
|
|
|
|
|
uint8x16_t nl = vceqq_u8(v, vdupq_n_u8('\n')); |
|
1213
|
|
|
|
|
|
|
uint8x16_t any = vorrq_u8(vorrq_u8(vorrq_u8(bs, bt), vorrq_u8(lt, amp)), |
|
1214
|
|
|
|
|
|
|
vorrq_u8(vorrq_u8(vorrq_u8(st, us), vorrq_u8(ti, lb)), |
|
1215
|
|
|
|
|
|
|
vorrq_u8(vorrq_u8(bg, rb), nl))); |
|
1216
|
|
|
|
|
|
|
uint8x8_t lo, hi; |
|
1217
|
|
|
|
|
|
|
uint8x8_t packed_lo; |
|
1218
|
|
|
|
|
|
|
uint64_t m; |
|
1219
|
|
|
|
|
|
|
if (vmaxvq_u8(any) == 0) return 16; |
|
1220
|
|
|
|
|
|
|
/* Reduce to 64-bit then ctz to find first match. */ |
|
1221
|
|
|
|
|
|
|
lo = vget_low_u8(any); |
|
1222
|
|
|
|
|
|
|
hi = vget_high_u8(any); |
|
1223
|
|
|
|
|
|
|
/* Pack each byte's high bit into a 16-bit mask via shrn trick. */ |
|
1224
|
|
|
|
|
|
|
packed_lo = vshrn_n_u16(vreinterpretq_u16_u8(any), 4); |
|
1225
|
|
|
|
|
|
|
m = vget_lane_u64(vreinterpret_u64_u8(packed_lo), 0); |
|
1226
|
|
|
|
|
|
|
(void)lo; (void)hi; |
|
1227
|
|
|
|
|
|
|
return (size_t)(__builtin_ctzll(m) >> 2); |
|
1228
|
|
|
|
|
|
|
} |
|
1229
|
|
|
|
|
|
|
#endif |
|
1230
|
|
|
|
|
|
|
|
|
1231
|
2221
|
|
|
|
|
|
static inline size_t mds_inline_skip8(const char* p) { |
|
1232
|
|
|
|
|
|
|
/* Portable SWAR fallback: 8-byte stride. */ |
|
1233
|
|
|
|
|
|
|
uint64_t w; |
|
1234
|
|
|
|
|
|
|
uint64_t m; |
|
1235
|
2221
|
|
|
|
|
|
memcpy(&w, p, 8); |
|
1236
|
|
|
|
|
|
|
#define MDS_HASZ(x) (((x) - 0x0101010101010101ULL) & ~(x) & 0x8080808080808080ULL) |
|
1237
|
|
|
|
|
|
|
#define MDS_BC(b) ((uint64_t)(b) * 0x0101010101010101ULL) |
|
1238
|
2221
|
|
|
|
|
|
m = MDS_HASZ(w ^ MDS_BC('\\')) |
|
1239
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('`')) |
|
1240
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('<')) |
|
1241
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('&')) |
|
1242
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('*')) |
|
1243
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('_')) |
|
1244
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('~')) |
|
1245
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('[')) |
|
1246
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('!')) |
|
1247
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC(']')) |
|
1248
|
2221
|
|
|
|
|
|
| MDS_HASZ(w ^ MDS_BC('\n')); |
|
1249
|
|
|
|
|
|
|
#undef MDS_HASZ |
|
1250
|
|
|
|
|
|
|
#undef MDS_BC |
|
1251
|
2221
|
100
|
|
|
|
|
if (!m) return 8; |
|
1252
|
|
|
|
|
|
|
/* m has high bit set in each matching lane (little-endian byte order). */ |
|
1253
|
1459
|
|
|
|
|
|
return (size_t)(__builtin_ctzll(m) >> 3); |
|
1254
|
|
|
|
|
|
|
} |
|
1255
|
|
|
|
|
|
|
|
|
1256
|
1706
|
|
|
|
|
|
static void scan_forward(scn* z) { |
|
1257
|
1706
|
|
|
|
|
|
const char* s = z->s; |
|
1258
|
1706
|
|
|
|
|
|
size_t n = z->n; |
|
1259
|
1706
|
|
|
|
|
|
size_t pos = 0; |
|
1260
|
1706
|
|
|
|
|
|
size_t text_start = 0; |
|
1261
|
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
#define FLUSH_TEXT() do { \ |
|
1263
|
|
|
|
|
|
|
if (pos > text_start) append_text(z, s + text_start, pos - text_start); \ |
|
1264
|
|
|
|
|
|
|
text_start = pos; \ |
|
1265
|
|
|
|
|
|
|
} while (0) |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
8883
|
100
|
|
|
|
|
while (pos < n) { |
|
1268
|
7177
|
|
|
|
|
|
unsigned char c = (unsigned char)s[pos]; |
|
1269
|
7177
|
|
|
|
|
|
switch (c) { |
|
1270
|
339
|
|
|
|
|
|
case '\\': { |
|
1271
|
|
|
|
|
|
|
inode* x; |
|
1272
|
303
|
100
|
|
|
|
|
if (pos + 1 < n && s[pos+1] == '\n') { |
|
|
|
100
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
/* hard break */ |
|
1274
|
13
|
50
|
|
|
|
|
FLUSH_TEXT(); |
|
1275
|
13
|
|
|
|
|
|
x = node_new(z, N_LINEBREAK); |
|
1276
|
13
|
|
|
|
|
|
append(z, x); |
|
1277
|
13
|
|
|
|
|
|
pos += 2; |
|
1278
|
|
|
|
|
|
|
/* skip leading spaces on next line */ |
|
1279
|
19
|
50
|
|
|
|
|
while (pos < n && (s[pos] == ' ' || s[pos] == '\t')) pos++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1280
|
13
|
|
|
|
|
|
text_start = pos; |
|
1281
|
13
|
|
|
|
|
|
continue; |
|
1282
|
|
|
|
|
|
|
} |
|
1283
|
290
|
100
|
|
|
|
|
if (pos + 1 < n && is_ascii_punct((unsigned char)s[pos+1])) { |
|
|
|
100
|
|
|
|
|
|
|
1284
|
254
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1285
|
254
|
|
|
|
|
|
append_text_dup(z, s + pos + 1, 1); |
|
1286
|
254
|
|
|
|
|
|
pos += 2; |
|
1287
|
254
|
|
|
|
|
|
text_start = pos; |
|
1288
|
254
|
|
|
|
|
|
continue; |
|
1289
|
|
|
|
|
|
|
} |
|
1290
|
36
|
|
|
|
|
|
pos++; |
|
1291
|
|
|
|
|
|
|
(void)x; |
|
1292
|
36
|
|
|
|
|
|
continue; |
|
1293
|
|
|
|
|
|
|
} |
|
1294
|
191
|
|
|
|
|
|
case '`': { |
|
1295
|
|
|
|
|
|
|
size_t end; |
|
1296
|
191
|
100
|
|
|
|
|
if (z->ctx->flags & MDS_FLAG_NO_CODE) { |
|
1297
|
|
|
|
|
|
|
/* emit literal backtick(s) as text */ |
|
1298
|
2
|
|
|
|
|
|
size_t r = pos; |
|
1299
|
4
|
50
|
|
|
|
|
while (r < n && s[r] == '`') r++; |
|
|
|
100
|
|
|
|
|
|
|
1300
|
2
|
50
|
|
|
|
|
FLUSH_TEXT(); |
|
1301
|
2
|
|
|
|
|
|
append_text(z, s + pos, r - pos); |
|
1302
|
2
|
|
|
|
|
|
pos = r; text_start = pos; continue; |
|
1303
|
|
|
|
|
|
|
} |
|
1304
|
189
|
|
|
|
|
|
end = try_code_span(z, pos); |
|
1305
|
|
|
|
|
|
|
if (end) { |
|
1306
|
|
|
|
|
|
|
/* flush bytes before pos */ |
|
1307
|
|
|
|
|
|
|
if (pos > text_start) { |
|
1308
|
|
|
|
|
|
|
/* append_text already; but z->tail is the new CODE node. |
|
1309
|
|
|
|
|
|
|
* We need to insert text BEFORE it. Re-do manually. */ |
|
1310
|
|
|
|
|
|
|
} |
|
1311
|
|
|
|
|
|
|
/* Actually: try_code_span already appended a CODE node, |
|
1312
|
|
|
|
|
|
|
* so the prior bytes weren't flushed. Need to flush first. */ |
|
1313
|
|
|
|
|
|
|
/* To keep things simple, flush BEFORE attempting span. */ |
|
1314
|
|
|
|
|
|
|
/* Implementation note: re-do as flush-then-attempt below. */ |
|
1315
|
|
|
|
|
|
|
(void)end; |
|
1316
|
|
|
|
|
|
|
} |
|
1317
|
|
|
|
|
|
|
/* re-attempt with flush */ |
|
1318
|
45
|
|
|
|
|
|
{ |
|
1319
|
189
|
|
|
|
|
|
size_t saved_pos = pos; |
|
1320
|
|
|
|
|
|
|
size_t end2; |
|
1321
|
|
|
|
|
|
|
size_t r; |
|
1322
|
|
|
|
|
|
|
/* remove the CODE node just appended (we did it above) */ |
|
1323
|
189
|
100
|
|
|
|
|
if (end && z->tail && z->tail->type == N_CODE) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1324
|
144
|
|
|
|
|
|
inode* dead = z->tail; |
|
1325
|
144
|
|
|
|
|
|
z->tail = dead->prev; |
|
1326
|
144
|
100
|
|
|
|
|
if (z->tail) z->tail->next = NULL; |
|
1327
|
96
|
|
|
|
|
|
else z->head = NULL; |
|
1328
|
|
|
|
|
|
|
} |
|
1329
|
|
|
|
|
|
|
/* flush text */ |
|
1330
|
189
|
100
|
|
|
|
|
if (saved_pos > text_start) |
|
1331
|
71
|
|
|
|
|
|
append_text(z, s + text_start, saved_pos - text_start); |
|
1332
|
189
|
|
|
|
|
|
text_start = saved_pos; |
|
1333
|
|
|
|
|
|
|
/* re-attempt cleanly */ |
|
1334
|
189
|
|
|
|
|
|
end2 = try_code_span(z, saved_pos); |
|
1335
|
189
|
100
|
|
|
|
|
if (end2) { |
|
1336
|
144
|
|
|
|
|
|
pos = end2; text_start = pos; continue; |
|
1337
|
|
|
|
|
|
|
} |
|
1338
|
|
|
|
|
|
|
/* failed: emit literal backticks */ |
|
1339
|
45
|
|
|
|
|
|
r = pos; |
|
1340
|
99
|
100
|
|
|
|
|
while (r < n && s[r] == '`') r++; |
|
|
|
100
|
|
|
|
|
|
|
1341
|
45
|
|
|
|
|
|
append_text(z, s + pos, r - pos); |
|
1342
|
45
|
|
|
|
|
|
pos = r; text_start = pos; continue; |
|
1343
|
|
|
|
|
|
|
} |
|
1344
|
|
|
|
|
|
|
} |
|
1345
|
356
|
|
|
|
|
|
case '<': { |
|
1346
|
|
|
|
|
|
|
size_t end; |
|
1347
|
279
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1348
|
279
|
|
|
|
|
|
end = try_autolink(z, pos); |
|
1349
|
279
|
100
|
|
|
|
|
if (end) { pos += end; text_start = pos; continue; } |
|
1350
|
222
|
100
|
|
|
|
|
if (!(z->ctx->flags & MDS_FLAG_NO_HTML)) { |
|
1351
|
220
|
|
|
|
|
|
end = try_html_inline(z, pos); |
|
1352
|
220
|
100
|
|
|
|
|
if (end) { pos += end; text_start = pos; continue; } |
|
1353
|
|
|
|
|
|
|
} |
|
1354
|
77
|
|
|
|
|
|
append_text(z, s + pos, 1); |
|
1355
|
77
|
|
|
|
|
|
pos++; text_start = pos; |
|
1356
|
77
|
|
|
|
|
|
continue; |
|
1357
|
|
|
|
|
|
|
} |
|
1358
|
145
|
|
|
|
|
|
case '&': { |
|
1359
|
112
|
|
|
|
|
|
size_t consumed = try_entity(z, pos); |
|
1360
|
112
|
100
|
|
|
|
|
if (consumed) { |
|
1361
|
|
|
|
|
|
|
/* flush prior text first */ |
|
1362
|
79
|
|
|
|
|
|
size_t before = pos; |
|
1363
|
|
|
|
|
|
|
/* try_entity already appended the entity TEXT; we need to |
|
1364
|
|
|
|
|
|
|
* insert prior bytes before it. */ |
|
1365
|
79
|
50
|
|
|
|
|
if (z->tail && before > text_start) { |
|
|
|
100
|
|
|
|
|
|
|
1366
|
52
|
|
|
|
|
|
inode* added = z->tail; |
|
1367
|
|
|
|
|
|
|
/* detach */ |
|
1368
|
52
|
|
|
|
|
|
z->tail = added->prev; |
|
1369
|
52
|
100
|
|
|
|
|
if (z->tail) z->tail->next = NULL; |
|
1370
|
3
|
|
|
|
|
|
else z->head = NULL; |
|
1371
|
52
|
|
|
|
|
|
append_text(z, s + text_start, before - text_start); |
|
1372
|
|
|
|
|
|
|
/* re-append */ |
|
1373
|
52
|
|
|
|
|
|
added->prev = z->tail; |
|
1374
|
52
|
|
|
|
|
|
added->next = NULL; |
|
1375
|
52
|
50
|
|
|
|
|
if (z->tail) z->tail->next = added; |
|
1376
|
0
|
|
|
|
|
|
else z->head = added; |
|
1377
|
52
|
|
|
|
|
|
z->tail = added; |
|
1378
|
|
|
|
|
|
|
} |
|
1379
|
79
|
|
|
|
|
|
pos += consumed; text_start = pos; |
|
1380
|
79
|
|
|
|
|
|
continue; |
|
1381
|
|
|
|
|
|
|
} |
|
1382
|
33
|
|
|
|
|
|
pos++; continue; |
|
1383
|
|
|
|
|
|
|
} |
|
1384
|
1540
|
|
|
|
|
|
case '*': |
|
1385
|
|
|
|
|
|
|
case '_': |
|
1386
|
1518
|
|
|
|
|
|
case '~': { |
|
1387
|
|
|
|
|
|
|
size_t start; |
|
1388
|
|
|
|
|
|
|
size_t runlen; |
|
1389
|
|
|
|
|
|
|
int co, cc; |
|
1390
|
|
|
|
|
|
|
inode* x; |
|
1391
|
1540
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1392
|
1540
|
|
|
|
|
|
start = pos; |
|
1393
|
3984
|
100
|
|
|
|
|
while (pos < n && (unsigned char)s[pos] == c) pos++; |
|
|
|
100
|
|
|
|
|
|
|
1394
|
1540
|
|
|
|
|
|
runlen = pos - start; |
|
1395
|
1540
|
100
|
|
|
|
|
if (c == '~' && ((runlen != 1 && runlen != 2) || !(z->ctx->flags & MDS_FLAG_STRIKE))) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
/* not strike candidate (or strikethrough disabled); emit as text */ |
|
1397
|
22
|
|
|
|
|
|
append_text(z, s + start, runlen); |
|
1398
|
22
|
|
|
|
|
|
text_start = pos; |
|
1399
|
22
|
|
|
|
|
|
continue; |
|
1400
|
|
|
|
|
|
|
} |
|
1401
|
1518
|
|
|
|
|
|
classify_run(s, n, start, runlen, &co, &cc, c); |
|
1402
|
1518
|
|
|
|
|
|
x = node_new(z, N_DELIM); |
|
1403
|
1518
|
|
|
|
|
|
x->delim_char = c; |
|
1404
|
1518
|
|
|
|
|
|
x->count = (int)runlen; |
|
1405
|
1518
|
|
|
|
|
|
x->can_open = co; |
|
1406
|
1518
|
|
|
|
|
|
x->can_close = cc; |
|
1407
|
1518
|
|
|
|
|
|
x->s = s + start; x->n = runlen; |
|
1408
|
1518
|
|
|
|
|
|
append(z, x); |
|
1409
|
1518
|
|
|
|
|
|
text_start = pos; |
|
1410
|
1518
|
|
|
|
|
|
continue; |
|
1411
|
|
|
|
|
|
|
} |
|
1412
|
1317
|
|
|
|
|
|
case '[': { |
|
1413
|
|
|
|
|
|
|
inode* x; |
|
1414
|
659
|
100
|
|
|
|
|
if (z->ctx->flags & MDS_FLAG_NO_LINKS) { |
|
1415
|
1
|
50
|
|
|
|
|
FLUSH_TEXT(); |
|
1416
|
1
|
|
|
|
|
|
append_text(z, s + pos, 1); |
|
1417
|
1
|
|
|
|
|
|
pos++; text_start = pos; continue; |
|
1418
|
|
|
|
|
|
|
} |
|
1419
|
658
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1420
|
658
|
|
|
|
|
|
x = node_new(z, N_OPEN_BRACKET); |
|
1421
|
658
|
|
|
|
|
|
x->s = s + pos; x->n = 1; |
|
1422
|
658
|
|
|
|
|
|
x->active = 1; |
|
1423
|
658
|
|
|
|
|
|
append(z, x); |
|
1424
|
658
|
|
|
|
|
|
pos++; text_start = pos; |
|
1425
|
658
|
|
|
|
|
|
continue; |
|
1426
|
|
|
|
|
|
|
} |
|
1427
|
113
|
|
|
|
|
|
case '!': { |
|
1428
|
113
|
100
|
|
|
|
|
if (pos + 1 < n && s[pos+1] == '[' && |
|
|
|
100
|
|
|
|
|
|
|
1429
|
86
|
100
|
|
|
|
|
!(z->ctx->flags & MDS_FLAG_NO_IMAGES)) { |
|
1430
|
|
|
|
|
|
|
inode* x; |
|
1431
|
85
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1432
|
85
|
|
|
|
|
|
x = node_new(z, N_OPEN_BANG); |
|
1433
|
85
|
|
|
|
|
|
x->s = s + pos; x->n = 2; |
|
1434
|
85
|
|
|
|
|
|
x->active = 1; |
|
1435
|
85
|
|
|
|
|
|
append(z, x); |
|
1436
|
85
|
|
|
|
|
|
pos += 2; text_start = pos; |
|
1437
|
85
|
|
|
|
|
|
continue; |
|
1438
|
|
|
|
|
|
|
} |
|
1439
|
28
|
|
|
|
|
|
pos++; continue; |
|
1440
|
|
|
|
|
|
|
} |
|
1441
|
1000
|
|
|
|
|
|
case ']': { |
|
1442
|
|
|
|
|
|
|
size_t p2; |
|
1443
|
723
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1444
|
723
|
|
|
|
|
|
p2 = pos; |
|
1445
|
723
|
100
|
|
|
|
|
if (try_close_bracket(z, &p2)) { |
|
1446
|
446
|
|
|
|
|
|
pos = p2; text_start = pos; |
|
1447
|
446
|
|
|
|
|
|
continue; |
|
1448
|
|
|
|
|
|
|
} |
|
1449
|
|
|
|
|
|
|
/* literal ] */ |
|
1450
|
277
|
|
|
|
|
|
append_text(z, s + pos, 1); |
|
1451
|
277
|
|
|
|
|
|
pos++; text_start = pos; |
|
1452
|
277
|
|
|
|
|
|
continue; |
|
1453
|
|
|
|
|
|
|
} |
|
1454
|
676
|
|
|
|
|
|
case '\n': { |
|
1455
|
|
|
|
|
|
|
int hard; |
|
1456
|
|
|
|
|
|
|
inode* br; |
|
1457
|
338
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1458
|
|
|
|
|
|
|
/* hard break iff prev TEXT ended with two-or-more spaces */ |
|
1459
|
338
|
|
|
|
|
|
hard = 0; |
|
1460
|
338
|
50
|
|
|
|
|
if (z->tail && z->tail->type == N_TEXT) { |
|
|
|
100
|
|
|
|
|
|
|
1461
|
300
|
|
|
|
|
|
inode* t = z->tail; |
|
1462
|
300
|
100
|
|
|
|
|
if (t->n >= 2 && t->s[t->n - 1] == ' ' && t->s[t->n - 2] == ' ') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
/* trim trailing spaces */ |
|
1464
|
72
|
50
|
|
|
|
|
while (t->n > 0 && t->s[t->n - 1] == ' ') t->n--; |
|
|
|
100
|
|
|
|
|
|
|
1465
|
16
|
50
|
|
|
|
|
if (t->n == 0) { |
|
1466
|
|
|
|
|
|
|
/* remove empty text */ |
|
1467
|
0
|
|
|
|
|
|
z->tail = t->prev; |
|
1468
|
0
|
0
|
|
|
|
|
if (z->tail) z->tail->next = NULL; |
|
1469
|
0
|
|
|
|
|
|
else z->head = NULL; |
|
1470
|
|
|
|
|
|
|
} |
|
1471
|
16
|
|
|
|
|
|
hard = 1; |
|
1472
|
284
|
50
|
|
|
|
|
} else if (t->n >= 1 && t->s[t->n - 1] == ' ') { |
|
|
|
100
|
|
|
|
|
|
|
1473
|
|
|
|
|
|
|
/* single space trailing — strip */ |
|
1474
|
9
|
|
|
|
|
|
t->n--; |
|
1475
|
9
|
100
|
|
|
|
|
if (t->n == 0) { |
|
1476
|
6
|
|
|
|
|
|
z->tail = t->prev; |
|
1477
|
6
|
50
|
|
|
|
|
if (z->tail) z->tail->next = NULL; |
|
1478
|
0
|
|
|
|
|
|
else z->head = NULL; |
|
1479
|
|
|
|
|
|
|
} |
|
1480
|
|
|
|
|
|
|
} |
|
1481
|
|
|
|
|
|
|
} |
|
1482
|
338
|
100
|
|
|
|
|
br = node_new(z, hard ? N_LINEBREAK : N_SOFTBREAK); |
|
1483
|
338
|
|
|
|
|
|
append(z, br); |
|
1484
|
338
|
|
|
|
|
|
pos++; |
|
1485
|
|
|
|
|
|
|
/* skip leading spaces on next line */ |
|
1486
|
503
|
50
|
|
|
|
|
while (pos < n && (s[pos] == ' ' || s[pos] == '\t')) pos++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1487
|
338
|
|
|
|
|
|
text_start = pos; |
|
1488
|
338
|
|
|
|
|
|
continue; |
|
1489
|
|
|
|
|
|
|
} |
|
1490
|
2919
|
|
|
|
|
|
default: |
|
1491
|
|
|
|
|
|
|
{ |
|
1492
|
|
|
|
|
|
|
/* Fast skip over plain prose. The chunked stride keeps the |
|
1493
|
|
|
|
|
|
|
* text run intact (no FLUSH_TEXT needed) — we just advance |
|
1494
|
|
|
|
|
|
|
* `pos` past bytes the outer switch would have ignored. */ |
|
1495
|
|
|
|
|
|
|
#if MDS_INLINE_HAVE_NEON |
|
1496
|
|
|
|
|
|
|
while (pos + 16 <= n) { |
|
1497
|
|
|
|
|
|
|
size_t k = mds_inline_skip16(s + pos); |
|
1498
|
|
|
|
|
|
|
pos += k; |
|
1499
|
|
|
|
|
|
|
if (k < 16) goto next_iter; |
|
1500
|
|
|
|
|
|
|
} |
|
1501
|
|
|
|
|
|
|
#endif |
|
1502
|
3681
|
100
|
|
|
|
|
while (pos + 8 <= n) { |
|
1503
|
2221
|
|
|
|
|
|
size_t k = mds_inline_skip8(s + pos); |
|
1504
|
2221
|
|
|
|
|
|
pos += k; |
|
1505
|
2221
|
100
|
|
|
|
|
if (k < 8) goto next_iter; |
|
1506
|
|
|
|
|
|
|
} |
|
1507
|
|
|
|
|
|
|
/* Tail: 1-byte at a time. The interest table makes the |
|
1508
|
|
|
|
|
|
|
* predicate branch-free. */ |
|
1509
|
5694
|
100
|
|
|
|
|
while (pos < n && !mds_inline_interest[(unsigned char)s[pos]]) |
|
|
|
100
|
|
|
|
|
|
|
1510
|
4234
|
|
|
|
|
|
pos++; |
|
1511
|
1460
|
|
|
|
|
|
next_iter: |
|
1512
|
2919
|
|
|
|
|
|
continue; |
|
1513
|
|
|
|
|
|
|
} |
|
1514
|
|
|
|
|
|
|
} |
|
1515
|
|
|
|
|
|
|
} |
|
1516
|
1706
|
100
|
|
|
|
|
FLUSH_TEXT(); |
|
1517
|
|
|
|
|
|
|
#undef FLUSH_TEXT |
|
1518
|
1706
|
|
|
|
|
|
} |
|
1519
|
|
|
|
|
|
|
|
|
1520
|
|
|
|
|
|
|
/* ---------------- emit pass ---------------- */ |
|
1521
|
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
static void emit_children(scn* z, inode* head); |
|
1523
|
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
/* HTML-escape NOT done here; renderer cb_text does the escaping. */ |
|
1525
|
3918
|
|
|
|
|
|
static void emit_text(scn* z, const char* s, size_t n) { |
|
1526
|
3918
|
50
|
|
|
|
|
if (n == 0) return; |
|
1527
|
3918
|
50
|
|
|
|
|
if (z->ctx->cb.text) z->ctx->cb.text(z->ctx->ud, s, n); |
|
1528
|
|
|
|
|
|
|
} |
|
1529
|
139
|
|
|
|
|
|
static void emit_raw(scn* z, const char* s, size_t n) { |
|
1530
|
139
|
50
|
|
|
|
|
if (n == 0) return; |
|
1531
|
139
|
50
|
|
|
|
|
if (z->ctx->cb.raw) z->ctx->cb.raw(z->ctx->ud, s, n); |
|
1532
|
|
|
|
|
|
|
} |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
5517
|
|
|
|
|
|
static void emit_node(scn* z, inode* x) { |
|
1535
|
5517
|
|
|
|
|
|
mds_callbacks* cb = &z->ctx->cb; |
|
1536
|
|
|
|
|
|
|
mds_inline_detail d; |
|
1537
|
5517
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
1538
|
5517
|
|
|
|
|
|
switch (x->type) { |
|
1539
|
3702
|
|
|
|
|
|
case N_TEXT: |
|
1540
|
3702
|
|
|
|
|
|
emit_text(z, x->s, x->n); |
|
1541
|
3702
|
|
|
|
|
|
break; |
|
1542
|
322
|
|
|
|
|
|
case N_SOFTBREAK: |
|
1543
|
322
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_SOFTBREAK, &d); |
|
1544
|
322
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_SOFTBREAK); |
|
1545
|
322
|
|
|
|
|
|
break; |
|
1546
|
29
|
|
|
|
|
|
case N_LINEBREAK: |
|
1547
|
29
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_LINEBREAK, &d); |
|
1548
|
29
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_LINEBREAK); |
|
1549
|
29
|
|
|
|
|
|
break; |
|
1550
|
144
|
|
|
|
|
|
case N_CODE: |
|
1551
|
144
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_CODE, &d); |
|
1552
|
144
|
|
|
|
|
|
emit_text(z, x->s, x->n); |
|
1553
|
144
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_CODE); |
|
1554
|
144
|
|
|
|
|
|
break; |
|
1555
|
57
|
|
|
|
|
|
case N_AUTOLINK: |
|
1556
|
57
|
|
|
|
|
|
d.u.autolink.uri = x->s; |
|
1557
|
57
|
|
|
|
|
|
d.u.autolink.uri_len = x->n; |
|
1558
|
57
|
|
|
|
|
|
d.u.autolink.is_email = x->is_email; |
|
1559
|
57
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_AUTOLINK, &d); |
|
1560
|
57
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_AUTOLINK); |
|
1561
|
57
|
|
|
|
|
|
break; |
|
1562
|
139
|
|
|
|
|
|
case N_HTMLINLINE: |
|
1563
|
139
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_HTML_INLINE, &d); |
|
1564
|
139
|
|
|
|
|
|
emit_raw(z, x->s, x->n); |
|
1565
|
139
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_HTML_INLINE); |
|
1566
|
139
|
|
|
|
|
|
break; |
|
1567
|
343
|
|
|
|
|
|
case N_EMPH: |
|
1568
|
343
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_EMPH, &d); |
|
1569
|
343
|
|
|
|
|
|
emit_children(z, x->children); |
|
1570
|
343
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_EMPH); |
|
1571
|
343
|
|
|
|
|
|
break; |
|
1572
|
237
|
|
|
|
|
|
case N_STRONG: |
|
1573
|
237
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_STRONG, &d); |
|
1574
|
237
|
|
|
|
|
|
emit_children(z, x->children); |
|
1575
|
237
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_STRONG); |
|
1576
|
237
|
|
|
|
|
|
break; |
|
1577
|
26
|
|
|
|
|
|
case N_STRIKE: |
|
1578
|
26
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_STRIKE, &d); |
|
1579
|
26
|
|
|
|
|
|
emit_children(z, x->children); |
|
1580
|
26
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_STRIKE); |
|
1581
|
26
|
|
|
|
|
|
break; |
|
1582
|
343
|
|
|
|
|
|
case N_LINK: |
|
1583
|
343
|
|
|
|
|
|
d.u.link.href = x->href; d.u.link.href_len = x->hlen; |
|
1584
|
343
|
|
|
|
|
|
d.u.link.title = x->title; d.u.link.title_len = x->tlen; |
|
1585
|
343
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_LINK, &d); |
|
1586
|
343
|
|
|
|
|
|
emit_children(z, x->children); |
|
1587
|
343
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_LINK); |
|
1588
|
343
|
|
|
|
|
|
break; |
|
1589
|
79
|
|
|
|
|
|
case N_IMAGE: |
|
1590
|
79
|
|
|
|
|
|
d.u.image.href = x->href; d.u.image.href_len = x->hlen; |
|
1591
|
79
|
|
|
|
|
|
d.u.image.title = x->title; d.u.image.title_len = x->tlen; |
|
1592
|
79
|
|
|
|
|
|
d.u.image.alt = NULL; d.u.image.alt_len = 0; /* renderer derives alt from children */ |
|
1593
|
79
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_IMAGE, &d); |
|
1594
|
79
|
|
|
|
|
|
emit_children(z, x->children); |
|
1595
|
79
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_IMAGE); |
|
1596
|
79
|
|
|
|
|
|
break; |
|
1597
|
24
|
|
|
|
|
|
case N_FOOTNOTE_REF: |
|
1598
|
|
|
|
|
|
|
/* opener->href / hlen carry the raw label captured at match |
|
1599
|
|
|
|
|
|
|
* time; we stuffed them there because inode has no dedicated |
|
1600
|
|
|
|
|
|
|
* label slot. The renderer manages numbering. */ |
|
1601
|
24
|
|
|
|
|
|
d.u.footnote_ref.label = x->href; |
|
1602
|
24
|
|
|
|
|
|
d.u.footnote_ref.label_len = x->hlen; |
|
1603
|
24
|
50
|
|
|
|
|
if (cb->enter_inline) cb->enter_inline(z->ctx->ud, MDS_INL_FOOTNOTE_REF, &d); |
|
1604
|
24
|
50
|
|
|
|
|
if (cb->leave_inline) cb->leave_inline(z->ctx->ud, MDS_INL_FOOTNOTE_REF); |
|
1605
|
24
|
|
|
|
|
|
break; |
|
1606
|
72
|
|
|
|
|
|
case N_DELIM: |
|
1607
|
|
|
|
|
|
|
case N_OPEN_BRACKET: |
|
1608
|
|
|
|
|
|
|
case N_OPEN_BANG: |
|
1609
|
|
|
|
|
|
|
/* leftover unmatched delimiter/bracket — emit as literal text */ |
|
1610
|
72
|
|
|
|
|
|
emit_text(z, x->s, x->n); |
|
1611
|
72
|
|
|
|
|
|
break; |
|
1612
|
|
|
|
|
|
|
} |
|
1613
|
5517
|
|
|
|
|
|
} |
|
1614
|
|
|
|
|
|
|
|
|
1615
|
2734
|
|
|
|
|
|
static void emit_children(scn* z, inode* head) { |
|
1616
|
|
|
|
|
|
|
inode* p; |
|
1617
|
8251
|
100
|
|
|
|
|
for (p = head; p; p = p->next) emit_node(z, p); |
|
1618
|
2734
|
|
|
|
|
|
} |
|
1619
|
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
/* ---------------- public entry ---------------- */ |
|
1621
|
|
|
|
|
|
|
|
|
1622
|
3164
|
|
|
|
|
|
MDS_HOT void mds_inline_scan(mds_ctx* ctx, const char* s, size_t n) { |
|
1623
|
|
|
|
|
|
|
scn z; |
|
1624
|
4622
|
50
|
|
|
|
|
if (n == 0) return; |
|
1625
|
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
/* Fast path for table cells and trivial paragraphs: if no byte in |
|
1627
|
|
|
|
|
|
|
* the run can possibly trigger an inline construct, we can skip |
|
1628
|
|
|
|
|
|
|
* the entire inode-list build / process_emphasis / emit_children |
|
1629
|
|
|
|
|
|
|
* pipeline and just call cb.text directly. Inline triggers are: |
|
1630
|
|
|
|
|
|
|
* `* _ ~ ` [ ] ! < & \\` plus the line-break candidates `\n` and |
|
1631
|
|
|
|
|
|
|
* the trailing-spaces hard-break case. The classifier dispatch |
|
1632
|
|
|
|
|
|
|
* table (src/simd/mds_dispatch.h) is the authoritative list; we |
|
1633
|
|
|
|
|
|
|
* use a small per-call SWAR-style scalar scan rather than the SIMD |
|
1634
|
|
|
|
|
|
|
* classifier so this stays cheap for short cell-sized runs. |
|
1635
|
|
|
|
|
|
|
* |
|
1636
|
|
|
|
|
|
|
* Tables hit this constantly (cells are typically a single word), |
|
1637
|
|
|
|
|
|
|
* and ordinary prose paragraphs hit it for runs between inline |
|
1638
|
|
|
|
|
|
|
* markers. The slow path is bit-identical to the original code. */ |
|
1639
|
|
|
|
|
|
|
{ |
|
1640
|
3164
|
|
|
|
|
|
const unsigned char* p = (const unsigned char*)s; |
|
1641
|
3164
|
|
|
|
|
|
const unsigned char* end = p + n; |
|
1642
|
117515
|
100
|
|
|
|
|
for (; p < end; p++) { |
|
1643
|
116057
|
|
|
|
|
|
unsigned char c = *p; |
|
1644
|
|
|
|
|
|
|
/* Bucket the trigger set with a small bitmap-style check. |
|
1645
|
|
|
|
|
|
|
* The compiler turns this into a branchless OR-chain. */ |
|
1646
|
116057
|
100
|
|
|
|
|
if (c == '*' || c == '_' || c == '~' || c == '`' || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1647
|
114895
|
50
|
|
|
|
|
c == '[' || c == ']' || c == '!' || c == '<' || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1648
|
114607
|
100
|
|
|
|
|
c == '&' || c == '\\' || c == '\n') break; |
|
|
|
100
|
|
|
|
|
|
|
1649
|
|
|
|
|
|
|
} |
|
1650
|
3164
|
100
|
|
|
|
|
if (MDS_LIKELY(p == end)) { |
|
1651
|
|
|
|
|
|
|
/* Also bail out on trailing spaces, which CommonMark would |
|
1652
|
|
|
|
|
|
|
* otherwise turn into a hard-break candidate. Table cells |
|
1653
|
|
|
|
|
|
|
* never have them (the splitter trims) and most paragraph |
|
1654
|
|
|
|
|
|
|
* runs don't either. */ |
|
1655
|
1458
|
100
|
|
|
|
|
if (n < 2 || !(s[n-1] == ' ' && s[n-2] == ' ')) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1656
|
1458
|
50
|
|
|
|
|
if (ctx->cb.text) ctx->cb.text(ctx->ud, s, n); |
|
1657
|
1458
|
|
|
|
|
|
return; |
|
1658
|
|
|
|
|
|
|
} |
|
1659
|
|
|
|
|
|
|
} |
|
1660
|
|
|
|
|
|
|
} |
|
1661
|
|
|
|
|
|
|
|
|
1662
|
1706
|
|
|
|
|
|
byteclass_init(); |
|
1663
|
1706
|
|
|
|
|
|
memset(&z, 0, sizeof z); |
|
1664
|
1706
|
|
|
|
|
|
z.ctx = ctx; |
|
1665
|
1706
|
|
|
|
|
|
z.s = s; |
|
1666
|
1706
|
|
|
|
|
|
z.n = n; |
|
1667
|
|
|
|
|
|
|
|
|
1668
|
1706
|
|
|
|
|
|
scan_forward(&z); |
|
1669
|
1706
|
|
|
|
|
|
process_emphasis(&z, NULL); |
|
1670
|
1706
|
|
|
|
|
|
emit_children(&z, z.head); |
|
1671
|
|
|
|
|
|
|
} |
|
1672
|
|
|
|
|
|
|
|