| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* mds_block.c — scalar CommonMark block scanner. |
|
2
|
|
|
|
|
|
|
* |
|
3
|
|
|
|
|
|
|
* Scope: |
|
4
|
|
|
|
|
|
|
* - Thematic break (§4.1) |
|
5
|
|
|
|
|
|
|
* - ATX heading (§4.2) |
|
6
|
|
|
|
|
|
|
* - Fenced code (§4.5) |
|
7
|
|
|
|
|
|
|
* - Paragraph (§4.8) |
|
8
|
|
|
|
|
|
|
* - Block quote (§5.1) with lazy continuation |
|
9
|
|
|
|
|
|
|
* - Bullet list + ordered list + list item (§5.2, §5.3) |
|
10
|
|
|
|
|
|
|
* |
|
11
|
|
|
|
|
|
|
* Inline content is emitted as raw text via cb->text; the inline |
|
12
|
|
|
|
|
|
|
* tokeniser handles it downstream. |
|
13
|
|
|
|
|
|
|
*/ |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
#include "mds_block.h" |
|
16
|
|
|
|
|
|
|
#include "mds_ir.h" |
|
17
|
|
|
|
|
|
|
#include "mds_ctx.h" |
|
18
|
|
|
|
|
|
|
#include "mds_arena.h" |
|
19
|
|
|
|
|
|
|
#include "mds_linkref.h" |
|
20
|
|
|
|
|
|
|
#include "mds_footnote.h" |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
/* Forward decl — defined in mds_render_html.c which is unity-included |
|
23
|
|
|
|
|
|
|
* AFTER this file. Used to emit the footnotes section in first-use |
|
24
|
|
|
|
|
|
|
* order. NULL-safe in the unlikely case the caller installs a non-HTML |
|
25
|
|
|
|
|
|
|
* renderer; iteration just stops at index 0. */ |
|
26
|
|
|
|
|
|
|
int mds_render_html_used_footnote(void* ud, size_t i, |
|
27
|
|
|
|
|
|
|
const char** label_out, |
|
28
|
|
|
|
|
|
|
size_t* label_len_out); |
|
29
|
|
|
|
|
|
|
#include "mds_inline.h" |
|
30
|
|
|
|
|
|
|
#include |
|
31
|
|
|
|
|
|
|
#include |
|
32
|
|
|
|
|
|
|
#include |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
#define MAX_DEPTH 1000 |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
typedef enum { CT_DOC, CT_QUOTE, CT_LIST, CT_LIST_ITEM } ct_kind; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
typedef struct { |
|
39
|
|
|
|
|
|
|
ct_kind kind; |
|
40
|
|
|
|
|
|
|
int ordered; |
|
41
|
|
|
|
|
|
|
int start; |
|
42
|
|
|
|
|
|
|
char marker; |
|
43
|
|
|
|
|
|
|
int tight; |
|
44
|
|
|
|
|
|
|
int had_blank_inside; |
|
45
|
|
|
|
|
|
|
int pending_blank; /* CT_LIST only: a blank line was seen, awaiting next non-blank to decide loose-ness */ |
|
46
|
|
|
|
|
|
|
int content_col; |
|
47
|
|
|
|
|
|
|
int opened; |
|
48
|
|
|
|
|
|
|
int ev_idx; /* index of the buffered enter_block event */ |
|
49
|
|
|
|
|
|
|
int is_empty; /* CT_LIST_ITEM only: opened with no content */ |
|
50
|
|
|
|
|
|
|
int blank_after_empty; /* CT_LIST_ITEM only: blank line seen while still empty */ |
|
51
|
|
|
|
|
|
|
} ctn; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
typedef enum { |
|
54
|
|
|
|
|
|
|
LF_NONE, |
|
55
|
|
|
|
|
|
|
LF_PARAGRAPH, |
|
56
|
|
|
|
|
|
|
LF_CODE_FENCED, |
|
57
|
|
|
|
|
|
|
LF_CODE_INDENTED, |
|
58
|
|
|
|
|
|
|
LF_HTML |
|
59
|
|
|
|
|
|
|
} leaf_kind; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
typedef enum { |
|
62
|
|
|
|
|
|
|
EV_ENTER_BLOCK, |
|
63
|
|
|
|
|
|
|
EV_LEAVE_BLOCK, |
|
64
|
|
|
|
|
|
|
EV_TEXT, |
|
65
|
|
|
|
|
|
|
EV_RAW, |
|
66
|
|
|
|
|
|
|
EV_INLINE /* bytes need mds_inline_scan at flush time */ |
|
67
|
|
|
|
|
|
|
} ev_type; |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
typedef struct { |
|
70
|
|
|
|
|
|
|
ev_type type; |
|
71
|
|
|
|
|
|
|
union { |
|
72
|
|
|
|
|
|
|
struct { mds_block_type t; mds_block_detail d; const char* info_ptr; size_t info_len; } enter; |
|
73
|
|
|
|
|
|
|
struct { mds_block_type t; } leave; |
|
74
|
|
|
|
|
|
|
struct { size_t off; size_t len; } bytes; |
|
75
|
|
|
|
|
|
|
} u; |
|
76
|
|
|
|
|
|
|
} ev_rec; |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
typedef struct { |
|
79
|
|
|
|
|
|
|
mds_ctx* ctx; |
|
80
|
|
|
|
|
|
|
ctn stack[MAX_DEPTH]; |
|
81
|
|
|
|
|
|
|
int depth; |
|
82
|
|
|
|
|
|
|
int list_depth; /* # CT_LIST containers currently on stack */ |
|
83
|
|
|
|
|
|
|
leaf_kind leaf; |
|
84
|
|
|
|
|
|
|
int leaf_in_container; |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
char* para; |
|
87
|
|
|
|
|
|
|
size_t para_len; |
|
88
|
|
|
|
|
|
|
size_t para_cap; |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
char fence_char; |
|
91
|
|
|
|
|
|
|
int fence_len; |
|
92
|
|
|
|
|
|
|
int fence_indent; |
|
93
|
|
|
|
|
|
|
char* fence_info; |
|
94
|
|
|
|
|
|
|
size_t fence_info_len; |
|
95
|
|
|
|
|
|
|
char* code_body; |
|
96
|
|
|
|
|
|
|
size_t code_len; |
|
97
|
|
|
|
|
|
|
size_t code_cap; |
|
98
|
|
|
|
|
|
|
int pending_blanks; |
|
99
|
|
|
|
|
|
|
int blank_pending; /* 1 = blank line seen, attribute to deepest surviving CT_LIST after next walk */ |
|
100
|
|
|
|
|
|
|
int setext_level; |
|
101
|
|
|
|
|
|
|
int html_type; /* 1..7 when leaf == LF_HTML */ |
|
102
|
|
|
|
|
|
|
char* html_body; |
|
103
|
|
|
|
|
|
|
size_t html_len; |
|
104
|
|
|
|
|
|
|
size_t html_cap; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
/* Event buffer used while list_depth > 0 so we can patch is_tight on |
|
107
|
|
|
|
|
|
|
* the LIST enter event once we know whether the list is loose. */ |
|
108
|
|
|
|
|
|
|
ev_rec* evbuf; |
|
109
|
|
|
|
|
|
|
size_t ev_len; |
|
110
|
|
|
|
|
|
|
size_t ev_cap; |
|
111
|
|
|
|
|
|
|
char* bytepool; |
|
112
|
|
|
|
|
|
|
size_t bp_len; |
|
113
|
|
|
|
|
|
|
size_t bp_cap; |
|
114
|
|
|
|
|
|
|
char* line_scratch; /* per-line buffer for tab-expanded line copy */ |
|
115
|
|
|
|
|
|
|
size_t line_scratch_cap; |
|
116
|
|
|
|
|
|
|
} bscanner; |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
/* ---------- heap buffer helpers ---------- */ |
|
119
|
|
|
|
|
|
|
|
|
120
|
9372
|
|
|
|
|
|
static void buf_grow(char** buf, size_t* cap, size_t need) { |
|
121
|
|
|
|
|
|
|
size_t nc; |
|
122
|
9372
|
100
|
|
|
|
|
if (need <= *cap) return; |
|
123
|
4533
|
100
|
|
|
|
|
nc = *cap ? *cap : 256; |
|
124
|
4576
|
100
|
|
|
|
|
while (nc < need) nc = nc + (nc >> 1) + 64; |
|
125
|
4533
|
|
|
|
|
|
*buf = (char*)realloc(*buf, nc); |
|
126
|
4533
|
|
|
|
|
|
*cap = nc; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
3907
|
|
|
|
|
|
static void buf_append(char** buf, size_t* len, size_t* cap, |
|
129
|
|
|
|
|
|
|
const char* s, size_t n) { |
|
130
|
3907
|
|
|
|
|
|
buf_grow(buf, cap, *len + n + 1); |
|
131
|
3907
|
|
|
|
|
|
memcpy(*buf + *len, s, n); |
|
132
|
3907
|
|
|
|
|
|
*len += n; |
|
133
|
3907
|
|
|
|
|
|
(*buf)[*len] = '\0'; |
|
134
|
3907
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
/* ---------- SAX dispatch (buffered while inside a list) ---------- */ |
|
137
|
|
|
|
|
|
|
|
|
138
|
3848
|
|
|
|
|
|
static size_t pool_intern(bscanner* b, const char* s, size_t n) { |
|
139
|
|
|
|
|
|
|
size_t off; |
|
140
|
3848
|
|
|
|
|
|
buf_grow(&b->bytepool, &b->bp_cap, b->bp_len + n + 1); |
|
141
|
3848
|
|
|
|
|
|
off = b->bp_len; |
|
142
|
3848
|
50
|
|
|
|
|
if (n) memcpy(b->bytepool + off, s, n); |
|
143
|
3848
|
|
|
|
|
|
b->bp_len += n; |
|
144
|
3848
|
|
|
|
|
|
return off; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
14944
|
|
|
|
|
|
static ev_rec* ev_alloc(bscanner* b) { |
|
147
|
14944
|
100
|
|
|
|
|
if (b->ev_len == b->ev_cap) { |
|
148
|
2244
|
100
|
|
|
|
|
size_t nc = b->ev_cap ? b->ev_cap * 2 : 64; |
|
149
|
2244
|
|
|
|
|
|
b->evbuf = (ev_rec*)realloc(b->evbuf, nc * sizeof(ev_rec)); |
|
150
|
2244
|
|
|
|
|
|
b->ev_cap = nc; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
14944
|
|
|
|
|
|
return &b->evbuf[b->ev_len++]; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
5534
|
|
|
|
|
|
static int sax_enter(bscanner* b, mds_block_type t, const mds_block_detail* d) { |
|
156
|
|
|
|
|
|
|
/* Always buffer at document level: defers inline_scan until linkref |
|
157
|
|
|
|
|
|
|
* table is fully populated. Returns the event index. */ |
|
158
|
|
|
|
|
|
|
{ |
|
159
|
5534
|
|
|
|
|
|
ev_rec* e = ev_alloc(b); |
|
160
|
5534
|
|
|
|
|
|
e->type = EV_ENTER_BLOCK; |
|
161
|
5534
|
|
|
|
|
|
e->u.enter.t = t; |
|
162
|
5534
|
50
|
|
|
|
|
e->u.enter.d = d ? *d : (mds_block_detail){0}; |
|
163
|
5534
|
|
|
|
|
|
e->u.enter.info_ptr = NULL; |
|
164
|
5534
|
|
|
|
|
|
e->u.enter.info_len = 0; |
|
165
|
5534
|
100
|
|
|
|
|
if (t == MDS_BLK_CODE_FENCED && d && d->u.code_fenced.info_len) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
166
|
19
|
|
|
|
|
|
size_t off = pool_intern(b, d->u.code_fenced.info, d->u.code_fenced.info_len); |
|
167
|
19
|
|
|
|
|
|
e->u.enter.info_ptr = (const char*)(uintptr_t)off; |
|
168
|
19
|
|
|
|
|
|
e->u.enter.info_len = d->u.code_fenced.info_len; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
5534
|
|
|
|
|
|
return (int)(b->ev_len - 1); |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
} |
|
173
|
5534
|
|
|
|
|
|
static void sax_leave(bscanner* b, mds_block_type t) { |
|
174
|
5534
|
|
|
|
|
|
ev_rec* e = ev_alloc(b); |
|
175
|
5534
|
|
|
|
|
|
e->type = EV_LEAVE_BLOCK; |
|
176
|
5534
|
|
|
|
|
|
e->u.leave.t = t; |
|
177
|
5534
|
|
|
|
|
|
} |
|
178
|
512
|
|
|
|
|
|
static void sax_text(bscanner* b, const char* s, size_t n) { |
|
179
|
512
|
|
|
|
|
|
size_t off = pool_intern(b, s, n); |
|
180
|
512
|
|
|
|
|
|
ev_rec* e = ev_alloc(b); |
|
181
|
512
|
|
|
|
|
|
e->type = EV_TEXT; |
|
182
|
512
|
|
|
|
|
|
e->u.bytes.off = off; |
|
183
|
512
|
|
|
|
|
|
e->u.bytes.len = n; |
|
184
|
512
|
|
|
|
|
|
} |
|
185
|
200
|
|
|
|
|
|
static void sax_raw(bscanner* b, const char* s, size_t n) { |
|
186
|
200
|
|
|
|
|
|
size_t off = pool_intern(b, s, n); |
|
187
|
200
|
|
|
|
|
|
ev_rec* e = ev_alloc(b); |
|
188
|
200
|
|
|
|
|
|
e->type = EV_RAW; |
|
189
|
200
|
|
|
|
|
|
e->u.bytes.off = off; |
|
190
|
200
|
|
|
|
|
|
e->u.bytes.len = n; |
|
191
|
200
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
3117
|
|
|
|
|
|
static void sax_inline_text(bscanner* b, const char* s, size_t n) { |
|
194
|
3117
|
|
|
|
|
|
size_t off = pool_intern(b, s, n); |
|
195
|
3117
|
|
|
|
|
|
ev_rec* e = ev_alloc(b); |
|
196
|
3117
|
|
|
|
|
|
e->type = EV_INLINE; |
|
197
|
3117
|
|
|
|
|
|
e->u.bytes.off = off; |
|
198
|
3117
|
|
|
|
|
|
e->u.bytes.len = n; |
|
199
|
3117
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
2460
|
|
|
|
|
|
static void sax_flush(bscanner* b) { |
|
202
|
|
|
|
|
|
|
/* Hoist the callback table and bytepool base out of |
|
203
|
|
|
|
|
|
|
* the per-event loop. The callbacks are guaranteed non-NULL by the |
|
204
|
|
|
|
|
|
|
* HTML renderer setup, but the scanner has to remain generic, so |
|
205
|
|
|
|
|
|
|
* we still null-check; MDS_LIKELY tells the compiler to fall |
|
206
|
|
|
|
|
|
|
* through (the renderer is by far the most common consumer). */ |
|
207
|
2460
|
|
|
|
|
|
const mds_callbacks cb = b->ctx->cb; |
|
208
|
2460
|
|
|
|
|
|
void* const ud = b->ctx->ud; |
|
209
|
2460
|
|
|
|
|
|
const char* const pool = b->bytepool; |
|
210
|
2460
|
|
|
|
|
|
const size_t n = b->ev_len; |
|
211
|
2460
|
|
|
|
|
|
ev_rec* const evs = b->evbuf; |
|
212
|
|
|
|
|
|
|
size_t i; |
|
213
|
17404
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
214
|
14944
|
|
|
|
|
|
ev_rec* e = &evs[i]; |
|
215
|
14944
|
|
|
|
|
|
switch (e->type) { |
|
216
|
5534
|
|
|
|
|
|
case EV_ENTER_BLOCK: |
|
217
|
5534
|
100
|
|
|
|
|
if (e->u.enter.t == MDS_BLK_CODE_FENCED && e->u.enter.info_len) { |
|
|
|
100
|
|
|
|
|
|
|
218
|
19
|
|
|
|
|
|
size_t off = (size_t)(uintptr_t)e->u.enter.info_ptr; |
|
219
|
19
|
|
|
|
|
|
e->u.enter.d.u.code_fenced.info = pool + off; |
|
220
|
19
|
|
|
|
|
|
e->u.enter.d.u.code_fenced.info_len = e->u.enter.info_len; |
|
221
|
|
|
|
|
|
|
} |
|
222
|
5534
|
50
|
|
|
|
|
if (MDS_LIKELY(cb.enter_block != NULL)) |
|
223
|
5534
|
|
|
|
|
|
cb.enter_block(ud, e->u.enter.t, &e->u.enter.d); |
|
224
|
5534
|
|
|
|
|
|
break; |
|
225
|
5534
|
|
|
|
|
|
case EV_LEAVE_BLOCK: |
|
226
|
5534
|
50
|
|
|
|
|
if (MDS_LIKELY(cb.leave_block != NULL)) |
|
227
|
5534
|
|
|
|
|
|
cb.leave_block(ud, e->u.leave.t); |
|
228
|
5534
|
|
|
|
|
|
break; |
|
229
|
512
|
|
|
|
|
|
case EV_TEXT: |
|
230
|
512
|
50
|
|
|
|
|
if (MDS_LIKELY(cb.text != NULL)) |
|
231
|
512
|
|
|
|
|
|
cb.text(ud, pool + e->u.bytes.off, e->u.bytes.len); |
|
232
|
512
|
|
|
|
|
|
break; |
|
233
|
200
|
|
|
|
|
|
case EV_RAW: |
|
234
|
200
|
50
|
|
|
|
|
if (MDS_LIKELY(cb.raw != NULL)) |
|
235
|
200
|
|
|
|
|
|
cb.raw(ud, pool + e->u.bytes.off, e->u.bytes.len); |
|
236
|
200
|
|
|
|
|
|
break; |
|
237
|
3164
|
|
|
|
|
|
case EV_INLINE: |
|
238
|
3164
|
|
|
|
|
|
mds_inline_scan(b->ctx, pool + e->u.bytes.off, e->u.bytes.len); |
|
239
|
3164
|
|
|
|
|
|
break; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
} |
|
242
|
2460
|
|
|
|
|
|
b->ev_len = 0; |
|
243
|
2460
|
|
|
|
|
|
b->bp_len = 0; |
|
244
|
2460
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
/* ---------- container helpers ---------- */ |
|
247
|
|
|
|
|
|
|
|
|
248
|
2422
|
|
|
|
|
|
static ctn* top(bscanner* b) { return &b->stack[b->depth - 1]; } |
|
249
|
|
|
|
|
|
|
|
|
250
|
3604
|
|
|
|
|
|
static int push(bscanner* b, ct_kind k) { |
|
251
|
|
|
|
|
|
|
ctn* c; |
|
252
|
3604
|
50
|
|
|
|
|
if (b->depth >= MAX_DEPTH) return 0; |
|
253
|
3604
|
|
|
|
|
|
c = &b->stack[b->depth++]; |
|
254
|
3604
|
|
|
|
|
|
memset(c, 0, sizeof *c); |
|
255
|
3604
|
|
|
|
|
|
c->kind = k; |
|
256
|
3604
|
|
|
|
|
|
c->tight = 1; |
|
257
|
3604
|
|
|
|
|
|
c->ev_idx = -1; |
|
258
|
3604
|
100
|
|
|
|
|
if (k == CT_LIST) b->list_depth++; |
|
259
|
3604
|
|
|
|
|
|
return 1; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
/* ---------- forward decls ---------- */ |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
static void emit_open(bscanner* b, int idx); |
|
265
|
|
|
|
|
|
|
static void emit_close(bscanner* b, ctn* c); |
|
266
|
|
|
|
|
|
|
static void finalize_leaf(bscanner* b); |
|
267
|
|
|
|
|
|
|
static void close_containers_to(bscanner* b, int target_depth); |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
/* ---------- link reference definition extractor ---------- |
|
270
|
|
|
|
|
|
|
* |
|
271
|
|
|
|
|
|
|
* Tries to consume one definition starting at *p_in. Returns 1 if one was |
|
272
|
|
|
|
|
|
|
* fully parsed, 0 otherwise. On success updates *p_in past the end of the |
|
273
|
|
|
|
|
|
|
* definition. |
|
274
|
|
|
|
|
|
|
* |
|
275
|
|
|
|
|
|
|
* Grammar (simplified subset): |
|
276
|
|
|
|
|
|
|
* ^ {0,3} '[' label ']' ':' ws+ url (ws+ title)? ws* \n? |
|
277
|
|
|
|
|
|
|
* - label may contain anything but ']', '[', '\n' (no nesting). |
|
278
|
|
|
|
|
|
|
* - url either with no '<>' inside, or a bareword with no |
|
279
|
|
|
|
|
|
|
* spaces (parens balanced not enforced here). |
|
280
|
|
|
|
|
|
|
* - title in "...", '...', or (...), possibly on next line. |
|
281
|
|
|
|
|
|
|
*/ |
|
282
|
3367
|
|
|
|
|
|
static int parse_linkref(const char* p, const char* end, const char** p_out, |
|
283
|
|
|
|
|
|
|
const char** lbl_s, const char** lbl_e, |
|
284
|
|
|
|
|
|
|
const char** url_s, const char** url_e, |
|
285
|
|
|
|
|
|
|
const char** tit_s, const char** tit_e) { |
|
286
|
|
|
|
|
|
|
const char* q; |
|
287
|
|
|
|
|
|
|
int lead; |
|
288
|
|
|
|
|
|
|
const char* ls; |
|
289
|
|
|
|
|
|
|
int label_nl; |
|
290
|
|
|
|
|
|
|
const char* r; |
|
291
|
|
|
|
|
|
|
const char* le; |
|
292
|
|
|
|
|
|
|
int has_nonws; |
|
293
|
|
|
|
|
|
|
int nl; |
|
294
|
|
|
|
|
|
|
const char *us, *ue; |
|
295
|
|
|
|
|
|
|
const char *ts, *te; |
|
296
|
|
|
|
|
|
|
const char* save_after_url; |
|
297
|
|
|
|
|
|
|
int saw_nl; |
|
298
|
|
|
|
|
|
|
int saw_ws; |
|
299
|
|
|
|
|
|
|
char open, close; |
|
300
|
|
|
|
|
|
|
int blank_found; |
|
301
|
|
|
|
|
|
|
const char* check; |
|
302
|
|
|
|
|
|
|
|
|
303
|
3367
|
|
|
|
|
|
q = p; |
|
304
|
3367
|
|
|
|
|
|
lead = 0; |
|
305
|
3368
|
50
|
|
|
|
|
while (q < end && *q == ' ' && lead < 3) { q++; lead++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
306
|
3367
|
50
|
|
|
|
|
if (q >= end || *q != '[') return 0; |
|
|
|
100
|
|
|
|
|
|
|
307
|
708
|
|
|
|
|
|
q++; |
|
308
|
708
|
|
|
|
|
|
ls = q; |
|
309
|
|
|
|
|
|
|
/* Label may span multiple lines (no blank line). Allow \] inside. */ |
|
310
|
708
|
|
|
|
|
|
label_nl = 0; |
|
311
|
3488
|
50
|
|
|
|
|
while (q < end && *q != ']') { |
|
|
|
100
|
|
|
|
|
|
|
312
|
2828
|
100
|
|
|
|
|
if (*q == '\\' && q + 1 < end) { q += 2; continue; } |
|
|
|
50
|
|
|
|
|
|
|
313
|
2807
|
100
|
|
|
|
|
if (*q == '[') return 0; |
|
314
|
2759
|
100
|
|
|
|
|
if (*q == '\n') { |
|
315
|
15
|
50
|
|
|
|
|
if (++label_nl > 0) { |
|
316
|
|
|
|
|
|
|
/* check for blank line */ |
|
317
|
15
|
|
|
|
|
|
r = q + 1; |
|
318
|
15
|
50
|
|
|
|
|
while (r < end && (*r == ' ' || *r == '\t')) r++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
319
|
15
|
50
|
|
|
|
|
if (r >= end || *r == '\n') return 0; |
|
|
|
50
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
} |
|
322
|
2759
|
|
|
|
|
|
q++; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
660
|
50
|
|
|
|
|
if (q >= end || *q != ']') return 0; |
|
|
|
50
|
|
|
|
|
|
|
325
|
660
|
|
|
|
|
|
le = q; |
|
326
|
660
|
100
|
|
|
|
|
if (le == ls) return 0; /* empty label invalid */ |
|
327
|
|
|
|
|
|
|
/* Label must contain at least one non-whitespace char. */ |
|
328
|
648
|
|
|
|
|
|
has_nonws = 0; |
|
329
|
676
|
100
|
|
|
|
|
for (r = ls; r < le; r++) { |
|
330
|
651
|
100
|
|
|
|
|
if (*r != ' ' && *r != '\t' && *r != '\n') { has_nonws = 1; break; } |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
} |
|
332
|
648
|
100
|
|
|
|
|
if (!has_nonws) return 0; |
|
333
|
623
|
|
|
|
|
|
q++; |
|
334
|
623
|
100
|
|
|
|
|
if (q >= end || *q != ':') return 0; |
|
|
|
100
|
|
|
|
|
|
|
335
|
280
|
|
|
|
|
|
q++; |
|
336
|
|
|
|
|
|
|
/* whitespace, may include up to one newline */ |
|
337
|
280
|
|
|
|
|
|
nl = 0; |
|
338
|
566
|
100
|
|
|
|
|
while (q < end && (*q == ' ' || *q == '\t' || *q == '\n')) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
339
|
286
|
100
|
|
|
|
|
if (*q == '\n') { if (++nl > 1) return 0; } |
|
|
|
50
|
|
|
|
|
|
|
340
|
286
|
|
|
|
|
|
q++; |
|
341
|
|
|
|
|
|
|
} |
|
342
|
280
|
100
|
|
|
|
|
if (q >= end) return 0; |
|
343
|
|
|
|
|
|
|
/* url */ |
|
344
|
277
|
100
|
|
|
|
|
if (*q == '<') { |
|
345
|
9
|
|
|
|
|
|
q++; |
|
346
|
9
|
|
|
|
|
|
us = q; |
|
347
|
36
|
50
|
|
|
|
|
while (q < end && *q != '>' && *q != '\n' && *q != '<') q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
348
|
9
|
50
|
|
|
|
|
if (q >= end || *q != '>') return 0; |
|
|
|
50
|
|
|
|
|
|
|
349
|
9
|
|
|
|
|
|
ue = q; |
|
350
|
9
|
|
|
|
|
|
q++; |
|
351
|
|
|
|
|
|
|
} else { |
|
352
|
268
|
|
|
|
|
|
us = q; |
|
353
|
1675
|
100
|
|
|
|
|
while (q < end && *q != ' ' && *q != '\t' && *q != '\n') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
/* disallow ASCII control */ |
|
355
|
1407
|
50
|
|
|
|
|
if ((unsigned char)*q < 0x20) return 0; |
|
356
|
1407
|
|
|
|
|
|
q++; |
|
357
|
|
|
|
|
|
|
} |
|
358
|
268
|
|
|
|
|
|
ue = q; |
|
359
|
268
|
50
|
|
|
|
|
if (ue == us) return 0; |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
/* optional title — must be separated from URL by at least one ws char |
|
362
|
|
|
|
|
|
|
* (per CM §4.7). Without separator, '(baz)' is not a valid def. */ |
|
363
|
277
|
|
|
|
|
|
ts = NULL; te = NULL; |
|
364
|
277
|
|
|
|
|
|
save_after_url = q; |
|
365
|
277
|
|
|
|
|
|
saw_nl = 0; |
|
366
|
277
|
|
|
|
|
|
saw_ws = 0; |
|
367
|
446
|
100
|
|
|
|
|
while (q < end && (*q == ' ' || *q == '\t' || *q == '\n')) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
368
|
169
|
100
|
|
|
|
|
if (*q == '\n') saw_nl++; |
|
369
|
169
|
|
|
|
|
|
saw_ws = 1; |
|
370
|
169
|
|
|
|
|
|
q++; |
|
371
|
|
|
|
|
|
|
} |
|
372
|
277
|
100
|
|
|
|
|
if (q < end && (*q == '"' || *q == '\'' || *q == '(') && saw_nl <= 1 && saw_ws) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
373
|
115
|
|
|
|
|
|
open = *q; |
|
374
|
115
|
50
|
|
|
|
|
close = (open == '(') ? ')' : open; |
|
375
|
115
|
|
|
|
|
|
q++; |
|
376
|
115
|
|
|
|
|
|
ts = q; |
|
377
|
|
|
|
|
|
|
/* Title may span multiple lines but cannot contain a blank line. |
|
378
|
|
|
|
|
|
|
Detect blank line as: \n followed by (spaces|tabs)* \n . */ |
|
379
|
115
|
|
|
|
|
|
blank_found = 0; |
|
380
|
893
|
100
|
|
|
|
|
while (q < end && *q != close) { |
|
|
|
100
|
|
|
|
|
|
|
381
|
778
|
50
|
|
|
|
|
if (open != '(' && *q == '\\' && q + 1 < end) { q += 2; continue; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
382
|
769
|
50
|
|
|
|
|
if (open == '(' && *q == '(') { /* unescaped '(' invalid in paren title */ |
|
|
|
0
|
|
|
|
|
|
|
383
|
0
|
|
|
|
|
|
blank_found = 1; break; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
769
|
100
|
|
|
|
|
if (*q == '\n') { |
|
386
|
12
|
|
|
|
|
|
r = q + 1; |
|
387
|
12
|
50
|
|
|
|
|
while (r < end && (*r == ' ' || *r == '\t')) r++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
388
|
12
|
50
|
|
|
|
|
if (r >= end || *r == '\n') { blank_found = 1; break; } |
|
|
|
50
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
} |
|
390
|
769
|
|
|
|
|
|
q++; |
|
391
|
|
|
|
|
|
|
} |
|
392
|
115
|
50
|
|
|
|
|
if (blank_found) { ts = NULL; te = NULL; q = save_after_url; goto end_title; } |
|
393
|
115
|
100
|
|
|
|
|
if (q < end && *q == close) { |
|
|
|
50
|
|
|
|
|
|
|
394
|
112
|
|
|
|
|
|
te = q; |
|
395
|
112
|
|
|
|
|
|
q++; |
|
396
|
|
|
|
|
|
|
/* nothing but ws/newline allowed on remainder of title-line */ |
|
397
|
112
|
|
|
|
|
|
check = q; |
|
398
|
124
|
100
|
|
|
|
|
while (check < end && check < end && *check != '\n') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
399
|
18
|
100
|
|
|
|
|
if (*check != ' ' && *check != '\t') { ts = NULL; te = NULL; q = save_after_url; goto end_title; } |
|
|
|
50
|
|
|
|
|
|
|
400
|
12
|
|
|
|
|
|
check++; |
|
401
|
|
|
|
|
|
|
} |
|
402
|
|
|
|
|
|
|
} else { |
|
403
|
3
|
|
|
|
|
|
ts = NULL; te = NULL; |
|
404
|
3
|
|
|
|
|
|
q = save_after_url; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
} else { |
|
407
|
162
|
|
|
|
|
|
q = save_after_url; |
|
408
|
|
|
|
|
|
|
} |
|
409
|
277
|
|
|
|
|
|
end_title: |
|
410
|
|
|
|
|
|
|
/* consume trailing ws + one newline */ |
|
411
|
289
|
100
|
|
|
|
|
while (q < end && (*q == ' ' || *q == '\t')) q++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
412
|
277
|
100
|
|
|
|
|
if (q < end && *q == '\n') q++; |
|
|
|
100
|
|
|
|
|
|
|
413
|
244
|
100
|
|
|
|
|
else if (q < end) { |
|
414
|
|
|
|
|
|
|
/* trailing content on same line invalidates the title; reparse without title */ |
|
415
|
9
|
50
|
|
|
|
|
if (ts) { |
|
416
|
0
|
|
|
|
|
|
ts = NULL; te = NULL; |
|
417
|
0
|
|
|
|
|
|
q = save_after_url; |
|
418
|
0
|
0
|
|
|
|
|
while (q < end && (*q == ' ' || *q == '\t')) q++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
419
|
0
|
0
|
|
|
|
|
if (q < end && *q == '\n') q++; |
|
|
|
0
|
|
|
|
|
|
|
420
|
0
|
0
|
|
|
|
|
else if (q < end) return 0; |
|
421
|
|
|
|
|
|
|
} else { |
|
422
|
9
|
|
|
|
|
|
return 0; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
} |
|
425
|
268
|
|
|
|
|
|
*p_out = q; |
|
426
|
268
|
|
|
|
|
|
*lbl_s = ls; *lbl_e = le; |
|
427
|
268
|
|
|
|
|
|
*url_s = us; *url_e = ue; |
|
428
|
268
|
|
|
|
|
|
*tit_s = ts; *tit_e = te; |
|
429
|
268
|
|
|
|
|
|
return 1; |
|
430
|
|
|
|
|
|
|
} |
|
431
|
|
|
|
|
|
|
|
|
432
|
262
|
|
|
|
|
|
static void ensure_linkref_tab(mds_ctx* ctx) { |
|
433
|
262
|
100
|
|
|
|
|
if (ctx->refs) return; |
|
434
|
241
|
|
|
|
|
|
ctx->refs = (struct mds_linkref_tab*)mds_arena_alloc( |
|
435
|
|
|
|
|
|
|
&ctx->arena, sizeof(struct mds_linkref_tab)); |
|
436
|
241
|
|
|
|
|
|
mds_linkref_init(ctx->refs, &ctx->arena); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
|
|
439
|
21
|
|
|
|
|
|
static void ensure_footnote_tab(mds_ctx* ctx) { |
|
440
|
21
|
100
|
|
|
|
|
if (ctx->footnotes) return; |
|
441
|
9
|
|
|
|
|
|
ctx->footnotes = (struct mds_footnote_tab*)mds_arena_alloc( |
|
442
|
|
|
|
|
|
|
&ctx->arena, sizeof(struct mds_footnote_tab)); |
|
443
|
9
|
|
|
|
|
|
mds_footnote_init(ctx->footnotes, &ctx->arena); |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
/* Tier E.1 (extended) — line-level preprocessing pass that walks the input |
|
447
|
|
|
|
|
|
|
* once, identifies top-level `[^label]:` definitions, captures their |
|
448
|
|
|
|
|
|
|
* multi-line bodies (continuation = blank lines or 4-space-indented lines, |
|
449
|
|
|
|
|
|
|
* with 4-space dedent), registers them in the footnote table, and returns |
|
450
|
|
|
|
|
|
|
* a cleaned input with those lines elided. Only invoked when |
|
451
|
|
|
|
|
|
|
* MDS_FLAG_FOOTNOTES is set. Tracks fenced-code state to avoid matching |
|
452
|
|
|
|
|
|
|
* `[^…]:` inside fenced blocks. |
|
453
|
|
|
|
|
|
|
* |
|
454
|
|
|
|
|
|
|
* The cleaned buffer is allocated from the arena so it survives the |
|
455
|
|
|
|
|
|
|
* lifetime of the scan; the original ctx->input is left in place for |
|
456
|
|
|
|
|
|
|
* sub-scans (which re-run mds_block_scan on individual def bodies). |
|
457
|
|
|
|
|
|
|
* |
|
458
|
|
|
|
|
|
|
* NOTE: this is a coarse, document-level pass — defs nested inside list |
|
459
|
|
|
|
|
|
|
* items or blockquotes are still handled by finalize_paragraph's leading- |
|
460
|
|
|
|
|
|
|
* defs strip. */ |
|
461
|
458
|
|
|
|
|
|
static void preprocess_footnotes(mds_ctx* ctx) { |
|
462
|
|
|
|
|
|
|
const char* in; |
|
463
|
|
|
|
|
|
|
size_t ilen; |
|
464
|
|
|
|
|
|
|
char* out; |
|
465
|
|
|
|
|
|
|
size_t olen; |
|
466
|
|
|
|
|
|
|
const char* p; |
|
467
|
|
|
|
|
|
|
const char* end; |
|
468
|
|
|
|
|
|
|
int in_fence; |
|
469
|
|
|
|
|
|
|
char fence_ch; |
|
470
|
|
|
|
|
|
|
int fence_len; |
|
471
|
|
|
|
|
|
|
const char* le; |
|
472
|
|
|
|
|
|
|
const char* nxt; |
|
473
|
|
|
|
|
|
|
size_t lsz; |
|
474
|
|
|
|
|
|
|
const char* q; |
|
475
|
|
|
|
|
|
|
int ind; |
|
476
|
|
|
|
|
|
|
int run; |
|
477
|
|
|
|
|
|
|
const char* tail; |
|
478
|
|
|
|
|
|
|
char fc; |
|
479
|
|
|
|
|
|
|
int fl; |
|
480
|
|
|
|
|
|
|
const char *lbs, *lbe, *bs; |
|
481
|
|
|
|
|
|
|
char* body; |
|
482
|
|
|
|
|
|
|
size_t blen; |
|
483
|
|
|
|
|
|
|
const char *le2, *nxt2; |
|
484
|
|
|
|
|
|
|
int blank; |
|
485
|
|
|
|
|
|
|
const char* r; |
|
486
|
|
|
|
|
|
|
int sp; |
|
487
|
|
|
|
|
|
|
|
|
488
|
458
|
|
|
|
|
|
in = ctx->input; |
|
489
|
458
|
|
|
|
|
|
ilen = ctx->len; |
|
490
|
458
|
100
|
|
|
|
|
if (!ilen) return; |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
/* Worst-case: same size as input. */ |
|
493
|
456
|
|
|
|
|
|
out = (char*)mds_arena_alloc(&ctx->arena, ilen + 1); |
|
494
|
456
|
|
|
|
|
|
olen = 0; |
|
495
|
|
|
|
|
|
|
|
|
496
|
456
|
|
|
|
|
|
p = in; |
|
497
|
456
|
|
|
|
|
|
end = in + ilen; |
|
498
|
456
|
|
|
|
|
|
in_fence = 0; |
|
499
|
456
|
|
|
|
|
|
fence_ch = 0; |
|
500
|
456
|
|
|
|
|
|
fence_len = 0; |
|
501
|
|
|
|
|
|
|
|
|
502
|
2105
|
100
|
|
|
|
|
while (p < end) { |
|
503
|
1649
|
|
|
|
|
|
le = (const char*)memchr(p, '\n', (size_t)(end - p)); |
|
504
|
1649
|
100
|
|
|
|
|
if (!le) le = end; |
|
505
|
1649
|
100
|
|
|
|
|
nxt = (le < end) ? le + 1 : end; |
|
506
|
1649
|
|
|
|
|
|
lsz = (size_t)(nxt - p); |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
/* Walk up to 3 leading spaces. */ |
|
509
|
1649
|
|
|
|
|
|
q = p; |
|
510
|
1649
|
|
|
|
|
|
ind = 0; |
|
511
|
1745
|
100
|
|
|
|
|
while (q < le && *q == ' ' && ind < 3) { q++; ind++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
|
|
513
|
1649
|
100
|
|
|
|
|
if (in_fence) { |
|
514
|
|
|
|
|
|
|
/* Close-fence detection: run of fence_ch ≥ fence_len, then ws. */ |
|
515
|
10
|
|
|
|
|
|
run = 0; |
|
516
|
25
|
100
|
|
|
|
|
while (q + run < le && q[run] == fence_ch) run++; |
|
|
|
100
|
|
|
|
|
|
|
517
|
10
|
100
|
|
|
|
|
if (run >= fence_len) { |
|
518
|
5
|
|
|
|
|
|
tail = q + run; |
|
519
|
5
|
50
|
|
|
|
|
while (tail < le && (*tail == ' ' || *tail == '\t')) tail++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
520
|
5
|
50
|
|
|
|
|
if (tail == le) in_fence = 0; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
10
|
|
|
|
|
|
memcpy(out + olen, p, lsz); olen += lsz; |
|
523
|
10
|
|
|
|
|
|
p = nxt; |
|
524
|
10
|
|
|
|
|
|
continue; |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
/* Open-fence detection. */ |
|
528
|
1639
|
100
|
|
|
|
|
if (le - q >= 3 && (q[0] == '`' || q[0] == '~') && q[1] == q[0] && q[2] == q[0]) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
529
|
5
|
|
|
|
|
|
fc = q[0]; |
|
530
|
5
|
|
|
|
|
|
fl = 3; |
|
531
|
5
|
100
|
|
|
|
|
while (q + fl < le && q[fl] == fc) fl++; |
|
|
|
50
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
/* Tilde fences can have any info; backtick fences must not |
|
533
|
|
|
|
|
|
|
* contain backticks in info — but for our purposes (we only |
|
534
|
|
|
|
|
|
|
* care about hiding defs inside the fence body) treat both |
|
535
|
|
|
|
|
|
|
* the same. */ |
|
536
|
5
|
|
|
|
|
|
fence_ch = fc; |
|
537
|
5
|
|
|
|
|
|
fence_len = fl; |
|
538
|
5
|
|
|
|
|
|
in_fence = 1; |
|
539
|
5
|
|
|
|
|
|
memcpy(out + olen, p, lsz); olen += lsz; |
|
540
|
5
|
|
|
|
|
|
p = nxt; |
|
541
|
5
|
|
|
|
|
|
continue; |
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
/* Footnote def? `[^label]:` */ |
|
545
|
1634
|
100
|
|
|
|
|
if (le - q >= 4 && q[0] == '[' && q[1] == '^') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
546
|
21
|
|
|
|
|
|
lbs = q + 2; |
|
547
|
21
|
|
|
|
|
|
lbe = lbs; |
|
548
|
249
|
50
|
|
|
|
|
while (lbe < le && *lbe != ']' && *lbe != '[' && *lbe != '\n') lbe++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
549
|
21
|
50
|
|
|
|
|
if (lbe < le && *lbe == ']' && lbe > lbs && |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
550
|
21
|
50
|
|
|
|
|
lbe + 1 < le && lbe[1] == ':') { |
|
|
|
50
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
/* Match. Capture body. */ |
|
552
|
21
|
|
|
|
|
|
bs = lbe + 2; |
|
553
|
54
|
100
|
|
|
|
|
while (bs < le && (*bs == ' ' || *bs == '\t')) bs++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
/* Body buffer (arena). Worst case the rest of the input. */ |
|
556
|
21
|
|
|
|
|
|
body = (char*)mds_arena_alloc(&ctx->arena, |
|
557
|
21
|
|
|
|
|
|
(size_t)(end - bs) + 1); |
|
558
|
21
|
|
|
|
|
|
blen = 0; |
|
559
|
21
|
100
|
|
|
|
|
if (bs < le) { |
|
560
|
15
|
|
|
|
|
|
memcpy(body + blen, bs, (size_t)(le - bs)); |
|
561
|
15
|
|
|
|
|
|
blen += (size_t)(le - bs); |
|
562
|
|
|
|
|
|
|
} |
|
563
|
|
|
|
|
|
|
|
|
564
|
21
|
|
|
|
|
|
p = nxt; |
|
565
|
|
|
|
|
|
|
/* Continuation lines: blank OR ≥4 spaces (or 1 tab) leading. */ |
|
566
|
51
|
100
|
|
|
|
|
while (p < end) { |
|
567
|
42
|
|
|
|
|
|
le2 = (const char*)memchr(p, '\n', (size_t)(end - p)); |
|
568
|
42
|
50
|
|
|
|
|
if (!le2) le2 = end; |
|
569
|
42
|
50
|
|
|
|
|
nxt2 = (le2 < end) ? le2 + 1 : end; |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
/* Blank? (only ws up to newline) */ |
|
572
|
42
|
|
|
|
|
|
blank = 1; |
|
573
|
114
|
100
|
|
|
|
|
for (r = p; r < le2; r++) { |
|
574
|
96
|
100
|
|
|
|
|
if (*r != ' ' && *r != '\t') { blank = 0; break; } |
|
|
|
50
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
} |
|
576
|
42
|
100
|
|
|
|
|
if (blank) { |
|
577
|
18
|
|
|
|
|
|
body[blen++] = '\n'; |
|
578
|
18
|
|
|
|
|
|
p = nxt2; |
|
579
|
18
|
|
|
|
|
|
continue; |
|
580
|
|
|
|
|
|
|
} |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
/* Check ≥4 leading spaces (or one tab counts as 4). */ |
|
583
|
24
|
|
|
|
|
|
sp = 0; |
|
584
|
24
|
|
|
|
|
|
r = p; |
|
585
|
72
|
50
|
|
|
|
|
while (r < le2 && sp < 4) { |
|
|
|
100
|
|
|
|
|
|
|
586
|
60
|
100
|
|
|
|
|
if (*r == ' ') { sp++; r++; } |
|
587
|
12
|
50
|
|
|
|
|
else if (*r == '\t') { sp = 4; r++; break; } |
|
588
|
12
|
|
|
|
|
|
else break; |
|
589
|
|
|
|
|
|
|
} |
|
590
|
24
|
100
|
|
|
|
|
if (sp < 4) break; /* end of def body */ |
|
591
|
|
|
|
|
|
|
|
|
592
|
12
|
|
|
|
|
|
body[blen++] = '\n'; |
|
593
|
12
|
50
|
|
|
|
|
if (le2 > r) { |
|
594
|
12
|
|
|
|
|
|
memcpy(body + blen, r, (size_t)(le2 - r)); |
|
595
|
12
|
|
|
|
|
|
blen += (size_t)(le2 - r); |
|
596
|
|
|
|
|
|
|
} |
|
597
|
12
|
|
|
|
|
|
p = nxt2; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
/* Strip trailing blank/whitespace bytes for tidy rendering. */ |
|
601
|
33
|
50
|
|
|
|
|
while (blen > 0 && (body[blen-1] == '\n' || |
|
|
|
100
|
|
|
|
|
|
|
602
|
21
|
50
|
|
|
|
|
body[blen-1] == ' ' || |
|
603
|
33
|
50
|
|
|
|
|
body[blen-1] == '\t')) blen--; |
|
604
|
|
|
|
|
|
|
|
|
605
|
21
|
|
|
|
|
|
ensure_footnote_tab(ctx); |
|
606
|
21
|
|
|
|
|
|
mds_footnote_add(ctx->footnotes, |
|
607
|
21
|
|
|
|
|
|
lbs, (size_t)(lbe - lbs), |
|
608
|
|
|
|
|
|
|
body, blen); |
|
609
|
|
|
|
|
|
|
/* Do NOT copy these lines into out. */ |
|
610
|
21
|
|
|
|
|
|
continue; |
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
/* Default: copy line. */ |
|
615
|
1613
|
|
|
|
|
|
memcpy(out + olen, p, lsz); olen += lsz; |
|
616
|
1613
|
|
|
|
|
|
p = nxt; |
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
|
|
619
|
456
|
|
|
|
|
|
out[olen] = '\0'; |
|
620
|
456
|
|
|
|
|
|
ctx->input = out; |
|
621
|
456
|
|
|
|
|
|
ctx->len = olen; |
|
622
|
|
|
|
|
|
|
} |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
/* GFM footnote definition: ` {0,3}[^label]:` followed by body bytes |
|
625
|
|
|
|
|
|
|
* (continuation rules collapsed in this MVP — body is whatever remains |
|
626
|
|
|
|
|
|
|
* of the paragraph buffer up to the next `[^...]:` line or end). Returns |
|
627
|
|
|
|
|
|
|
* 1 if a def was consumed, advancing *p_in. Body bytes are the slice |
|
628
|
|
|
|
|
|
|
* [body_s..body_e). */ |
|
629
|
909
|
|
|
|
|
|
static int parse_footnote_def(const char* p, const char* end, |
|
630
|
|
|
|
|
|
|
const char** p_out, |
|
631
|
|
|
|
|
|
|
const char** lbl_s, const char** lbl_e, |
|
632
|
|
|
|
|
|
|
const char** body_s, const char** body_e) { |
|
633
|
|
|
|
|
|
|
const char* q; |
|
634
|
|
|
|
|
|
|
int lead; |
|
635
|
|
|
|
|
|
|
const char* ls; |
|
636
|
|
|
|
|
|
|
const char* le; |
|
637
|
|
|
|
|
|
|
const char* bs; |
|
638
|
|
|
|
|
|
|
const char* be; |
|
639
|
|
|
|
|
|
|
const char* nxt; |
|
640
|
|
|
|
|
|
|
const char* r; |
|
641
|
|
|
|
|
|
|
int rlead; |
|
642
|
|
|
|
|
|
|
const char* rr; |
|
643
|
|
|
|
|
|
|
|
|
644
|
909
|
|
|
|
|
|
q = p; |
|
645
|
909
|
|
|
|
|
|
lead = 0; |
|
646
|
910
|
50
|
|
|
|
|
while (q < end && *q == ' ' && lead < 3) { q++; lead++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
647
|
909
|
100
|
|
|
|
|
if (end - q < 4) return 0; |
|
648
|
878
|
100
|
|
|
|
|
if (q[0] != '[' || q[1] != '^') return 0; |
|
|
|
50
|
|
|
|
|
|
|
649
|
0
|
|
|
|
|
|
q += 2; |
|
650
|
0
|
|
|
|
|
|
ls = q; |
|
651
|
0
|
0
|
|
|
|
|
while (q < end && *q != ']' && *q != '\n' && *q != '[') q++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
652
|
0
|
0
|
|
|
|
|
if (q >= end || *q != ']' || q == ls) return 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
653
|
0
|
|
|
|
|
|
le = q; |
|
654
|
0
|
|
|
|
|
|
q++; |
|
655
|
0
|
0
|
|
|
|
|
if (q >= end || *q != ':') return 0; |
|
|
|
0
|
|
|
|
|
|
|
656
|
0
|
|
|
|
|
|
q++; |
|
657
|
|
|
|
|
|
|
/* Spec: any number of spaces/tabs follow; they are stripped. */ |
|
658
|
0
|
0
|
|
|
|
|
while (q < end && (*q == ' ' || *q == '\t')) q++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
/* Body runs until the next bare-line `[^...]:` or end of buffer. |
|
660
|
|
|
|
|
|
|
* In this MVP we accept body across newlines but NOT across blank |
|
661
|
|
|
|
|
|
|
* lines (the para buffer would already have been split). */ |
|
662
|
0
|
|
|
|
|
|
bs = q; |
|
663
|
0
|
|
|
|
|
|
be = q; |
|
664
|
0
|
0
|
|
|
|
|
while (q < end) { |
|
665
|
0
|
0
|
|
|
|
|
if (*q == '\n') { |
|
666
|
0
|
|
|
|
|
|
nxt = q + 1; |
|
667
|
|
|
|
|
|
|
/* Peek next line: another `[^...]:` ? -> stop here. */ |
|
668
|
0
|
|
|
|
|
|
r = nxt; |
|
669
|
0
|
|
|
|
|
|
rlead = 0; |
|
670
|
0
|
0
|
|
|
|
|
while (r < end && *r == ' ' && rlead < 3) { r++; rlead++; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
671
|
0
|
0
|
|
|
|
|
if (end - r >= 4 && r[0] == '[' && r[1] == '^') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
672
|
0
|
|
|
|
|
|
rr = r + 2; |
|
673
|
0
|
0
|
|
|
|
|
while (rr < end && *rr != ']' && *rr != '\n' && *rr != '[') rr++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
674
|
0
|
0
|
|
|
|
|
if (rr < end && *rr == ']' && rr + 1 < end && rr[1] == ':') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
675
|
0
|
|
|
|
|
|
be = q; /* don't include the \n */ |
|
676
|
0
|
|
|
|
|
|
q = nxt; /* next def starts here */ |
|
677
|
0
|
|
|
|
|
|
goto done; |
|
678
|
|
|
|
|
|
|
} |
|
679
|
|
|
|
|
|
|
} |
|
680
|
0
|
|
|
|
|
|
be = q + 1; /* keep the \n in body */ |
|
681
|
0
|
|
|
|
|
|
q = nxt; |
|
682
|
0
|
|
|
|
|
|
continue; |
|
683
|
|
|
|
|
|
|
} |
|
684
|
0
|
|
|
|
|
|
q++; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
0
|
|
|
|
|
|
be = q; |
|
687
|
0
|
|
|
|
|
|
done: |
|
688
|
0
|
|
|
|
|
|
*p_out = q; |
|
689
|
0
|
|
|
|
|
|
*lbl_s = ls; *lbl_e = le; |
|
690
|
0
|
|
|
|
|
|
*body_s = bs; *body_e = be; |
|
691
|
0
|
|
|
|
|
|
return 1; |
|
692
|
|
|
|
|
|
|
} |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
/* ---------- GFM tables ---------- */ |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
#include "mds.h" |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
static const char* next_line(const char* p, const char* end, |
|
699
|
|
|
|
|
|
|
const char** line_end_out); |
|
700
|
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
#define MDS_TBL_MAX_COLS 64 |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
/* Tables: memchr-driven pipe scan. libc memchr is SIMD-vectorised |
|
704
|
|
|
|
|
|
|
* on every platform we care about (Apple libSystem uses NEON, glibc uses |
|
705
|
|
|
|
|
|
|
* AVX2), so iterating candidate pipes is dramatically faster than a |
|
706
|
|
|
|
|
|
|
* byte-by-byte loop. We confirm each pipe is unescaped by counting the |
|
707
|
|
|
|
|
|
|
* run of preceding backslashes — odd count means escaped. */ |
|
708
|
|
|
|
|
|
|
MDS_HOT |
|
709
|
768
|
|
|
|
|
|
static const char* tbl_find_pipe(const char* p, const char* end) { |
|
710
|
|
|
|
|
|
|
const char* q; |
|
711
|
|
|
|
|
|
|
const char* bs; |
|
712
|
|
|
|
|
|
|
|
|
713
|
799
|
50
|
|
|
|
|
while (p < end) { |
|
714
|
799
|
|
|
|
|
|
q = (const char*)memchr(p, '|', (size_t)(end - p)); |
|
715
|
799
|
100
|
|
|
|
|
if (!q) return NULL; |
|
716
|
|
|
|
|
|
|
/* Count preceding backslashes. */ |
|
717
|
532
|
|
|
|
|
|
bs = q; |
|
718
|
563
|
100
|
|
|
|
|
while (bs > p && bs[-1] == '\\') bs--; |
|
|
|
100
|
|
|
|
|
|
|
719
|
532
|
100
|
|
|
|
|
if (((q - bs) & 1u) == 0u) return q; /* even ⇒ unescaped */ |
|
720
|
31
|
|
|
|
|
|
p = q + 1; |
|
721
|
|
|
|
|
|
|
} |
|
722
|
0
|
|
|
|
|
|
return NULL; |
|
723
|
|
|
|
|
|
|
} |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
/* Does the cell contain a backslash? Cheap check that lets the fast |
|
726
|
|
|
|
|
|
|
* path skip the per-byte unescape pass entirely. */ |
|
727
|
|
|
|
|
|
|
MDS_ALWAYS_INLINE static int tbl_cell_needs_unescape(const char* s, size_t n) { |
|
728
|
320
|
|
|
|
|
|
return memchr(s, '\\', n) != NULL; |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
/* Split a line on unescaped '|', trimming leading/trailing whitespace |
|
732
|
|
|
|
|
|
|
* and any single leading/trailing pipe. Writes (start,len) pairs to |
|
733
|
|
|
|
|
|
|
* cells[] up to max_cells. Returns the cell count (without truncation). */ |
|
734
|
243
|
|
|
|
|
|
static unsigned tbl_split_cells(const char* line, size_t len, |
|
735
|
|
|
|
|
|
|
const char** out_s, size_t* out_n, |
|
736
|
|
|
|
|
|
|
unsigned max_cells) { |
|
737
|
|
|
|
|
|
|
const char* p; |
|
738
|
|
|
|
|
|
|
const char* end; |
|
739
|
|
|
|
|
|
|
int esc; |
|
740
|
|
|
|
|
|
|
const char* q; |
|
741
|
|
|
|
|
|
|
unsigned count; |
|
742
|
|
|
|
|
|
|
const char* pipe; |
|
743
|
|
|
|
|
|
|
const char* cell_end; |
|
744
|
|
|
|
|
|
|
const char* cs; |
|
745
|
|
|
|
|
|
|
const char* ce; |
|
746
|
|
|
|
|
|
|
|
|
747
|
243
|
|
|
|
|
|
p = line; |
|
748
|
243
|
|
|
|
|
|
end = line + len; |
|
749
|
|
|
|
|
|
|
/* Trim outer whitespace. */ |
|
750
|
243
|
50
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
751
|
246
|
50
|
|
|
|
|
while (end > p && (end[-1] == ' ' || end[-1] == '\t')) end--; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
/* Strip a single leading unescaped pipe. */ |
|
753
|
243
|
50
|
|
|
|
|
if (p < end && *p == '|') p++; |
|
|
|
100
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
/* Strip a single trailing unescaped pipe (must not be escaped). */ |
|
755
|
243
|
50
|
|
|
|
|
if (end > p && end[-1] == '|') { |
|
|
|
100
|
|
|
|
|
|
|
756
|
210
|
|
|
|
|
|
esc = 0; |
|
757
|
210
|
|
|
|
|
|
q = end - 1; |
|
758
|
210
|
50
|
|
|
|
|
while (q > p && q[-1] == '\\') { esc = !esc; q--; } |
|
|
|
50
|
|
|
|
|
|
|
759
|
210
|
50
|
|
|
|
|
if (!esc) end--; |
|
760
|
|
|
|
|
|
|
} |
|
761
|
243
|
|
|
|
|
|
count = 0; |
|
762
|
501
|
50
|
|
|
|
|
while (p <= end) { |
|
763
|
501
|
|
|
|
|
|
pipe = tbl_find_pipe(p, end); |
|
764
|
501
|
100
|
|
|
|
|
cell_end = pipe ? pipe : end; |
|
765
|
501
|
|
|
|
|
|
cs = p; |
|
766
|
501
|
|
|
|
|
|
ce = cell_end; |
|
767
|
956
|
100
|
|
|
|
|
while (cs < ce && (*cs == ' ' || *cs == '\t')) cs++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
768
|
942
|
100
|
|
|
|
|
while (ce > cs && (ce[-1] == ' ' || ce[-1] == '\t')) ce--; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
769
|
501
|
50
|
|
|
|
|
if (count < max_cells) { |
|
770
|
501
|
|
|
|
|
|
out_s[count] = cs; |
|
771
|
501
|
|
|
|
|
|
out_n[count] = (size_t)(ce - cs); |
|
772
|
|
|
|
|
|
|
} |
|
773
|
501
|
|
|
|
|
|
count++; |
|
774
|
501
|
100
|
|
|
|
|
if (!pipe) break; |
|
775
|
258
|
|
|
|
|
|
p = pipe + 1; |
|
776
|
|
|
|
|
|
|
} |
|
777
|
243
|
|
|
|
|
|
return count; |
|
778
|
|
|
|
|
|
|
} |
|
779
|
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
/* Parse a separator row: zero or more spaces, optional |, then per cell |
|
781
|
|
|
|
|
|
|
* `:?-+:?` separated by `|`, optional trailing |. Returns cell count and |
|
782
|
|
|
|
|
|
|
* fills aligns[]. Returns 0 if not a valid separator. */ |
|
783
|
147
|
|
|
|
|
|
static unsigned tbl_parse_separator(const char* line, size_t len, |
|
784
|
|
|
|
|
|
|
mds_align* aligns, unsigned max_cells) { |
|
785
|
|
|
|
|
|
|
const char* p; |
|
786
|
|
|
|
|
|
|
const char* end; |
|
787
|
|
|
|
|
|
|
unsigned count; |
|
788
|
|
|
|
|
|
|
const char* cs; |
|
789
|
|
|
|
|
|
|
const char* ce; |
|
790
|
|
|
|
|
|
|
int left, right; |
|
791
|
|
|
|
|
|
|
const char* q; |
|
792
|
|
|
|
|
|
|
|
|
793
|
147
|
|
|
|
|
|
p = line; |
|
794
|
147
|
|
|
|
|
|
end = line + len; |
|
795
|
147
|
50
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
796
|
147
|
50
|
|
|
|
|
while (end > p && (end[-1] == ' ' || end[-1] == '\t')) end--; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
797
|
147
|
50
|
|
|
|
|
if (p >= end) return 0; |
|
798
|
147
|
100
|
|
|
|
|
if (*p == '|') p++; |
|
799
|
|
|
|
|
|
|
/* Strip trailing |. */ |
|
800
|
147
|
50
|
|
|
|
|
if (end > p && end[-1] == '|') end--; |
|
|
|
100
|
|
|
|
|
|
|
801
|
147
|
|
|
|
|
|
count = 0; |
|
802
|
296
|
50
|
|
|
|
|
while (p <= end) { |
|
803
|
296
|
|
|
|
|
|
cs = p; |
|
804
|
1884
|
100
|
|
|
|
|
while (p < end && *p != '|') p++; |
|
|
|
100
|
|
|
|
|
|
|
805
|
296
|
|
|
|
|
|
ce = p; |
|
806
|
|
|
|
|
|
|
/* Trim spaces around the cell. */ |
|
807
|
504
|
50
|
|
|
|
|
while (cs < ce && (*cs == ' ' || *cs == '\t')) cs++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
808
|
498
|
50
|
|
|
|
|
while (ce > cs && (ce[-1] == ' ' || ce[-1] == '\t')) ce--; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
809
|
296
|
50
|
|
|
|
|
if (cs >= ce) return 0; |
|
810
|
296
|
|
|
|
|
|
left = (*cs == ':'); |
|
811
|
296
|
100
|
|
|
|
|
if (left) cs++; |
|
812
|
296
|
50
|
|
|
|
|
right = (ce > cs && ce[-1] == ':'); |
|
|
|
100
|
|
|
|
|
|
|
813
|
296
|
100
|
|
|
|
|
if (right) ce--; |
|
814
|
296
|
50
|
|
|
|
|
if (cs >= ce) return 0; |
|
815
|
1350
|
100
|
|
|
|
|
for (q = cs; q < ce; q++) |
|
816
|
1062
|
100
|
|
|
|
|
if (*q != '-') return 0; |
|
817
|
288
|
50
|
|
|
|
|
if (count < max_cells) { |
|
818
|
288
|
100
|
|
|
|
|
aligns[count] = left && right ? MDS_ALIGN_CENTER : |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
left ? MDS_ALIGN_LEFT : |
|
820
|
|
|
|
|
|
|
right ? MDS_ALIGN_RIGHT : MDS_ALIGN_NONE; |
|
821
|
|
|
|
|
|
|
} |
|
822
|
288
|
|
|
|
|
|
count++; |
|
823
|
288
|
100
|
|
|
|
|
if (p >= end) break; |
|
824
|
149
|
|
|
|
|
|
p++; /* skip '|' */ |
|
825
|
|
|
|
|
|
|
} |
|
826
|
139
|
|
|
|
|
|
return count; |
|
827
|
|
|
|
|
|
|
} |
|
828
|
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
/* Returns the next newline in [p,end) or end. *le_out is the line end |
|
830
|
|
|
|
|
|
|
* (excluding CR). */ |
|
831
|
1258
|
|
|
|
|
|
static const char* tbl_next_line(const char* p, const char* end, const char** le_out) { |
|
832
|
1258
|
|
|
|
|
|
const char* nl = (const char*)memchr(p, '\n', (size_t)(end - p)); |
|
833
|
1258
|
100
|
|
|
|
|
const char* le = nl ? nl : end; |
|
834
|
1258
|
50
|
|
|
|
|
if (le > p && le[-1] == '\r') le--; |
|
|
|
50
|
|
|
|
|
|
|
835
|
1258
|
|
|
|
|
|
*le_out = le; |
|
836
|
1258
|
100
|
|
|
|
|
return nl ? nl + 1 : end; |
|
837
|
|
|
|
|
|
|
} |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
/* Returns 1 when the cell text has no byte that could begin an inline |
|
840
|
|
|
|
|
|
|
* construct, so the inline scanner can be bypassed entirely and the |
|
841
|
|
|
|
|
|
|
* text emitted as a plain EV_TEXT event. Mirrors the trigger set used |
|
842
|
|
|
|
|
|
|
* by mds_inline_scan's fast path. Tables of single-word cells (the |
|
843
|
|
|
|
|
|
|
* common case in real-world reports) hit this on every cell. */ |
|
844
|
|
|
|
|
|
|
MDS_ALWAYS_INLINE static int tbl_cell_is_plain(const char* s, size_t n) { |
|
845
|
|
|
|
|
|
|
size_t i; |
|
846
|
273
|
50
|
|
|
|
|
if (n == 0) return 1; |
|
847
|
1328
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
848
|
1085
|
|
|
|
|
|
unsigned char c = (unsigned char)s[i]; |
|
849
|
1085
|
100
|
|
|
|
|
if (c == '*' || c == '_' || c == '~' || c == '`' || |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
850
|
1061
|
50
|
|
|
|
|
c == '[' || c == ']' || c == '!' || c == '<' || |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
851
|
1055
|
50
|
|
|
|
|
c == '&' || c == '\\' || c == '\n') |
|
|
|
50
|
|
|
|
|
|
|
852
|
30
|
|
|
|
|
|
return 0; |
|
853
|
|
|
|
|
|
|
} |
|
854
|
|
|
|
|
|
|
/* Reject trailing double-space (CommonMark hard-break candidate). */ |
|
855
|
243
|
100
|
|
|
|
|
if (n >= 2 && s[n-1] == ' ' && s[n-2] == ' ') return 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
856
|
243
|
|
|
|
|
|
return 1; |
|
857
|
|
|
|
|
|
|
} |
|
858
|
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
/* Emit a row whose cells are already split (cs[i], cn[i]). |
|
860
|
|
|
|
|
|
|
* `count` is the number of cells the split produced; we always emit |
|
861
|
|
|
|
|
|
|
* exactly `nhead` cells, padding with empties if needed. */ |
|
862
|
|
|
|
|
|
|
MDS_HOT |
|
863
|
164
|
|
|
|
|
|
static void tbl_emit_row_pre(bscanner* b, |
|
864
|
|
|
|
|
|
|
const char* const* cs, const size_t* cn, |
|
865
|
|
|
|
|
|
|
unsigned count, |
|
866
|
|
|
|
|
|
|
const mds_align* aligns, unsigned nhead) { |
|
867
|
|
|
|
|
|
|
mds_block_detail d; |
|
868
|
|
|
|
|
|
|
unsigned i; |
|
869
|
|
|
|
|
|
|
mds_block_detail cd; |
|
870
|
|
|
|
|
|
|
const char* s; |
|
871
|
|
|
|
|
|
|
size_t n; |
|
872
|
|
|
|
|
|
|
size_t off; |
|
873
|
|
|
|
|
|
|
char* dst; |
|
874
|
|
|
|
|
|
|
size_t bl; |
|
875
|
|
|
|
|
|
|
size_t j; |
|
876
|
|
|
|
|
|
|
ev_rec* e; |
|
877
|
|
|
|
|
|
|
|
|
878
|
164
|
|
|
|
|
|
d.u.table_cell.align = MDS_ALIGN_NONE; /* placeholder, row has no align */ |
|
879
|
164
|
|
|
|
|
|
sax_enter(b, MDS_BLK_TABLE_ROW, &d); |
|
880
|
514
|
100
|
|
|
|
|
for (i = 0; i < nhead; i++) { |
|
881
|
350
|
|
|
|
|
|
cd.u.table_cell.align = aligns[i]; |
|
882
|
350
|
|
|
|
|
|
sax_enter(b, MDS_BLK_TABLE_CELL, &cd); |
|
883
|
350
|
100
|
|
|
|
|
if (i < count && cn[i]) { |
|
|
|
100
|
|
|
|
|
|
|
884
|
320
|
|
|
|
|
|
s = cs[i]; |
|
885
|
320
|
|
|
|
|
|
n = cn[i]; |
|
886
|
320
|
100
|
|
|
|
|
if (MDS_LIKELY(!tbl_cell_needs_unescape(s, n))) { |
|
887
|
|
|
|
|
|
|
/* Fast path: no '\\' anywhere in the cell. If the cell |
|
888
|
|
|
|
|
|
|
* is also free of inline triggers (the common single- |
|
889
|
|
|
|
|
|
|
* word case), emit EV_TEXT directly so sax_flush calls |
|
890
|
|
|
|
|
|
|
* cb.text without going through mds_inline_scan. Else |
|
891
|
|
|
|
|
|
|
* fall back to EV_INLINE. Both branches use the same |
|
892
|
|
|
|
|
|
|
* pool_intern, so input bytes are never double-copied. */ |
|
893
|
273
|
100
|
|
|
|
|
if (MDS_LIKELY(tbl_cell_is_plain(s, n))) { |
|
894
|
243
|
|
|
|
|
|
sax_text(b, s, n); |
|
895
|
|
|
|
|
|
|
} else { |
|
896
|
30
|
|
|
|
|
|
sax_inline_text(b, s, n); |
|
897
|
|
|
|
|
|
|
} |
|
898
|
|
|
|
|
|
|
} else { |
|
899
|
|
|
|
|
|
|
/* Slow path: copy-and-unescape '\\|' → '|' in one pass |
|
900
|
|
|
|
|
|
|
* directly into the bytepool, then record the event. */ |
|
901
|
47
|
|
|
|
|
|
buf_grow(&b->bytepool, &b->bp_cap, b->bp_len + n + 1); |
|
902
|
47
|
|
|
|
|
|
off = b->bp_len; |
|
903
|
47
|
|
|
|
|
|
dst = b->bytepool + off; |
|
904
|
47
|
|
|
|
|
|
bl = 0; |
|
905
|
471
|
100
|
|
|
|
|
for (j = 0; j < n; j++) { |
|
906
|
424
|
100
|
|
|
|
|
if (s[j] == '\\' && j + 1 < n && s[j+1] == '|') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
907
|
29
|
|
|
|
|
|
dst[bl++] = '|'; j++; |
|
908
|
|
|
|
|
|
|
} else { |
|
909
|
395
|
|
|
|
|
|
dst[bl++] = s[j]; |
|
910
|
|
|
|
|
|
|
} |
|
911
|
|
|
|
|
|
|
} |
|
912
|
47
|
|
|
|
|
|
b->bp_len += bl; |
|
913
|
47
|
|
|
|
|
|
e = ev_alloc(b); |
|
914
|
47
|
|
|
|
|
|
e->type = EV_INLINE; |
|
915
|
47
|
|
|
|
|
|
e->u.bytes.off = off; |
|
916
|
47
|
|
|
|
|
|
e->u.bytes.len = bl; |
|
917
|
|
|
|
|
|
|
} |
|
918
|
|
|
|
|
|
|
} |
|
919
|
350
|
|
|
|
|
|
sax_leave(b, MDS_BLK_TABLE_CELL); |
|
920
|
|
|
|
|
|
|
} |
|
921
|
164
|
|
|
|
|
|
sax_leave(b, MDS_BLK_TABLE_ROW); |
|
922
|
164
|
|
|
|
|
|
} |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
/* Convenience wrapper for body rows: split + emit. */ |
|
925
|
|
|
|
|
|
|
MDS_HOT |
|
926
|
96
|
|
|
|
|
|
static void tbl_emit_row(bscanner* b, const char* line, size_t len, |
|
927
|
|
|
|
|
|
|
const mds_align* aligns, unsigned nhead) { |
|
928
|
|
|
|
|
|
|
const char* cs[MDS_TBL_MAX_COLS]; |
|
929
|
|
|
|
|
|
|
size_t cn[MDS_TBL_MAX_COLS]; |
|
930
|
|
|
|
|
|
|
unsigned count; |
|
931
|
|
|
|
|
|
|
|
|
932
|
96
|
|
|
|
|
|
count = tbl_split_cells(line, len, cs, cn, MDS_TBL_MAX_COLS); |
|
933
|
96
|
|
|
|
|
|
tbl_emit_row_pre(b, cs, cn, count, aligns, nhead); |
|
934
|
96
|
|
|
|
|
|
} |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
/* Cheap detection: returns 1 if the first two lines starting at p form a |
|
937
|
|
|
|
|
|
|
* valid GFM table header + separator (matching column count). Emits nothing. */ |
|
938
|
947
|
|
|
|
|
|
static int tbl_peek_header(const char* p, const char* end) { |
|
939
|
|
|
|
|
|
|
const char* le1; |
|
940
|
|
|
|
|
|
|
const char* p2; |
|
941
|
|
|
|
|
|
|
const char* hs[MDS_TBL_MAX_COLS]; |
|
942
|
|
|
|
|
|
|
size_t hn[MDS_TBL_MAX_COLS]; |
|
943
|
|
|
|
|
|
|
unsigned nhead; |
|
944
|
|
|
|
|
|
|
const char* le2; |
|
945
|
|
|
|
|
|
|
mds_align aligns[MDS_TBL_MAX_COLS]; |
|
946
|
|
|
|
|
|
|
unsigned nsep; |
|
947
|
|
|
|
|
|
|
|
|
948
|
947
|
|
|
|
|
|
memset(aligns, 0, sizeof aligns); |
|
949
|
947
|
|
|
|
|
|
p2 = tbl_next_line(p, end, &le1); |
|
950
|
947
|
100
|
|
|
|
|
if (p2 >= end) return 0; |
|
951
|
103
|
100
|
|
|
|
|
if (!tbl_find_pipe(p, le1)) return 0; |
|
952
|
79
|
|
|
|
|
|
nhead = tbl_split_cells(p, (size_t)(le1 - p), hs, hn, MDS_TBL_MAX_COLS); |
|
953
|
79
|
50
|
|
|
|
|
if (nhead < 1 || nhead > MDS_TBL_MAX_COLS) return 0; |
|
|
|
50
|
|
|
|
|
|
|
954
|
79
|
|
|
|
|
|
(void)tbl_next_line(p2, end, &le2); |
|
955
|
79
|
|
|
|
|
|
nsep = tbl_parse_separator(p2, (size_t)(le2 - p2), aligns, MDS_TBL_MAX_COLS); |
|
956
|
79
|
|
|
|
|
|
return (nsep == nhead); |
|
957
|
|
|
|
|
|
|
} |
|
958
|
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
/* Attempt to emit a table starting at the given paragraph buffer. |
|
960
|
|
|
|
|
|
|
* Returns the number of bytes consumed, or 0 if no table starts here. */ |
|
961
|
68
|
|
|
|
|
|
static size_t try_emit_table(bscanner* b, const char* p, const char* end) { |
|
962
|
|
|
|
|
|
|
/* Read first line. */ |
|
963
|
|
|
|
|
|
|
const char* le1; |
|
964
|
|
|
|
|
|
|
const char* p2; |
|
965
|
|
|
|
|
|
|
const char* hs[MDS_TBL_MAX_COLS]; |
|
966
|
|
|
|
|
|
|
size_t hn[MDS_TBL_MAX_COLS]; |
|
967
|
|
|
|
|
|
|
unsigned nhead; |
|
968
|
|
|
|
|
|
|
const char* le2; |
|
969
|
|
|
|
|
|
|
const char* p3; |
|
970
|
|
|
|
|
|
|
mds_align aligns[MDS_TBL_MAX_COLS]; |
|
971
|
|
|
|
|
|
|
unsigned nsep; |
|
972
|
|
|
|
|
|
|
mds_block_detail d; |
|
973
|
|
|
|
|
|
|
mds_align* alstore; |
|
974
|
|
|
|
|
|
|
mds_block_detail hd; |
|
975
|
|
|
|
|
|
|
const char* row; |
|
976
|
|
|
|
|
|
|
int body_open; |
|
977
|
|
|
|
|
|
|
const char* le; |
|
978
|
|
|
|
|
|
|
const char* nx; |
|
979
|
|
|
|
|
|
|
mds_block_detail bd; |
|
980
|
|
|
|
|
|
|
|
|
981
|
68
|
|
|
|
|
|
memset(aligns, 0, sizeof aligns); |
|
982
|
68
|
|
|
|
|
|
p2 = tbl_next_line(p, end, &le1); |
|
983
|
68
|
50
|
|
|
|
|
if (p2 >= end) return 0; /* need a second line */ |
|
984
|
|
|
|
|
|
|
/* Header must contain at least one unescaped pipe. */ |
|
985
|
68
|
50
|
|
|
|
|
if (!tbl_find_pipe(p, le1)) return 0; |
|
986
|
|
|
|
|
|
|
/* Count header cells. */ |
|
987
|
68
|
|
|
|
|
|
nhead = tbl_split_cells(p, (size_t)(le1 - p), hs, hn, MDS_TBL_MAX_COLS); |
|
988
|
68
|
50
|
|
|
|
|
if (nhead < 1 || nhead > MDS_TBL_MAX_COLS) return 0; |
|
|
|
50
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
/* Read separator line. */ |
|
990
|
68
|
|
|
|
|
|
p3 = tbl_next_line(p2, end, &le2); |
|
991
|
68
|
|
|
|
|
|
nsep = tbl_parse_separator(p2, (size_t)(le2 - p2), aligns, MDS_TBL_MAX_COLS); |
|
992
|
68
|
50
|
|
|
|
|
if (nsep != nhead) return 0; |
|
993
|
|
|
|
|
|
|
/* Emit table. */ |
|
994
|
68
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
995
|
68
|
|
|
|
|
|
d.u.table.ncols = nhead; |
|
996
|
|
|
|
|
|
|
/* Persist aligns in arena so the renderer can see them. */ |
|
997
|
68
|
|
|
|
|
|
alstore = (mds_align*)mds_arena_alloc(&b->ctx->arena, |
|
998
|
|
|
|
|
|
|
sizeof(mds_align) * nhead); |
|
999
|
68
|
|
|
|
|
|
memcpy(alstore, aligns, sizeof(mds_align) * nhead); |
|
1000
|
68
|
|
|
|
|
|
d.u.table.aligns = alstore; |
|
1001
|
68
|
|
|
|
|
|
sax_enter(b, MDS_BLK_TABLE, &d); |
|
1002
|
|
|
|
|
|
|
/* Head. Reuse the already-split header instead of splitting again. */ |
|
1003
|
68
|
|
|
|
|
|
memset(&hd, 0, sizeof hd); |
|
1004
|
68
|
|
|
|
|
|
sax_enter(b, MDS_BLK_TABLE_HEAD, &hd); |
|
1005
|
68
|
|
|
|
|
|
tbl_emit_row_pre(b, hs, hn, nhead, aligns, nhead); |
|
1006
|
68
|
|
|
|
|
|
sax_leave(b, MDS_BLK_TABLE_HEAD); |
|
1007
|
|
|
|
|
|
|
/* Body: keep consuming lines that contain at least one unescaped pipe. */ |
|
1008
|
68
|
|
|
|
|
|
row = p3; |
|
1009
|
68
|
|
|
|
|
|
body_open = 0; |
|
1010
|
164
|
100
|
|
|
|
|
while (row < end) { |
|
1011
|
96
|
|
|
|
|
|
nx = tbl_next_line(row, end, &le); |
|
1012
|
96
|
50
|
|
|
|
|
if (le == row) { row = nx; break; } /* blank */ |
|
1013
|
96
|
50
|
|
|
|
|
if (!tbl_find_pipe(row, le)) break; |
|
1014
|
96
|
100
|
|
|
|
|
if (!body_open) { |
|
1015
|
56
|
|
|
|
|
|
memset(&bd, 0, sizeof bd); |
|
1016
|
56
|
|
|
|
|
|
sax_enter(b, MDS_BLK_TABLE_BODY, &bd); |
|
1017
|
56
|
|
|
|
|
|
body_open = 1; |
|
1018
|
|
|
|
|
|
|
} |
|
1019
|
96
|
|
|
|
|
|
tbl_emit_row(b, row, (size_t)(le - row), aligns, nhead); |
|
1020
|
96
|
|
|
|
|
|
row = nx; |
|
1021
|
|
|
|
|
|
|
} |
|
1022
|
68
|
100
|
|
|
|
|
if (body_open) sax_leave(b, MDS_BLK_TABLE_BODY); |
|
1023
|
68
|
|
|
|
|
|
sax_leave(b, MDS_BLK_TABLE); |
|
1024
|
68
|
|
|
|
|
|
return (size_t)(row - p); |
|
1025
|
|
|
|
|
|
|
} |
|
1026
|
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
/* ---------- leaf finalisation ---------- */ |
|
1028
|
|
|
|
|
|
|
|
|
1029
|
3261
|
|
|
|
|
|
static void finalize_paragraph(bscanner* b) { |
|
1030
|
|
|
|
|
|
|
/* Consume any leading link-reference and footnote definitions. */ |
|
1031
|
|
|
|
|
|
|
char* p; |
|
1032
|
|
|
|
|
|
|
char* end; |
|
1033
|
|
|
|
|
|
|
unsigned _bf; |
|
1034
|
|
|
|
|
|
|
const char* np; |
|
1035
|
|
|
|
|
|
|
const char *ls, *le_, *us, *ue, *ts, *te; |
|
1036
|
|
|
|
|
|
|
const char *fls, *fle, *bs, *be; |
|
1037
|
|
|
|
|
|
|
size_t blen; |
|
1038
|
|
|
|
|
|
|
size_t rem; |
|
1039
|
|
|
|
|
|
|
char c; |
|
1040
|
|
|
|
|
|
|
size_t lead; |
|
1041
|
|
|
|
|
|
|
int sx; |
|
1042
|
|
|
|
|
|
|
mds_block_detail d; |
|
1043
|
|
|
|
|
|
|
const char* tp; |
|
1044
|
|
|
|
|
|
|
const char* tend; |
|
1045
|
|
|
|
|
|
|
const char* run; |
|
1046
|
|
|
|
|
|
|
const char* le; |
|
1047
|
|
|
|
|
|
|
const char* nx; |
|
1048
|
|
|
|
|
|
|
const char* re; |
|
1049
|
|
|
|
|
|
|
size_t consumed; |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
3261
|
|
|
|
|
|
p = b->para; |
|
1052
|
3261
|
|
|
|
|
|
end = b->para + b->para_len; |
|
1053
|
3261
|
|
|
|
|
|
_bf = b->ctx->flags; |
|
1054
|
3523
|
100
|
|
|
|
|
while (p < end) { |
|
1055
|
|
|
|
|
|
|
/* Footnote def MUST be checked before linkref, since a label |
|
1056
|
|
|
|
|
|
|
* beginning with `^` would otherwise be eaten as a linkref. */ |
|
1057
|
3285
|
100
|
|
|
|
|
if ((_bf & MDS_FLAG_FOOTNOTES)) { |
|
1058
|
909
|
50
|
|
|
|
|
if (parse_footnote_def(p, end, &np, &fls, &fle, &bs, &be)) { |
|
1059
|
0
|
|
|
|
|
|
ensure_footnote_tab(b->ctx); |
|
1060
|
0
|
|
|
|
|
|
blen = (size_t)(be - bs); |
|
1061
|
0
|
0
|
|
|
|
|
while (blen > 0 && (bs[blen - 1] == '\n' || bs[blen - 1] == ' ' || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1062
|
0
|
0
|
|
|
|
|
bs[blen - 1] == '\t')) blen--; |
|
1063
|
0
|
|
|
|
|
|
mds_footnote_add(b->ctx->footnotes, |
|
1064
|
0
|
|
|
|
|
|
fls, (size_t)(fle - fls), |
|
1065
|
|
|
|
|
|
|
bs, blen); |
|
1066
|
0
|
|
|
|
|
|
p = (char*)np; |
|
1067
|
0
|
|
|
|
|
|
continue; |
|
1068
|
|
|
|
|
|
|
} |
|
1069
|
|
|
|
|
|
|
} |
|
1070
|
6568
|
|
|
|
|
|
if (!(_bf & MDS_FLAG_NO_REFERENCES) && |
|
1071
|
3283
|
|
|
|
|
|
parse_linkref(p, end, &np, &ls, &le_, &us, &ue, &ts, &te)) { |
|
1072
|
262
|
|
|
|
|
|
ensure_linkref_tab(b->ctx); |
|
1073
|
418
|
|
|
|
|
|
mds_linkref_add(b->ctx->refs, |
|
1074
|
262
|
|
|
|
|
|
ls, (size_t)(le_ - ls), |
|
1075
|
262
|
|
|
|
|
|
us, (size_t)(ue - us), |
|
1076
|
418
|
100
|
|
|
|
|
ts ? ts : "", ts ? (size_t)(te - ts) : 0); |
|
|
|
100
|
|
|
|
|
|
|
1077
|
262
|
|
|
|
|
|
p = (char*)np; |
|
1078
|
262
|
|
|
|
|
|
continue; |
|
1079
|
|
|
|
|
|
|
} |
|
1080
|
3023
|
|
|
|
|
|
break; |
|
1081
|
|
|
|
|
|
|
} |
|
1082
|
|
|
|
|
|
|
/* Shift remaining content to the start of the buffer. */ |
|
1083
|
3261
|
100
|
|
|
|
|
if (p > b->para) { |
|
1084
|
244
|
|
|
|
|
|
rem = (size_t)(end - p); |
|
1085
|
244
|
100
|
|
|
|
|
if (rem) memmove(b->para, p, rem); |
|
1086
|
244
|
|
|
|
|
|
b->para_len = rem; |
|
1087
|
|
|
|
|
|
|
} |
|
1088
|
|
|
|
|
|
|
/* Trim trailing whitespace. */ |
|
1089
|
3291
|
100
|
|
|
|
|
while (b->para_len > 0) { |
|
1090
|
3053
|
|
|
|
|
|
c = b->para[b->para_len - 1]; |
|
1091
|
3053
|
100
|
|
|
|
|
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') b->para_len--; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
else break; |
|
1093
|
|
|
|
|
|
|
} |
|
1094
|
|
|
|
|
|
|
/* Trim leading whitespace. */ |
|
1095
|
3261
|
|
|
|
|
|
lead = 0; |
|
1096
|
3262
|
100
|
|
|
|
|
while (lead < b->para_len && |
|
1097
|
3024
|
100
|
|
|
|
|
(b->para[lead] == ' ' || b->para[lead] == '\t' || b->para[lead] == '\n')) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1098
|
1
|
|
|
|
|
|
lead++; |
|
1099
|
3261
|
100
|
|
|
|
|
if (lead) { memmove(b->para, b->para + lead, b->para_len - lead); b->para_len -= lead; } |
|
1100
|
3261
|
100
|
|
|
|
|
if (b->para_len == 0) { |
|
1101
|
238
|
|
|
|
|
|
b->setext_level = 0; |
|
1102
|
238
|
|
|
|
|
|
return; |
|
1103
|
|
|
|
|
|
|
} |
|
1104
|
3023
|
|
|
|
|
|
sx = b->setext_level; |
|
1105
|
3023
|
|
|
|
|
|
b->setext_level = 0; |
|
1106
|
3023
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
1107
|
3023
|
100
|
|
|
|
|
if (sx) { |
|
1108
|
78
|
|
|
|
|
|
d.u.heading.level = sx; |
|
1109
|
78
|
|
|
|
|
|
sax_enter(b, MDS_BLK_HEADING, &d); |
|
1110
|
78
|
|
|
|
|
|
sax_inline_text(b, b->para, b->para_len); |
|
1111
|
78
|
|
|
|
|
|
sax_leave(b, MDS_BLK_HEADING); |
|
1112
|
2945
|
100
|
|
|
|
|
} else if ((b->ctx->flags & MDS_FLAG_TABLES) && |
|
1113
|
913
|
100
|
|
|
|
|
b->para_len >= 3) { |
|
1114
|
|
|
|
|
|
|
/* Try GFM table detection: header line | separator. May appear |
|
1115
|
|
|
|
|
|
|
* embedded — split paragraph into pre-text, table(s), post-text. */ |
|
1116
|
912
|
|
|
|
|
|
tp = b->para; |
|
1117
|
912
|
|
|
|
|
|
tend = b->para + b->para_len; |
|
1118
|
912
|
|
|
|
|
|
run = tp; |
|
1119
|
1859
|
100
|
|
|
|
|
while (tp < tend) { |
|
1120
|
|
|
|
|
|
|
/* find current line bounds */ |
|
1121
|
947
|
|
|
|
|
|
nx = next_line(tp, tend, &le); |
|
1122
|
947
|
100
|
|
|
|
|
if (tbl_peek_header(tp, tend)) { |
|
1123
|
|
|
|
|
|
|
/* flush any prior text as paragraph BEFORE emitting table */ |
|
1124
|
68
|
100
|
|
|
|
|
if (tp > run) { |
|
1125
|
3
|
|
|
|
|
|
re = tp; |
|
1126
|
6
|
50
|
|
|
|
|
while (re > run && (re[-1] == '\n' || re[-1] == '\r' || |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1127
|
6
|
50
|
|
|
|
|
re[-1] == ' ' || re[-1] == '\t')) re--; |
|
|
|
50
|
|
|
|
|
|
|
1128
|
3
|
50
|
|
|
|
|
if (re > run) { |
|
1129
|
3
|
|
|
|
|
|
sax_enter(b, MDS_BLK_PARAGRAPH, &d); |
|
1130
|
3
|
|
|
|
|
|
sax_inline_text(b, run, (size_t)(re - run)); |
|
1131
|
3
|
|
|
|
|
|
sax_leave(b, MDS_BLK_PARAGRAPH); |
|
1132
|
|
|
|
|
|
|
} |
|
1133
|
|
|
|
|
|
|
} |
|
1134
|
68
|
|
|
|
|
|
consumed = try_emit_table(b, tp, tend); |
|
1135
|
68
|
|
|
|
|
|
tp = tp + consumed; |
|
1136
|
68
|
|
|
|
|
|
run = tp; |
|
1137
|
68
|
|
|
|
|
|
continue; |
|
1138
|
|
|
|
|
|
|
} |
|
1139
|
|
|
|
|
|
|
(void)le; |
|
1140
|
879
|
|
|
|
|
|
tp = nx; |
|
1141
|
|
|
|
|
|
|
} |
|
1142
|
912
|
100
|
|
|
|
|
if (run < tend) { |
|
1143
|
844
|
|
|
|
|
|
re = tend; |
|
1144
|
844
|
50
|
|
|
|
|
while (re > run && (re[-1] == '\n' || re[-1] == '\r' || |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1145
|
844
|
50
|
|
|
|
|
re[-1] == ' ' || re[-1] == '\t')) re--; |
|
|
|
50
|
|
|
|
|
|
|
1146
|
844
|
50
|
|
|
|
|
if (re > run) { |
|
1147
|
844
|
|
|
|
|
|
sax_enter(b, MDS_BLK_PARAGRAPH, &d); |
|
1148
|
844
|
|
|
|
|
|
sax_inline_text(b, run, (size_t)(re - run)); |
|
1149
|
844
|
|
|
|
|
|
sax_leave(b, MDS_BLK_PARAGRAPH); |
|
1150
|
|
|
|
|
|
|
} |
|
1151
|
|
|
|
|
|
|
} |
|
1152
|
|
|
|
|
|
|
} else { |
|
1153
|
2033
|
|
|
|
|
|
sax_enter(b, MDS_BLK_PARAGRAPH, &d); |
|
1154
|
2033
|
|
|
|
|
|
sax_inline_text(b, b->para, b->para_len); |
|
1155
|
2033
|
|
|
|
|
|
sax_leave(b, MDS_BLK_PARAGRAPH); |
|
1156
|
|
|
|
|
|
|
} |
|
1157
|
3023
|
|
|
|
|
|
b->para_len = 0; |
|
1158
|
|
|
|
|
|
|
} |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
170
|
|
|
|
|
|
static void finalize_code_indented(bscanner* b) { |
|
1161
|
|
|
|
|
|
|
size_t i; |
|
1162
|
|
|
|
|
|
|
size_t ls; |
|
1163
|
|
|
|
|
|
|
size_t j; |
|
1164
|
|
|
|
|
|
|
int blank; |
|
1165
|
|
|
|
|
|
|
mds_block_detail d; |
|
1166
|
|
|
|
|
|
|
/* Strip trailing blank lines. */ |
|
1167
|
173
|
50
|
|
|
|
|
while (b->code_len > 0) { |
|
1168
|
173
|
|
|
|
|
|
i = b->code_len; |
|
1169
|
|
|
|
|
|
|
/* find start of last line */ |
|
1170
|
173
|
|
|
|
|
|
ls = i; |
|
1171
|
173
|
50
|
|
|
|
|
if (ls > 0) ls--; /* skip its '\n' */ |
|
1172
|
1585
|
100
|
|
|
|
|
while (ls > 0 && b->code_body[ls - 1] != '\n') ls--; |
|
|
|
100
|
|
|
|
|
|
|
1173
|
173
|
|
|
|
|
|
blank = 1; |
|
1174
|
221
|
100
|
|
|
|
|
for (j = ls; j + 1 < i; j++) { |
|
1175
|
218
|
100
|
|
|
|
|
if (b->code_body[j] != ' ' && b->code_body[j] != '\t') { blank = 0; break; } |
|
|
|
50
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
} |
|
1177
|
173
|
100
|
|
|
|
|
if (!blank) break; |
|
1178
|
3
|
|
|
|
|
|
b->code_len = ls; |
|
1179
|
|
|
|
|
|
|
} |
|
1180
|
170
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
1181
|
170
|
|
|
|
|
|
sax_enter(b, MDS_BLK_CODE_INDENTED, &d); |
|
1182
|
170
|
50
|
|
|
|
|
if (b->code_len) sax_text(b, b->code_body, b->code_len); |
|
1183
|
170
|
|
|
|
|
|
sax_leave(b, MDS_BLK_CODE_INDENTED); |
|
1184
|
170
|
|
|
|
|
|
b->code_len = 0; |
|
1185
|
170
|
|
|
|
|
|
b->pending_blanks = 0; |
|
1186
|
170
|
|
|
|
|
|
} |
|
1187
|
|
|
|
|
|
|
|
|
1188
|
114
|
|
|
|
|
|
static void finalize_code_fenced(bscanner* b) { |
|
1189
|
|
|
|
|
|
|
mds_block_detail d; |
|
1190
|
114
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
1191
|
114
|
|
|
|
|
|
d.u.code_fenced.info = b->fence_info; |
|
1192
|
114
|
|
|
|
|
|
d.u.code_fenced.info_len = b->fence_info_len; |
|
1193
|
114
|
|
|
|
|
|
sax_enter(b, MDS_BLK_CODE_FENCED, &d); |
|
1194
|
114
|
100
|
|
|
|
|
if (b->code_len) sax_text(b, b->code_body, b->code_len); |
|
1195
|
114
|
|
|
|
|
|
sax_leave(b, MDS_BLK_CODE_FENCED); |
|
1196
|
114
|
|
|
|
|
|
b->code_len = 0; |
|
1197
|
114
|
|
|
|
|
|
b->fence_info = NULL; |
|
1198
|
114
|
|
|
|
|
|
b->fence_info_len = 0; |
|
1199
|
114
|
|
|
|
|
|
} |
|
1200
|
|
|
|
|
|
|
|
|
1201
|
200
|
|
|
|
|
|
static void finalize_html(bscanner* b) { |
|
1202
|
|
|
|
|
|
|
size_t i; |
|
1203
|
|
|
|
|
|
|
size_t ls; |
|
1204
|
|
|
|
|
|
|
size_t j; |
|
1205
|
|
|
|
|
|
|
int blank; |
|
1206
|
|
|
|
|
|
|
mds_block_detail d; |
|
1207
|
|
|
|
|
|
|
/* trim trailing blank lines */ |
|
1208
|
200
|
50
|
|
|
|
|
while (b->html_len > 0) { |
|
1209
|
200
|
|
|
|
|
|
i = b->html_len; |
|
1210
|
200
|
|
|
|
|
|
ls = i; |
|
1211
|
200
|
50
|
|
|
|
|
if (ls > 0) ls--; |
|
1212
|
2666
|
100
|
|
|
|
|
while (ls > 0 && b->html_body[ls - 1] != '\n') ls--; |
|
|
|
100
|
|
|
|
|
|
|
1213
|
200
|
|
|
|
|
|
blank = 1; |
|
1214
|
260
|
50
|
|
|
|
|
for (j = ls; j + 1 < i; j++) { |
|
1215
|
260
|
100
|
|
|
|
|
if (b->html_body[j] != ' ' && b->html_body[j] != '\t') { blank = 0; break; } |
|
|
|
50
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
} |
|
1217
|
200
|
50
|
|
|
|
|
if (!blank) break; |
|
1218
|
0
|
|
|
|
|
|
b->html_len = ls; |
|
1219
|
|
|
|
|
|
|
} |
|
1220
|
200
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
1221
|
200
|
|
|
|
|
|
sax_enter(b, MDS_BLK_HTML, &d); |
|
1222
|
200
|
50
|
|
|
|
|
if (b->html_len) sax_raw(b, b->html_body, b->html_len); |
|
1223
|
200
|
|
|
|
|
|
sax_leave(b, MDS_BLK_HTML); |
|
1224
|
200
|
|
|
|
|
|
b->html_len = 0; |
|
1225
|
200
|
|
|
|
|
|
b->html_type = 0; |
|
1226
|
200
|
|
|
|
|
|
} |
|
1227
|
|
|
|
|
|
|
|
|
1228
|
6293
|
|
|
|
|
|
static void finalize_leaf(bscanner* b) { |
|
1229
|
6293
|
|
|
|
|
|
switch (b->leaf) { |
|
1230
|
3261
|
|
|
|
|
|
case LF_PARAGRAPH: finalize_paragraph(b); break; |
|
1231
|
114
|
|
|
|
|
|
case LF_CODE_FENCED: finalize_code_fenced(b); break; |
|
1232
|
170
|
|
|
|
|
|
case LF_CODE_INDENTED: finalize_code_indented(b); break; |
|
1233
|
200
|
|
|
|
|
|
case LF_HTML: finalize_html(b); break; |
|
1234
|
2548
|
|
|
|
|
|
case LF_NONE: break; |
|
1235
|
|
|
|
|
|
|
} |
|
1236
|
6293
|
|
|
|
|
|
b->leaf = LF_NONE; |
|
1237
|
6293
|
|
|
|
|
|
} |
|
1238
|
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
/* ---------- container emit ---------- */ |
|
1240
|
|
|
|
|
|
|
|
|
1241
|
1144
|
|
|
|
|
|
static void emit_open(bscanner* b, int idx) { |
|
1242
|
1144
|
|
|
|
|
|
ctn* c = &b->stack[idx]; |
|
1243
|
|
|
|
|
|
|
mds_block_detail d; |
|
1244
|
1144
|
50
|
|
|
|
|
if (c->opened) return; |
|
1245
|
1144
|
|
|
|
|
|
c->opened = 1; |
|
1246
|
1144
|
|
|
|
|
|
memset(&d, 0, sizeof d); |
|
1247
|
1144
|
100
|
|
|
|
|
if (c->kind == CT_QUOTE) { |
|
1248
|
176
|
|
|
|
|
|
sax_enter(b, MDS_BLK_QUOTE, &d); |
|
1249
|
968
|
100
|
|
|
|
|
} else if (c->kind == CT_LIST) { |
|
1250
|
377
|
|
|
|
|
|
d.u.list.is_ordered = c->ordered; |
|
1251
|
377
|
|
|
|
|
|
d.u.list.is_tight = c->tight; |
|
1252
|
377
|
|
|
|
|
|
d.u.list.start = c->start; |
|
1253
|
377
|
|
|
|
|
|
d.u.list.marker = c->marker; |
|
1254
|
377
|
|
|
|
|
|
c->ev_idx = sax_enter(b, MDS_BLK_LIST, &d); |
|
1255
|
591
|
50
|
|
|
|
|
} else if (c->kind == CT_LIST_ITEM) { |
|
1256
|
591
|
|
|
|
|
|
sax_enter(b, MDS_BLK_LIST_ITEM, &d); |
|
1257
|
|
|
|
|
|
|
} |
|
1258
|
|
|
|
|
|
|
} |
|
1259
|
|
|
|
|
|
|
|
|
1260
|
1144
|
|
|
|
|
|
static void emit_close(bscanner* b, ctn* c) { |
|
1261
|
1144
|
50
|
|
|
|
|
if (!c->opened) return; |
|
1262
|
1144
|
100
|
|
|
|
|
if (c->kind == CT_QUOTE) { |
|
1263
|
176
|
|
|
|
|
|
sax_leave(b, MDS_BLK_QUOTE); |
|
1264
|
968
|
100
|
|
|
|
|
} else if (c->kind == CT_LIST) { |
|
1265
|
|
|
|
|
|
|
/* Patch is_tight on the buffered enter event if any. */ |
|
1266
|
377
|
50
|
|
|
|
|
if (c->ev_idx >= 0 && (size_t)c->ev_idx < b->ev_len) { |
|
|
|
50
|
|
|
|
|
|
|
1267
|
377
|
|
|
|
|
|
ev_rec* e = &b->evbuf[c->ev_idx]; |
|
1268
|
377
|
50
|
|
|
|
|
if (e->type == EV_ENTER_BLOCK && e->u.enter.t == MDS_BLK_LIST) { |
|
|
|
50
|
|
|
|
|
|
|
1269
|
377
|
|
|
|
|
|
e->u.enter.d.u.list.is_tight = c->had_blank_inside ? 0 : 1; |
|
1270
|
|
|
|
|
|
|
} |
|
1271
|
|
|
|
|
|
|
} |
|
1272
|
377
|
|
|
|
|
|
sax_leave(b, MDS_BLK_LIST); |
|
1273
|
377
|
|
|
|
|
|
b->list_depth--; |
|
1274
|
|
|
|
|
|
|
/* Doc-wide buffering: flush only at end of mds_block_scan. */ |
|
1275
|
591
|
50
|
|
|
|
|
} else if (c->kind == CT_LIST_ITEM) { |
|
1276
|
591
|
|
|
|
|
|
sax_leave(b, MDS_BLK_LIST_ITEM); |
|
1277
|
|
|
|
|
|
|
} |
|
1278
|
|
|
|
|
|
|
} |
|
1279
|
|
|
|
|
|
|
|
|
1280
|
7245
|
|
|
|
|
|
static void close_containers_to(bscanner* b, int target_depth) { |
|
1281
|
|
|
|
|
|
|
ctn* c; |
|
1282
|
8336
|
100
|
|
|
|
|
while (b->depth > target_depth) { |
|
1283
|
1091
|
|
|
|
|
|
finalize_leaf(b); |
|
1284
|
1091
|
|
|
|
|
|
c = &b->stack[b->depth - 1]; |
|
1285
|
1091
|
|
|
|
|
|
emit_close(b, c); |
|
1286
|
1091
|
|
|
|
|
|
b->depth--; |
|
1287
|
|
|
|
|
|
|
} |
|
1288
|
7245
|
|
|
|
|
|
} |
|
1289
|
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
/* ---------- line helpers ---------- */ |
|
1291
|
|
|
|
|
|
|
|
|
1292
|
7399
|
|
|
|
|
|
static const char* next_line(const char* p, const char* end, |
|
1293
|
|
|
|
|
|
|
const char** line_end_out) { |
|
1294
|
|
|
|
|
|
|
const char* nl; |
|
1295
|
|
|
|
|
|
|
const char* le; |
|
1296
|
7399
|
|
|
|
|
|
nl = (const char*)memchr(p, '\n', (size_t)(end - p)); |
|
1297
|
7399
|
100
|
|
|
|
|
if (!nl) { *line_end_out = end; return end; } |
|
1298
|
6495
|
|
|
|
|
|
le = nl; |
|
1299
|
6495
|
100
|
|
|
|
|
if (le > p && *(le - 1) == '\r') le--; |
|
|
|
50
|
|
|
|
|
|
|
1300
|
6495
|
|
|
|
|
|
*line_end_out = le; |
|
1301
|
6495
|
|
|
|
|
|
return nl + 1; |
|
1302
|
|
|
|
|
|
|
} |
|
1303
|
|
|
|
|
|
|
|
|
1304
|
532
|
|
|
|
|
|
static int consume_indent(const char** p, const char* end, int max) { |
|
1305
|
532
|
|
|
|
|
|
int col = 0; |
|
1306
|
532
|
|
|
|
|
|
const char* q = *p; |
|
1307
|
2339
|
100
|
|
|
|
|
while (q < end && col < max) { |
|
|
|
100
|
|
|
|
|
|
|
1308
|
1807
|
50
|
|
|
|
|
if (*q == ' ') { col++; q++; } |
|
1309
|
0
|
0
|
|
|
|
|
else if (*q == '\t') { |
|
1310
|
0
|
|
|
|
|
|
int adv = 4 - (col & 3); |
|
1311
|
0
|
0
|
|
|
|
|
if (col + adv > max) break; |
|
1312
|
0
|
|
|
|
|
|
col += adv; q++; |
|
1313
|
0
|
|
|
|
|
|
} else break; |
|
1314
|
|
|
|
|
|
|
} |
|
1315
|
532
|
|
|
|
|
|
*p = q; |
|
1316
|
532
|
|
|
|
|
|
return col; |
|
1317
|
|
|
|
|
|
|
} |
|
1318
|
|
|
|
|
|
|
|
|
1319
|
9474
|
|
|
|
|
|
static int count_indent(const char* p, const char* end) { |
|
1320
|
9474
|
|
|
|
|
|
int col = 0; |
|
1321
|
12757
|
100
|
|
|
|
|
while (p < end) { |
|
1322
|
12456
|
100
|
|
|
|
|
if (*p == ' ') col++; |
|
1323
|
9173
|
50
|
|
|
|
|
else if (*p == '\t') col += 4 - (col & 3); |
|
1324
|
9173
|
|
|
|
|
|
else break; |
|
1325
|
3283
|
|
|
|
|
|
p++; |
|
1326
|
|
|
|
|
|
|
} |
|
1327
|
9474
|
|
|
|
|
|
return col; |
|
1328
|
|
|
|
|
|
|
} |
|
1329
|
|
|
|
|
|
|
|
|
1330
|
12227
|
|
|
|
|
|
static int is_blank(const char* p, const char* end) { |
|
1331
|
16839
|
100
|
|
|
|
|
while (p < end) { |
|
1332
|
15651
|
100
|
|
|
|
|
if (*p != ' ' && *p != '\t') return 0; |
|
|
|
50
|
|
|
|
|
|
|
1333
|
4612
|
|
|
|
|
|
p++; |
|
1334
|
|
|
|
|
|
|
} |
|
1335
|
1188
|
|
|
|
|
|
return 1; |
|
1336
|
|
|
|
|
|
|
} |
|
1337
|
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
/* ---------- HTML block recognition (CommonMark §4.6) ---------- */ |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
2163
|
|
|
|
|
|
static int ascii_ieq(const char* a, const char* b, size_t n) { |
|
1341
|
|
|
|
|
|
|
size_t i; |
|
1342
|
|
|
|
|
|
|
char x, y; |
|
1343
|
2967
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
1344
|
2813
|
|
|
|
|
|
x = a[i]; y = b[i]; |
|
1345
|
2813
|
100
|
|
|
|
|
if (x >= 'A' && x <= 'Z') x = (char)(x + 32); |
|
|
|
100
|
|
|
|
|
|
|
1346
|
2813
|
50
|
|
|
|
|
if (y >= 'A' && y <= 'Z') y = (char)(y + 32); |
|
|
|
50
|
|
|
|
|
|
|
1347
|
2813
|
100
|
|
|
|
|
if (x != y) return 0; |
|
1348
|
|
|
|
|
|
|
} |
|
1349
|
154
|
|
|
|
|
|
return 1; |
|
1350
|
|
|
|
|
|
|
} |
|
1351
|
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
/* Type-6 block tag names (lowercased). Sorted by length then alpha for |
|
1353
|
|
|
|
|
|
|
* a simple linear scan; the set is small enough that hashing isn't worth it. */ |
|
1354
|
|
|
|
|
|
|
static const char* const HTML6_TAGS[] = { |
|
1355
|
|
|
|
|
|
|
"address","article","aside","base","basefont","blockquote","body","caption", |
|
1356
|
|
|
|
|
|
|
"center","col","colgroup","dd","details","dialog","dir","div","dl","dt", |
|
1357
|
|
|
|
|
|
|
"fieldset","figcaption","figure","footer","form","frame","frameset", |
|
1358
|
|
|
|
|
|
|
"h1","h2","h3","h4","h5","h6","head","header","hr","html","iframe","legend", |
|
1359
|
|
|
|
|
|
|
"li","link","main","menu","menuitem","nav","noframes","ol","optgroup","option", |
|
1360
|
|
|
|
|
|
|
"p","param","search","section","summary","table","tbody","td","tfoot","th", |
|
1361
|
|
|
|
|
|
|
"thead","title","tr","track","ul", NULL |
|
1362
|
|
|
|
|
|
|
}; |
|
1363
|
|
|
|
|
|
|
|
|
1364
|
255
|
|
|
|
|
|
static int is_html6_tag(const char* s, size_t n) { |
|
1365
|
|
|
|
|
|
|
int i; |
|
1366
|
12769
|
100
|
|
|
|
|
for (i = 0; HTML6_TAGS[i]; i++) { |
|
1367
|
12623
|
|
|
|
|
|
size_t tl = strlen(HTML6_TAGS[i]); |
|
1368
|
12623
|
100
|
|
|
|
|
if (tl == n && ascii_ieq(s, HTML6_TAGS[i], n)) return 1; |
|
|
|
100
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
} |
|
1370
|
146
|
|
|
|
|
|
return 0; |
|
1371
|
|
|
|
|
|
|
} |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
1748
|
|
|
|
|
|
static int is_alpha(char c) { |
|
1374
|
1748
|
100
|
|
|
|
|
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
} |
|
1376
|
1551
|
|
|
|
|
|
static int is_alnum(char c) { |
|
1377
|
1551
|
100
|
|
|
|
|
return is_alpha(c) || (c >= '0' && c <= '9'); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
} |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
/* Type-7 helpers: validate a complete open or close tag on one line. |
|
1381
|
|
|
|
|
|
|
* Returns 1 if `[p, end)` (after the initial '<') is a complete tag |
|
1382
|
|
|
|
|
|
|
* followed only by whitespace. */ |
|
1383
|
134
|
|
|
|
|
|
static int is_type7_open_tag(const char* p, const char* end) { |
|
1384
|
|
|
|
|
|
|
const char* name; |
|
1385
|
|
|
|
|
|
|
size_t nlen; |
|
1386
|
|
|
|
|
|
|
int i; |
|
1387
|
|
|
|
|
|
|
const char* aws; |
|
1388
|
|
|
|
|
|
|
const char* vs; |
|
1389
|
|
|
|
|
|
|
char q; |
|
1390
|
|
|
|
|
|
|
const char* uv; |
|
1391
|
|
|
|
|
|
|
/* tag name: ASCII letter, then [A-Za-z0-9-]* */ |
|
1392
|
134
|
50
|
|
|
|
|
if (p >= end || !is_alpha(*p)) return 0; |
|
|
|
100
|
|
|
|
|
|
|
1393
|
119
|
|
|
|
|
|
name = p; |
|
1394
|
119
|
|
|
|
|
|
p++; |
|
1395
|
369
|
50
|
|
|
|
|
while (p < end && (is_alnum(*p) || *p == '-')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1396
|
119
|
|
|
|
|
|
nlen = (size_t)(p - name); |
|
1397
|
|
|
|
|
|
|
/* disallowed tag names for type 7 */ |
|
1398
|
|
|
|
|
|
|
{ |
|
1399
|
|
|
|
|
|
|
static const char* const banned[] = {"script","pre","style","textarea",NULL}; |
|
1400
|
595
|
100
|
|
|
|
|
for (i = 0; banned[i]; i++) { |
|
1401
|
476
|
|
|
|
|
|
size_t bl = strlen(banned[i]); |
|
1402
|
476
|
100
|
|
|
|
|
if (bl == nlen && ascii_ieq(name, banned[i], nlen)) return 0; |
|
|
|
50
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
} |
|
1404
|
|
|
|
|
|
|
} |
|
1405
|
|
|
|
|
|
|
/* attributes */ |
|
1406
|
149
|
100
|
|
|
|
|
while (p < end) { |
|
1407
|
|
|
|
|
|
|
/* whitespace then attr-name */ |
|
1408
|
146
|
|
|
|
|
|
aws = p; |
|
1409
|
194
|
50
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1410
|
146
|
100
|
|
|
|
|
if (p == aws) break; /* must have ws before attr */ |
|
1411
|
45
|
50
|
|
|
|
|
if (p >= end || *p == '/' || *p == '>') break; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1412
|
42
|
50
|
|
|
|
|
if (!is_alpha(*p) && *p != '_' && *p != ':') return 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1413
|
42
|
|
|
|
|
|
p++; |
|
1414
|
159
|
50
|
|
|
|
|
while (p < end && (is_alnum(*p) || *p == '_' || *p == ':' || *p == '.' || *p == '-')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
/* optional value */ |
|
1416
|
42
|
|
|
|
|
|
vs = p; |
|
1417
|
45
|
50
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1418
|
42
|
50
|
|
|
|
|
if (p < end && *p == '=') { |
|
|
|
100
|
|
|
|
|
|
|
1419
|
39
|
|
|
|
|
|
p++; |
|
1420
|
42
|
50
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1421
|
39
|
50
|
|
|
|
|
if (p >= end) return 0; |
|
1422
|
39
|
100
|
|
|
|
|
if (*p == '"' || *p == '\'') { |
|
|
|
50
|
|
|
|
|
|
|
1423
|
39
|
|
|
|
|
|
q = *p++; |
|
1424
|
288
|
100
|
|
|
|
|
while (p < end && *p != q) p++; |
|
|
|
100
|
|
|
|
|
|
|
1425
|
39
|
100
|
|
|
|
|
if (p >= end) return 0; |
|
1426
|
27
|
|
|
|
|
|
p++; |
|
1427
|
|
|
|
|
|
|
} else { |
|
1428
|
0
|
|
|
|
|
|
uv = p; |
|
1429
|
0
|
0
|
|
|
|
|
while (p < end && *p != ' ' && *p != '\t' && *p != '\"' |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1430
|
0
|
0
|
|
|
|
|
&& *p != '\'' && *p != '=' && *p != '<' && *p != '>' |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1431
|
0
|
0
|
|
|
|
|
&& *p != '`') p++; |
|
|
|
0
|
|
|
|
|
|
|
1432
|
0
|
0
|
|
|
|
|
if (p == uv) return 0; |
|
1433
|
|
|
|
|
|
|
} |
|
1434
|
|
|
|
|
|
|
} else { |
|
1435
|
3
|
|
|
|
|
|
p = vs; /* no value, rewind */ |
|
1436
|
|
|
|
|
|
|
} |
|
1437
|
|
|
|
|
|
|
} |
|
1438
|
107
|
100
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1439
|
107
|
100
|
|
|
|
|
if (p < end && *p == '/') p++; |
|
|
|
100
|
|
|
|
|
|
|
1440
|
107
|
100
|
|
|
|
|
if (p >= end || *p != '>') return 0; |
|
|
|
100
|
|
|
|
|
|
|
1441
|
38
|
|
|
|
|
|
p++; |
|
1442
|
38
|
100
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1443
|
38
|
|
|
|
|
|
return p == end; |
|
1444
|
|
|
|
|
|
|
} |
|
1445
|
|
|
|
|
|
|
|
|
1446
|
12
|
|
|
|
|
|
static int is_type7_close_tag(const char* p, const char* end) { |
|
1447
|
|
|
|
|
|
|
const char* name; |
|
1448
|
|
|
|
|
|
|
size_t nlen; |
|
1449
|
|
|
|
|
|
|
int i; |
|
1450
|
|
|
|
|
|
|
/* already past '' */ |
|
1451
|
12
|
50
|
|
|
|
|
if (p >= end || !is_alpha(*p)) return 0; |
|
|
|
50
|
|
|
|
|
|
|
1452
|
12
|
|
|
|
|
|
name = p; |
|
1453
|
12
|
|
|
|
|
|
p++; |
|
1454
|
24
|
50
|
|
|
|
|
while (p < end && (is_alnum(*p) || *p == '-')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1455
|
12
|
|
|
|
|
|
nlen = (size_t)(p - name); |
|
1456
|
|
|
|
|
|
|
{ |
|
1457
|
|
|
|
|
|
|
static const char* const banned[] = {"script","pre","style","textarea",NULL}; |
|
1458
|
60
|
100
|
|
|
|
|
for (i = 0; banned[i]; i++) { |
|
1459
|
48
|
|
|
|
|
|
size_t bl = strlen(banned[i]); |
|
1460
|
48
|
100
|
|
|
|
|
if (bl == nlen && ascii_ieq(name, banned[i], nlen)) return 0; |
|
|
|
50
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
} |
|
1462
|
|
|
|
|
|
|
} |
|
1463
|
15
|
50
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1464
|
12
|
50
|
|
|
|
|
if (p >= end || *p != '>') return 0; |
|
|
|
100
|
|
|
|
|
|
|
1465
|
9
|
|
|
|
|
|
p++; |
|
1466
|
9
|
100
|
|
|
|
|
while (p < end && (*p == ' ' || *p == '\t')) p++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1467
|
9
|
|
|
|
|
|
return p == end; |
|
1468
|
|
|
|
|
|
|
} |
|
1469
|
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
/* Detect HTML block start. Returns 1..7 on match, 0 otherwise. |
|
1471
|
|
|
|
|
|
|
* Caller has already stripped container prefixes; `p` is the line start |
|
1472
|
|
|
|
|
|
|
* after container content_col indent (but possibly with up to 3 leading |
|
1473
|
|
|
|
|
|
|
* spaces left). `allow_type7` is 0 when inside a paragraph (rule 7 |
|
1474
|
|
|
|
|
|
|
* cannot interrupt). */ |
|
1475
|
4677
|
|
|
|
|
|
static int detect_html_block_start(const char* p, const char* end, int allow_type7) { |
|
1476
|
4677
|
|
|
|
|
|
int lead = 0; |
|
1477
|
|
|
|
|
|
|
const char* q; |
|
1478
|
5129
|
100
|
|
|
|
|
while (p < end && *p == ' ' && lead < 3) { p++; lead++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1479
|
4677
|
100
|
|
|
|
|
if (p >= end || *p != '<') return 0; |
|
|
|
100
|
|
|
|
|
|
|
1480
|
334
|
|
|
|
|
|
q = p + 1; |
|
1481
|
|
|
|
|
|
|
/* Type 2: |