| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* mds_linkref.c — link reference definition table. |
|
2
|
|
|
|
|
|
|
* |
|
3
|
|
|
|
|
|
|
* Simple linear-scan table backed by arena memory. CommonMark §4.7 |
|
4
|
|
|
|
|
|
|
* specifies first-definition-wins; we honour that via mds_linkref_add() |
|
5
|
|
|
|
|
|
|
* returning 0 for duplicates. |
|
6
|
|
|
|
|
|
|
* |
|
7
|
|
|
|
|
|
|
* Label normalisation (per spec): trim, lowercase ASCII, collapse runs |
|
8
|
|
|
|
|
|
|
* of whitespace to a single space. Unicode case-folding deferred. |
|
9
|
|
|
|
|
|
|
*/ |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#include "mds_linkref.h" |
|
12
|
|
|
|
|
|
|
#include |
|
13
|
|
|
|
|
|
|
#include |
|
14
|
|
|
|
|
|
|
#include |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
/* Decode a single UTF-8 codepoint starting at s[i] (i < n). Returns the |
|
17
|
|
|
|
|
|
|
* codepoint, sets *adv to the byte count consumed. On invalid bytes, |
|
18
|
|
|
|
|
|
|
* returns the byte as a codepoint and adv=1 (lenient). */ |
|
19
|
2003
|
|
|
|
|
|
static unsigned cf_decode(const char* s, size_t n, size_t i, int* adv) { |
|
20
|
2003
|
|
|
|
|
|
unsigned char c = (unsigned char)s[i]; |
|
21
|
2003
|
100
|
|
|
|
|
if (c < 0x80) { *adv = 1; return c; } |
|
22
|
21
|
100
|
|
|
|
|
if ((c & 0xE0) == 0xC0 && i + 1 < n && |
|
|
|
50
|
|
|
|
|
|
|
23
|
18
|
50
|
|
|
|
|
((unsigned char)s[i+1] & 0xC0) == 0x80) { |
|
24
|
18
|
|
|
|
|
|
*adv = 2; |
|
25
|
18
|
|
|
|
|
|
return ((unsigned)(c & 0x1F) << 6) | ((unsigned char)s[i+1] & 0x3F); |
|
26
|
|
|
|
|
|
|
} |
|
27
|
3
|
50
|
|
|
|
|
if ((c & 0xF0) == 0xE0 && i + 2 < n && |
|
|
|
50
|
|
|
|
|
|
|
28
|
3
|
50
|
|
|
|
|
((unsigned char)s[i+1] & 0xC0) == 0x80 && |
|
29
|
3
|
50
|
|
|
|
|
((unsigned char)s[i+2] & 0xC0) == 0x80) { |
|
30
|
3
|
|
|
|
|
|
*adv = 3; |
|
31
|
3
|
|
|
|
|
|
return ((unsigned)(c & 0x0F) << 12) |
|
32
|
3
|
|
|
|
|
|
| (((unsigned char)s[i+1] & 0x3F) << 6) |
|
33
|
3
|
|
|
|
|
|
| ((unsigned char)s[i+2] & 0x3F); |
|
34
|
|
|
|
|
|
|
} |
|
35
|
0
|
0
|
|
|
|
|
if ((c & 0xF8) == 0xF0 && i + 3 < n && |
|
|
|
0
|
|
|
|
|
|
|
36
|
0
|
0
|
|
|
|
|
((unsigned char)s[i+1] & 0xC0) == 0x80 && |
|
37
|
0
|
0
|
|
|
|
|
((unsigned char)s[i+2] & 0xC0) == 0x80 && |
|
38
|
0
|
0
|
|
|
|
|
((unsigned char)s[i+3] & 0xC0) == 0x80) { |
|
39
|
0
|
|
|
|
|
|
*adv = 4; |
|
40
|
0
|
|
|
|
|
|
return ((unsigned)(c & 0x07) << 18) |
|
41
|
0
|
|
|
|
|
|
| (((unsigned char)s[i+1] & 0x3F) << 12) |
|
42
|
0
|
|
|
|
|
|
| (((unsigned char)s[i+2] & 0x3F) << 6) |
|
43
|
0
|
|
|
|
|
|
| ((unsigned char)s[i+3] & 0x3F); |
|
44
|
|
|
|
|
|
|
} |
|
45
|
0
|
|
|
|
|
|
*adv = 1; |
|
46
|
0
|
|
|
|
|
|
return c; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
/* Encode codepoint as UTF-8 into out; returns bytes written (0..4). */ |
|
50
|
2000
|
|
|
|
|
|
static int cf_encode(unsigned cp, char* out) { |
|
51
|
2000
|
100
|
|
|
|
|
if (cp < 0x80) { out[0] = (char)cp; return 1; } |
|
52
|
18
|
50
|
|
|
|
|
if (cp < 0x800) { |
|
53
|
18
|
|
|
|
|
|
out[0] = (char)(0xC0 | (cp >> 6)); |
|
54
|
18
|
|
|
|
|
|
out[1] = (char)(0x80 | (cp & 0x3F)); |
|
55
|
18
|
|
|
|
|
|
return 2; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
0
|
0
|
|
|
|
|
if (cp < 0x10000) { |
|
58
|
0
|
|
|
|
|
|
out[0] = (char)(0xE0 | (cp >> 12)); |
|
59
|
0
|
|
|
|
|
|
out[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
|
60
|
0
|
|
|
|
|
|
out[2] = (char)(0x80 | (cp & 0x3F)); |
|
61
|
0
|
|
|
|
|
|
return 3; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
0
|
|
|
|
|
|
out[0] = (char)(0xF0 | (cp >> 18)); |
|
64
|
0
|
|
|
|
|
|
out[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); |
|
65
|
0
|
|
|
|
|
|
out[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
|
66
|
0
|
|
|
|
|
|
out[3] = (char)(0x80 | (cp & 0x3F)); |
|
67
|
0
|
|
|
|
|
|
return 4; |
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
/* Case-fold a single codepoint to its lowercase form, writing 1..4 |
|
71
|
|
|
|
|
|
|
* bytes to out. Covers ASCII, Latin-1 supplement, Latin Extended-A |
|
72
|
|
|
|
|
|
|
* (selected), Greek, Cyrillic, plus a tiny exception list (ẞ → ss, |
|
73
|
|
|
|
|
|
|
* µ → μ). Returns bytes written. This is the subset needed by |
|
74
|
|
|
|
|
|
|
* CommonMark spec examples; full CaseFolding.txt is overkill. */ |
|
75
|
2003
|
|
|
|
|
|
static int cf_fold(unsigned cp, char* out) { |
|
76
|
|
|
|
|
|
|
/* Exceptions that expand (1 codepoint -> 2 codepoints). */ |
|
77
|
2003
|
100
|
|
|
|
|
if (cp == 0x1E9E) { out[0] = 's'; out[1] = 's'; return 2; } /* ẞ */ |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
/* Single-codepoint mappings. */ |
|
80
|
2000
|
100
|
|
|
|
|
if (cp >= 'A' && cp <= 'Z') cp = cp + 0x20; |
|
|
|
100
|
|
|
|
|
|
|
81
|
1892
|
50
|
|
|
|
|
else if (cp == 0xB5) cp = 0x3BC; /* µ -> μ */ |
|
82
|
1892
|
100
|
|
|
|
|
else if (cp >= 0xC0 && cp <= 0xDE && cp != 0xD7) cp += 0x20; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
83
|
1892
|
100
|
|
|
|
|
else if (cp >= 0x391 && cp <= 0x3A1) cp += 0x20; /* Α..Ρ */ |
|
|
|
100
|
|
|
|
|
|
|
84
|
1886
|
100
|
|
|
|
|
else if (cp >= 0x3A3 && cp <= 0x3AB) cp += 0x20; /* Σ..Ϋ */ |
|
|
|
100
|
|
|
|
|
|
|
85
|
1883
|
50
|
|
|
|
|
else if (cp == 0x3A2) { /* reserved, leave */ } |
|
86
|
1883
|
50
|
|
|
|
|
else if (cp >= 0x400 && cp <= 0x40F) cp += 0x50; /* Cyrillic Ѐ..Џ */ |
|
|
|
0
|
|
|
|
|
|
|
87
|
1883
|
50
|
|
|
|
|
else if (cp >= 0x410 && cp <= 0x42F) cp += 0x20; /* Cyrillic А..Я */ |
|
|
|
0
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
/* Else: leave codepoint as-is (no fold). */ |
|
89
|
|
|
|
|
|
|
|
|
90
|
2000
|
|
|
|
|
|
return cf_encode(cp, out); |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
/* Normalise label per CommonMark spec: trim outer whitespace, collapse |
|
94
|
|
|
|
|
|
|
* inner whitespace runs to a single space, and case-fold codepoints. |
|
95
|
|
|
|
|
|
|
* Writes at most 4 bytes per input byte plus 4 trailing bytes; caller |
|
96
|
|
|
|
|
|
|
* must provide an output buffer of size >= n*4 + 4. Returns output |
|
97
|
|
|
|
|
|
|
* length. */ |
|
98
|
518
|
|
|
|
|
|
static size_t cf_normalise(const char* s, size_t n, char* out) { |
|
99
|
518
|
|
|
|
|
|
size_t a = 0, b = n; |
|
100
|
518
|
|
|
|
|
|
size_t j = 0; |
|
101
|
|
|
|
|
|
|
size_t i; |
|
102
|
518
|
|
|
|
|
|
int in_ws = 0; |
|
103
|
|
|
|
|
|
|
/* Trim ends */ |
|
104
|
521
|
100
|
|
|
|
|
while (a < b && (s[a] == ' ' || s[a] == '\t' || s[a] == '\n')) a++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
105
|
521
|
100
|
|
|
|
|
while (b > a && (s[b-1] == ' ' || s[b-1] == '\t' || s[b-1] == '\n')) b--; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
106
|
518
|
|
|
|
|
|
i = a; |
|
107
|
2602
|
100
|
|
|
|
|
while (i < b) { |
|
108
|
2084
|
|
|
|
|
|
unsigned char c = (unsigned char)s[i]; |
|
109
|
2084
|
100
|
|
|
|
|
if (c == ' ' || c == '\t' || c == '\n') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
110
|
81
|
50
|
|
|
|
|
if (!in_ws) { out[j++] = ' '; in_ws = 1; } |
|
111
|
81
|
|
|
|
|
|
i++; |
|
112
|
|
|
|
|
|
|
} else { |
|
113
|
|
|
|
|
|
|
int adv; |
|
114
|
2003
|
|
|
|
|
|
unsigned cp = cf_decode(s, b, i, &adv); |
|
115
|
2003
|
|
|
|
|
|
j += cf_fold(cp, out + j); |
|
116
|
2003
|
|
|
|
|
|
i += adv; |
|
117
|
2003
|
|
|
|
|
|
in_ws = 0; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
} |
|
120
|
518
|
|
|
|
|
|
return j; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
262
|
|
|
|
|
|
static char* normalise_label(mds_arena* a, const char* s, size_t n, size_t* nlen) { |
|
124
|
|
|
|
|
|
|
char* out; |
|
125
|
|
|
|
|
|
|
size_t j; |
|
126
|
262
|
|
|
|
|
|
out = (char*)mds_arena_alloc(a, n * 4 + 4); |
|
127
|
262
|
|
|
|
|
|
j = cf_normalise(s, n, out); |
|
128
|
262
|
|
|
|
|
|
out[j] = '\0'; |
|
129
|
262
|
|
|
|
|
|
*nlen = j; |
|
130
|
262
|
|
|
|
|
|
return out; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
362
|
|
|
|
|
|
static char* arena_dup(mds_arena* a, const char* s, size_t n) { |
|
134
|
|
|
|
|
|
|
char* d; |
|
135
|
362
|
|
|
|
|
|
d = (char*)mds_arena_alloc(a, n + 1); |
|
136
|
362
|
100
|
|
|
|
|
if (n) memcpy(d, s, n); |
|
137
|
362
|
|
|
|
|
|
d[n] = '\0'; |
|
138
|
362
|
|
|
|
|
|
return d; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
241
|
|
|
|
|
|
void mds_linkref_init(struct mds_linkref_tab* t, mds_arena* a) { |
|
142
|
241
|
|
|
|
|
|
t->entries = NULL; |
|
143
|
241
|
|
|
|
|
|
t->len = 0; |
|
144
|
241
|
|
|
|
|
|
t->cap = 0; |
|
145
|
241
|
|
|
|
|
|
t->arena = a; |
|
146
|
241
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
256
|
|
|
|
|
|
const mds_linkref* mds_linkref_get(const struct mds_linkref_tab* t, |
|
149
|
|
|
|
|
|
|
const char* label, size_t llen) { |
|
150
|
|
|
|
|
|
|
/* Normalise into a stack buffer. Max expansion is 4x (4-byte UTF-8 |
|
151
|
|
|
|
|
|
|
* input expanding to 4-byte fold). Cap at 4 KiB; pathologically long |
|
152
|
|
|
|
|
|
|
* labels can't match a stored entry anyway. */ |
|
153
|
|
|
|
|
|
|
char buf[4096]; |
|
154
|
|
|
|
|
|
|
size_t nlen; |
|
155
|
|
|
|
|
|
|
size_t i; |
|
156
|
256
|
50
|
|
|
|
|
if (llen > sizeof buf / 4 - 4) return NULL; |
|
157
|
256
|
|
|
|
|
|
nlen = cf_normalise(label, llen, buf); |
|
158
|
304
|
100
|
|
|
|
|
for (i = 0; i < t->len; i++) { |
|
159
|
274
|
100
|
|
|
|
|
if (t->entries[i].klen == nlen && |
|
|
|
50
|
|
|
|
|
|
|
160
|
259
|
100
|
|
|
|
|
(nlen == 0 || memcmp(t->entries[i].key, buf, nlen) == 0)) |
|
161
|
226
|
|
|
|
|
|
return &t->entries[i]; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
30
|
|
|
|
|
|
return NULL; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
262
|
|
|
|
|
|
int mds_linkref_add(struct mds_linkref_tab* t, |
|
167
|
|
|
|
|
|
|
const char* label, size_t llen, |
|
168
|
|
|
|
|
|
|
const char* url, size_t ulen, |
|
169
|
|
|
|
|
|
|
const char* title, size_t tlen) { |
|
170
|
|
|
|
|
|
|
size_t nlen; |
|
171
|
|
|
|
|
|
|
char* key; |
|
172
|
|
|
|
|
|
|
size_t i; |
|
173
|
|
|
|
|
|
|
mds_linkref* e; |
|
174
|
262
|
|
|
|
|
|
key = normalise_label(t->arena, label, llen, &nlen); |
|
175
|
|
|
|
|
|
|
/* dup check */ |
|
176
|
280
|
100
|
|
|
|
|
for (i = 0; i < t->len; i++) { |
|
177
|
24
|
50
|
|
|
|
|
if (t->entries[i].klen == nlen && |
|
178
|
24
|
50
|
|
|
|
|
(nlen == 0 || memcmp(t->entries[i].key, key, nlen) == 0)) |
|
|
|
100
|
|
|
|
|
|
|
179
|
6
|
|
|
|
|
|
return 0; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
256
|
100
|
|
|
|
|
if (t->len == t->cap) { |
|
182
|
241
|
50
|
|
|
|
|
size_t nc = t->cap ? t->cap * 2 : 8; |
|
183
|
241
|
|
|
|
|
|
t->entries = (mds_linkref*)realloc(t->entries, nc * sizeof(mds_linkref)); |
|
184
|
241
|
|
|
|
|
|
t->cap = nc; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
256
|
|
|
|
|
|
e = &t->entries[t->len++]; |
|
187
|
256
|
|
|
|
|
|
e->key = key; e->klen = nlen; |
|
188
|
256
|
|
|
|
|
|
e->url = arena_dup(t->arena, url, ulen); e->ulen = ulen; |
|
189
|
256
|
100
|
|
|
|
|
e->title = tlen ? arena_dup(t->arena, title, tlen) : NULL; |
|
190
|
256
|
|
|
|
|
|
e->tlen = tlen; |
|
191
|
256
|
|
|
|
|
|
return 1; |
|
192
|
|
|
|
|
|
|
} |