File Coverage

src/mds_linkref.c
Criterion Covered Total %
statement 89 110 80.9
branch 75 116 64.6
condition n/a
subroutine n/a
pod n/a
total 164 226 72.5


line stmt bran cond sub pod time code
1             /* mds_linkref.c — link reference definition table.
2             *
3             * Simple linear-scan table backed by arena memory. CommonMark §4.7
4             * specifies first-definition-wins; we honour that via mds_linkref_add()
5             * returning 0 for duplicates.
6             *
7             * Label normalisation (per spec): trim, lowercase ASCII, collapse runs
8             * of whitespace to a single space. Unicode case-folding deferred.
9             */
10              
11             #include "mds_linkref.h"
12             #include
13             #include
14             #include
15              
16             /* Decode a single UTF-8 codepoint starting at s[i] (i < n). Returns the
17             * codepoint, sets *adv to the byte count consumed. On invalid bytes,
18             * returns the byte as a codepoint and adv=1 (lenient). */
19 2003           static unsigned cf_decode(const char* s, size_t n, size_t i, int* adv) {
20 2003           unsigned char c = (unsigned char)s[i];
21 2003 100         if (c < 0x80) { *adv = 1; return c; }
22 21 100         if ((c & 0xE0) == 0xC0 && i + 1 < n &&
    50          
23 18 50         ((unsigned char)s[i+1] & 0xC0) == 0x80) {
24 18           *adv = 2;
25 18           return ((unsigned)(c & 0x1F) << 6) | ((unsigned char)s[i+1] & 0x3F);
26             }
27 3 50         if ((c & 0xF0) == 0xE0 && i + 2 < n &&
    50          
28 3 50         ((unsigned char)s[i+1] & 0xC0) == 0x80 &&
29 3 50         ((unsigned char)s[i+2] & 0xC0) == 0x80) {
30 3           *adv = 3;
31 3           return ((unsigned)(c & 0x0F) << 12)
32 3           | (((unsigned char)s[i+1] & 0x3F) << 6)
33 3           | ((unsigned char)s[i+2] & 0x3F);
34             }
35 0 0         if ((c & 0xF8) == 0xF0 && i + 3 < n &&
    0          
36 0 0         ((unsigned char)s[i+1] & 0xC0) == 0x80 &&
37 0 0         ((unsigned char)s[i+2] & 0xC0) == 0x80 &&
38 0 0         ((unsigned char)s[i+3] & 0xC0) == 0x80) {
39 0           *adv = 4;
40 0           return ((unsigned)(c & 0x07) << 18)
41 0           | (((unsigned char)s[i+1] & 0x3F) << 12)
42 0           | (((unsigned char)s[i+2] & 0x3F) << 6)
43 0           | ((unsigned char)s[i+3] & 0x3F);
44             }
45 0           *adv = 1;
46 0           return c;
47             }
48              
49             /* Encode codepoint as UTF-8 into out; returns bytes written (0..4). */
50 2000           static int cf_encode(unsigned cp, char* out) {
51 2000 100         if (cp < 0x80) { out[0] = (char)cp; return 1; }
52 18 50         if (cp < 0x800) {
53 18           out[0] = (char)(0xC0 | (cp >> 6));
54 18           out[1] = (char)(0x80 | (cp & 0x3F));
55 18           return 2;
56             }
57 0 0         if (cp < 0x10000) {
58 0           out[0] = (char)(0xE0 | (cp >> 12));
59 0           out[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
60 0           out[2] = (char)(0x80 | (cp & 0x3F));
61 0           return 3;
62             }
63 0           out[0] = (char)(0xF0 | (cp >> 18));
64 0           out[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
65 0           out[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
66 0           out[3] = (char)(0x80 | (cp & 0x3F));
67 0           return 4;
68             }
69              
70             /* Case-fold a single codepoint to its lowercase form, writing 1..4
71             * bytes to out. Covers ASCII, Latin-1 supplement, Latin Extended-A
72             * (selected), Greek, Cyrillic, plus a tiny exception list (ẞ → ss,
73             * µ → μ). Returns bytes written. This is the subset needed by
74             * CommonMark spec examples; full CaseFolding.txt is overkill. */
75 2003           static int cf_fold(unsigned cp, char* out) {
76             /* Exceptions that expand (1 codepoint -> 2 codepoints). */
77 2003 100         if (cp == 0x1E9E) { out[0] = 's'; out[1] = 's'; return 2; } /* ẞ */
78              
79             /* Single-codepoint mappings. */
80 2000 100         if (cp >= 'A' && cp <= 'Z') cp = cp + 0x20;
    100          
81 1892 50         else if (cp == 0xB5) cp = 0x3BC; /* µ -> μ */
82 1892 100         else if (cp >= 0xC0 && cp <= 0xDE && cp != 0xD7) cp += 0x20;
    50          
    0          
83 1892 100         else if (cp >= 0x391 && cp <= 0x3A1) cp += 0x20; /* Α..Ρ */
    100          
84 1886 100         else if (cp >= 0x3A3 && cp <= 0x3AB) cp += 0x20; /* Σ..Ϋ */
    100          
85 1883 50         else if (cp == 0x3A2) { /* reserved, leave */ }
86 1883 50         else if (cp >= 0x400 && cp <= 0x40F) cp += 0x50; /* Cyrillic Ѐ..Џ */
    0          
87 1883 50         else if (cp >= 0x410 && cp <= 0x42F) cp += 0x20; /* Cyrillic А..Я */
    0          
88             /* Else: leave codepoint as-is (no fold). */
89              
90 2000           return cf_encode(cp, out);
91             }
92              
93             /* Normalise label per CommonMark spec: trim outer whitespace, collapse
94             * inner whitespace runs to a single space, and case-fold codepoints.
95             * Writes at most 4 bytes per input byte plus 4 trailing bytes; caller
96             * must provide an output buffer of size >= n*4 + 4. Returns output
97             * length. */
98 518           static size_t cf_normalise(const char* s, size_t n, char* out) {
99 518           size_t a = 0, b = n;
100 518           size_t j = 0;
101             size_t i;
102 518           int in_ws = 0;
103             /* Trim ends */
104 521 100         while (a < b && (s[a] == ' ' || s[a] == '\t' || s[a] == '\n')) a++;
    50          
    50          
    100          
105 521 100         while (b > a && (s[b-1] == ' ' || s[b-1] == '\t' || s[b-1] == '\n')) b--;
    50          
    50          
    100          
106 518           i = a;
107 2602 100         while (i < b) {
108 2084           unsigned char c = (unsigned char)s[i];
109 2084 100         if (c == ' ' || c == '\t' || c == '\n') {
    50          
    100          
110 81 50         if (!in_ws) { out[j++] = ' '; in_ws = 1; }
111 81           i++;
112             } else {
113             int adv;
114 2003           unsigned cp = cf_decode(s, b, i, &adv);
115 2003           j += cf_fold(cp, out + j);
116 2003           i += adv;
117 2003           in_ws = 0;
118             }
119             }
120 518           return j;
121             }
122              
123 262           static char* normalise_label(mds_arena* a, const char* s, size_t n, size_t* nlen) {
124             char* out;
125             size_t j;
126 262           out = (char*)mds_arena_alloc(a, n * 4 + 4);
127 262           j = cf_normalise(s, n, out);
128 262           out[j] = '\0';
129 262           *nlen = j;
130 262           return out;
131             }
132              
133 362           static char* arena_dup(mds_arena* a, const char* s, size_t n) {
134             char* d;
135 362           d = (char*)mds_arena_alloc(a, n + 1);
136 362 100         if (n) memcpy(d, s, n);
137 362           d[n] = '\0';
138 362           return d;
139             }
140              
141 241           void mds_linkref_init(struct mds_linkref_tab* t, mds_arena* a) {
142 241           t->entries = NULL;
143 241           t->len = 0;
144 241           t->cap = 0;
145 241           t->arena = a;
146 241           }
147              
148 256           const mds_linkref* mds_linkref_get(const struct mds_linkref_tab* t,
149             const char* label, size_t llen) {
150             /* Normalise into a stack buffer. Max expansion is 4x (4-byte UTF-8
151             * input expanding to 4-byte fold). Cap at 4 KiB; pathologically long
152             * labels can't match a stored entry anyway. */
153             char buf[4096];
154             size_t nlen;
155             size_t i;
156 256 50         if (llen > sizeof buf / 4 - 4) return NULL;
157 256           nlen = cf_normalise(label, llen, buf);
158 304 100         for (i = 0; i < t->len; i++) {
159 274 100         if (t->entries[i].klen == nlen &&
    50          
160 259 100         (nlen == 0 || memcmp(t->entries[i].key, buf, nlen) == 0))
161 226           return &t->entries[i];
162             }
163 30           return NULL;
164             }
165              
166 262           int mds_linkref_add(struct mds_linkref_tab* t,
167             const char* label, size_t llen,
168             const char* url, size_t ulen,
169             const char* title, size_t tlen) {
170             size_t nlen;
171             char* key;
172             size_t i;
173             mds_linkref* e;
174 262           key = normalise_label(t->arena, label, llen, &nlen);
175             /* dup check */
176 280 100         for (i = 0; i < t->len; i++) {
177 24 50         if (t->entries[i].klen == nlen &&
178 24 50         (nlen == 0 || memcmp(t->entries[i].key, key, nlen) == 0))
    100          
179 6           return 0;
180             }
181 256 100         if (t->len == t->cap) {
182 241 50         size_t nc = t->cap ? t->cap * 2 : 8;
183 241           t->entries = (mds_linkref*)realloc(t->entries, nc * sizeof(mds_linkref));
184 241           t->cap = nc;
185             }
186 256           e = &t->entries[t->len++];
187 256           e->key = key; e->klen = nlen;
188 256           e->url = arena_dup(t->arena, url, ulen); e->ulen = ulen;
189 256 100         e->title = tlen ? arena_dup(t->arena, title, tlen) : NULL;
190 256           e->tlen = tlen;
191 256           return 1;
192             }