File Coverage

types.c
Criterion Covered Total %
statement 414 436 94.9
branch 482 598 80.6
condition n/a
subroutine n/a
pod n/a
total 896 1034 86.6


line stmt bran cond sub pod time code
1             #define PERL_NO_GET_CONTEXT
2             #include "EXTERN.h"
3             #include "perl.h"
4             #include "XSUB.h"
5              
6             #include
7             #include
8             #include
9              
10             #include "types.h"
11              
12 9871           void free_typeinfo(pTHX_ TypeInfo *t) {
13 9871 50         if (!t) return;
14 9871 100         if (t->inner) free_typeinfo(aTHX_ t->inner);
15 9871 100         if (t->tuple) {
16             int i;
17 3268 100         for (i = 0; i < t->tuple_len; i++) free_typeinfo(aTHX_ t->tuple[i]);
18 1097           Safefree(t->tuple);
19             }
20 9871 100         if (t->enum_entries) {
21             int i;
22 3053 100         for (i = 0; i < t->enum_count; i++) Safefree(t->enum_entries[i].name);
23 1017           Safefree(t->enum_entries);
24             }
25 9871 100         if (t->enum_lookup) SvREFCNT_dec((SV*)t->enum_lookup);
26 9871 100         if (t->variant_decl_to_wire) Safefree(t->variant_decl_to_wire);
27 9871 100         if (t->variant_wire_to_decl) Safefree(t->variant_wire_to_decl);
28 9871 100         if (t->tuple_names) {
29             int i;
30 98 100         for (i = 0; i < t->tuple_len; i++)
31 57 50         if (t->tuple_names[i]) Safefree(t->tuple_names[i]);
32 41           Safefree(t->tuple_names);
33             }
34 9871           Safefree(t);
35             }
36              
37 1020           static void parse_enum_entries(pTHX_ TypeInfo *t, const char *s, STRLEN len, int code) {
38 1020           int cap = 0;
39 1020 100         long min_val = (code == T_ENUM8) ? -128 : -32768;
40 1020 100         long max_val = (code == T_ENUM8) ? 127 : 32767;
41 1020           t->enum_entries = NULL;
42 1020           t->enum_count = 0;
43 1020           t->enum_lookup = newHV();
44              
45 1020           STRLEN i = 0;
46 3056 100         while (i < len) {
47 4077 50         while (i < len && (s[i] == ' ' || s[i] == ',')) i++;
    100          
    100          
48 2039 50         if (i >= len) break;
49              
50 2039 50         if (s[i] != '\'')
51 0           croak("Invalid enum format: expected single quote at position %d", (int)i);
52 2039           i++;
53              
54             /* Scan the name and unescape backslash-escapes (\\, \') so the
55             * stored name matches the unescaped form ClickHouse emits in
56             * describe table output and that the user passes to encode(). */
57 2039           STRLEN name_start = i;
58 2039           STRLEN name_raw_len = 0; /* raw bytes consumed (incl. backslashes) */
59 4147 50         while (i < len && s[i] != '\'') {
    100          
60 2108 100         if (s[i] == '\\' && i + 1 < len) { i++; name_raw_len++; }
    50          
61 2108           i++; name_raw_len++;
62             }
63 2039 50         if (i >= len)
64 0           croak("Invalid enum format: unterminated quote");
65              
66             /* Build the unescaped name into a freshly-allocated buffer. The
67             * unescaped length is at most name_raw_len; allocate that bound. */
68             char *name_buf;
69 2039           Newx(name_buf, name_raw_len + 1, char);
70 2039           STRLEN name_len = 0;
71 2039           STRLEN j = name_start;
72 4147 100         while (j < i) {
73 2108 100         if (s[j] == '\\' && j + 1 < i) j++;
    50          
74 2108           name_buf[name_len++] = s[j++];
75             }
76 2039           name_buf[name_len] = 0;
77 2039 100         if (name_len == 0) {
78 1           Safefree(name_buf);
79 1           croak("Invalid enum format: empty name at position %d", (int)name_start);
80             }
81 2038           i++; /* closing quote */
82              
83 8152 50         while (i < len && (s[i] == ' ' || s[i] == '=')) i++;
    100          
    100          
84              
85 2038           int neg = 0;
86 2038 50         if (i < len && s[i] == '-') { neg = 1; i++; }
    100          
87 2038 50         if (i >= len || s[i] < '0' || s[i] > '9') {
    50          
    50          
88 0           Safefree(name_buf);
89 0           croak("Invalid enum format: expected digit at position %d", (int)i);
90             }
91 2038           long val = 0;
92 4101 100         while (i < len && s[i] >= '0' && s[i] <= '9') {
    100          
    50          
93 2063           val = val * 10 + (s[i] - '0');
94 2063           i++;
95             }
96 2038 100         if (neg) val = -val;
97 2038 50         if (val < min_val || val > max_val) {
    100          
98 2           Safefree(name_buf);
99 2 100         croak("Enum value %ld out of range for %s",
100             val, code == T_ENUM8 ? "Enum8" : "Enum16");
101             }
102              
103 2036 100         if (t->enum_count >= cap) {
104 1017 50         cap = cap ? cap * 2 : 8;
105 1017           Renew(t->enum_entries, cap, EnumEntry);
106             }
107 2036           t->enum_entries[t->enum_count].name = name_buf;
108 2036           t->enum_entries[t->enum_count].name_len = name_len;
109 2036           t->enum_entries[t->enum_count].value = (int16_t)val;
110 2036           t->enum_count++;
111              
112 2036           hv_store(t->enum_lookup, name_buf, name_len, newSViv(val), 0);
113             }
114 1017           }
115              
116             /* Heap-allocated cleanup slot for a TypeInfo*. Disarmed by setting *slot=NULL. */
117 9892           static void cleanup_typeinfo_slot(pTHX_ void *p) {
118 9892           TypeInfo **slot = (TypeInfo **)p;
119 9892 100         if (*slot) free_typeinfo(aTHX_ *slot);
120 9892           Safefree(slot);
121 9892           }
122              
123             /* Cleanup for a partially-built Tuple types array. The struct owns the array
124             * directly so the destructor never dereferences stack memory after a longjmp
125             * out of parse_tuple_types. Disarm by setting slot->types = NULL. */
126             typedef struct {
127             TypeInfo **types;
128             int count;
129             } TupleSlot;
130              
131 1097           static void cleanup_tuple_slot(pTHX_ void *p) {
132 1097           TupleSlot *s = (TupleSlot *)p;
133 1097 50         if (s->types) {
134             int i;
135 0 0         for (i = 0; i < s->count; i++)
136 0 0         if (s->types[i]) free_typeinfo(aTHX_ s->types[i]);
137 0           Safefree(s->types);
138             }
139 1097           Safefree(s);
140 1097           }
141              
142             /* Bound of one entry in a comma-separated type list, after outer-WS strip
143             * and optional "field-name" prefix removal. Both Tuple/Map/Variant
144             * parsing and Variant alphabetical sorting need these post-strip bounds. */
145             typedef struct {
146             STRLEN start; /* offset of the trimmed type expression */
147             STRLEN len; /* length of the trimmed type expression */
148             STRLEN name_start; /* offset of the field-name prefix (or 0 if absent) */
149             STRLEN name_len; /* length of the field-name prefix (0 = no name) */
150             } TypeBound;
151              
152             /* Split a comma-separated type list at depth-0 commas, trim outer
153             * whitespace, strip any leading "name" field-name prefix (Tuple
154             * named-element form). When a field-name is found, name_start/name_len
155             * record it so callers (e.g. T_TUPLE) can keep the names; when absent,
156             * name_len is 0. bounds must have at least len+1 slots. Returns the
157             * count of non-empty entries. */
158 1063           static int split_type_list(const char *s, STRLEN len, TypeBound *bounds) {
159 1063           int count = 0;
160 1063           int depth = 0;
161 1063           STRLEN start = 0, i;
162             #define IS_WS(c) ((c)==' '||(c)=='\t'||(c)=='\n'||(c)=='\r')
163 26119 100         for (i = 0; i <= len; i++) {
164 25056 100         char c = (i < len) ? s[i] : ',';
165 25056 100         if (c == '(') depth++;
166 24046 100         else if (c == ')') depth--;
167 23036 100         else if ((c == ',' && depth == 0) || i == len) {
    100          
    50          
168 2127           STRLEN tstart = start, tend = i;
169 2127           STRLEN nstart = 0, nlen = 0;
170 3191 100         while (tstart < tend && IS_WS(s[tstart])) tstart++;
    100          
    50          
    50          
    50          
171 2127 100         while (tend > tstart && IS_WS(s[tend-1])) tend--;
    50          
    50          
    50          
    50          
172 2127 100         if (tend > tstart) {
173 2126           STRLEN id_end = tstart;
174 2126 50         if (id_end < tend
175 2126 50         && ((s[id_end] >= 'A' && s[id_end] <= 'Z')
    100          
176 12 50         || (s[id_end] >= 'a' && s[id_end] <= 'z')
    50          
177 0 0         || s[id_end] == '_')) {
178 2126           id_end++;
179 2126           while (id_end < tend
180 13698 100         && ((s[id_end] >= 'A' && s[id_end] <= 'Z')
    100          
    100          
181 12561 100         || (s[id_end] >= 'a' && s[id_end] <= 'z')
    50          
182 3147 100         || (s[id_end] >= '0' && s[id_end] <= '9')
    50          
183 1019 50         || s[id_end] == '_'))
184 11572           id_end++;
185 2126 100         if (id_end < tend && IS_WS(s[id_end])) {
    100          
    50          
    50          
    50          
186 12           nstart = tstart;
187 12           nlen = id_end - tstart;
188 12           STRLEN ts = id_end;
189 24 50         while (ts < tend && IS_WS(s[ts])) ts++;
    100          
    50          
    50          
    50          
190 12 50         if (ts < tend) tstart = ts;
191             }
192             }
193 2126           bounds[count].start = tstart;
194 2126           bounds[count].len = tend - tstart;
195 2126           bounds[count].name_start = nstart;
196 2126           bounds[count].name_len = nlen;
197 2126           count++;
198             }
199 2127           start = i + 1;
200             }
201             }
202             #undef IS_WS
203 1063           return count;
204             }
205              
206             /* parse_tuple_types_with_bounds: caller already split the list and wants
207             * to reuse the bounds (e.g. Variant alphabetical sort). For convenience,
208             * parse_tuple_types is a thin wrapper that splits internally. */
209 1097           static TypeInfo** parse_tuple_types_with_bounds(pTHX_ const char *s,
210             TypeBound *bounds,
211             int n) {
212             TupleSlot *slot;
213 1097           Newxz(slot, 1, TupleSlot);
214 1097           SAVEDESTRUCTOR_X(cleanup_tuple_slot, slot);
215 1097 50         if (n > 0) Newxz(slot->types, n, TypeInfo*);
216              
217             int i;
218 3268 100         for (i = 0; i < n; i++) {
219 2171           slot->types[i] = parse_type(aTHX_ s + bounds[i].start, bounds[i].len);
220 2171           slot->count = i + 1;
221             }
222             {
223 1097           TypeInfo **result = slot->types;
224 1097           slot->types = NULL; /* Disarm: caller now owns the array. */
225 1097           return result;
226             }
227             }
228              
229 24           static TypeInfo** parse_tuple_types(pTHX_ const char *s, STRLEN len, int *count) {
230             TypeBound *bounds;
231 24 50         Newx(bounds, len + 1, TypeBound);
232 24           SAVEFREEPV(bounds);
233 24           *count = split_type_list(s, len, bounds);
234 24           return parse_tuple_types_with_bounds(aTHX_ s, bounds, *count);
235             }
236              
237             /* Return 1 if this type can be used as a JSON typed path. CH writes
238             * typed paths as a regular column; types whose serialization has a
239             * non-empty state-prefix stream (Variant: mode byte; LC: version +
240             * flags + dict; JSON/Dynamic: their own prefix) would interleave
241             * incorrectly with other paths' prefixes in the Object prefix
242             * section. Composites recursively check. */
243 63           static int type_can_be_typed_path(TypeInfo *t) {
244 63           switch (t->code) {
245 6           case T_VARIANT:
246             case T_LOWCARDINALITY:
247             case T_JSON:
248             case T_DYNAMIC:
249 6           return 0;
250 6           case T_ARRAY:
251             case T_NULLABLE:
252 6           return type_can_be_typed_path(t->inner);
253 6           case T_TUPLE:
254             case T_MAP: {
255             int i;
256 18 100         for (i = 0; i < t->tuple_len; i++)
257 12 50         if (!type_can_be_typed_path(t->tuple[i]))
258 0           return 0;
259 6           return 1;
260             }
261 45           default:
262 45           return 1;
263             }
264             }
265              
266             /* Parse "name Type, name Type, ..." inside JSON(...). Names may include
267             * dots (CH typed paths are dotted, like JSON(user.id UInt64)); type is
268             * a full type expression. Stores parsed entries on t in name-sorted
269             * order via tuple_names + tuple. Empty body (JSON()) is a no-op. */
270 41           static void parse_json_typed_paths(pTHX_ TypeInfo *t,
271             const char *body, STRLEN body_len) {
272             TypeBound *bounds;
273 41 50         Newxz(bounds, body_len + 1, TypeBound);
274 41           SAVEFREEPV(bounds);
275              
276 41           int idx = 0;
277 41           int depth = 0;
278 41           STRLEN start = 0, i;
279 811 100         for (i = 0; i <= body_len; i++) {
280 773 100         char c = (i < body_len) ? body[i] : ',';
281 773 100         if (c == '(') depth++;
282 757 100         else if (c == ')') depth--;
283 741 100         else if ((c == ',' && depth == 0) || i == body_len) {
    100          
    50          
284 52           STRLEN ts = start, te = i;
285             #define J_WS(c2) ((c2)==' '||(c2)=='\t'||(c2)=='\n'||(c2)=='\r')
286 63 100         while (ts < te && J_WS(body[ts])) ts++;
    100          
    50          
    50          
    50          
287 52 100         while (te > ts && J_WS(body[te-1])) te--;
    50          
    50          
    50          
    50          
288 52 100         if (te > ts) {
289 50           STRLEN id = ts;
290 50 50         if (body[id] == '_'
291 50 50         || (body[id] >= 'A' && body[id] <= 'Z')
    50          
292 50 50         || (body[id] >= 'a' && body[id] <= 'z')) {
    50          
293 50           id++;
294 50           while (id < te
295 186 100         && (body[id] == '_' || body[id] == '.'
    50          
    100          
296 178 100         || (body[id] >= 'A' && body[id] <= 'Z')
    50          
297 178 100         || (body[id] >= 'a' && body[id] <= 'z')
    50          
298 49 50         || (body[id] >= '0' && body[id] <= '9')))
    0          
299 136           id++;
300             }
301 50 50         if (id == ts)
302 0           croak("JSON(...): missing path name in '%.*s'",
303             (int)(te - ts), body + ts);
304             /* Reject trailing dot and consecutive dots in path names:
305             * "a.", "a..b", ".a" (the leading dot is already caught
306             * by the start-char rule). CH itself allows only well-
307             * formed dotted identifiers; mirror that. */
308 50 100         if (body[id - 1] == '.')
309 1           croak("JSON(...): path name must not end with '.' "
310             "in '%.*s'",
311             (int)(id - ts), body + ts);
312             STRLEN dk;
313 182 100         for (dk = ts + 1; dk < id; dk++) {
314 134 100         if (body[dk] == '.' && body[dk - 1] == '.')
    100          
315 1           croak("JSON(...): path name must not contain "
316             "consecutive dots in '%.*s'",
317             (int)(id - ts), body + ts);
318             }
319 48           STRLEN ws = id;
320 95 100         while (ws < te && J_WS(body[ws])) ws++;
    100          
    50          
    50          
    50          
321 48 100         if (ws == id || ws == te)
    50          
322 1           croak("JSON(...): expected 'name Type' but got '%.*s'",
323             (int)(te - ts), body + ts);
324 47           bounds[idx].name_start = ts;
325 47           bounds[idx].name_len = id - ts;
326 47           bounds[idx].start = ws;
327 47           bounds[idx].len = te - ws;
328 47           idx++;
329             }
330             #undef J_WS
331 49           start = i + 1;
332             }
333             }
334 38 100         if (idx == 0) return;
335 36           int n = idx;
336              
337             int j, ii;
338 47 100         for (ii = 1; ii < n; ii++) {
339 11           TypeBound key = bounds[ii];
340 11           j = ii - 1;
341 19 100         while (j >= 0) {
342 11           STRLEN m = bounds[j].name_len < key.name_len
343             ? bounds[j].name_len : key.name_len;
344 11           int cmp = memcmp(body + bounds[j].name_start,
345 11           body + key.name_start, m);
346 11 100         if (cmp == 0)
347 1           cmp = (int)bounds[j].name_len - (int)key.name_len;
348 11 100         if (cmp <= 0) break;
349 8           bounds[j+1] = bounds[j];
350 8           j--;
351             }
352 11           bounds[j+1] = key;
353             }
354              
355 46 100         for (ii = 1; ii < n; ii++) {
356 11 100         if (bounds[ii].name_len == bounds[ii-1].name_len
357 5           && memcmp(body + bounds[ii].name_start,
358 5           body + bounds[ii-1].name_start,
359 5 100         bounds[ii].name_len) == 0)
360 1           croak("JSON(...): duplicate typed path name '%.*s'",
361             (int)bounds[ii].name_len, body + bounds[ii].name_start);
362             }
363              
364 35           t->tuple_len = n;
365 35           Newxz(t->tuple_names, n, char*);
366 80 100         for (ii = 0; ii < n; ii++) {
367 45           Newx(t->tuple_names[ii], bounds[ii].name_len + 1, char);
368 45           memcpy(t->tuple_names[ii], body + bounds[ii].name_start,
369 45           bounds[ii].name_len);
370 45           t->tuple_names[ii][bounds[ii].name_len] = '\0';
371             }
372 35           t->tuple = parse_tuple_types_with_bounds(aTHX_ body, bounds, n);
373              
374 74 100         for (ii = 0; ii < n; ii++) {
375 45 100         if (!type_can_be_typed_path(t->tuple[ii]))
376 6           croak("JSON(%s ...): typed path inner type cannot include "
377             "Variant, LowCardinality, JSON, or Dynamic (those have "
378             "wire prefixes that would interleave incorrectly)",
379             t->tuple_names[ii]);
380             }
381             }
382              
383 9892           TypeInfo* parse_type(pTHX_ const char *type, STRLEN len) {
384             TypeInfo *t;
385             /* Slot lives on the heap so its address is stable across the XSUB lifetime. */
386             TypeInfo **slot;
387 9892           Newx(slot, 1, TypeInfo*);
388 9892           *slot = NULL;
389 9892           SAVEDESTRUCTOR_X(cleanup_typeinfo_slot, slot);
390 9892           Newxz(t, 1, TypeInfo);
391 9892           *slot = t;
392              
393 9892 100         if (len == 4 && strncmp(type, "Int8", 4) == 0) {
    100          
394 12           t->code = T_INT8;
395 9880 100         } else if (len == 5 && strncmp(type, "Int16", 5) == 0) {
    100          
396 10           t->code = T_INT16;
397 9870 100         } else if (len == 5 && strncmp(type, "Int32", 5) == 0) {
    100          
398 1484           t->code = T_INT32;
399 8386 100         } else if (len == 5 && strncmp(type, "Int64", 5) == 0) {
    100          
400 13           t->code = T_INT64;
401 8373 100         } else if (len == 5 && strncmp(type, "UInt8", 5) == 0) {
    100          
402 27           t->code = T_UINT8;
403 8346 100         } else if (len == 6 && strncmp(type, "UInt16", 6) == 0) {
    100          
404 4           t->code = T_UINT16;
405 8342 100         } else if (len == 6 && strncmp(type, "UInt32", 6) == 0) {
    100          
406 129           t->code = T_UINT32;
407 8213 100         } else if (len == 6 && strncmp(type, "UInt64", 6) == 0) {
    100          
408 40           t->code = T_UINT64;
409 8173 100         } else if (len == 7 && strncmp(type, "Float32", 7) == 0) {
    100          
410 11           t->code = T_FLOAT32;
411 8162 100         } else if (len == 7 && strncmp(type, "Float64", 7) == 0) {
    100          
412 354           t->code = T_FLOAT64;
413 7808 100         } else if (len == 8 && strncmp(type, "BFloat16", 8) == 0) {
    100          
414 9           t->code = T_BFLOAT16;
415 7799 100         } else if (len == 6 && strncmp(type, "String", 6) == 0) {
    100          
416 1593           t->code = T_STRING;
417 6206 100         } else if (len > 12 && strncmp(type, "FixedString(", 12) == 0) {
    100          
418 22           t->code = T_FIXEDSTRING;
419 22           t->param = atoi(type + 12);
420 22 100         if (t->param <= 0) croak("FixedString needs positive length");
421 6184 100         } else if (len > 6 && strncmp(type, "Array(", 6) == 0) {
    100          
422 1069           t->code = T_ARRAY;
423 1069           t->inner = parse_type(aTHX_ type + 6, len - 7);
424 6136 100         } else if (len > 6 && strncmp(type, "Tuple(", 6) == 0) {
    100          
425 1021           t->code = T_TUPLE;
426 1021           const char *body = type + 6;
427 1021           STRLEN body_len = len - 7;
428             TypeBound *bounds;
429 1021 50         Newx(bounds, body_len + 1, TypeBound);
430 1021           SAVEFREEPV(bounds);
431 1021           t->tuple_len = split_type_list(body, body_len, bounds);
432 1021           t->tuple = parse_tuple_types_with_bounds(aTHX_ body, bounds, t->tuple_len);
433             /* If at least one element carries a field-name, capture all of
434             * them so encode_column can accept hashrefs for this tuple. A
435             * mix of named and unnamed elements isn't legal in ClickHouse;
436             * we accept any element having a name as "named tuple". */
437 1021           int has_names = 0;
438             int j;
439 3051 100         for (j = 0; j < t->tuple_len; j++) {
440 2036 100         if (bounds[j].name_len > 0) { has_names = 1; break; }
441             }
442 1021 100         if (has_names) {
443 6           Newxz(t->tuple_names, t->tuple_len, char *);
444 18 100         for (j = 0; j < t->tuple_len; j++) {
445 12 50         if (bounds[j].name_len > 0) {
446 12           Newx(t->tuple_names[j], bounds[j].name_len + 1, char);
447 12           memcpy(t->tuple_names[j],
448 12           body + bounds[j].name_start, bounds[j].name_len);
449 12           t->tuple_names[j][bounds[j].name_len] = '\0';
450             }
451             /* else: leave NULL -- mixed named/unnamed not really
452             * supported; encode will croak if hashref is used. */
453             }
454             }
455 4094 100         } else if (len > 9 && strncmp(type, "Nullable(", 9) == 0) {
    100          
456 1068 100         if (len > 18 && strncmp(type + 9, "Nullable(", 9) == 0)
    100          
457 2           croak("Nullable(Nullable(...)) is not allowed");
458 1066           t->code = T_NULLABLE;
459 1066           t->inner = parse_type(aTHX_ type + 9, len - 10);
460 3026 100         } else if (len > 6 && strncmp(type, "Enum8(", 6) == 0) {
    100          
461 1015           t->code = T_ENUM8;
462 1015           parse_enum_entries(aTHX_ t, type + 6, len - 7, T_ENUM8);
463 2011 100         } else if (len > 7 && strncmp(type, "Enum16(", 7) == 0) {
    100          
464 5           t->code = T_ENUM16;
465 5           parse_enum_entries(aTHX_ t, type + 7, len - 8, T_ENUM16);
466 2006 100         } else if (len > 10 && strncmp(type, "Decimal32(", 10) == 0) {
    100          
467 9           t->code = T_DECIMAL32;
468 9           t->param = atoi(type + 10);
469 9 50         if (t->param < 0 || t->param > 9)
    100          
470 1           croak("Decimal32 scale must be 0..9, got %d", t->param);
471 1997 100         } else if (len > 10 && strncmp(type, "Decimal64(", 10) == 0) {
    100          
472 1012           t->code = T_DECIMAL64;
473 1012           t->param = atoi(type + 10);
474 1012 50         if (t->param < 0 || t->param > 18)
    50          
475 0           croak("Decimal64 scale must be 0..18, got %d", t->param);
476 985 100         } else if (len > 11 && strncmp(type, "Decimal128(", 11) == 0) {
    100          
477 16           t->code = T_DECIMAL128;
478 16           t->param = atoi(type + 11);
479 16 50         if (t->param < 0 || t->param > 38)
    50          
480 0           croak("Decimal128 scale must be 0..38, got %d", t->param);
481 969 100         } else if (len > 11 && strncmp(type, "Decimal256(", 11) == 0) {
    100          
482 11           t->code = T_DECIMAL256;
483 11           t->param = atoi(type + 11);
484 11 50         if (t->param < 0 || t->param > 76)
    50          
485 0           croak("Decimal256 scale must be 0..76, got %d", t->param);
486 959 100         } else if (len > 8 && strncmp(type, "Decimal(", 8) == 0) {
    100          
487 2           int precision = atoi(type + 8);
488 2           const char *comma = memchr(type + 8, ',', len - 8);
489 2 50         if (!comma) croak("Decimal(P, S) requires precision and scale");
490 2           int scale = atoi(comma + 1);
491 2 100         if (precision < 1 || precision > 38)
    50          
492 1           croak("Decimal(P, S) precision must be 1..38, got %d (use Decimal256(S) explicitly for P > 38)", precision);
493 1 50         if (scale < 0 || scale > precision)
    50          
494 0           croak("Decimal scale must be 0..precision, got %d", scale);
495 1           t->param = scale;
496 1 50         if (precision <= 9) t->code = T_DECIMAL32;
497 0 0         else if (precision <= 18) t->code = T_DECIMAL64;
498 0           else t->code = T_DECIMAL128;
499 956 100         } else if (len == 4 && strncmp(type, "Date", 4) == 0) {
    100          
500 20           t->code = T_DATE;
501 936 100         } else if (len == 6 && strncmp(type, "Date32", 6) == 0) {
    100          
502 7           t->code = T_DATE32;
503 929 100         } else if (len == 8 && strncmp(type, "DateTime", 8) == 0) {
    50          
504 353           t->code = T_DATETIME;
505 576 100         } else if (len > 9 && strncmp(type, "DateTime(", 9) == 0) {
    100          
506 1           t->code = T_DATETIME;
507 575 100         } else if (len > 11 && strncmp(type, "DateTime64(", 11) == 0) {
    100          
508 21           t->code = T_DATETIME64;
509 21           t->param = atoi(type + 11);
510 21 50         if (t->param < 0 || t->param > 9)
    100          
511 1           croak("DateTime64 precision must be 0..9, got %d", t->param);
512 554 100         } else if (len == 4 && strncmp(type, "Bool", 4) == 0) {
    100          
513 313           t->code = T_BOOL;
514 241 100         } else if (len == 7 && strncmp(type, "Boolean", 7) == 0) {
    50          
515 0           t->code = T_BOOL;
516 241 100         } else if (len == 4 && strncmp(type, "UUID", 4) == 0) {
    100          
517 12           t->code = T_UUID;
518 229 100         } else if (len == 4 && strncmp(type, "IPv4", 4) == 0) {
    100          
519 13           t->code = T_IPV4;
520 216 100         } else if (len == 4 && strncmp(type, "IPv6", 4) == 0) {
    100          
521 6           t->code = T_IPV6;
522 211 100         } else if (len > 24 && strncmp(type, "SimpleAggregateFunction(", 24) == 0) {
    100          
523             /* SimpleAggregateFunction(func, T) is wire-equivalent to T -- the
524             * func name only affects how readers aggregate on read, not how
525             * values are stored. Strip it and parse the rest as the inner type. */
526 1           const char *body = type + 24;
527 1           STRLEN body_len = len - 25;
528 1           const char *comma = memchr(body, ',', body_len);
529 1 50         if (!comma) croak("SimpleAggregateFunction requires (func, T)");
530 1           STRLEN inner_off = (comma - body) + 1;
531 2 50         while (inner_off < body_len && body[inner_off] == ' ') inner_off++;
    100          
532 1           TypeInfo *inner = parse_type(aTHX_ body + inner_off, body_len - inner_off);
533             /* Steal inner's contents in one shot. The outer slot still owns t; the
534             * inner's slot was already disarmed before parse_type returned, so we
535             * can free the now-redundant inner struct directly. */
536 1           *t = *inner;
537 1           Safefree(inner);
538 226 100         } else if (len > 8 && strncmp(type, "Variant(", 8) == 0) {
    100          
539             /* Variant(T1, T2, ...) - tagged union. Each input row is either
540             * undef (NULL) or [$variant_idx, $value]. ClickHouse stores
541             * Variant sub-columns and per-row discriminators in alphabetical
542             * order of variant type names, not declaration order, so build
543             * a permutation that maps the user's declaration index to the
544             * wire (alphabetical) position. */
545 18           t->code = T_VARIANT;
546 18           const char *body = type + 8;
547 18           STRLEN body_len = len - 9;
548              
549             /* Split once, then share the bounds with parse_tuple_types_with_bounds
550             * (the alphabetical sort and the parsed TypeInfo entries reference
551             * the same ranges). */
552             TypeBound *bounds;
553 18 50         Newx(bounds, body_len + 1, TypeBound);
554 18           SAVEFREEPV(bounds);
555 18           t->tuple_len = split_type_list(body, body_len, bounds);
556 18 100         if (t->tuple_len < 1)
557 1           croak("Variant requires at least one type argument");
558 17 50         if (t->tuple_len > 254)
559 0           croak("Variant supports at most 254 types (got %d)", t->tuple_len);
560 17           t->tuple = parse_tuple_types_with_bounds(aTHX_ body, bounds, t->tuple_len);
561              
562             /* Sort declaration indices alphabetically by their type bytes.
563             * Selection sort -- nvar is at most 254. */
564 17           Newx(t->variant_wire_to_decl, t->tuple_len, int);
565 17           Newx(t->variant_decl_to_wire, t->tuple_len, int);
566             int j, k;
567 53 100         for (j = 0; j < t->tuple_len; j++) t->variant_wire_to_decl[j] = j;
568 36 100         for (j = 0; j < t->tuple_len - 1; j++) {
569 19           int min_idx = j;
570 40 100         for (k = j + 1; k < t->tuple_len; k++) {
571 21           int a = t->variant_wire_to_decl[min_idx];
572 21           int b = t->variant_wire_to_decl[k];
573 21           STRLEN la = bounds[a].len, lb = bounds[b].len;
574 21           STRLEN cmp_len = la < lb ? la : lb;
575 21           int cmp = memcmp(body + bounds[a].start,
576 21           body + bounds[b].start, cmp_len);
577 21 100         if (cmp > 0 || (cmp == 0 && la > lb)) min_idx = k;
    50          
    0          
578             }
579 19 100         if (min_idx != j) {
580 4           int tmp = t->variant_wire_to_decl[j];
581 4           t->variant_wire_to_decl[j] = t->variant_wire_to_decl[min_idx];
582 4           t->variant_wire_to_decl[min_idx] = tmp;
583             }
584             }
585 53 100         for (j = 0; j < t->tuple_len; j++)
586 36           t->variant_decl_to_wire[t->variant_wire_to_decl[j]] = j;
587 191 100         } else if (len > 4 && strncmp(type, "Map(", 4) == 0) {
    100          
588             /* Map(K, V) is wire-equivalent to Array(Tuple(K, V)). Build the
589             * synthetic structure so encode_column can reuse Array+Tuple paths. */
590 14           t->code = T_MAP;
591 14           t->tuple = parse_tuple_types(aTHX_ type + 4, len - 5, &t->tuple_len);
592 14 50         if (t->tuple_len != 2)
593 0           croak("Map type requires exactly 2 type arguments, got %d", t->tuple_len);
594 177 100         } else if (len > 7 && strncmp(type, "Nested(", 7) == 0) {
    100          
595             /* On the wire, ClickHouse splits a Nested(a T1, b T2) column into
596             * flat columns ".a Array(T1)" and ".b Array(T2)" -- this
597             * encoder does not perform that expansion. Use the flat form
598             * directly in your column spec. */
599 2           croak("Nested(...) is not supported directly; declare flat columns "
600             "like 'name.field' Array(T) instead (CH stores Nested that way "
601             "on the wire). describe table / for_table() returns the flat form.");
602 175 100         } else if (len == 7 && strncmp(type, "Dynamic", 7) == 0) {
    100          
603             /* Standalone Dynamic column: same wire machinery as a single
604             * JSON path's Dynamic sub-column. Each row is a scalar leaf
605             * (Bool/Float64/Int64/String), an Array(T) of those, or
606             * undef (NULL). Hashrefs aren't accepted here - use JSON for
607             * object-shaped values. */
608 12           t->code = T_DYNAMIC;
609 163 100         } else if ((len == 4 && strncmp(type, "JSON", 4) == 0)
    100          
610 93 100         || (len > 5 && strncmp(type, "JSON(", 5) == 0
    100          
611 41 50         && type[len-1] == ')')
612 52 100         || (len > 7 && strncmp(type, "Object(", 7) == 0)) {
    50          
613             /* ClickHouse's stable JSON type (24.8+). Wire layout (V1 over
614             * Native, validated byte-for-byte against the server in
615             * doc/json-research/): Object structure prefix, then for each
616             * path a Dynamic prefix + Variant mode byte, then per-path
617             * Variant data, then a shared-data Array(Tuple(String,String))
618             * trailer. The per-row schema is determined at encode time by
619             * inspecting each value's Perl type. The JSON(name Type, ...)
620             * form pins specific paths to concrete inner types; those
621             * paths skip the Dynamic+Variant wrapping. */
622 111           t->code = T_JSON;
623 111 100         if (len > 5 && type[4] == '(') {
    50          
624 41           parse_json_typed_paths(aTHX_ t, type + 5, len - 6);
625             }
626 52 100         } else if (len == 5 && strncmp(type, "Point", 5) == 0) {
    50          
627             /* Point = Tuple(Float64, Float64) */
628 10           t->code = T_TUPLE;
629 10           t->tuple = parse_tuple_types(aTHX_ "Float64, Float64", 16, &t->tuple_len);
630 42 100         } else if (len == 4 && strncmp(type, "Ring", 4) == 0) {
    50          
631             /* Ring = Array(Point) */
632 3           t->code = T_ARRAY;
633 3           t->inner = parse_type(aTHX_ "Point", 5);
634 39 100         } else if (len == 10 && strncmp(type, "LineString", 10) == 0) {
    100          
635             /* LineString = Array(Point) */
636 2           t->code = T_ARRAY;
637 2           t->inner = parse_type(aTHX_ "Point", 5);
638 37 50         } else if (len == 15 && strncmp(type, "MultiLineString", 15) == 0) {
    0          
639             /* MultiLineString = Array(Array(Point)) */
640 0           t->code = T_ARRAY;
641 0           t->inner = parse_type(aTHX_ "Array(Point)", 12);
642 37 100         } else if (len == 7 && strncmp(type, "Polygon", 7) == 0) {
    50          
643             /* Polygon = Array(Ring) */
644 3           t->code = T_ARRAY;
645 3           t->inner = parse_type(aTHX_ "Ring", 4);
646 34 100         } else if (len == 12 && strncmp(type, "MultiPolygon", 12) == 0) {
    50          
647             /* MultiPolygon = Array(Polygon) */
648 2           t->code = T_ARRAY;
649 2           t->inner = parse_type(aTHX_ "Polygon", 7);
650 32 100         } else if (len > 15 && strncmp(type, "LowCardinality(", 15) == 0) {
    50          
651 30           t->code = T_LOWCARDINALITY;
652 30           t->inner = parse_type(aTHX_ type + 15, len - 16);
653 30 100         if (t->inner->code != T_STRING && t->inner->code != T_FIXEDSTRING
    100          
654 10 100         && (t->inner->code != T_NULLABLE
655 9 50         || (t->inner->inner->code != T_STRING
656 0 0         && t->inner->inner->code != T_FIXEDSTRING)))
657 1           croak("LowCardinality(T) currently supports T = String / FixedString / Nullable(String) / Nullable(FixedString)");
658             } else {
659 2           croak("Unknown type: %.*s", (int)len, type);
660             }
661              
662             /* Disarm the slot: caller now owns t. */
663 9866           *slot = NULL;
664 9866           return t;
665             }