File Coverage

separated_parser.c
Criterion Covered Total %
statement 225 247 91.0
branch 157 212 74.0
condition n/a
subroutine n/a
pod n/a
total 382 459 83.2


line stmt bran cond sub pod time code
1             /*
2             * separated_parser.c - CSV/TSV state machine for File::Raw::Separated.
3             *
4             * See include/separated_parser.h for the public contract.
5             */
6              
7             #include "separated_parser.h"
8              
9             #include
10             #include
11             #include
12             #include
13              
14             /* ------------------------------------------------------------
15             * Internal types
16             * ------------------------------------------------------------ */
17              
18             typedef enum {
19             ST_START_FIELD = 0,
20             ST_IN_UNQUOTED,
21             ST_IN_QUOTED,
22             ST_MAYBE_END_QUOTE
23             } parse_state_t;
24              
25             struct separated_ctx {
26             /* Resolved options (copied at init time). */
27             separated_options_t opts;
28              
29             /* Caller. */
30             separated_field_cb cb;
31             void *ud;
32              
33             /* Field buffer (geometric growth). */
34             char *buf;
35             size_t buf_len;
36             size_t buf_cap;
37              
38             /* State. */
39             parse_state_t state;
40             int field_was_quoted; /* 1 if current field began with a quote */
41             int bom_checked; /* 1 once we've decided about the BOM */
42             int any_field_in_row; /* 1 if at least one field started in this row */
43              
44             /* Auto-detected / pinned EOL. */
45             separated_eol_t detected_eol;
46             int pending_cr; /* 1 if last byte was CR awaiting LF/data (CRLF detect) */
47              
48             /* Diagnostics. */
49             size_t bytes_consumed;
50             size_t rows_emitted;
51             size_t err_offset;
52              
53             /* Sticky error: once non-zero, all _feed/_finish are no-ops. */
54             separated_err_t sticky_err;
55             };
56              
57             /* Effective max-field cap (resolves opts.max_field_len == 0 to default). */
58             static size_t
59 178           effective_field_cap(const separated_options_t *opts)
60             {
61 178           return opts->max_field_len ? opts->max_field_len
62 178 100         : SEPARATED_FIELD_DEFAULT_CAP;
63             }
64              
65             /* ------------------------------------------------------------
66             * Field buffer: geometric growth with hard cap
67             * ------------------------------------------------------------ */
68              
69             static separated_err_t
70 21701072           buf_putc(separated_ctx_t *ctx, char c)
71             {
72 21701072 100         if (ctx->buf_len + 1 > ctx->buf_cap) {
73 178 100         size_t new_cap = ctx->buf_cap ? ctx->buf_cap * 2 : 64;
74 178           size_t cap_max = effective_field_cap(&ctx->opts);
75             char *new_buf;
76 178 100         if (new_cap > cap_max) new_cap = cap_max;
77 178 100         if (new_cap <= ctx->buf_len) {
78 1           return SEPARATED_ERR_FIELD_TOO_LONG;
79             }
80 177           new_buf = (char *)realloc(ctx->buf, new_cap);
81 177 50         if (!new_buf) return SEPARATED_ERR_NOMEM;
82 177           ctx->buf = new_buf;
83 177           ctx->buf_cap = new_cap;
84             }
85 21701071           ctx->buf[ctx->buf_len++] = c;
86 21701071           return SEPARATED_OK;
87             }
88              
89             static void
90 921383           buf_reset(separated_ctx_t *ctx)
91             {
92 921383           ctx->buf_len = 0;
93 921383           }
94              
95             /* ------------------------------------------------------------
96             * Trim helper for unquoted fields when opts.trim is on.
97             * Strips only ASCII space and tab. Quoted fields are never trimmed.
98             * ------------------------------------------------------------ */
99              
100             static void
101 5           trim_buf(char *buf, size_t *plen)
102             {
103 5           size_t len = *plen;
104 5           size_t start = 0;
105             size_t end;
106 11 100         while (start < len && (buf[start] == ' ' || buf[start] == '\t')) start++;
    100          
    100          
107 5           end = len;
108 8 100         while (end > start && (buf[end - 1] == ' ' || buf[end - 1] == '\t')) end--;
    100          
    100          
109 5 100         if (start > 0) memmove(buf, buf + start, end - start);
110 5           *plen = end - start;
111 5           }
112              
113             /* ------------------------------------------------------------
114             * Emit a field/row to the callback.
115             * end_of_row=1 means "this field is the last in its row".
116             * ------------------------------------------------------------ */
117              
118             static separated_err_t
119 921389           emit_field(separated_ctx_t *ctx, int end_of_row)
120             {
121             int call_rc;
122              
123             /* Trim only on unquoted fields. */
124 921389 100         if (ctx->opts.trim && !ctx->field_was_quoted) {
    100          
125 5           trim_buf(ctx->buf, &ctx->buf_len);
126             }
127              
128 921389 100         if (ctx->opts.empty_is_undef
129 6 100         && !ctx->field_was_quoted
130 5 100         && ctx->buf_len == 0) {
131 2           call_rc = ctx->cb(NULL, SEPARATED_FIELD_NULL_LEN,
132             end_of_row, ctx->ud);
133             } else {
134             /* Pass even an empty quoted field as a real "" field. */
135 921387 100         const char *p = ctx->buf_len ? ctx->buf : "";
136 921387           call_rc = ctx->cb(p, ctx->buf_len, end_of_row, ctx->ud);
137             }
138 921386 100         if (call_rc != 0) return SEPARATED_ERR_ABORTED;
139              
140 921383           buf_reset(ctx);
141 921383           ctx->field_was_quoted = 0;
142 921383           ctx->any_field_in_row = 1;
143 921383 100         if (end_of_row) {
144 110611           ctx->rows_emitted++;
145 110611           ctx->any_field_in_row = 0;
146             }
147 921383           return SEPARATED_OK;
148             }
149              
150             /* ------------------------------------------------------------
151             * BOM stripping (UTF-8 only, when binary=0).
152             * Called once before any byte is interpreted. Caller passes the
153             * incoming buffer pointer + length pair through bom_skip; on return
154             * any leading 3-byte BOM has been advanced past.
155             * ------------------------------------------------------------ */
156              
157             static void
158 501           bom_check(separated_ctx_t *ctx, const char **pp, size_t *plen)
159             {
160 501 100         if (ctx->bom_checked) return;
161 173           ctx->bom_checked = 1;
162              
163 173 100         if (ctx->opts.binary) return;
164              
165 170 100         if (*plen >= 3) {
166 162           const unsigned char *u = (const unsigned char *)*pp;
167 162 100         if (u[0] == 0xEF && u[1] == 0xBB && u[2] == 0xBF) {
    50          
    50          
168 1           *pp += 3;
169 1           *plen -= 3;
170 1           ctx->bytes_consumed += 3;
171             }
172             }
173             }
174              
175             /* ------------------------------------------------------------
176             * EOL helpers
177             *
178             * detect_or_match returns 1 if the byte at `c` (with the parser's
179             * pending_cr flag) is a row terminator under the active EOL mode,
180             * 0 if it's a normal byte, or a negative error code on a pinned
181             * mismatch under strict mode.
182             *
183             * On a successful match the function may consume the byte (we always
184             * do — the caller treats the return-1 case as "row ended here") and
185             * also flips pending_cr or detected_eol as appropriate.
186             *
187             * NOTE: CRLF handling needs lookahead-of-1. We model it with the
188             * pending_cr bit:
189             * see CR => set pending_cr=1, do NOT emit row yet.
190             * next byte:
191             * if LF => CRLF row terminator, clear pending_cr.
192             * else => emit deferred CR-row terminator (CR mode), then
193             * re-process current byte from scratch.
194             *
195             * Keeping that in a tiny helper keeps the main loop legible.
196             * ------------------------------------------------------------ */
197              
198             /* Return 1 if c is a terminator after the active EOL mode considers it. */
199             static int
200 22622303           is_lf(int c) { return c == '\n'; }
201             static int
202 22622305           is_cr(int c) { return c == '\r'; }
203              
204             /* ------------------------------------------------------------
205             * Core feed loop.
206             *
207             * Drives the state machine over [buf, buf+len). Returns OK or the
208             * first error encountered; on error err_offset is set to the byte
209             * offset within the original input (ctx->bytes_consumed at the
210             * point of failure).
211             * ------------------------------------------------------------ */
212              
213             #define FAIL(code) do { \
214             ctx->sticky_err = (code); \
215             ctx->err_offset = ctx->bytes_consumed; \
216             return (code); \
217             } while (0)
218              
219             #define PUTC(c) do { \
220             separated_err_t _e = buf_putc(ctx, (char)(c)); \
221             if (_e != SEPARATED_OK) FAIL(_e); \
222             } while (0)
223              
224             #define EMIT(end_of_row) do { \
225             separated_err_t _e = emit_field(ctx, (end_of_row)); \
226             if (_e != SEPARATED_OK) FAIL(_e); \
227             } while (0)
228              
229             /* End-of-row from a CR or LF or CRLF. Clears pending_cr. */
230             static separated_err_t
231 110612           handle_row_end(separated_ctx_t *ctx)
232             {
233 110612           ctx->pending_cr = 0;
234 110612           return emit_field(ctx, 1);
235             }
236              
237             /* Decide whether `c` should terminate the current row, given the
238             * current EOL mode. Returns:
239             * 1 - row ended (caller must NOT process c further)
240             * 0 - byte is data; caller continues with state-machine handling
241             * -ve - error code (only in strict + EOL_PINNED mismatch)
242             *
243             * Side-effect: may toggle pending_cr / detected_eol. */
244             static int
245 22622319           classify_eol(separated_ctx_t *ctx, int c)
246             {
247             /* Resolve any deferred CR from previous byte. */
248 22622319 100         if (ctx->pending_cr) {
249 14 100         if (is_lf(c)) {
250             /* CRLF terminator. Lock detection if AUTO. */
251 12 100         if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) {
252 8           ctx->detected_eol = SEPARATED_EOL_CRLF;
253 4 100         } else if (ctx->opts.eol_mode != SEPARATED_EOL_CRLF
254 2 50         && ctx->opts.strict) {
255 2           FAIL(SEPARATED_ERR_EOL_PINNED);
256             }
257 10           return 1; /* row already ended at the CR; consume LF as well */
258             } else {
259             /* CR alone => row ended at the CR. The current byte is data,
260             * but we have a pending row-end to flush first. We do that
261             * by returning a "deferred" signal: the caller flushes the
262             * row, clears pending_cr, then re-enters with the current
263             * byte. We model that here by emitting now and reporting
264             * "row ended" — caller must remember NOT to consume c. */
265 2 100         if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) {
266 1           ctx->detected_eol = SEPARATED_EOL_CR;
267 1 50         } else if (ctx->opts.eol_mode != SEPARATED_EOL_CR
268 0 0         && ctx->opts.strict) {
269 0           FAIL(SEPARATED_ERR_EOL_PINNED);
270             }
271 2           return 2; /* row ended on previous CR; do not consume c */
272             }
273             }
274              
275             /* No pending CR. Look at this byte. */
276 22622305 100         if (is_cr(c)) {
277 16           ctx->pending_cr = 1;
278 16           return -1; /* tentative; need lookahead. byte consumed. */
279             }
280 22622289 100         if (is_lf(c)) {
281 110604 100         if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) {
282 110600           ctx->detected_eol = SEPARATED_EOL_LF;
283 4 50         } else if (ctx->opts.eol_mode != SEPARATED_EOL_LF
284 0 0         && ctx->opts.strict) {
285 0           FAIL(SEPARATED_ERR_EOL_PINNED);
286             }
287 110604           return 1;
288             }
289 22511685           return 0;
290             }
291              
292             /* Public-facing _feed implementation. */
293             separated_err_t
294 501           separated_feed(separated_ctx_t *ctx, const char *buf, size_t len)
295             {
296             size_t i;
297              
298 501 50         if (ctx->sticky_err) return ctx->sticky_err;
299              
300 501           bom_check(ctx, &buf, &len);
301              
302 501           i = 0;
303 22623062 100         while (i < len) {
304 22622575           int c = (unsigned char)buf[i];
305             int eol;
306              
307             /* ---- IN_QUOTED short-circuits EOL detection: newlines are data. */
308 22622575 100         if (ctx->state == ST_IN_QUOTED) {
309 256 100         if (ctx->opts.escape >= 0 && c == ctx->opts.escape) {
    100          
310             /* Backslash-style escape: consume next byte literally. */
311 2 50         if (i + 1 >= len) {
312             /* Defer: store nothing; the next feed sees this byte
313             * again. We do that by NOT advancing past the escape
314             * char and returning. */
315             /* Implementation: append a one-byte "escape pending"
316             * marker via a local flag. Cleanest: stuff it as the
317             * last byte of buf and remember we're mid-escape. */
318             /* Simpler model: require the next byte to be in the
319             * SAME chunk. For now we accept that limitation for
320             * v0.01 and document it: backslash escapes that
321             * straddle a chunk boundary are not supported. */
322 0 0         PUTC(c); /* fall back to literal escape char */
323 0           ctx->bytes_consumed++;
324 0           i++;
325 0           continue;
326             }
327 2 50         PUTC(buf[i + 1]);
328 2           ctx->bytes_consumed += 2;
329 2           i += 2;
330 2           continue;
331             }
332 254 100         if (c == ctx->opts.quote) {
333 52           ctx->state = ST_MAYBE_END_QUOTE;
334 52           ctx->bytes_consumed++;
335 52           i++;
336 52           continue;
337             }
338 202 50         PUTC(c);
339 202           ctx->bytes_consumed++;
340 202           i++;
341 202           continue;
342             }
343              
344             /* ---- All other states: consult EOL classifier first. */
345 22622319           eol = classify_eol(ctx, c);
346 22622319 100         if (eol < 0 && ctx->sticky_err) return ctx->sticky_err;
    100          
347              
348 22622317 100         if (eol == 1) {
349             /* Current byte (or its LF partner) is end-of-row. Consume it
350             * and emit the current field as end_of_row. */
351 110614           ctx->bytes_consumed++;
352 110614           i++;
353             /* Skip empty-trailing-newline case: only emit if any field
354             * has been started OR the buffer has content. */
355 110614 100         if (ctx->any_field_in_row || ctx->buf_len > 0
    100          
356 6 50         || ctx->state != ST_START_FIELD) {
357 110608           separated_err_t e = handle_row_end(ctx);
358 110605 100         if (e != SEPARATED_OK) FAIL(e);
359             }
360 110608           ctx->state = ST_START_FIELD;
361 110608           continue;
362             }
363 22511703 100         if (eol == 2) {
364             /* Pending CR resolved as row-end; this byte is fresh data.
365             * Flush the row but do NOT consume the current byte. */
366 2           ctx->pending_cr = 0; /* must clear unconditionally — handle_row_end
367             does so but we may skip the call below
368             when the row is empty (leading bare CR),
369             and an unset pending_cr with un-advanced
370             i would loop on the same byte forever. */
371 2 50         if (ctx->any_field_in_row || ctx->buf_len > 0
    0          
372 0 0         || ctx->state != ST_START_FIELD) {
373 2           separated_err_t e = handle_row_end(ctx);
374 2 50         if (e != SEPARATED_OK) FAIL(e);
375             }
376 2           ctx->state = ST_START_FIELD;
377             /* Do not advance i: re-enter the loop on this byte. */
378 2           continue;
379             }
380 22511701 100         if (eol == -1) {
381             /* CR consumed, awaiting decision. */
382 16           ctx->bytes_consumed++;
383 16           i++;
384 16           continue;
385             }
386             /* eol == 0: byte is regular data, fall through to state machine. */
387              
388 22511685           switch (ctx->state) {
389 921394           case ST_START_FIELD:
390 921394 100         if (c == ctx->opts.sep) {
391 6 50         EMIT(0); /* empty field, more to come on this row */
392 921388 100         } else if (ctx->opts.quote >= 0 && c == ctx->opts.quote) {
    100          
393 40           ctx->field_was_quoted = 1;
394 40           ctx->state = ST_IN_QUOTED;
395             } else {
396 921348 50         PUTC(c);
397 921348           ctx->state = ST_IN_UNQUOTED;
398             }
399 921394           break;
400              
401 21590248           case ST_IN_UNQUOTED:
402 21590248 100         if (c == ctx->opts.sep) {
403 810739 50         EMIT(0);
404 810739           ctx->state = ST_START_FIELD;
405 20779509 100         } else if (ctx->opts.quote >= 0 && c == ctx->opts.quote) {
    100          
406 6 100         if (ctx->opts.strict) FAIL(SEPARATED_ERR_BAD_QUOTE);
407             /* Lenient: keep the quote literally, stay in state. */
408 3 50         PUTC(c);
409             } else {
410 20779503 100         PUTC(c);
411             }
412 21590244           break;
413              
414 43           case ST_MAYBE_END_QUOTE:
415 43 100         if (c == ctx->opts.quote) {
416             /* RFC 4180 doubled-quote escape. */
417 12 50         PUTC(c);
418 12           ctx->state = ST_IN_QUOTED;
419 31 100         } else if (c == ctx->opts.sep) {
420 27 50         EMIT(0);
421 27           ctx->state = ST_START_FIELD;
422             } else {
423 4 100         if (ctx->opts.strict) FAIL(SEPARATED_ERR_BAD_QUOTE);
424             /* Lenient: closing quote was real, but stray data after.
425             * Append the unexpected byte and continue as unquoted. */
426 2 50         PUTC(c);
427 2           ctx->state = ST_IN_UNQUOTED;
428             }
429 41           break;
430              
431 0           case ST_IN_QUOTED:
432             /* unreachable; handled above */
433 0           break;
434             }
435              
436 22511679           ctx->bytes_consumed++;
437 22511679           i++;
438             }
439              
440 487           return SEPARATED_OK;
441             }
442              
443             /* ------------------------------------------------------------
444             * Finish: flush any half-built field/row at EOF.
445             * ------------------------------------------------------------ */
446              
447             separated_err_t
448 160           separated_finish(separated_ctx_t *ctx)
449             {
450 160 50         if (ctx->sticky_err) return ctx->sticky_err;
451              
452             /* Resolve a dangling CR (CR-only row terminator). */
453 160 100         if (ctx->pending_cr) {
454 2 100         if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) {
455 1           ctx->detected_eol = SEPARATED_EOL_CR;
456 1 50         } else if (ctx->opts.eol_mode != SEPARATED_EOL_CR
457 0 0         && ctx->opts.strict) {
458 0           FAIL(SEPARATED_ERR_EOL_PINNED);
459             }
460 2           ctx->pending_cr = 0;
461 2 50         if (ctx->any_field_in_row || ctx->buf_len > 0
    0          
462 0 0         || ctx->state != ST_START_FIELD) {
463 2           separated_err_t e = handle_row_end(ctx);
464 2 50         if (e != SEPARATED_OK) FAIL(e);
465             }
466 2           ctx->state = ST_START_FIELD;
467 2           return SEPARATED_OK;
468             }
469              
470             /* Strict: open quote at EOF is a parse error. */
471 158 50         if (ctx->state == ST_IN_QUOTED) {
472 0 0         if (ctx->opts.strict) FAIL(SEPARATED_ERR_BAD_QUOTE);
473             /* Lenient: emit whatever we have. */
474             }
475              
476             /* Emit any buffered field (and end-of-row) if there's data or we
477             * were mid-field. */
478 158 100         if (ctx->any_field_in_row || ctx->buf_len > 0
    50          
479 153 50         || ctx->state == ST_IN_UNQUOTED
480 153 50         || ctx->state == ST_IN_QUOTED
481 153 50         || ctx->state == ST_MAYBE_END_QUOTE) {
482 5           separated_err_t e = emit_field(ctx, 1);
483 5 50         if (e != SEPARATED_OK) FAIL(e);
484             }
485 158           ctx->state = ST_START_FIELD;
486 158           return SEPARATED_OK;
487             }
488              
489             /* ------------------------------------------------------------
490             * Construction / destruction
491             * ------------------------------------------------------------ */
492              
493             void
494 137           separated_options_init_csv(separated_options_t *opts)
495             {
496 137           memset(opts, 0, sizeof *opts);
497 137           opts->sep = ',';
498 137           opts->quote = '"';
499 137           opts->escape = -1;
500 137           opts->eol_mode = SEPARATED_EOL_AUTO;
501 137           }
502              
503             void
504 55           separated_options_init_tsv(separated_options_t *opts)
505             {
506 55           memset(opts, 0, sizeof *opts);
507 55           opts->sep = '\t';
508 55           opts->quote = -1;
509 55           opts->escape = -1;
510 55           opts->eol_mode = SEPARATED_EOL_AUTO;
511 55           }
512              
513             separated_ctx_t *
514 175           separated_init(const separated_options_t *opts,
515             separated_field_cb cb, void *ud)
516             {
517 175           separated_ctx_t *ctx = (separated_ctx_t *)calloc(1, sizeof *ctx);
518 175 50         if (!ctx) return NULL;
519 175           ctx->opts = *opts;
520 175           ctx->cb = cb;
521 175           ctx->ud = ud;
522 175           ctx->state = ST_START_FIELD;
523 175           return ctx;
524             }
525              
526             void
527 172           separated_free(separated_ctx_t *ctx)
528             {
529 172 50         if (!ctx) return;
530 172           free(ctx->buf);
531 172           free(ctx);
532             }
533              
534 3           size_t separated_offset(const separated_ctx_t *ctx) { return ctx->bytes_consumed; }
535 0           size_t separated_rows(const separated_ctx_t *ctx) { return ctx->rows_emitted; }
536              
537             /* ------------------------------------------------------------
538             * One-shot wrapper
539             * ------------------------------------------------------------ */
540              
541             long
542 148           separated_parse(const char *buf, size_t len,
543             const separated_options_t *opts,
544             separated_field_cb cb, void *ud,
545             size_t *err_offset)
546             {
547 148           separated_ctx_t *ctx = separated_init(opts, cb, ud);
548             separated_err_t e;
549             long ret;
550 148 50         if (!ctx) {
551 0 0         if (err_offset) *err_offset = 0;
552 0           return SEPARATED_ERR_NOMEM;
553             }
554              
555 148           e = separated_feed(ctx, buf, len);
556 145 100         if (e == SEPARATED_OK) {
557 137           e = separated_finish(ctx);
558             }
559              
560 145 100         if (e != SEPARATED_OK) {
561 8 50         if (err_offset) *err_offset = ctx->err_offset;
562 8           ret = (long)e;
563             } else {
564 137 50         if (err_offset) *err_offset = len;
565 137           ret = (long)ctx->rows_emitted;
566             }
567              
568 145           separated_free(ctx);
569 145           return ret;
570             }
571              
572             /* ------------------------------------------------------------
573             * strerror
574             * ------------------------------------------------------------ */
575              
576             const char *
577 8           separated_strerror(separated_err_t err)
578             {
579 8           switch (err) {
580 0           case SEPARATED_OK: return "ok";
581 0           case SEPARATED_ERR_NOMEM: return "out of memory";
582 1           case SEPARATED_ERR_FIELD_TOO_LONG: return "field exceeds max length";
583 5           case SEPARATED_ERR_BAD_QUOTE: return "malformed quoting";
584 2           case SEPARATED_ERR_EOL_PINNED: return "line ending does not match pinned eol mode";
585 0           case SEPARATED_ERR_ABORTED: return "parse aborted by callback";
586             }
587 0           return "unknown error";
588             }