| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* separated_parser.c - CSV/TSV state machine for File::Raw::Separated. |
|
3
|
|
|
|
|
|
|
* |
|
4
|
|
|
|
|
|
|
* See include/separated_parser.h for the public contract. |
|
5
|
|
|
|
|
|
|
*/ |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
#include "separated_parser.h" |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#include |
|
10
|
|
|
|
|
|
|
#include |
|
11
|
|
|
|
|
|
|
#include |
|
12
|
|
|
|
|
|
|
#include |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
15
|
|
|
|
|
|
|
* Internal types |
|
16
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
typedef enum { |
|
19
|
|
|
|
|
|
|
ST_START_FIELD = 0, |
|
20
|
|
|
|
|
|
|
ST_IN_UNQUOTED, |
|
21
|
|
|
|
|
|
|
ST_IN_QUOTED, |
|
22
|
|
|
|
|
|
|
ST_MAYBE_END_QUOTE |
|
23
|
|
|
|
|
|
|
} parse_state_t; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
struct separated_ctx { |
|
26
|
|
|
|
|
|
|
/* Resolved options (copied at init time). */ |
|
27
|
|
|
|
|
|
|
separated_options_t opts; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
/* Caller. */ |
|
30
|
|
|
|
|
|
|
separated_field_cb cb; |
|
31
|
|
|
|
|
|
|
void *ud; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
/* Field buffer (geometric growth). */ |
|
34
|
|
|
|
|
|
|
char *buf; |
|
35
|
|
|
|
|
|
|
size_t buf_len; |
|
36
|
|
|
|
|
|
|
size_t buf_cap; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
/* State. */ |
|
39
|
|
|
|
|
|
|
parse_state_t state; |
|
40
|
|
|
|
|
|
|
int field_was_quoted; /* 1 if current field began with a quote */ |
|
41
|
|
|
|
|
|
|
int bom_checked; /* 1 once we've decided about the BOM */ |
|
42
|
|
|
|
|
|
|
int any_field_in_row; /* 1 if at least one field started in this row */ |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
/* Auto-detected / pinned EOL. */ |
|
45
|
|
|
|
|
|
|
separated_eol_t detected_eol; |
|
46
|
|
|
|
|
|
|
int pending_cr; /* 1 if last byte was CR awaiting LF/data (CRLF detect) */ |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
/* Diagnostics. */ |
|
49
|
|
|
|
|
|
|
size_t bytes_consumed; |
|
50
|
|
|
|
|
|
|
size_t rows_emitted; |
|
51
|
|
|
|
|
|
|
size_t err_offset; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
/* Sticky error: once non-zero, all _feed/_finish are no-ops. */ |
|
54
|
|
|
|
|
|
|
separated_err_t sticky_err; |
|
55
|
|
|
|
|
|
|
}; |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
/* Effective max-field cap (resolves opts.max_field_len == 0 to default). */ |
|
58
|
|
|
|
|
|
|
static size_t |
|
59
|
178
|
|
|
|
|
|
effective_field_cap(const separated_options_t *opts) |
|
60
|
|
|
|
|
|
|
{ |
|
61
|
178
|
|
|
|
|
|
return opts->max_field_len ? opts->max_field_len |
|
62
|
178
|
100
|
|
|
|
|
: SEPARATED_FIELD_DEFAULT_CAP; |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
66
|
|
|
|
|
|
|
* Field buffer: geometric growth with hard cap |
|
67
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
static separated_err_t |
|
70
|
21701072
|
|
|
|
|
|
buf_putc(separated_ctx_t *ctx, char c) |
|
71
|
|
|
|
|
|
|
{ |
|
72
|
21701072
|
100
|
|
|
|
|
if (ctx->buf_len + 1 > ctx->buf_cap) { |
|
73
|
178
|
100
|
|
|
|
|
size_t new_cap = ctx->buf_cap ? ctx->buf_cap * 2 : 64; |
|
74
|
178
|
|
|
|
|
|
size_t cap_max = effective_field_cap(&ctx->opts); |
|
75
|
|
|
|
|
|
|
char *new_buf; |
|
76
|
178
|
100
|
|
|
|
|
if (new_cap > cap_max) new_cap = cap_max; |
|
77
|
178
|
100
|
|
|
|
|
if (new_cap <= ctx->buf_len) { |
|
78
|
1
|
|
|
|
|
|
return SEPARATED_ERR_FIELD_TOO_LONG; |
|
79
|
|
|
|
|
|
|
} |
|
80
|
177
|
|
|
|
|
|
new_buf = (char *)realloc(ctx->buf, new_cap); |
|
81
|
177
|
50
|
|
|
|
|
if (!new_buf) return SEPARATED_ERR_NOMEM; |
|
82
|
177
|
|
|
|
|
|
ctx->buf = new_buf; |
|
83
|
177
|
|
|
|
|
|
ctx->buf_cap = new_cap; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
21701071
|
|
|
|
|
|
ctx->buf[ctx->buf_len++] = c; |
|
86
|
21701071
|
|
|
|
|
|
return SEPARATED_OK; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
static void |
|
90
|
921383
|
|
|
|
|
|
buf_reset(separated_ctx_t *ctx) |
|
91
|
|
|
|
|
|
|
{ |
|
92
|
921383
|
|
|
|
|
|
ctx->buf_len = 0; |
|
93
|
921383
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
96
|
|
|
|
|
|
|
* Trim helper for unquoted fields when opts.trim is on. |
|
97
|
|
|
|
|
|
|
* Strips only ASCII space and tab. Quoted fields are never trimmed. |
|
98
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
static void |
|
101
|
5
|
|
|
|
|
|
trim_buf(char *buf, size_t *plen) |
|
102
|
|
|
|
|
|
|
{ |
|
103
|
5
|
|
|
|
|
|
size_t len = *plen; |
|
104
|
5
|
|
|
|
|
|
size_t start = 0; |
|
105
|
|
|
|
|
|
|
size_t end; |
|
106
|
11
|
100
|
|
|
|
|
while (start < len && (buf[start] == ' ' || buf[start] == '\t')) start++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
107
|
5
|
|
|
|
|
|
end = len; |
|
108
|
8
|
100
|
|
|
|
|
while (end > start && (buf[end - 1] == ' ' || buf[end - 1] == '\t')) end--; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
109
|
5
|
100
|
|
|
|
|
if (start > 0) memmove(buf, buf + start, end - start); |
|
110
|
5
|
|
|
|
|
|
*plen = end - start; |
|
111
|
5
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
114
|
|
|
|
|
|
|
* Emit a field/row to the callback. |
|
115
|
|
|
|
|
|
|
* end_of_row=1 means "this field is the last in its row". |
|
116
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
static separated_err_t |
|
119
|
921389
|
|
|
|
|
|
emit_field(separated_ctx_t *ctx, int end_of_row) |
|
120
|
|
|
|
|
|
|
{ |
|
121
|
|
|
|
|
|
|
int call_rc; |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
/* Trim only on unquoted fields. */ |
|
124
|
921389
|
100
|
|
|
|
|
if (ctx->opts.trim && !ctx->field_was_quoted) { |
|
|
|
100
|
|
|
|
|
|
|
125
|
5
|
|
|
|
|
|
trim_buf(ctx->buf, &ctx->buf_len); |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
921389
|
100
|
|
|
|
|
if (ctx->opts.empty_is_undef |
|
129
|
6
|
100
|
|
|
|
|
&& !ctx->field_was_quoted |
|
130
|
5
|
100
|
|
|
|
|
&& ctx->buf_len == 0) { |
|
131
|
2
|
|
|
|
|
|
call_rc = ctx->cb(NULL, SEPARATED_FIELD_NULL_LEN, |
|
132
|
|
|
|
|
|
|
end_of_row, ctx->ud); |
|
133
|
|
|
|
|
|
|
} else { |
|
134
|
|
|
|
|
|
|
/* Pass even an empty quoted field as a real "" field. */ |
|
135
|
921387
|
100
|
|
|
|
|
const char *p = ctx->buf_len ? ctx->buf : ""; |
|
136
|
921387
|
|
|
|
|
|
call_rc = ctx->cb(p, ctx->buf_len, end_of_row, ctx->ud); |
|
137
|
|
|
|
|
|
|
} |
|
138
|
921386
|
100
|
|
|
|
|
if (call_rc != 0) return SEPARATED_ERR_ABORTED; |
|
139
|
|
|
|
|
|
|
|
|
140
|
921383
|
|
|
|
|
|
buf_reset(ctx); |
|
141
|
921383
|
|
|
|
|
|
ctx->field_was_quoted = 0; |
|
142
|
921383
|
|
|
|
|
|
ctx->any_field_in_row = 1; |
|
143
|
921383
|
100
|
|
|
|
|
if (end_of_row) { |
|
144
|
110611
|
|
|
|
|
|
ctx->rows_emitted++; |
|
145
|
110611
|
|
|
|
|
|
ctx->any_field_in_row = 0; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
921383
|
|
|
|
|
|
return SEPARATED_OK; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
151
|
|
|
|
|
|
|
* BOM stripping (UTF-8 only, when binary=0). |
|
152
|
|
|
|
|
|
|
* Called once before any byte is interpreted. Caller passes the |
|
153
|
|
|
|
|
|
|
* incoming buffer pointer + length pair through bom_skip; on return |
|
154
|
|
|
|
|
|
|
* any leading 3-byte BOM has been advanced past. |
|
155
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
static void |
|
158
|
501
|
|
|
|
|
|
bom_check(separated_ctx_t *ctx, const char **pp, size_t *plen) |
|
159
|
|
|
|
|
|
|
{ |
|
160
|
501
|
100
|
|
|
|
|
if (ctx->bom_checked) return; |
|
161
|
173
|
|
|
|
|
|
ctx->bom_checked = 1; |
|
162
|
|
|
|
|
|
|
|
|
163
|
173
|
100
|
|
|
|
|
if (ctx->opts.binary) return; |
|
164
|
|
|
|
|
|
|
|
|
165
|
170
|
100
|
|
|
|
|
if (*plen >= 3) { |
|
166
|
162
|
|
|
|
|
|
const unsigned char *u = (const unsigned char *)*pp; |
|
167
|
162
|
100
|
|
|
|
|
if (u[0] == 0xEF && u[1] == 0xBB && u[2] == 0xBF) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
168
|
1
|
|
|
|
|
|
*pp += 3; |
|
169
|
1
|
|
|
|
|
|
*plen -= 3; |
|
170
|
1
|
|
|
|
|
|
ctx->bytes_consumed += 3; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
176
|
|
|
|
|
|
|
* EOL helpers |
|
177
|
|
|
|
|
|
|
* |
|
178
|
|
|
|
|
|
|
* detect_or_match returns 1 if the byte at `c` (with the parser's |
|
179
|
|
|
|
|
|
|
* pending_cr flag) is a row terminator under the active EOL mode, |
|
180
|
|
|
|
|
|
|
* 0 if it's a normal byte, or a negative error code on a pinned |
|
181
|
|
|
|
|
|
|
* mismatch under strict mode. |
|
182
|
|
|
|
|
|
|
* |
|
183
|
|
|
|
|
|
|
* On a successful match the function may consume the byte (we always |
|
184
|
|
|
|
|
|
|
* do — the caller treats the return-1 case as "row ended here") and |
|
185
|
|
|
|
|
|
|
* also flips pending_cr or detected_eol as appropriate. |
|
186
|
|
|
|
|
|
|
* |
|
187
|
|
|
|
|
|
|
* NOTE: CRLF handling needs lookahead-of-1. We model it with the |
|
188
|
|
|
|
|
|
|
* pending_cr bit: |
|
189
|
|
|
|
|
|
|
* see CR => set pending_cr=1, do NOT emit row yet. |
|
190
|
|
|
|
|
|
|
* next byte: |
|
191
|
|
|
|
|
|
|
* if LF => CRLF row terminator, clear pending_cr. |
|
192
|
|
|
|
|
|
|
* else => emit deferred CR-row terminator (CR mode), then |
|
193
|
|
|
|
|
|
|
* re-process current byte from scratch. |
|
194
|
|
|
|
|
|
|
* |
|
195
|
|
|
|
|
|
|
* Keeping that in a tiny helper keeps the main loop legible. |
|
196
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
/* Return 1 if c is a terminator after the active EOL mode considers it. */ |
|
199
|
|
|
|
|
|
|
static int |
|
200
|
22622303
|
|
|
|
|
|
is_lf(int c) { return c == '\n'; } |
|
201
|
|
|
|
|
|
|
static int |
|
202
|
22622305
|
|
|
|
|
|
is_cr(int c) { return c == '\r'; } |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
205
|
|
|
|
|
|
|
* Core feed loop. |
|
206
|
|
|
|
|
|
|
* |
|
207
|
|
|
|
|
|
|
* Drives the state machine over [buf, buf+len). Returns OK or the |
|
208
|
|
|
|
|
|
|
* first error encountered; on error err_offset is set to the byte |
|
209
|
|
|
|
|
|
|
* offset within the original input (ctx->bytes_consumed at the |
|
210
|
|
|
|
|
|
|
* point of failure). |
|
211
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
#define FAIL(code) do { \ |
|
214
|
|
|
|
|
|
|
ctx->sticky_err = (code); \ |
|
215
|
|
|
|
|
|
|
ctx->err_offset = ctx->bytes_consumed; \ |
|
216
|
|
|
|
|
|
|
return (code); \ |
|
217
|
|
|
|
|
|
|
} while (0) |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
#define PUTC(c) do { \ |
|
220
|
|
|
|
|
|
|
separated_err_t _e = buf_putc(ctx, (char)(c)); \ |
|
221
|
|
|
|
|
|
|
if (_e != SEPARATED_OK) FAIL(_e); \ |
|
222
|
|
|
|
|
|
|
} while (0) |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
#define EMIT(end_of_row) do { \ |
|
225
|
|
|
|
|
|
|
separated_err_t _e = emit_field(ctx, (end_of_row)); \ |
|
226
|
|
|
|
|
|
|
if (_e != SEPARATED_OK) FAIL(_e); \ |
|
227
|
|
|
|
|
|
|
} while (0) |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
/* End-of-row from a CR or LF or CRLF. Clears pending_cr. */ |
|
230
|
|
|
|
|
|
|
static separated_err_t |
|
231
|
110612
|
|
|
|
|
|
handle_row_end(separated_ctx_t *ctx) |
|
232
|
|
|
|
|
|
|
{ |
|
233
|
110612
|
|
|
|
|
|
ctx->pending_cr = 0; |
|
234
|
110612
|
|
|
|
|
|
return emit_field(ctx, 1); |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
/* Decide whether `c` should terminate the current row, given the |
|
238
|
|
|
|
|
|
|
* current EOL mode. Returns: |
|
239
|
|
|
|
|
|
|
* 1 - row ended (caller must NOT process c further) |
|
240
|
|
|
|
|
|
|
* 0 - byte is data; caller continues with state-machine handling |
|
241
|
|
|
|
|
|
|
* -ve - error code (only in strict + EOL_PINNED mismatch) |
|
242
|
|
|
|
|
|
|
* |
|
243
|
|
|
|
|
|
|
* Side-effect: may toggle pending_cr / detected_eol. */ |
|
244
|
|
|
|
|
|
|
static int |
|
245
|
22622319
|
|
|
|
|
|
classify_eol(separated_ctx_t *ctx, int c) |
|
246
|
|
|
|
|
|
|
{ |
|
247
|
|
|
|
|
|
|
/* Resolve any deferred CR from previous byte. */ |
|
248
|
22622319
|
100
|
|
|
|
|
if (ctx->pending_cr) { |
|
249
|
14
|
100
|
|
|
|
|
if (is_lf(c)) { |
|
250
|
|
|
|
|
|
|
/* CRLF terminator. Lock detection if AUTO. */ |
|
251
|
12
|
100
|
|
|
|
|
if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) { |
|
252
|
8
|
|
|
|
|
|
ctx->detected_eol = SEPARATED_EOL_CRLF; |
|
253
|
4
|
100
|
|
|
|
|
} else if (ctx->opts.eol_mode != SEPARATED_EOL_CRLF |
|
254
|
2
|
50
|
|
|
|
|
&& ctx->opts.strict) { |
|
255
|
2
|
|
|
|
|
|
FAIL(SEPARATED_ERR_EOL_PINNED); |
|
256
|
|
|
|
|
|
|
} |
|
257
|
10
|
|
|
|
|
|
return 1; /* row already ended at the CR; consume LF as well */ |
|
258
|
|
|
|
|
|
|
} else { |
|
259
|
|
|
|
|
|
|
/* CR alone => row ended at the CR. The current byte is data, |
|
260
|
|
|
|
|
|
|
* but we have a pending row-end to flush first. We do that |
|
261
|
|
|
|
|
|
|
* by returning a "deferred" signal: the caller flushes the |
|
262
|
|
|
|
|
|
|
* row, clears pending_cr, then re-enters with the current |
|
263
|
|
|
|
|
|
|
* byte. We model that here by emitting now and reporting |
|
264
|
|
|
|
|
|
|
* "row ended" — caller must remember NOT to consume c. */ |
|
265
|
2
|
100
|
|
|
|
|
if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) { |
|
266
|
1
|
|
|
|
|
|
ctx->detected_eol = SEPARATED_EOL_CR; |
|
267
|
1
|
50
|
|
|
|
|
} else if (ctx->opts.eol_mode != SEPARATED_EOL_CR |
|
268
|
0
|
0
|
|
|
|
|
&& ctx->opts.strict) { |
|
269
|
0
|
|
|
|
|
|
FAIL(SEPARATED_ERR_EOL_PINNED); |
|
270
|
|
|
|
|
|
|
} |
|
271
|
2
|
|
|
|
|
|
return 2; /* row ended on previous CR; do not consume c */ |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
/* No pending CR. Look at this byte. */ |
|
276
|
22622305
|
100
|
|
|
|
|
if (is_cr(c)) { |
|
277
|
16
|
|
|
|
|
|
ctx->pending_cr = 1; |
|
278
|
16
|
|
|
|
|
|
return -1; /* tentative; need lookahead. byte consumed. */ |
|
279
|
|
|
|
|
|
|
} |
|
280
|
22622289
|
100
|
|
|
|
|
if (is_lf(c)) { |
|
281
|
110604
|
100
|
|
|
|
|
if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) { |
|
282
|
110600
|
|
|
|
|
|
ctx->detected_eol = SEPARATED_EOL_LF; |
|
283
|
4
|
50
|
|
|
|
|
} else if (ctx->opts.eol_mode != SEPARATED_EOL_LF |
|
284
|
0
|
0
|
|
|
|
|
&& ctx->opts.strict) { |
|
285
|
0
|
|
|
|
|
|
FAIL(SEPARATED_ERR_EOL_PINNED); |
|
286
|
|
|
|
|
|
|
} |
|
287
|
110604
|
|
|
|
|
|
return 1; |
|
288
|
|
|
|
|
|
|
} |
|
289
|
22511685
|
|
|
|
|
|
return 0; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
/* Public-facing _feed implementation. */ |
|
293
|
|
|
|
|
|
|
separated_err_t |
|
294
|
501
|
|
|
|
|
|
separated_feed(separated_ctx_t *ctx, const char *buf, size_t len) |
|
295
|
|
|
|
|
|
|
{ |
|
296
|
|
|
|
|
|
|
size_t i; |
|
297
|
|
|
|
|
|
|
|
|
298
|
501
|
50
|
|
|
|
|
if (ctx->sticky_err) return ctx->sticky_err; |
|
299
|
|
|
|
|
|
|
|
|
300
|
501
|
|
|
|
|
|
bom_check(ctx, &buf, &len); |
|
301
|
|
|
|
|
|
|
|
|
302
|
501
|
|
|
|
|
|
i = 0; |
|
303
|
22623062
|
100
|
|
|
|
|
while (i < len) { |
|
304
|
22622575
|
|
|
|
|
|
int c = (unsigned char)buf[i]; |
|
305
|
|
|
|
|
|
|
int eol; |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
/* ---- IN_QUOTED short-circuits EOL detection: newlines are data. */ |
|
308
|
22622575
|
100
|
|
|
|
|
if (ctx->state == ST_IN_QUOTED) { |
|
309
|
256
|
100
|
|
|
|
|
if (ctx->opts.escape >= 0 && c == ctx->opts.escape) { |
|
|
|
100
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
/* Backslash-style escape: consume next byte literally. */ |
|
311
|
2
|
50
|
|
|
|
|
if (i + 1 >= len) { |
|
312
|
|
|
|
|
|
|
/* Defer: store nothing; the next feed sees this byte |
|
313
|
|
|
|
|
|
|
* again. We do that by NOT advancing past the escape |
|
314
|
|
|
|
|
|
|
* char and returning. */ |
|
315
|
|
|
|
|
|
|
/* Implementation: append a one-byte "escape pending" |
|
316
|
|
|
|
|
|
|
* marker via a local flag. Cleanest: stuff it as the |
|
317
|
|
|
|
|
|
|
* last byte of buf and remember we're mid-escape. */ |
|
318
|
|
|
|
|
|
|
/* Simpler model: require the next byte to be in the |
|
319
|
|
|
|
|
|
|
* SAME chunk. For now we accept that limitation for |
|
320
|
|
|
|
|
|
|
* v0.01 and document it: backslash escapes that |
|
321
|
|
|
|
|
|
|
* straddle a chunk boundary are not supported. */ |
|
322
|
0
|
0
|
|
|
|
|
PUTC(c); /* fall back to literal escape char */ |
|
323
|
0
|
|
|
|
|
|
ctx->bytes_consumed++; |
|
324
|
0
|
|
|
|
|
|
i++; |
|
325
|
0
|
|
|
|
|
|
continue; |
|
326
|
|
|
|
|
|
|
} |
|
327
|
2
|
50
|
|
|
|
|
PUTC(buf[i + 1]); |
|
328
|
2
|
|
|
|
|
|
ctx->bytes_consumed += 2; |
|
329
|
2
|
|
|
|
|
|
i += 2; |
|
330
|
2
|
|
|
|
|
|
continue; |
|
331
|
|
|
|
|
|
|
} |
|
332
|
254
|
100
|
|
|
|
|
if (c == ctx->opts.quote) { |
|
333
|
52
|
|
|
|
|
|
ctx->state = ST_MAYBE_END_QUOTE; |
|
334
|
52
|
|
|
|
|
|
ctx->bytes_consumed++; |
|
335
|
52
|
|
|
|
|
|
i++; |
|
336
|
52
|
|
|
|
|
|
continue; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
202
|
50
|
|
|
|
|
PUTC(c); |
|
339
|
202
|
|
|
|
|
|
ctx->bytes_consumed++; |
|
340
|
202
|
|
|
|
|
|
i++; |
|
341
|
202
|
|
|
|
|
|
continue; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
/* ---- All other states: consult EOL classifier first. */ |
|
345
|
22622319
|
|
|
|
|
|
eol = classify_eol(ctx, c); |
|
346
|
22622319
|
100
|
|
|
|
|
if (eol < 0 && ctx->sticky_err) return ctx->sticky_err; |
|
|
|
100
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
|
348
|
22622317
|
100
|
|
|
|
|
if (eol == 1) { |
|
349
|
|
|
|
|
|
|
/* Current byte (or its LF partner) is end-of-row. Consume it |
|
350
|
|
|
|
|
|
|
* and emit the current field as end_of_row. */ |
|
351
|
110614
|
|
|
|
|
|
ctx->bytes_consumed++; |
|
352
|
110614
|
|
|
|
|
|
i++; |
|
353
|
|
|
|
|
|
|
/* Skip empty-trailing-newline case: only emit if any field |
|
354
|
|
|
|
|
|
|
* has been started OR the buffer has content. */ |
|
355
|
110614
|
100
|
|
|
|
|
if (ctx->any_field_in_row || ctx->buf_len > 0 |
|
|
|
100
|
|
|
|
|
|
|
356
|
6
|
50
|
|
|
|
|
|| ctx->state != ST_START_FIELD) { |
|
357
|
110608
|
|
|
|
|
|
separated_err_t e = handle_row_end(ctx); |
|
358
|
110605
|
100
|
|
|
|
|
if (e != SEPARATED_OK) FAIL(e); |
|
359
|
|
|
|
|
|
|
} |
|
360
|
110608
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
361
|
110608
|
|
|
|
|
|
continue; |
|
362
|
|
|
|
|
|
|
} |
|
363
|
22511703
|
100
|
|
|
|
|
if (eol == 2) { |
|
364
|
|
|
|
|
|
|
/* Pending CR resolved as row-end; this byte is fresh data. |
|
365
|
|
|
|
|
|
|
* Flush the row but do NOT consume the current byte. */ |
|
366
|
2
|
|
|
|
|
|
ctx->pending_cr = 0; /* must clear unconditionally — handle_row_end |
|
367
|
|
|
|
|
|
|
does so but we may skip the call below |
|
368
|
|
|
|
|
|
|
when the row is empty (leading bare CR), |
|
369
|
|
|
|
|
|
|
and an unset pending_cr with un-advanced |
|
370
|
|
|
|
|
|
|
i would loop on the same byte forever. */ |
|
371
|
2
|
50
|
|
|
|
|
if (ctx->any_field_in_row || ctx->buf_len > 0 |
|
|
|
0
|
|
|
|
|
|
|
372
|
0
|
0
|
|
|
|
|
|| ctx->state != ST_START_FIELD) { |
|
373
|
2
|
|
|
|
|
|
separated_err_t e = handle_row_end(ctx); |
|
374
|
2
|
50
|
|
|
|
|
if (e != SEPARATED_OK) FAIL(e); |
|
375
|
|
|
|
|
|
|
} |
|
376
|
2
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
377
|
|
|
|
|
|
|
/* Do not advance i: re-enter the loop on this byte. */ |
|
378
|
2
|
|
|
|
|
|
continue; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
22511701
|
100
|
|
|
|
|
if (eol == -1) { |
|
381
|
|
|
|
|
|
|
/* CR consumed, awaiting decision. */ |
|
382
|
16
|
|
|
|
|
|
ctx->bytes_consumed++; |
|
383
|
16
|
|
|
|
|
|
i++; |
|
384
|
16
|
|
|
|
|
|
continue; |
|
385
|
|
|
|
|
|
|
} |
|
386
|
|
|
|
|
|
|
/* eol == 0: byte is regular data, fall through to state machine. */ |
|
387
|
|
|
|
|
|
|
|
|
388
|
22511685
|
|
|
|
|
|
switch (ctx->state) { |
|
389
|
921394
|
|
|
|
|
|
case ST_START_FIELD: |
|
390
|
921394
|
100
|
|
|
|
|
if (c == ctx->opts.sep) { |
|
391
|
6
|
50
|
|
|
|
|
EMIT(0); /* empty field, more to come on this row */ |
|
392
|
921388
|
100
|
|
|
|
|
} else if (ctx->opts.quote >= 0 && c == ctx->opts.quote) { |
|
|
|
100
|
|
|
|
|
|
|
393
|
40
|
|
|
|
|
|
ctx->field_was_quoted = 1; |
|
394
|
40
|
|
|
|
|
|
ctx->state = ST_IN_QUOTED; |
|
395
|
|
|
|
|
|
|
} else { |
|
396
|
921348
|
50
|
|
|
|
|
PUTC(c); |
|
397
|
921348
|
|
|
|
|
|
ctx->state = ST_IN_UNQUOTED; |
|
398
|
|
|
|
|
|
|
} |
|
399
|
921394
|
|
|
|
|
|
break; |
|
400
|
|
|
|
|
|
|
|
|
401
|
21590248
|
|
|
|
|
|
case ST_IN_UNQUOTED: |
|
402
|
21590248
|
100
|
|
|
|
|
if (c == ctx->opts.sep) { |
|
403
|
810739
|
50
|
|
|
|
|
EMIT(0); |
|
404
|
810739
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
405
|
20779509
|
100
|
|
|
|
|
} else if (ctx->opts.quote >= 0 && c == ctx->opts.quote) { |
|
|
|
100
|
|
|
|
|
|
|
406
|
6
|
100
|
|
|
|
|
if (ctx->opts.strict) FAIL(SEPARATED_ERR_BAD_QUOTE); |
|
407
|
|
|
|
|
|
|
/* Lenient: keep the quote literally, stay in state. */ |
|
408
|
3
|
50
|
|
|
|
|
PUTC(c); |
|
409
|
|
|
|
|
|
|
} else { |
|
410
|
20779503
|
100
|
|
|
|
|
PUTC(c); |
|
411
|
|
|
|
|
|
|
} |
|
412
|
21590244
|
|
|
|
|
|
break; |
|
413
|
|
|
|
|
|
|
|
|
414
|
43
|
|
|
|
|
|
case ST_MAYBE_END_QUOTE: |
|
415
|
43
|
100
|
|
|
|
|
if (c == ctx->opts.quote) { |
|
416
|
|
|
|
|
|
|
/* RFC 4180 doubled-quote escape. */ |
|
417
|
12
|
50
|
|
|
|
|
PUTC(c); |
|
418
|
12
|
|
|
|
|
|
ctx->state = ST_IN_QUOTED; |
|
419
|
31
|
100
|
|
|
|
|
} else if (c == ctx->opts.sep) { |
|
420
|
27
|
50
|
|
|
|
|
EMIT(0); |
|
421
|
27
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
422
|
|
|
|
|
|
|
} else { |
|
423
|
4
|
100
|
|
|
|
|
if (ctx->opts.strict) FAIL(SEPARATED_ERR_BAD_QUOTE); |
|
424
|
|
|
|
|
|
|
/* Lenient: closing quote was real, but stray data after. |
|
425
|
|
|
|
|
|
|
* Append the unexpected byte and continue as unquoted. */ |
|
426
|
2
|
50
|
|
|
|
|
PUTC(c); |
|
427
|
2
|
|
|
|
|
|
ctx->state = ST_IN_UNQUOTED; |
|
428
|
|
|
|
|
|
|
} |
|
429
|
41
|
|
|
|
|
|
break; |
|
430
|
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
case ST_IN_QUOTED: |
|
432
|
|
|
|
|
|
|
/* unreachable; handled above */ |
|
433
|
0
|
|
|
|
|
|
break; |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
|
|
436
|
22511679
|
|
|
|
|
|
ctx->bytes_consumed++; |
|
437
|
22511679
|
|
|
|
|
|
i++; |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
|
|
440
|
487
|
|
|
|
|
|
return SEPARATED_OK; |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
444
|
|
|
|
|
|
|
* Finish: flush any half-built field/row at EOF. |
|
445
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
separated_err_t |
|
448
|
160
|
|
|
|
|
|
separated_finish(separated_ctx_t *ctx) |
|
449
|
|
|
|
|
|
|
{ |
|
450
|
160
|
50
|
|
|
|
|
if (ctx->sticky_err) return ctx->sticky_err; |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
/* Resolve a dangling CR (CR-only row terminator). */ |
|
453
|
160
|
100
|
|
|
|
|
if (ctx->pending_cr) { |
|
454
|
2
|
100
|
|
|
|
|
if (ctx->opts.eol_mode == SEPARATED_EOL_AUTO) { |
|
455
|
1
|
|
|
|
|
|
ctx->detected_eol = SEPARATED_EOL_CR; |
|
456
|
1
|
50
|
|
|
|
|
} else if (ctx->opts.eol_mode != SEPARATED_EOL_CR |
|
457
|
0
|
0
|
|
|
|
|
&& ctx->opts.strict) { |
|
458
|
0
|
|
|
|
|
|
FAIL(SEPARATED_ERR_EOL_PINNED); |
|
459
|
|
|
|
|
|
|
} |
|
460
|
2
|
|
|
|
|
|
ctx->pending_cr = 0; |
|
461
|
2
|
50
|
|
|
|
|
if (ctx->any_field_in_row || ctx->buf_len > 0 |
|
|
|
0
|
|
|
|
|
|
|
462
|
0
|
0
|
|
|
|
|
|| ctx->state != ST_START_FIELD) { |
|
463
|
2
|
|
|
|
|
|
separated_err_t e = handle_row_end(ctx); |
|
464
|
2
|
50
|
|
|
|
|
if (e != SEPARATED_OK) FAIL(e); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
2
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
467
|
2
|
|
|
|
|
|
return SEPARATED_OK; |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
/* Strict: open quote at EOF is a parse error. */ |
|
471
|
158
|
50
|
|
|
|
|
if (ctx->state == ST_IN_QUOTED) { |
|
472
|
0
|
0
|
|
|
|
|
if (ctx->opts.strict) FAIL(SEPARATED_ERR_BAD_QUOTE); |
|
473
|
|
|
|
|
|
|
/* Lenient: emit whatever we have. */ |
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
/* Emit any buffered field (and end-of-row) if there's data or we |
|
477
|
|
|
|
|
|
|
* were mid-field. */ |
|
478
|
158
|
100
|
|
|
|
|
if (ctx->any_field_in_row || ctx->buf_len > 0 |
|
|
|
50
|
|
|
|
|
|
|
479
|
153
|
50
|
|
|
|
|
|| ctx->state == ST_IN_UNQUOTED |
|
480
|
153
|
50
|
|
|
|
|
|| ctx->state == ST_IN_QUOTED |
|
481
|
153
|
50
|
|
|
|
|
|| ctx->state == ST_MAYBE_END_QUOTE) { |
|
482
|
5
|
|
|
|
|
|
separated_err_t e = emit_field(ctx, 1); |
|
483
|
5
|
50
|
|
|
|
|
if (e != SEPARATED_OK) FAIL(e); |
|
484
|
|
|
|
|
|
|
} |
|
485
|
158
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
486
|
158
|
|
|
|
|
|
return SEPARATED_OK; |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
490
|
|
|
|
|
|
|
* Construction / destruction |
|
491
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
void |
|
494
|
137
|
|
|
|
|
|
separated_options_init_csv(separated_options_t *opts) |
|
495
|
|
|
|
|
|
|
{ |
|
496
|
137
|
|
|
|
|
|
memset(opts, 0, sizeof *opts); |
|
497
|
137
|
|
|
|
|
|
opts->sep = ','; |
|
498
|
137
|
|
|
|
|
|
opts->quote = '"'; |
|
499
|
137
|
|
|
|
|
|
opts->escape = -1; |
|
500
|
137
|
|
|
|
|
|
opts->eol_mode = SEPARATED_EOL_AUTO; |
|
501
|
137
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
void |
|
504
|
55
|
|
|
|
|
|
separated_options_init_tsv(separated_options_t *opts) |
|
505
|
|
|
|
|
|
|
{ |
|
506
|
55
|
|
|
|
|
|
memset(opts, 0, sizeof *opts); |
|
507
|
55
|
|
|
|
|
|
opts->sep = '\t'; |
|
508
|
55
|
|
|
|
|
|
opts->quote = -1; |
|
509
|
55
|
|
|
|
|
|
opts->escape = -1; |
|
510
|
55
|
|
|
|
|
|
opts->eol_mode = SEPARATED_EOL_AUTO; |
|
511
|
55
|
|
|
|
|
|
} |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
separated_ctx_t * |
|
514
|
175
|
|
|
|
|
|
separated_init(const separated_options_t *opts, |
|
515
|
|
|
|
|
|
|
separated_field_cb cb, void *ud) |
|
516
|
|
|
|
|
|
|
{ |
|
517
|
175
|
|
|
|
|
|
separated_ctx_t *ctx = (separated_ctx_t *)calloc(1, sizeof *ctx); |
|
518
|
175
|
50
|
|
|
|
|
if (!ctx) return NULL; |
|
519
|
175
|
|
|
|
|
|
ctx->opts = *opts; |
|
520
|
175
|
|
|
|
|
|
ctx->cb = cb; |
|
521
|
175
|
|
|
|
|
|
ctx->ud = ud; |
|
522
|
175
|
|
|
|
|
|
ctx->state = ST_START_FIELD; |
|
523
|
175
|
|
|
|
|
|
return ctx; |
|
524
|
|
|
|
|
|
|
} |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
void |
|
527
|
172
|
|
|
|
|
|
separated_free(separated_ctx_t *ctx) |
|
528
|
|
|
|
|
|
|
{ |
|
529
|
172
|
50
|
|
|
|
|
if (!ctx) return; |
|
530
|
172
|
|
|
|
|
|
free(ctx->buf); |
|
531
|
172
|
|
|
|
|
|
free(ctx); |
|
532
|
|
|
|
|
|
|
} |
|
533
|
|
|
|
|
|
|
|
|
534
|
3
|
|
|
|
|
|
size_t separated_offset(const separated_ctx_t *ctx) { return ctx->bytes_consumed; } |
|
535
|
0
|
|
|
|
|
|
size_t separated_rows(const separated_ctx_t *ctx) { return ctx->rows_emitted; } |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
538
|
|
|
|
|
|
|
* One-shot wrapper |
|
539
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
long |
|
542
|
148
|
|
|
|
|
|
separated_parse(const char *buf, size_t len, |
|
543
|
|
|
|
|
|
|
const separated_options_t *opts, |
|
544
|
|
|
|
|
|
|
separated_field_cb cb, void *ud, |
|
545
|
|
|
|
|
|
|
size_t *err_offset) |
|
546
|
|
|
|
|
|
|
{ |
|
547
|
148
|
|
|
|
|
|
separated_ctx_t *ctx = separated_init(opts, cb, ud); |
|
548
|
|
|
|
|
|
|
separated_err_t e; |
|
549
|
|
|
|
|
|
|
long ret; |
|
550
|
148
|
50
|
|
|
|
|
if (!ctx) { |
|
551
|
0
|
0
|
|
|
|
|
if (err_offset) *err_offset = 0; |
|
552
|
0
|
|
|
|
|
|
return SEPARATED_ERR_NOMEM; |
|
553
|
|
|
|
|
|
|
} |
|
554
|
|
|
|
|
|
|
|
|
555
|
148
|
|
|
|
|
|
e = separated_feed(ctx, buf, len); |
|
556
|
145
|
100
|
|
|
|
|
if (e == SEPARATED_OK) { |
|
557
|
137
|
|
|
|
|
|
e = separated_finish(ctx); |
|
558
|
|
|
|
|
|
|
} |
|
559
|
|
|
|
|
|
|
|
|
560
|
145
|
100
|
|
|
|
|
if (e != SEPARATED_OK) { |
|
561
|
8
|
50
|
|
|
|
|
if (err_offset) *err_offset = ctx->err_offset; |
|
562
|
8
|
|
|
|
|
|
ret = (long)e; |
|
563
|
|
|
|
|
|
|
} else { |
|
564
|
137
|
50
|
|
|
|
|
if (err_offset) *err_offset = len; |
|
565
|
137
|
|
|
|
|
|
ret = (long)ctx->rows_emitted; |
|
566
|
|
|
|
|
|
|
} |
|
567
|
|
|
|
|
|
|
|
|
568
|
145
|
|
|
|
|
|
separated_free(ctx); |
|
569
|
145
|
|
|
|
|
|
return ret; |
|
570
|
|
|
|
|
|
|
} |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
/* ------------------------------------------------------------ |
|
573
|
|
|
|
|
|
|
* strerror |
|
574
|
|
|
|
|
|
|
* ------------------------------------------------------------ */ |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
const char * |
|
577
|
8
|
|
|
|
|
|
separated_strerror(separated_err_t err) |
|
578
|
|
|
|
|
|
|
{ |
|
579
|
8
|
|
|
|
|
|
switch (err) { |
|
580
|
0
|
|
|
|
|
|
case SEPARATED_OK: return "ok"; |
|
581
|
0
|
|
|
|
|
|
case SEPARATED_ERR_NOMEM: return "out of memory"; |
|
582
|
1
|
|
|
|
|
|
case SEPARATED_ERR_FIELD_TOO_LONG: return "field exceeds max length"; |
|
583
|
5
|
|
|
|
|
|
case SEPARATED_ERR_BAD_QUOTE: return "malformed quoting"; |
|
584
|
2
|
|
|
|
|
|
case SEPARATED_ERR_EOL_PINNED: return "line ending does not match pinned eol mode"; |
|
585
|
0
|
|
|
|
|
|
case SEPARATED_ERR_ABORTED: return "parse aborted by callback"; |
|
586
|
|
|
|
|
|
|
} |
|
587
|
0
|
|
|
|
|
|
return "unknown error"; |
|
588
|
|
|
|
|
|
|
} |