line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
typedef struct perl_tokenizer { |
2
|
|
|
|
|
|
|
sqlite3_tokenizer base; |
3
|
|
|
|
|
|
|
SV *coderef; /* the perl tokenizer is a coderef that takes |
4
|
|
|
|
|
|
|
a string and returns a cursor coderef */ |
5
|
|
|
|
|
|
|
} perl_tokenizer; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
typedef struct perl_tokenizer_cursor { |
8
|
|
|
|
|
|
|
sqlite3_tokenizer_cursor base; |
9
|
|
|
|
|
|
|
SV *coderef; /* ref to the closure that returns terms */ |
10
|
|
|
|
|
|
|
char *pToken; /* storage for a copy of the last token */ |
11
|
|
|
|
|
|
|
int nTokenAllocated; /* space allocated to pToken buffer */ |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
/* members below are only used if the input string is in utf8 */ |
14
|
|
|
|
|
|
|
const char *pInput; /* input we are tokenizing */ |
15
|
|
|
|
|
|
|
const char *currentByte; /* pointer into pInput */ |
16
|
|
|
|
|
|
|
int currentChar; /* char position corresponding to currentByte */ |
17
|
|
|
|
|
|
|
} perl_tokenizer_cursor; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
/* |
20
|
|
|
|
|
|
|
** Create a new tokenizer instance. |
21
|
|
|
|
|
|
|
** Will be called whenever a FTS3 table is created with |
22
|
|
|
|
|
|
|
** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name) |
23
|
|
|
|
|
|
|
** where qualified::function::name is a fully qualified perl function |
24
|
|
|
|
|
|
|
*/ |
25
|
5
|
|
|
|
|
|
static int perl_tokenizer_Create( |
26
|
|
|
|
|
|
|
int argc, const char * const *argv, |
27
|
|
|
|
|
|
|
sqlite3_tokenizer **ppTokenizer |
28
|
|
|
|
|
|
|
){ |
29
|
|
|
|
|
|
|
dTHX; |
30
|
5
|
|
|
|
|
|
dSP; |
31
|
|
|
|
|
|
|
int n_retval; |
32
|
|
|
|
|
|
|
SV *retval; |
33
|
|
|
|
|
|
|
perl_tokenizer *t; |
34
|
|
|
|
|
|
|
|
35
|
5
|
100
|
|
|
|
|
if (!argc) { |
36
|
1
|
|
|
|
|
|
return SQLITE_ERROR; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
4
|
|
|
|
|
|
t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t)); |
40
|
4
|
50
|
|
|
|
|
if( t==NULL ) return SQLITE_NOMEM; |
41
|
4
|
|
|
|
|
|
memset(t, 0, sizeof(*t)); |
42
|
|
|
|
|
|
|
|
43
|
4
|
|
|
|
|
|
ENTER; |
44
|
4
|
|
|
|
|
|
SAVETMPS; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
/* call the qualified::function::name */ |
47
|
4
|
50
|
|
|
|
|
PUSHMARK(SP); |
48
|
4
|
|
|
|
|
|
PUTBACK; |
49
|
4
|
|
|
|
|
|
n_retval = call_pv(argv[0], G_SCALAR); |
50
|
4
|
|
|
|
|
|
SPAGAIN; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
/* store a copy of the returned coderef into the tokenizer structure */ |
53
|
4
|
50
|
|
|
|
|
if (n_retval != 1) { |
54
|
0
|
|
|
|
|
|
warn("tokenizer_Create returned %d arguments", n_retval); |
55
|
|
|
|
|
|
|
} |
56
|
4
|
|
|
|
|
|
retval = POPs; |
57
|
4
|
|
|
|
|
|
t->coderef = newSVsv(retval); |
58
|
4
|
|
|
|
|
|
*ppTokenizer = &t->base; |
59
|
|
|
|
|
|
|
|
60
|
4
|
|
|
|
|
|
PUTBACK; |
61
|
4
|
50
|
|
|
|
|
FREETMPS; |
62
|
4
|
|
|
|
|
|
LEAVE; |
63
|
|
|
|
|
|
|
|
64
|
4
|
|
|
|
|
|
return SQLITE_OK; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
/* |
68
|
|
|
|
|
|
|
** Destroy a tokenizer |
69
|
|
|
|
|
|
|
*/ |
70
|
4
|
|
|
|
|
|
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ |
71
|
|
|
|
|
|
|
dTHX; |
72
|
4
|
|
|
|
|
|
perl_tokenizer *t = (perl_tokenizer *) pTokenizer; |
73
|
4
|
|
|
|
|
|
sv_free(t->coderef); |
74
|
4
|
|
|
|
|
|
sqlite3_free(t); |
75
|
4
|
|
|
|
|
|
return SQLITE_OK; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
/* |
79
|
|
|
|
|
|
|
** Prepare to begin tokenizing a particular string. The input |
80
|
|
|
|
|
|
|
** string to be tokenized is supposed to be pInput[0..nBytes-1] .. |
81
|
|
|
|
|
|
|
** except that nBytes passed by fts3 is -1 (don't know why) ! |
82
|
|
|
|
|
|
|
** This is passed to the tokenizer instance, which then returns a |
83
|
|
|
|
|
|
|
** closure implementing the cursor (so the cursor is again a coderef). |
84
|
|
|
|
|
|
|
*/ |
85
|
104
|
|
|
|
|
|
static int perl_tokenizer_Open( |
86
|
|
|
|
|
|
|
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ |
87
|
|
|
|
|
|
|
const char *pInput, int nBytes, /* Input buffer */ |
88
|
|
|
|
|
|
|
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ |
89
|
|
|
|
|
|
|
){ |
90
|
|
|
|
|
|
|
dTHX; |
91
|
104
|
|
|
|
|
|
dSP; |
92
|
|
|
|
|
|
|
dMY_CXT; |
93
|
|
|
|
|
|
|
U32 flags; |
94
|
|
|
|
|
|
|
SV *perl_string; |
95
|
|
|
|
|
|
|
int n_retval; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
/* build a Perl copy of the input string */ |
98
|
104
|
100
|
|
|
|
|
if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */ |
99
|
24
|
|
|
|
|
|
nBytes = strlen(pInput); |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
/* SVs_TEMP will call sv_2mortal */ |
103
|
104
|
|
|
|
|
|
perl_string = newSVpvn_flags(pInput, nBytes, SVs_TEMP); |
104
|
|
|
|
|
|
|
|
105
|
104
|
|
|
|
|
|
switch (MY_CXT.last_dbh_string_mode) { |
106
|
|
|
|
|
|
|
DBD_SQLITE_STRING_MODE_UNICODE_NAIVE: |
107
|
|
|
|
|
|
|
DBD_SQLITE_UTF8_DECODE_NAIVE(perl_string); |
108
|
|
|
|
|
|
|
break; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
DBD_SQLITE_STRING_MODE_UNICODE_FALLBACK: |
111
|
|
|
|
|
|
|
DBD_SQLITE_STRING_MODE_UNICODE_STRICT: |
112
|
|
|
|
|
|
|
DBD_SQLITE_UTF8_DECODE_WITH_FALLBACK(perl_string); |
113
|
|
|
|
|
|
|
break; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
default: |
116
|
104
|
|
|
|
|
|
break; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
104
|
50
|
|
|
|
|
DBD_SQLITE_UTF8_DECODE_IF_NEEDED(perl_string, MY_CXT.last_dbh_string_mode); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
104
|
|
|
|
|
|
perl_tokenizer *t = (perl_tokenizer *)pTokenizer; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
/* allocate and initialize the cursor struct */ |
124
|
|
|
|
|
|
|
perl_tokenizer_cursor *c; |
125
|
104
|
|
|
|
|
|
c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); |
126
|
104
|
|
|
|
|
|
memset(c, 0, sizeof(*c)); |
127
|
104
|
|
|
|
|
|
*ppCursor = &c->base; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
/* special handling if working with utf8 strings */ |
130
|
104
|
100
|
|
|
|
|
if (MY_CXT.last_dbh_string_mode & DBD_SQLITE_STRING_MODE_UNICODE_ANY) { |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
/* data to keep track of byte positions */ |
133
|
52
|
|
|
|
|
|
c->currentByte = c->pInput = pInput; |
134
|
52
|
|
|
|
|
|
c->currentChar = 0; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
104
|
|
|
|
|
|
ENTER; |
138
|
104
|
|
|
|
|
|
SAVETMPS; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
/* call the tokenizer coderef */ |
141
|
104
|
50
|
|
|
|
|
PUSHMARK(SP); |
142
|
104
|
50
|
|
|
|
|
XPUSHs(perl_string); |
143
|
104
|
|
|
|
|
|
PUTBACK; |
144
|
104
|
|
|
|
|
|
n_retval = call_sv(t->coderef, G_SCALAR); |
145
|
104
|
|
|
|
|
|
SPAGAIN; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
/* store the cursor coderef returned by the tokenizer */ |
148
|
104
|
50
|
|
|
|
|
if (n_retval != 1) { |
149
|
0
|
|
|
|
|
|
warn("tokenizer returned %d arguments, expected 1", n_retval); |
150
|
|
|
|
|
|
|
} |
151
|
104
|
|
|
|
|
|
c->coderef = newSVsv(POPs); |
152
|
|
|
|
|
|
|
|
153
|
104
|
|
|
|
|
|
PUTBACK; |
154
|
104
|
50
|
|
|
|
|
FREETMPS; |
155
|
104
|
|
|
|
|
|
LEAVE; |
156
|
104
|
|
|
|
|
|
return SQLITE_OK; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
/* |
160
|
|
|
|
|
|
|
** Close a tokenization cursor previously opened by a call to |
161
|
|
|
|
|
|
|
** perl_tokenizer_Open() above. |
162
|
|
|
|
|
|
|
*/ |
163
|
104
|
|
|
|
|
|
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ |
164
|
104
|
|
|
|
|
|
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
dTHX; |
167
|
104
|
|
|
|
|
|
sv_free(c->coderef); |
168
|
104
|
50
|
|
|
|
|
if (c->pToken) sqlite3_free(c->pToken); |
169
|
104
|
|
|
|
|
|
sqlite3_free(c); |
170
|
104
|
|
|
|
|
|
return SQLITE_OK; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
/* |
174
|
|
|
|
|
|
|
** Extract the next token from a tokenization cursor. The cursor must |
175
|
|
|
|
|
|
|
** have been opened by a prior call to perl_tokenizer_Open(). |
176
|
|
|
|
|
|
|
*/ |
177
|
255896
|
|
|
|
|
|
static int perl_tokenizer_Next( |
178
|
|
|
|
|
|
|
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */ |
179
|
|
|
|
|
|
|
const char **ppToken, /* OUT: Normalized text for token */ |
180
|
|
|
|
|
|
|
int *pnBytes, /* OUT: Number of bytes in normalized text */ |
181
|
|
|
|
|
|
|
int *piStartOffset, /* Starting offset of token. IN : char offset; OUT : byte offset */ |
182
|
|
|
|
|
|
|
int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */ |
183
|
|
|
|
|
|
|
int *piPosition /* OUT: Number of tokens returned before this one */ |
184
|
|
|
|
|
|
|
){ |
185
|
255896
|
|
|
|
|
|
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; |
186
|
|
|
|
|
|
|
int result; |
187
|
|
|
|
|
|
|
int n_retval; |
188
|
|
|
|
|
|
|
char *token; |
189
|
|
|
|
|
|
|
char *nextByte; |
190
|
|
|
|
|
|
|
STRLEN n_a; /* this is required for older perls < 5.8.8 */ |
191
|
|
|
|
|
|
|
I32 hop; |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
dTHX; |
194
|
255896
|
|
|
|
|
|
dSP; |
195
|
|
|
|
|
|
|
|
196
|
255896
|
|
|
|
|
|
ENTER; |
197
|
255896
|
|
|
|
|
|
SAVETMPS; |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
/* call the cursor */ |
200
|
255896
|
50
|
|
|
|
|
PUSHMARK(SP); |
201
|
255896
|
|
|
|
|
|
PUTBACK; |
202
|
255896
|
|
|
|
|
|
n_retval = call_sv(c->coderef, G_ARRAY); |
203
|
255896
|
|
|
|
|
|
SPAGAIN; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
/* if we get back an empty list, there is no more token */ |
206
|
255896
|
100
|
|
|
|
|
if (n_retval == 0) { |
207
|
40
|
|
|
|
|
|
result = SQLITE_DONE; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
/* otherwise, get token details from the return list */ |
210
|
|
|
|
|
|
|
else { |
211
|
255856
|
50
|
|
|
|
|
if (n_retval != 5) { |
212
|
0
|
|
|
|
|
|
warn("tokenizer cursor returned %d arguments, expected 5", n_retval); |
213
|
|
|
|
|
|
|
} |
214
|
255856
|
50
|
|
|
|
|
*piPosition = POPi; |
215
|
255856
|
50
|
|
|
|
|
*piEndOffset = POPi; |
216
|
255856
|
50
|
|
|
|
|
*piStartOffset = POPi; |
217
|
255856
|
50
|
|
|
|
|
*pnBytes = POPi; |
218
|
255856
|
50
|
|
|
|
|
token = POPpx; |
219
|
|
|
|
|
|
|
|
220
|
255856
|
100
|
|
|
|
|
if (c->pInput) { /* if working with utf8 data */ |
221
|
|
|
|
|
|
|
/* compute first hop : nb of chars from last position to the start of the token */ |
222
|
127928
|
|
|
|
|
|
hop = *piStartOffset - c->currentChar; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
/* hop: advance to the first byte in token */ |
225
|
127928
|
|
|
|
|
|
nextByte = (char*)utf8_hop((U8*)c->currentByte, hop); |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
/* compute 2nd hop : nb of chars from start of the token to end of token */ |
228
|
127928
|
|
|
|
|
|
hop = *piEndOffset - *piStartOffset; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
/* now recompute the start offset in bytes, not in chars */ |
231
|
127928
|
|
|
|
|
|
*piStartOffset = nextByte - c->pInput; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
/* 2nd hop: advance past to the last byte in token */ |
234
|
127928
|
|
|
|
|
|
nextByte = (char*)utf8_hop((U8*)nextByte, hop); |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
/* remember current position (useful for the next invocation) */ |
237
|
127928
|
|
|
|
|
|
c->currentChar = *piEndOffset; |
238
|
127928
|
|
|
|
|
|
c->currentByte = nextByte; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
/* now recompute the end offset in bytes, not in chars */ |
241
|
127928
|
|
|
|
|
|
*piEndOffset = nextByte - c->pInput; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
/* compute the size of the normalized token in bytes, not in chars */ |
244
|
127928
|
|
|
|
|
|
*pnBytes = strlen(token); |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
/* make sure we have enough storage for copying the token */ |
248
|
255856
|
100
|
|
|
|
|
if (*pnBytes > c->nTokenAllocated ){ |
249
|
|
|
|
|
|
|
char *pNew; |
250
|
108
|
|
|
|
|
|
c->nTokenAllocated = *pnBytes + 20; |
251
|
108
|
|
|
|
|
|
pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); |
252
|
108
|
50
|
|
|
|
|
if( !pNew ) return SQLITE_NOMEM; |
253
|
108
|
|
|
|
|
|
c->pToken = pNew; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
/* need to copy the token into the C cursor before perl frees that memory */ |
257
|
255856
|
|
|
|
|
|
memcpy(c->pToken, token, *pnBytes); |
258
|
255856
|
|
|
|
|
|
*ppToken = c->pToken; |
259
|
|
|
|
|
|
|
|
260
|
255856
|
|
|
|
|
|
result = SQLITE_OK; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
255896
|
|
|
|
|
|
PUTBACK; |
264
|
255896
|
100
|
|
|
|
|
FREETMPS; |
265
|
255896
|
|
|
|
|
|
LEAVE; |
266
|
|
|
|
|
|
|
|
267
|
255896
|
|
|
|
|
|
return result; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
/* |
271
|
|
|
|
|
|
|
** The set of routines that implement the perl tokenizer |
272
|
|
|
|
|
|
|
*/ |
273
|
|
|
|
|
|
|
sqlite3_tokenizer_module perl_tokenizer_Module = { |
274
|
|
|
|
|
|
|
0, |
275
|
|
|
|
|
|
|
perl_tokenizer_Create, |
276
|
|
|
|
|
|
|
perl_tokenizer_Destroy, |
277
|
|
|
|
|
|
|
perl_tokenizer_Open, |
278
|
|
|
|
|
|
|
perl_tokenizer_Close, |
279
|
|
|
|
|
|
|
perl_tokenizer_Next |
280
|
|
|
|
|
|
|
}; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
/* |
283
|
|
|
|
|
|
|
** Register the perl tokenizer with FTS3 |
284
|
|
|
|
|
|
|
*/ |
285
|
307
|
|
|
|
|
|
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) |
286
|
|
|
|
|
|
|
{ |
287
|
307
|
|
|
|
|
|
D_imp_dbh(dbh); |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
int rc; |
290
|
|
|
|
|
|
|
sqlite3_stmt *pStmt; |
291
|
307
|
|
|
|
|
|
const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; |
292
|
307
|
|
|
|
|
|
sqlite3_tokenizer_module *p = &perl_tokenizer_Module; |
293
|
|
|
|
|
|
|
|
294
|
307
|
50
|
|
|
|
|
if (!DBIc_ACTIVE(imp_dbh)) { |
295
|
0
|
|
|
|
|
|
sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle"); |
296
|
0
|
|
|
|
|
|
return FALSE; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
#if SQLITE_VERSION_NUMBER >= 3012000 |
300
|
307
|
|
|
|
|
|
rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0); |
301
|
307
|
50
|
|
|
|
|
if( rc!=SQLITE_OK ){ |
302
|
0
|
|
|
|
|
|
return rc; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
#endif |
305
|
|
|
|
|
|
|
|
306
|
307
|
|
|
|
|
|
rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); |
307
|
307
|
50
|
|
|
|
|
if( rc!=SQLITE_OK ){ |
308
|
0
|
|
|
|
|
|
return rc; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
307
|
|
|
|
|
|
sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC); |
312
|
307
|
|
|
|
|
|
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); |
313
|
307
|
|
|
|
|
|
sqlite3_step(pStmt); |
314
|
|
|
|
|
|
|
|
315
|
307
|
|
|
|
|
|
return sqlite3_finalize(pStmt); |
316
|
|
|
|
|
|
|
} |