line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* Copyright 2009 Peter Karman |
2
|
|
|
|
|
|
|
* |
3
|
|
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify |
4
|
|
|
|
|
|
|
* under the same terms as Perl itself. |
5
|
|
|
|
|
|
|
*/ |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
/* |
8
|
|
|
|
|
|
|
* Search::Tools C helpers |
9
|
|
|
|
|
|
|
*/ |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#include |
12
|
|
|
|
|
|
|
#include "search-tools.h" |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
/* global vars */ |
15
|
|
|
|
|
|
|
static HV* ST_ABBREVS = NULL; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
/* perl versions < 5.8.8 do not have this */ |
18
|
|
|
|
|
|
|
#ifndef is_utf8_string_loclen |
19
|
|
|
|
|
|
|
bool |
20
|
|
|
|
|
|
|
is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) |
21
|
|
|
|
|
|
|
{ |
22
|
|
|
|
|
|
|
dTHX; |
23
|
|
|
|
|
|
|
const U8* x = s; |
24
|
|
|
|
|
|
|
const U8* send; |
25
|
|
|
|
|
|
|
STRLEN c; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
if (!len) |
28
|
|
|
|
|
|
|
len = strlen((const char *)s); |
29
|
|
|
|
|
|
|
send = s + len; |
30
|
|
|
|
|
|
|
if (el) |
31
|
|
|
|
|
|
|
*el = 0; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
while (x < send) { |
34
|
|
|
|
|
|
|
/* Inline the easy bits of is_utf8_char() here for speed... */ |
35
|
|
|
|
|
|
|
if (UTF8_IS_INVARIANT(*x)) |
36
|
|
|
|
|
|
|
c = 1; |
37
|
|
|
|
|
|
|
else if (!UTF8_IS_START(*x)) |
38
|
|
|
|
|
|
|
goto out; |
39
|
|
|
|
|
|
|
else { |
40
|
|
|
|
|
|
|
/* ... and call is_utf8_char() only if really needed. */ |
41
|
|
|
|
|
|
|
#ifdef IS_UTF8_CHAR |
42
|
|
|
|
|
|
|
c = UTF8SKIP(x); |
43
|
|
|
|
|
|
|
if (IS_UTF8_CHAR_FAST(c)) { |
44
|
|
|
|
|
|
|
if (!IS_UTF8_CHAR(x, c)) |
45
|
|
|
|
|
|
|
c = 0; |
46
|
|
|
|
|
|
|
} else |
47
|
|
|
|
|
|
|
c = is_utf8_char_slow(x, c); |
48
|
|
|
|
|
|
|
#else |
49
|
|
|
|
|
|
|
c = is_utf8_char(x); |
50
|
|
|
|
|
|
|
#endif /* #ifdef IS_UTF8_CHAR */ |
51
|
|
|
|
|
|
|
if (!c) |
52
|
|
|
|
|
|
|
goto out; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
x += c; |
55
|
|
|
|
|
|
|
if (el) |
56
|
|
|
|
|
|
|
(*el)++; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
out: |
60
|
|
|
|
|
|
|
if (ep) |
61
|
|
|
|
|
|
|
*ep = x; |
62
|
|
|
|
|
|
|
if (x != send) |
63
|
|
|
|
|
|
|
return FALSE; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
return TRUE; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
#endif |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
static SV* |
72
|
2397
|
|
|
|
|
|
st_hv_store( HV* h, const char* key, SV* val) { |
73
|
|
|
|
|
|
|
dTHX; |
74
|
|
|
|
|
|
|
SV** ok; |
75
|
2397
|
|
|
|
|
|
ok = hv_store(h, key, strlen(key), SvREFCNT_inc(val), 0); |
76
|
2397
|
50
|
|
|
|
|
if (ok == NULL) { |
77
|
0
|
|
|
|
|
|
ST_CROAK("failed to store %s in hash", key); |
78
|
|
|
|
|
|
|
} |
79
|
2397
|
|
|
|
|
|
return *ok; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
static SV* |
83
|
0
|
|
|
|
|
|
st_hv_store_char( HV* h, const char *key, char *val) { |
84
|
|
|
|
|
|
|
dTHX; |
85
|
|
|
|
|
|
|
SV *value; |
86
|
0
|
|
|
|
|
|
value = newSVpv(val, 0); |
87
|
0
|
|
|
|
|
|
st_hv_store( h, key, value ); |
88
|
0
|
|
|
|
|
|
SvREFCNT_dec(value); |
89
|
0
|
|
|
|
|
|
return value; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
static SV* |
93
|
2397
|
|
|
|
|
|
st_hv_store_int( HV* h, const char* key, int i) { |
94
|
|
|
|
|
|
|
dTHX; |
95
|
|
|
|
|
|
|
SV *value; |
96
|
2397
|
|
|
|
|
|
value = newSViv(i); |
97
|
2397
|
|
|
|
|
|
st_hv_store( h, key, value ); |
98
|
2397
|
|
|
|
|
|
SvREFCNT_dec(value); |
99
|
2397
|
|
|
|
|
|
return value; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
/* UNUSED |
103
|
|
|
|
|
|
|
static SV* |
104
|
|
|
|
|
|
|
st_hvref_store( SV* h, const char* key, SV* val) { |
105
|
|
|
|
|
|
|
dTHX; |
106
|
|
|
|
|
|
|
return st_hv_store( (HV*)SvRV(h), key, val ); |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
*/ |
109
|
|
|
|
|
|
|
/* UNUSED |
110
|
|
|
|
|
|
|
static SV* |
111
|
|
|
|
|
|
|
st_hvref_store_char( SV* h, const char* key, char *val) { |
112
|
|
|
|
|
|
|
dTHX; |
113
|
|
|
|
|
|
|
return st_hv_store_char( (HV*)SvRV(h), key, val ); |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
*/ |
116
|
|
|
|
|
|
|
/* UNUSED |
117
|
|
|
|
|
|
|
static SV* |
118
|
|
|
|
|
|
|
st_hvref_store_int( SV* h, const char* key, int i) { |
119
|
|
|
|
|
|
|
dTHX; |
120
|
|
|
|
|
|
|
return st_hv_store_int( (HV*)SvRV(h), key, i ); |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
*/ |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
static SV* |
125
|
6629
|
|
|
|
|
|
st_av_fetch( AV* a, I32 index ) { |
126
|
|
|
|
|
|
|
dTHX; |
127
|
|
|
|
|
|
|
SV** ok; |
128
|
6629
|
|
|
|
|
|
ok = av_fetch(a, index, 0); |
129
|
6629
|
50
|
|
|
|
|
if (ok == NULL) { |
130
|
0
|
|
|
|
|
|
ST_CROAK("failed to fetch index %d", index); |
131
|
|
|
|
|
|
|
} |
132
|
6629
|
|
|
|
|
|
return *ok; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
static void * |
136
|
0
|
|
|
|
|
|
st_av_fetch_ptr( AV* a, I32 index ) { |
137
|
|
|
|
|
|
|
dTHX; |
138
|
|
|
|
|
|
|
SV** ok; |
139
|
|
|
|
|
|
|
void * ptr; |
140
|
0
|
|
|
|
|
|
ok = av_fetch(a, index, 0); |
141
|
0
|
0
|
|
|
|
|
if (ok == NULL) { |
142
|
0
|
|
|
|
|
|
ST_CROAK("failed to fetch index %d", index); |
143
|
|
|
|
|
|
|
} |
144
|
0
|
|
|
|
|
|
ptr = st_extract_ptr(*ok); |
145
|
|
|
|
|
|
|
//warn("%s refcnt == %d", SvPV_nolen(*ok), SvREFCNT(*ok)); |
146
|
0
|
|
|
|
|
|
return ptr; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
/* fetch SV* from hash */ |
150
|
|
|
|
|
|
|
static SV* |
151
|
48
|
|
|
|
|
|
st_hv_fetch( HV* h, const char* key ) { |
152
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
153
|
|
|
|
|
|
|
SV** ok; |
154
|
48
|
|
|
|
|
|
ok = hv_fetch(h, key, strlen(key), 0); |
155
|
48
|
50
|
|
|
|
|
if (ok == NULL) { |
156
|
0
|
|
|
|
|
|
ST_CROAK("failed to fetch %s", key); |
157
|
|
|
|
|
|
|
} |
158
|
48
|
|
|
|
|
|
return *ok; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
static SV* |
162
|
48
|
|
|
|
|
|
st_hvref_fetch( SV* h, const char* key ) { |
163
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
164
|
48
|
|
|
|
|
|
return st_hv_fetch((HV*)SvRV(h), key); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
/* UNUSED |
168
|
|
|
|
|
|
|
static char* |
169
|
|
|
|
|
|
|
st_hv_fetch_as_char( HV* h, const char* key ) { |
170
|
|
|
|
|
|
|
dTHX; |
171
|
|
|
|
|
|
|
SV** ok; |
172
|
|
|
|
|
|
|
ok = hv_fetch(h, key, strlen(key), 0); |
173
|
|
|
|
|
|
|
if (ok == NULL) { |
174
|
|
|
|
|
|
|
ST_CROAK("failed to fetch %s from hash", key); |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
return SvPV_nolen((SV*)*ok); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
*/ |
179
|
|
|
|
|
|
|
/* UNUSED |
180
|
|
|
|
|
|
|
static char* |
181
|
|
|
|
|
|
|
st_hvref_fetch_as_char( SV* h, const char* key ) { |
182
|
|
|
|
|
|
|
dTHX; |
183
|
|
|
|
|
|
|
return st_hv_fetch_as_char( (HV*)SvRV(h), key ); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
*/ |
186
|
|
|
|
|
|
|
/* UNUSED |
187
|
|
|
|
|
|
|
static IV |
188
|
|
|
|
|
|
|
st_hvref_fetch_as_int( SV* h, const char* key ) { |
189
|
|
|
|
|
|
|
dTHX; |
190
|
|
|
|
|
|
|
SV* val; |
191
|
|
|
|
|
|
|
IV i; |
192
|
|
|
|
|
|
|
val = st_hv_fetch( (HV*)SvRV(h), key ); |
193
|
|
|
|
|
|
|
i = SvIV(val); |
194
|
|
|
|
|
|
|
return i; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
*/ |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
void * |
199
|
15494
|
|
|
|
|
|
st_malloc(size_t size) { |
200
|
|
|
|
|
|
|
dTHX; |
201
|
|
|
|
|
|
|
void *ptr; |
202
|
15494
|
|
|
|
|
|
ptr = malloc(size); |
203
|
15494
|
50
|
|
|
|
|
if (ptr == NULL) { |
204
|
0
|
|
|
|
|
|
ST_CROAK("Out of memory! Can't malloc %lu bytes", |
205
|
|
|
|
|
|
|
(unsigned long)size); |
206
|
|
|
|
|
|
|
} |
207
|
15494
|
|
|
|
|
|
return ptr; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
static st_token* |
212
|
12450
|
|
|
|
|
|
st_new_token( |
213
|
|
|
|
|
|
|
I32 pos, |
214
|
|
|
|
|
|
|
I32 len, |
215
|
|
|
|
|
|
|
I32 u8len, |
216
|
|
|
|
|
|
|
const char *ptr, |
217
|
|
|
|
|
|
|
I32 is_hot, |
218
|
|
|
|
|
|
|
boolean is_match |
219
|
|
|
|
|
|
|
) { |
220
|
|
|
|
|
|
|
dTHX; |
221
|
|
|
|
|
|
|
st_token *tok; |
222
|
|
|
|
|
|
|
|
223
|
12450
|
50
|
|
|
|
|
if (!len) { |
224
|
0
|
|
|
|
|
|
ST_CROAK("cannot create token with zero length: '%s'", ptr); |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
12450
|
|
|
|
|
|
tok = st_malloc(sizeof(st_token)); |
228
|
12450
|
|
|
|
|
|
tok->pos = pos; |
229
|
12450
|
|
|
|
|
|
tok->len = len; |
230
|
12450
|
|
|
|
|
|
tok->u8len = u8len; |
231
|
12450
|
|
|
|
|
|
tok->is_hot = is_hot; |
232
|
12450
|
|
|
|
|
|
tok->is_match = is_match; |
233
|
12450
|
|
|
|
|
|
tok->is_sentence_start = 0; |
234
|
12450
|
|
|
|
|
|
tok->is_sentence_end = 0; |
235
|
12450
|
|
|
|
|
|
tok->is_abbreviation = 0; |
236
|
12450
|
|
|
|
|
|
tok->str = newSVpvn(ptr, len); /* newSVpvn_utf8 not available in some perls? */ |
237
|
12450
|
|
|
|
|
|
SvUTF8_on(tok->str); |
238
|
12450
|
|
|
|
|
|
tok->ref_cnt = 1; |
239
|
12450
|
|
|
|
|
|
return tok; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
static st_token_list* |
243
|
48
|
|
|
|
|
|
st_new_token_list( |
244
|
|
|
|
|
|
|
AV *tokens, |
245
|
|
|
|
|
|
|
AV *heat, |
246
|
|
|
|
|
|
|
AV *sentence_starts, |
247
|
|
|
|
|
|
|
unsigned int num |
248
|
|
|
|
|
|
|
) { |
249
|
|
|
|
|
|
|
dTHX; |
250
|
|
|
|
|
|
|
st_token_list *tl; |
251
|
48
|
|
|
|
|
|
tl = st_malloc(sizeof(st_token_list)); |
252
|
48
|
|
|
|
|
|
tl->pos = 0; |
253
|
48
|
|
|
|
|
|
tl->tokens = tokens; |
254
|
48
|
|
|
|
|
|
tl->heat = heat; |
255
|
48
|
|
|
|
|
|
tl->sentence_starts = sentence_starts; |
256
|
48
|
|
|
|
|
|
tl->num = (IV)num; |
257
|
48
|
|
|
|
|
|
tl->ref_cnt = 1; |
258
|
48
|
|
|
|
|
|
return tl; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
static void |
262
|
12450
|
|
|
|
|
|
st_free_token(st_token *tok) { |
263
|
|
|
|
|
|
|
dTHX; |
264
|
12450
|
50
|
|
|
|
|
if (tok->ref_cnt != 0) { |
265
|
0
|
|
|
|
|
|
ST_CROAK("Won't free token 0x%x with ref_cnt != 0 [%d]", |
266
|
|
|
|
|
|
|
tok, tok->ref_cnt); |
267
|
|
|
|
|
|
|
} |
268
|
12450
|
|
|
|
|
|
SvREFCNT_dec(tok->str); |
269
|
12450
|
|
|
|
|
|
free(tok); |
270
|
12450
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
static void |
273
|
48
|
|
|
|
|
|
st_free_token_list(st_token_list *token_list) { |
274
|
|
|
|
|
|
|
dTHX; |
275
|
48
|
50
|
|
|
|
|
if (token_list->ref_cnt != 0) { |
276
|
0
|
|
|
|
|
|
ST_CROAK("Won't free token_list 0x%x with ref_cnt > 0 [%d]", |
277
|
|
|
|
|
|
|
token_list, token_list->ref_cnt); |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
//warn("about to free st_token_list C struct\n"); |
281
|
|
|
|
|
|
|
//st_dump_token_list(token_list); |
282
|
|
|
|
|
|
|
|
283
|
48
|
|
|
|
|
|
SvREFCNT_dec(token_list->tokens); |
284
|
48
|
50
|
|
|
|
|
if (SvREFCNT(token_list->tokens)) { |
285
|
0
|
|
|
|
|
|
warn("Warning: possible memory leak for token_list->tokens 0x%lx with REFCNT %d\n", |
286
|
0
|
|
|
|
|
|
(unsigned long)token_list->tokens, SvREFCNT(token_list->tokens)); |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
48
|
|
|
|
|
|
SvREFCNT_dec(token_list->heat); |
290
|
48
|
50
|
|
|
|
|
if (SvREFCNT(token_list->heat)) { |
291
|
0
|
|
|
|
|
|
warn("Warning: possible memory leak for token_list->heat 0x%lx with REFCNT %d\n", |
292
|
0
|
|
|
|
|
|
(unsigned long)token_list->heat, SvREFCNT(token_list->heat)); |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
48
|
|
|
|
|
|
SvREFCNT_dec(token_list->sentence_starts); |
296
|
48
|
50
|
|
|
|
|
if (SvREFCNT(token_list->sentence_starts)) { |
297
|
0
|
|
|
|
|
|
warn("Warning: possible memory leak for token_list->sentence_starts 0x%lx with REFCNT %d\n", |
298
|
0
|
|
|
|
|
|
(unsigned long)token_list->sentence_starts, SvREFCNT(token_list->sentence_starts)); |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
48
|
|
|
|
|
|
free(token_list); |
302
|
48
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
static void |
305
|
0
|
|
|
|
|
|
st_dump_token_list(st_token_list *tl) { |
306
|
|
|
|
|
|
|
dTHX; |
307
|
|
|
|
|
|
|
IV len, pos; |
308
|
|
|
|
|
|
|
SV* tok; |
309
|
0
|
|
|
|
|
|
len = av_len(tl->tokens); |
310
|
0
|
|
|
|
|
|
pos = 0; |
311
|
0
|
|
|
|
|
|
warn("TokenList 0x%lx", (unsigned long)tl); |
312
|
0
|
|
|
|
|
|
warn(" pos = %ld\n", (unsigned long)tl->pos); |
313
|
0
|
|
|
|
|
|
warn(" len = %ld\n", (unsigned long)len + 1); |
314
|
0
|
|
|
|
|
|
warn(" num = %ld\n", (unsigned long)tl->num); |
315
|
0
|
|
|
|
|
|
warn(" ref_cnt = %ld\n", (unsigned long)tl->ref_cnt); |
316
|
0
|
|
|
|
|
|
warn(" tokens REFCNT = %ld\n", (unsigned long)SvREFCNT(tl->tokens)); |
317
|
0
|
|
|
|
|
|
warn(" heat REFCNT = %ld\n", (unsigned long)SvREFCNT(tl->heat)); |
318
|
0
|
|
|
|
|
|
warn(" sen_starts REFCNT = %ld\n", (unsigned long)SvREFCNT(tl->sentence_starts)); |
319
|
0
|
0
|
|
|
|
|
while (pos < len) { |
320
|
0
|
|
|
|
|
|
tok = st_av_fetch(tl->tokens, pos++); |
321
|
0
|
|
|
|
|
|
warn(" Token REFCNT = %ld\n", (unsigned long)SvREFCNT(tok)); |
322
|
0
|
|
|
|
|
|
st_dump_token((st_token*)st_extract_ptr(tok)); |
323
|
|
|
|
|
|
|
} |
324
|
0
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
static void |
327
|
0
|
|
|
|
|
|
st_dump_token(st_token *tok) { |
328
|
|
|
|
|
|
|
dTHX; |
329
|
0
|
|
|
|
|
|
warn("Token 0x%lx", (unsigned long)tok); |
330
|
0
|
0
|
|
|
|
|
warn(" str = '%s'\n", SvPV_nolen(tok->str)); |
331
|
0
|
|
|
|
|
|
warn(" pos = %ld\n", (unsigned long)tok->pos); |
332
|
0
|
|
|
|
|
|
warn(" len = %ld\n", (unsigned long)tok->len); |
333
|
0
|
|
|
|
|
|
warn(" u8len = %ld\n", (unsigned long)tok->u8len); |
334
|
0
|
|
|
|
|
|
warn(" is_match = %d\n", tok->is_match); |
335
|
0
|
|
|
|
|
|
warn(" is_sentence_start = %d\n", tok->is_sentence_start); |
336
|
0
|
|
|
|
|
|
warn(" is_sentence_end = %d\n", tok->is_sentence_end); |
337
|
0
|
|
|
|
|
|
warn(" is_abbreviation = %d\n", tok->is_abbreviation); |
338
|
0
|
|
|
|
|
|
warn(" is_hot = %d\n", tok->is_hot); |
339
|
0
|
|
|
|
|
|
warn(" ref_cnt = %ld\n", (unsigned long)tok->ref_cnt); |
340
|
0
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
/* make a Perl blessed object from a C pointer */ |
343
|
|
|
|
|
|
|
static SV* |
344
|
12498
|
|
|
|
|
|
st_bless_ptr( const char *class, void * c_ptr ) { |
345
|
|
|
|
|
|
|
dTHX; |
346
|
12498
|
|
|
|
|
|
SV* obj = newSViv( PTR2IV( c_ptr ) ); // use instead of sv_newmortal(). |
347
|
12498
|
|
|
|
|
|
sv_setref_pv(obj, class, c_ptr); |
348
|
12498
|
|
|
|
|
|
return obj; |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
/* return the C pointer from a Perl blessed O_OBJECT */ |
352
|
|
|
|
|
|
|
static void * |
353
|
12691
|
|
|
|
|
|
st_extract_ptr( SV* object ) { |
354
|
|
|
|
|
|
|
dTHX; |
355
|
12691
|
50
|
|
|
|
|
return INT2PTR( void*, SvIV(SvRV( object )) ); |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
static void |
359
|
0
|
|
|
|
|
|
st_croak( |
360
|
|
|
|
|
|
|
const char *file, |
361
|
|
|
|
|
|
|
int line, |
362
|
|
|
|
|
|
|
const char *func, |
363
|
|
|
|
|
|
|
const char *msgfmt, |
364
|
|
|
|
|
|
|
... |
365
|
|
|
|
|
|
|
) |
366
|
|
|
|
|
|
|
{ |
367
|
|
|
|
|
|
|
dTHX; |
368
|
|
|
|
|
|
|
va_list args; |
369
|
0
|
|
|
|
|
|
va_start(args, msgfmt); |
370
|
0
|
|
|
|
|
|
warn("Search::Tools error at %s:%d %s: ", file, line, func); |
371
|
|
|
|
|
|
|
//warn(msgfmt, args); |
372
|
0
|
|
|
|
|
|
croak(msgfmt, args); |
373
|
|
|
|
|
|
|
/* NEVER REACH HERE */ |
374
|
|
|
|
|
|
|
va_end(args); |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
/* UNUSED |
378
|
|
|
|
|
|
|
static SV* |
379
|
|
|
|
|
|
|
st_new_hash_object(const char *class) { |
380
|
|
|
|
|
|
|
dTHX; |
381
|
|
|
|
|
|
|
HV *hash; |
382
|
|
|
|
|
|
|
SV *object; |
383
|
|
|
|
|
|
|
hash = newHV(); |
384
|
|
|
|
|
|
|
object = sv_bless( newRV((SV*)hash), gv_stashpv(class,0) ); |
385
|
|
|
|
|
|
|
return object; |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
*/ |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
static void |
390
|
0
|
|
|
|
|
|
st_dump_sv(SV* ref) { |
391
|
|
|
|
|
|
|
dTHX; |
392
|
|
|
|
|
|
|
HV* hash; |
393
|
|
|
|
|
|
|
HE* hash_entry; |
394
|
|
|
|
|
|
|
AV* array; |
395
|
|
|
|
|
|
|
int num_keys, i, pos, len; |
396
|
|
|
|
|
|
|
SV* sv_key; |
397
|
|
|
|
|
|
|
SV* sv_val; |
398
|
|
|
|
|
|
|
int refcnt; |
399
|
|
|
|
|
|
|
|
400
|
0
|
|
|
|
|
|
pos = 0; |
401
|
0
|
|
|
|
|
|
i = 0; |
402
|
0
|
|
|
|
|
|
len = 0; |
403
|
|
|
|
|
|
|
|
404
|
0
|
0
|
|
|
|
|
if (SvTYPE(SvRV(ref))==SVt_PVHV) { |
405
|
0
|
|
|
|
|
|
warn("SV is a hash reference"); |
406
|
0
|
|
|
|
|
|
hash = (HV*)SvRV(ref); |
407
|
0
|
|
|
|
|
|
num_keys = hv_iterinit(hash); |
408
|
0
|
0
|
|
|
|
|
for (i = 0; i < num_keys; i++) { |
409
|
0
|
|
|
|
|
|
hash_entry = hv_iternext(hash); |
410
|
0
|
|
|
|
|
|
sv_key = hv_iterkeysv(hash_entry); |
411
|
0
|
|
|
|
|
|
sv_val = hv_iterval(hash, hash_entry); |
412
|
0
|
|
|
|
|
|
refcnt = SvREFCNT(sv_val); |
413
|
0
|
0
|
|
|
|
|
warn(" %s => %s [%d]\n", |
|
|
0
|
|
|
|
|
|
414
|
0
|
|
|
|
|
|
SvPV_nolen(sv_key), SvPV_nolen(sv_val), refcnt); |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
} |
417
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(ref))==SVt_PVAV) { |
418
|
0
|
|
|
|
|
|
warn("SV is an array reference"); |
419
|
0
|
|
|
|
|
|
array = (AV*)SvRV(ref); |
420
|
0
|
|
|
|
|
|
len = av_len(array)+1; |
421
|
0
|
|
|
|
|
|
warn("SV has %d items\n", len); |
422
|
0
|
|
|
|
|
|
pos = 0; |
423
|
0
|
0
|
|
|
|
|
while (pos < len) { |
424
|
0
|
|
|
|
|
|
st_describe_object( st_av_fetch(array, pos++) ); |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
0
|
|
|
|
|
|
return; |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
static void |
433
|
0
|
|
|
|
|
|
st_describe_object( SV* object ) { |
434
|
|
|
|
|
|
|
dTHX; |
435
|
|
|
|
|
|
|
char* str; |
436
|
|
|
|
|
|
|
|
437
|
0
|
|
|
|
|
|
warn("describing object\n"); |
438
|
0
|
0
|
|
|
|
|
str = SvPV_nolen( object ); |
439
|
0
|
0
|
|
|
|
|
if (SvROK(object)) |
440
|
|
|
|
|
|
|
{ |
441
|
0
|
0
|
|
|
|
|
if (SvTYPE(SvRV(object))==SVt_PVHV) |
442
|
0
|
|
|
|
|
|
warn("%s is a magic blessed reference\n", str); |
443
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(object))==SVt_PVMG) |
444
|
0
|
|
|
|
|
|
warn("%s is a magic reference", str); |
445
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(object))==SVt_IV) |
446
|
0
|
|
|
|
|
|
warn("%s is a IV reference (pointer)", str); |
447
|
|
|
|
|
|
|
else |
448
|
0
|
|
|
|
|
|
warn("%s is a reference of some kind", str); |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
else |
451
|
|
|
|
|
|
|
{ |
452
|
0
|
|
|
|
|
|
warn("%s is not a reference", str); |
453
|
0
|
0
|
|
|
|
|
if (sv_isobject(object)) |
454
|
0
|
|
|
|
|
|
warn("however, %s is an object", str); |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
} |
458
|
0
|
|
|
|
|
|
warn("object dump"); |
459
|
0
|
|
|
|
|
|
Perl_sv_dump( aTHX_ object ); |
460
|
0
|
|
|
|
|
|
warn("object ref dump"); |
461
|
0
|
|
|
|
|
|
Perl_sv_dump( aTHX_ (SV*)SvRV(object) ); |
462
|
0
|
|
|
|
|
|
st_dump_sv( object ); |
463
|
0
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
static boolean |
466
|
622
|
|
|
|
|
|
st_is_ascii( SV* str ) { |
467
|
|
|
|
|
|
|
dTHX; |
468
|
|
|
|
|
|
|
STRLEN len; |
469
|
|
|
|
|
|
|
char *bytes; |
470
|
|
|
|
|
|
|
IV i; |
471
|
|
|
|
|
|
|
|
472
|
622
|
50
|
|
|
|
|
bytes = SvPV(str, len); |
473
|
622
|
|
|
|
|
|
return st_char_is_ascii((unsigned char*)bytes, len); |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
static boolean |
477
|
658
|
|
|
|
|
|
st_char_is_ascii( unsigned char* str, STRLEN len ) { |
478
|
|
|
|
|
|
|
dTHX; |
479
|
|
|
|
|
|
|
IV i; |
480
|
|
|
|
|
|
|
|
481
|
59379
|
100
|
|
|
|
|
for(i=0; i
|
482
|
58973
|
100
|
|
|
|
|
if (str[i] >= 0x80) { |
483
|
252
|
|
|
|
|
|
return 0; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
} |
486
|
406
|
|
|
|
|
|
return 1; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
/* SvRX does this in Perl >= 5.10 */ |
490
|
|
|
|
|
|
|
static REGEXP* |
491
|
5619
|
|
|
|
|
|
st_get_regex_from_sv( SV *regex_sv ) { |
492
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
REGEXP *rx; |
495
|
|
|
|
|
|
|
MAGIC *mg; |
496
|
5619
|
|
|
|
|
|
mg = NULL; |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
#if ((PERL_VERSION > 9) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) |
499
|
5619
|
|
|
|
|
|
rx = SvRX(regex_sv); |
500
|
|
|
|
|
|
|
#else |
501
|
|
|
|
|
|
|
/* extract regexp struct from qr// entity */ |
502
|
|
|
|
|
|
|
if (SvROK(regex_sv)) { |
503
|
|
|
|
|
|
|
SV *sv = SvRV(regex_sv); |
504
|
|
|
|
|
|
|
if (SvMAGICAL(sv)) |
505
|
|
|
|
|
|
|
mg = mg_find(sv, PERL_MAGIC_qr); |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
if (!mg) { |
508
|
|
|
|
|
|
|
st_describe_object(regex_sv); |
509
|
|
|
|
|
|
|
ST_CROAK("regex is not a qr// entity"); |
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
rx = (REGEXP*)mg->mg_obj; |
513
|
|
|
|
|
|
|
#endif |
514
|
|
|
|
|
|
|
|
515
|
5619
|
50
|
|
|
|
|
if (rx == NULL) { |
516
|
0
|
|
|
|
|
|
ST_CROAK("Failed to extract REGEXP from regex_sv %s", |
517
|
|
|
|
|
|
|
SvPV_nolen( regex_sv )); |
518
|
|
|
|
|
|
|
} |
519
|
|
|
|
|
|
|
|
520
|
5619
|
|
|
|
|
|
return rx; |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
static void |
524
|
5546
|
|
|
|
|
|
st_heat_seeker( st_token *token, SV *re ) { |
525
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
REGEXP *rx; |
528
|
|
|
|
|
|
|
char *buf, *str_end; |
529
|
|
|
|
|
|
|
|
530
|
5546
|
|
|
|
|
|
rx = st_get_regex_from_sv(re); |
531
|
5546
|
|
|
|
|
|
buf = SvPVX(token->str); |
532
|
5546
|
|
|
|
|
|
str_end = buf + token->len; |
533
|
|
|
|
|
|
|
|
534
|
5546
|
100
|
|
|
|
|
if ( pregexec(rx, buf, str_end, buf, 1, token->str, 1) ) { |
535
|
123
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
50
|
|
|
|
|
|
536
|
0
|
|
|
|
|
|
warn("st_heat_seeker: token is hot: %s", buf); |
537
|
|
|
|
|
|
|
} |
538
|
123
|
|
|
|
|
|
token->is_hot = 1; |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
|
541
|
5546
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
static AV* |
544
|
25
|
|
|
|
|
|
st_heat_seeker_offsets( SV *str, SV *re ) { |
545
|
|
|
|
|
|
|
dTHX; |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
REGEXP *rx; |
548
|
|
|
|
|
|
|
char *buf, *str_end, *str_start; |
549
|
|
|
|
|
|
|
STRLEN str_len; |
550
|
|
|
|
|
|
|
AV *offsets; |
551
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
552
|
|
|
|
|
|
|
regexp *r; |
553
|
|
|
|
|
|
|
#endif |
554
|
|
|
|
|
|
|
|
555
|
25
|
|
|
|
|
|
rx = st_get_regex_from_sv(re); |
556
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
557
|
25
|
|
|
|
|
|
r = (regexp*)SvANY(rx); |
558
|
|
|
|
|
|
|
#endif |
559
|
25
|
50
|
|
|
|
|
buf = SvPV(str, str_len); |
560
|
25
|
|
|
|
|
|
str_start = buf; |
561
|
25
|
|
|
|
|
|
str_end = buf + str_len; |
562
|
25
|
|
|
|
|
|
offsets = newAV(); |
563
|
|
|
|
|
|
|
|
564
|
121
|
100
|
|
|
|
|
while ( pregexec(rx, buf, str_end, buf, 1, str, 1) ) { |
565
|
|
|
|
|
|
|
const char *start_ptr, *end_ptr; |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
#if ((PERL_VERSION == 10) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) |
568
|
|
|
|
|
|
|
start_ptr = buf + rx->offs[0].start; |
569
|
|
|
|
|
|
|
end_ptr = buf + rx->offs[0].end; |
570
|
|
|
|
|
|
|
#elif (PERL_VERSION > 10) |
571
|
96
|
|
|
|
|
|
start_ptr = buf + r->offs[0].start; |
572
|
96
|
|
|
|
|
|
end_ptr = buf + r->offs[0].end; |
573
|
|
|
|
|
|
|
#else |
574
|
|
|
|
|
|
|
start_ptr = buf + rx->startp[0]; |
575
|
|
|
|
|
|
|
end_ptr = buf + rx->endp[0]; |
576
|
|
|
|
|
|
|
#endif |
577
|
|
|
|
|
|
|
/* advance the pointer */ |
578
|
96
|
|
|
|
|
|
buf = (char*)end_ptr; |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
//warn("got heat match at %ld", start_ptr - str_start); |
581
|
96
|
|
|
|
|
|
av_push(offsets, newSViv(start_ptr - str_start)); |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
|
585
|
25
|
|
|
|
|
|
return offsets; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
/* |
589
|
|
|
|
|
|
|
st_tokenize() et al based on KinoSearch::Analysis::Tokenizer |
590
|
|
|
|
|
|
|
by Marvin Humphrey. |
591
|
|
|
|
|
|
|
He dared go where no XS regex user had gone before... |
592
|
|
|
|
|
|
|
*/ |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
static SV* |
595
|
48
|
|
|
|
|
|
st_tokenize( SV* str, SV* token_re, SV* heat_seeker, I32 match_num ) { |
596
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
/* declare */ |
599
|
|
|
|
|
|
|
IV num_tokens, prev_sentence_start; |
600
|
|
|
|
|
|
|
REGEXP *rx; |
601
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
602
|
|
|
|
|
|
|
regexp *r; |
603
|
|
|
|
|
|
|
#endif |
604
|
|
|
|
|
|
|
char *buf, *str_start, *str_end, *token_str; |
605
|
|
|
|
|
|
|
STRLEN str_len; |
606
|
|
|
|
|
|
|
const char *prev_end, *prev_start; |
607
|
|
|
|
|
|
|
AV *tokens; |
608
|
|
|
|
|
|
|
AV *heat; |
609
|
|
|
|
|
|
|
AV *sentence_starts; /* list of sentence start points for hot tokens */ |
610
|
|
|
|
|
|
|
SV *tok; |
611
|
|
|
|
|
|
|
boolean heat_seeker_is_CV, inside_sentence, prev_was_abbrev; |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
/* initialize */ |
614
|
48
|
|
|
|
|
|
num_tokens = 0; |
615
|
48
|
|
|
|
|
|
rx = st_get_regex_from_sv(token_re); |
616
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
617
|
48
|
|
|
|
|
|
r = (regexp*)SvANY(rx); |
618
|
|
|
|
|
|
|
#endif |
619
|
48
|
50
|
|
|
|
|
buf = SvPV(str, str_len); |
620
|
48
|
|
|
|
|
|
str_start = buf; |
621
|
48
|
|
|
|
|
|
str_end = str_start + str_len; |
622
|
48
|
|
|
|
|
|
prev_start = str_start; |
623
|
48
|
|
|
|
|
|
prev_end = prev_start; |
624
|
48
|
|
|
|
|
|
tokens = newAV(); |
625
|
48
|
|
|
|
|
|
heat = newAV(); |
626
|
48
|
|
|
|
|
|
sentence_starts = newAV(); |
627
|
48
|
|
|
|
|
|
prev_sentence_start = 0; |
628
|
48
|
|
|
|
|
|
inside_sentence = 0; // assume we start with a sentence start |
629
|
48
|
|
|
|
|
|
heat_seeker_is_CV = 0; |
630
|
48
|
|
|
|
|
|
prev_was_abbrev = 0; |
631
|
48
|
100
|
|
|
|
|
if (heat_seeker != NULL && (SvTYPE(SvRV(heat_seeker))==SVt_PVCV)) { |
|
|
100
|
|
|
|
|
|
632
|
12
|
|
|
|
|
|
heat_seeker_is_CV = 1; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
48
|
50
|
|
|
|
|
if (ST_DEBUG) { |
|
|
50
|
|
|
|
|
|
636
|
0
|
|
|
|
|
|
warn("tokenizing string %ld bytes long\n", str_len); |
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
|
639
|
6281
|
100
|
|
|
|
|
while ( pregexec(rx, buf, str_end, buf, 1, str, 1) ) { |
640
|
|
|
|
|
|
|
const char *start_ptr, *end_ptr; |
641
|
|
|
|
|
|
|
st_token *token; |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
#if ((PERL_VERSION == 10) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) |
644
|
|
|
|
|
|
|
start_ptr = buf + rx->offs[match_num].start; |
645
|
|
|
|
|
|
|
end_ptr = buf + rx->offs[match_num].end; |
646
|
|
|
|
|
|
|
#elif (PERL_VERSION > 10) |
647
|
6233
|
|
|
|
|
|
start_ptr = buf + r->offs[match_num].start; |
648
|
6233
|
|
|
|
|
|
end_ptr = buf + r->offs[match_num].end; |
649
|
|
|
|
|
|
|
#else |
650
|
|
|
|
|
|
|
start_ptr = buf + rx->startp[match_num]; |
651
|
|
|
|
|
|
|
end_ptr = buf + rx->endp[match_num]; |
652
|
|
|
|
|
|
|
#endif |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
/* advance the pointers */ |
655
|
6233
|
|
|
|
|
|
buf = (char*)end_ptr; |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
/* create token for the bytes between the last match and this one |
658
|
|
|
|
|
|
|
* check first that we have moved past first byte |
659
|
|
|
|
|
|
|
* and that the regex has moved us forward at least one byte |
660
|
|
|
|
|
|
|
*/ |
661
|
6233
|
100
|
|
|
|
|
if (start_ptr != str_start && start_ptr != prev_end) { |
|
|
100
|
|
|
|
|
|
662
|
6181
|
|
|
|
|
|
token = st_new_token(num_tokens++, |
663
|
6181
|
|
|
|
|
|
(start_ptr - prev_end), |
664
|
6181
|
|
|
|
|
|
utf8_distance((U8*)start_ptr, (U8*)prev_end), |
665
|
|
|
|
|
|
|
prev_end, 0, 0); |
666
|
6181
|
50
|
|
|
|
|
token_str = SvPV_nolen(token->str); |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
/* TODO |
669
|
|
|
|
|
|
|
there is an edge case here where a token that ends a sentence |
670
|
|
|
|
|
|
|
(e.g. punctuation) also matches the start of the next sentence |
671
|
|
|
|
|
|
|
(e.g. more punctuation, inverted question mark). |
672
|
|
|
|
|
|
|
Need to split that into 2 tokens in order to distinguish |
673
|
|
|
|
|
|
|
the end and start |
674
|
|
|
|
|
|
|
*/ |
675
|
|
|
|
|
|
|
|
676
|
6181
|
100
|
|
|
|
|
if (!inside_sentence) { |
677
|
7
|
50
|
|
|
|
|
if (num_tokens == 1 |
678
|
0
|
0
|
|
|
|
|
|| |
679
|
0
|
|
|
|
|
|
st_looks_like_sentence_start((unsigned char*)token_str, token->len) |
680
|
|
|
|
|
|
|
) { |
681
|
7
|
|
|
|
|
|
token->is_sentence_start = 1; |
682
|
7
|
|
|
|
|
|
inside_sentence = 1; |
683
|
|
|
|
|
|
|
} |
684
|
|
|
|
|
|
|
} |
685
|
6174
|
100
|
|
|
|
|
else if (!prev_was_abbrev |
686
|
6075
|
100
|
|
|
|
|
&& |
687
|
6075
|
|
|
|
|
|
st_looks_like_sentence_end((unsigned char*)token_str, token->len) |
688
|
|
|
|
|
|
|
) { |
689
|
184
|
|
|
|
|
|
token->is_sentence_end = 1; |
690
|
184
|
|
|
|
|
|
inside_sentence = 0; |
691
|
|
|
|
|
|
|
} |
692
|
6181
|
50
|
|
|
|
|
if (st_is_abbreviation((unsigned char*)token_str, token->len)) { |
693
|
0
|
|
|
|
|
|
token->is_abbreviation = 1; |
694
|
0
|
|
|
|
|
|
prev_was_abbrev = 1; |
695
|
|
|
|
|
|
|
} |
696
|
|
|
|
|
|
|
else { |
697
|
6181
|
|
|
|
|
|
prev_was_abbrev = 0; |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
6181
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
50
|
|
|
|
|
|
701
|
0
|
|
|
|
|
|
warn("prev [%d] [%d] [%d] [%s] [%d] [%d]", |
702
|
|
|
|
|
|
|
token->pos, token->len, token->u8len, token_str, |
703
|
0
|
|
|
|
|
|
token->is_sentence_start, token->is_sentence_end); |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
6181
|
|
|
|
|
|
tok = st_bless_ptr(ST_CLASS_TOKEN, token); |
707
|
6181
|
|
|
|
|
|
av_push(tokens, tok); |
708
|
6181
|
100
|
|
|
|
|
if (token->is_sentence_start) { |
709
|
|
|
|
|
|
|
//av_push(sentence_starts, newSViv(token->pos)); |
710
|
7
|
|
|
|
|
|
prev_sentence_start = token->pos; |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
/* create token object for the current match */ |
715
|
6233
|
|
|
|
|
|
token = st_new_token(num_tokens++, |
716
|
6233
|
|
|
|
|
|
(end_ptr - start_ptr), |
717
|
6233
|
|
|
|
|
|
utf8_distance((U8*)end_ptr, (U8*)start_ptr), |
718
|
|
|
|
|
|
|
start_ptr, |
719
|
|
|
|
|
|
|
0, 1); |
720
|
6233
|
50
|
|
|
|
|
token_str = SvPV_nolen(token->str); |
721
|
|
|
|
|
|
|
|
722
|
6233
|
100
|
|
|
|
|
if (!inside_sentence) { |
723
|
225
|
|
|
|
|
|
token->is_sentence_start = 1; |
724
|
225
|
|
|
|
|
|
inside_sentence = 1; |
725
|
225
|
|
|
|
|
|
prev_sentence_start = token->pos; |
726
|
|
|
|
|
|
|
} |
727
|
6008
|
50
|
|
|
|
|
else if (!prev_was_abbrev |
728
|
6008
|
50
|
|
|
|
|
&& |
729
|
6008
|
|
|
|
|
|
st_looks_like_sentence_end((unsigned char*)token_str, token->len) |
730
|
|
|
|
|
|
|
) { |
731
|
0
|
|
|
|
|
|
token->is_sentence_end = 1; |
732
|
0
|
|
|
|
|
|
inside_sentence = 0; |
733
|
|
|
|
|
|
|
} |
734
|
6233
|
100
|
|
|
|
|
if (st_is_abbreviation((unsigned char*)token_str, token->len)) { |
735
|
99
|
|
|
|
|
|
token->is_abbreviation = 1; |
736
|
99
|
|
|
|
|
|
prev_was_abbrev = 1; |
737
|
|
|
|
|
|
|
} |
738
|
|
|
|
|
|
|
else { |
739
|
6134
|
|
|
|
|
|
prev_was_abbrev = 0; |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
6233
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
50
|
|
|
|
|
|
743
|
0
|
|
|
|
|
|
warn("main [%d] [%d] [%d] [%s] [%d] [%d]", |
744
|
|
|
|
|
|
|
token->pos, token->len, token->u8len, token_str, |
745
|
0
|
|
|
|
|
|
token->is_sentence_start, token->is_sentence_end |
746
|
|
|
|
|
|
|
); |
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
|
749
|
6233
|
|
|
|
|
|
tok = st_bless_ptr(ST_CLASS_TOKEN, token); |
750
|
6233
|
100
|
|
|
|
|
if (heat_seeker != NULL) { |
751
|
6109
|
100
|
|
|
|
|
if (heat_seeker_is_CV) { |
752
|
563
|
|
|
|
|
|
dSP; |
753
|
563
|
|
|
|
|
|
ENTER; |
754
|
563
|
|
|
|
|
|
SAVETMPS; |
755
|
563
|
50
|
|
|
|
|
PUSHMARK(SP); |
756
|
563
|
50
|
|
|
|
|
XPUSHs(tok); |
757
|
563
|
|
|
|
|
|
PUTBACK; |
758
|
563
|
50
|
|
|
|
|
if (call_sv(heat_seeker, G_SCALAR) != 1) { |
759
|
0
|
|
|
|
|
|
croak("Invalid return value from heat_seeker SUB -- should be single integer"); |
760
|
|
|
|
|
|
|
} |
761
|
563
|
|
|
|
|
|
SPAGAIN; |
762
|
563
|
50
|
|
|
|
|
token->is_hot = POPi; |
763
|
|
|
|
|
|
|
//warn("heat_seeker CV returned %d\n", token->is_hot); |
764
|
563
|
|
|
|
|
|
PUTBACK; |
765
|
563
|
50
|
|
|
|
|
FREETMPS; |
766
|
563
|
|
|
|
|
|
LEAVE; |
767
|
|
|
|
|
|
|
} |
768
|
|
|
|
|
|
|
else { |
769
|
5546
|
|
|
|
|
|
st_heat_seeker(token, heat_seeker); |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
} |
772
|
6233
|
|
|
|
|
|
av_push(tokens, tok); |
773
|
6233
|
100
|
|
|
|
|
if (token->is_sentence_start) { |
774
|
|
|
|
|
|
|
//av_push(sentence_starts, newSViv(token->pos)); |
775
|
225
|
|
|
|
|
|
prev_sentence_start = token->pos; |
776
|
|
|
|
|
|
|
} |
777
|
6233
|
100
|
|
|
|
|
if (token->is_hot) { |
778
|
192
|
|
|
|
|
|
av_push(heat, newSViv(token->pos)); |
779
|
192
|
50
|
|
|
|
|
if (ST_DEBUG) |
|
|
50
|
|
|
|
|
|
780
|
0
|
|
|
|
|
|
warn("%s: sentence_start = %ld for hot token at pos %ld\n", |
781
|
0
|
|
|
|
|
|
FUNCTION__, (unsigned long)prev_sentence_start, (unsigned long)token->pos); |
782
|
|
|
|
|
|
|
|
783
|
192
|
|
|
|
|
|
av_push(sentence_starts, newSViv(prev_sentence_start)); |
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
/* remember where we are for next time */ |
787
|
6233
|
|
|
|
|
|
prev_end = end_ptr; |
788
|
6233
|
|
|
|
|
|
prev_start = start_ptr; |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
|
791
|
48
|
100
|
|
|
|
|
if (prev_end != str_end) { |
792
|
|
|
|
|
|
|
/* some bytes after the last match */ |
793
|
36
|
|
|
|
|
|
st_token *token = st_new_token(num_tokens++, |
794
|
36
|
|
|
|
|
|
(str_end - prev_end), |
795
|
36
|
|
|
|
|
|
utf8_distance((U8*)str_end, (U8*)prev_end), |
796
|
|
|
|
|
|
|
prev_end, |
797
|
|
|
|
|
|
|
0, 0); |
798
|
36
|
50
|
|
|
|
|
token_str = SvPV_nolen(token->str); |
799
|
36
|
50
|
|
|
|
|
if (st_looks_like_sentence_start((unsigned char*)token_str, token->len)) { |
800
|
0
|
|
|
|
|
|
token->is_sentence_start = 1; |
801
|
|
|
|
|
|
|
} |
802
|
36
|
100
|
|
|
|
|
else if (st_looks_like_sentence_end((unsigned char*)token_str, token->len)) { |
803
|
19
|
|
|
|
|
|
token->is_sentence_end = 1; |
804
|
|
|
|
|
|
|
} |
805
|
36
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
50
|
|
|
|
|
|
806
|
0
|
|
|
|
|
|
warn("tail: [%d] [%d] [%d] [%s] [%d] [%d]", |
807
|
|
|
|
|
|
|
token->pos, token->len, token->u8len, token_str, |
808
|
0
|
|
|
|
|
|
token->is_sentence_start, token->is_sentence_end |
809
|
|
|
|
|
|
|
); |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
|
812
|
36
|
|
|
|
|
|
tok = st_bless_ptr(ST_CLASS_TOKEN, token); |
813
|
36
|
|
|
|
|
|
av_push(tokens, tok); |
814
|
|
|
|
|
|
|
} |
815
|
|
|
|
|
|
|
|
816
|
48
|
|
|
|
|
|
return st_bless_ptr( |
817
|
|
|
|
|
|
|
ST_CLASS_TOKENLIST, |
818
|
48
|
|
|
|
|
|
st_new_token_list(tokens, heat, sentence_starts, num_tokens) |
819
|
|
|
|
|
|
|
); |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
static SV* |
823
|
5
|
|
|
|
|
|
st_find_bad_utf8( SV* str ) { |
824
|
|
|
|
|
|
|
dTHX; |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
STRLEN len; |
827
|
|
|
|
|
|
|
U8 *bytes; |
828
|
|
|
|
|
|
|
const U8 *pos; |
829
|
|
|
|
|
|
|
STRLEN *el; |
830
|
|
|
|
|
|
|
|
831
|
5
|
50
|
|
|
|
|
bytes = (U8*)SvPV(str, len); |
832
|
5
|
|
|
|
|
|
el = 0; |
833
|
5
|
100
|
|
|
|
|
if (is_utf8_string(bytes, len)) { |
834
|
3
|
|
|
|
|
|
return &PL_sv_undef; |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
else { |
837
|
2
|
|
|
|
|
|
is_utf8_string_loclen(bytes, len, &pos, el); |
838
|
5
|
|
|
|
|
|
return newSVpvn((char*)pos, strlen((char*)pos)); |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
/* lifted nearly verbatim from mod_perl */ |
843
|
6
|
|
|
|
|
|
static SV *st_escape_xml(char *s) { |
844
|
|
|
|
|
|
|
dTHX; |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
int i, j; |
847
|
|
|
|
|
|
|
SV *x; |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
/* first, count the number of extra characters */ |
850
|
119
|
100
|
|
|
|
|
for (i = 0, j = 0; s[i] != '\0'; i++) |
851
|
113
|
100
|
|
|
|
|
if (s[i] == '<' || s[i] == '>') |
|
|
100
|
|
|
|
|
|
852
|
6
|
|
|
|
|
|
j += 3; |
853
|
107
|
100
|
|
|
|
|
else if (s[i] == '&') |
854
|
3
|
|
|
|
|
|
j += 4; |
855
|
104
|
100
|
|
|
|
|
else if (s[i] == '"' || s[i] == '\'') |
|
|
50
|
|
|
|
|
|
856
|
1
|
|
|
|
|
|
j += 5; |
857
|
|
|
|
|
|
|
|
858
|
6
|
100
|
|
|
|
|
if (j == 0) |
859
|
4
|
|
|
|
|
|
return newSVpv(s,i); |
860
|
|
|
|
|
|
|
|
861
|
2
|
|
|
|
|
|
x = newSV(i + j + 1); |
862
|
|
|
|
|
|
|
|
863
|
52
|
100
|
|
|
|
|
for (i = 0, j = 0; s[i] != '\0'; i++, j++) |
864
|
50
|
100
|
|
|
|
|
if (s[i] == '<') { |
865
|
3
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], "<", 4); |
866
|
3
|
|
|
|
|
|
j += 3; |
867
|
|
|
|
|
|
|
} |
868
|
47
|
100
|
|
|
|
|
else if (s[i] == '>') { |
869
|
3
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], ">", 4); |
870
|
3
|
|
|
|
|
|
j += 3; |
871
|
|
|
|
|
|
|
} |
872
|
44
|
100
|
|
|
|
|
else if (s[i] == '&') { |
873
|
3
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], "&", 5); |
874
|
3
|
|
|
|
|
|
j += 4; |
875
|
|
|
|
|
|
|
} |
876
|
41
|
100
|
|
|
|
|
else if (s[i] == '"') { |
877
|
1
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], """, 6); |
878
|
1
|
|
|
|
|
|
j += 5; |
879
|
|
|
|
|
|
|
} |
880
|
40
|
50
|
|
|
|
|
else if (s[i] == '\'') { |
881
|
0
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], "'", 5); |
882
|
0
|
|
|
|
|
|
j += 4; |
883
|
|
|
|
|
|
|
} |
884
|
|
|
|
|
|
|
else |
885
|
40
|
|
|
|
|
|
SvPVX(x)[j] = s[i]; |
886
|
|
|
|
|
|
|
|
887
|
2
|
|
|
|
|
|
SvPVX(x)[j] = '\0'; |
888
|
2
|
|
|
|
|
|
SvCUR_set(x, j); |
889
|
2
|
|
|
|
|
|
SvPOK_on(x); |
890
|
2
|
|
|
|
|
|
return x; |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
/* returns the UCS32 value for a UTF8 string -- the character's Unicode value. |
894
|
|
|
|
|
|
|
see http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA |
895
|
|
|
|
|
|
|
*/ |
896
|
|
|
|
|
|
|
static IV |
897
|
0
|
|
|
|
|
|
st_utf8_codepoint( |
898
|
|
|
|
|
|
|
const unsigned char *utf8, |
899
|
|
|
|
|
|
|
IV len |
900
|
|
|
|
|
|
|
) |
901
|
|
|
|
|
|
|
{ |
902
|
|
|
|
|
|
|
dTHX; |
903
|
|
|
|
|
|
|
|
904
|
0
|
|
|
|
|
|
switch (len) { |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
case 1: |
907
|
0
|
|
|
|
|
|
return utf8[0]; |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
case 2: |
910
|
0
|
|
|
|
|
|
return (utf8[0] - 192) * 64 + utf8[1] - 128; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
case 3: |
913
|
0
|
|
|
|
|
|
return (utf8[0] - 224) * 4096 + (utf8[1] - 128) * 64 + utf8[2] - 128; |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
case 4: |
916
|
|
|
|
|
|
|
default: |
917
|
0
|
|
|
|
|
|
return (utf8[0] - 240) * 262144 + (utf8[1] - 128) * 4096 + (utf8[2] - 128) * 64 + |
918
|
0
|
|
|
|
|
|
utf8[3] - 128; |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
} |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
static IV |
924
|
36
|
|
|
|
|
|
st_looks_like_sentence_start(const unsigned char *ptr, IV len) |
925
|
|
|
|
|
|
|
{ |
926
|
|
|
|
|
|
|
dTHX; |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
I32 u8len, u32pt; |
929
|
|
|
|
|
|
|
|
930
|
36
|
50
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
50
|
|
|
|
|
|
931
|
0
|
|
|
|
|
|
warn("%s: >%s< %ld\n", FUNCTION__, ptr, len); |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
/* optimized for ASCII */ |
934
|
36
|
50
|
|
|
|
|
if (st_char_is_ascii((unsigned char*)ptr, len)) { |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
/* if the string is more than one byte long, |
937
|
|
|
|
|
|
|
make sure the second char is NOT UPPER |
938
|
|
|
|
|
|
|
since that is likely a false positive. |
939
|
|
|
|
|
|
|
*/ |
940
|
36
|
100
|
|
|
|
|
if (len > 1) { |
941
|
16
|
50
|
|
|
|
|
if (isUPPER(ptr[0]) && !isUPPER(ptr[1])) { |
|
|
0
|
|
|
|
|
|
942
|
0
|
|
|
|
|
|
return 1; |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
else { |
945
|
16
|
|
|
|
|
|
return 0; |
946
|
|
|
|
|
|
|
} |
947
|
|
|
|
|
|
|
} |
948
|
|
|
|
|
|
|
else { |
949
|
20
|
|
|
|
|
|
return isUPPER(ptr[0]); |
950
|
|
|
|
|
|
|
} |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
|
953
|
0
|
0
|
|
|
|
|
if (!len) { |
954
|
0
|
|
|
|
|
|
return 0; |
955
|
|
|
|
|
|
|
} |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
/* TODO if any char is UPPER in the string, consider it a start? */ |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
/* get first full UTF-8 char */ |
960
|
|
|
|
|
|
|
#if (PERL_VERSION >= 16) |
961
|
|
|
|
|
|
|
//warn("WE HAVE utf8_char_buf\n"); |
962
|
0
|
0
|
|
|
|
|
u8len = is_utf8_char_buf((const U8*)ptr, (const U8*)ptr+UTF8SKIP(ptr)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
963
|
|
|
|
|
|
|
#else |
964
|
|
|
|
|
|
|
//warn("WE HAVE utf8_char\n"); |
965
|
|
|
|
|
|
|
u8len = is_utf8_char((U8*)ptr); |
966
|
|
|
|
|
|
|
#endif |
967
|
|
|
|
|
|
|
|
968
|
0
|
0
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
0
|
|
|
|
|
|
969
|
0
|
|
|
|
|
|
warn("%s: %s is utf8 u8len %d\n", FUNCTION__, ptr, u8len); |
970
|
|
|
|
|
|
|
|
971
|
0
|
|
|
|
|
|
u32pt = st_utf8_codepoint(ptr, u8len); |
972
|
|
|
|
|
|
|
|
973
|
0
|
0
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
0
|
|
|
|
|
|
974
|
0
|
|
|
|
|
|
warn("%s: u32 code point %d\n", FUNCTION__, u32pt); |
975
|
|
|
|
|
|
|
|
976
|
0
|
0
|
|
|
|
|
if (iswupper((wint_t)u32pt)) { |
977
|
0
|
|
|
|
|
|
return 1; |
978
|
|
|
|
|
|
|
} |
979
|
0
|
0
|
|
|
|
|
if (u32pt == 191) { /* INVERTED QUESTION MARK */ |
980
|
0
|
|
|
|
|
|
return 1; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
/* TODO more here? */ |
984
|
|
|
|
|
|
|
|
985
|
0
|
|
|
|
|
|
return 0; |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
} |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
/* does any char in the string look like a sentence ending? */ |
990
|
|
|
|
|
|
|
static IV |
991
|
12119
|
|
|
|
|
|
st_looks_like_sentence_end(const unsigned char *ptr, IV len) |
992
|
|
|
|
|
|
|
{ |
993
|
|
|
|
|
|
|
dTHX; |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
IV i; |
996
|
12119
|
|
|
|
|
|
IV num_dots = 0; |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
/* right now this assumes ASCII sentence punctuation. |
999
|
|
|
|
|
|
|
* if we ever wanted utf8 support we'd need to iterate |
1000
|
|
|
|
|
|
|
* per-character instead of per byte. |
1001
|
|
|
|
|
|
|
*/ |
1002
|
|
|
|
|
|
|
|
1003
|
12119
|
50
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
50
|
|
|
|
|
|
1004
|
0
|
|
|
|
|
|
warn("%s: %s\n", FUNCTION__, ptr); |
1005
|
|
|
|
|
|
|
|
1006
|
59987
|
100
|
|
|
|
|
for (i=0; i
|
1007
|
47914
|
|
|
|
|
|
switch (ptr[i]) { |
1008
|
|
|
|
|
|
|
case '.': |
1009
|
|
|
|
|
|
|
/* if abbrev like e.g. U.S.A. then check before and after */ |
1010
|
165
|
|
|
|
|
|
num_dots++; |
1011
|
165
|
|
|
|
|
|
break; |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
case '?': |
1014
|
15
|
|
|
|
|
|
return 1; |
1015
|
|
|
|
|
|
|
break; |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
case '!': |
1018
|
31
|
|
|
|
|
|
return 1; |
1019
|
|
|
|
|
|
|
break; |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
default: |
1022
|
47703
|
|
|
|
|
|
continue; |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
} |
1025
|
|
|
|
|
|
|
} |
1026
|
12073
|
100
|
|
|
|
|
if (num_dots > 1 && num_dots < len) { |
|
|
50
|
|
|
|
|
|
1027
|
3
|
|
|
|
|
|
return 0; |
1028
|
|
|
|
|
|
|
} |
1029
|
12070
|
100
|
|
|
|
|
else if (num_dots == 1) { |
1030
|
157
|
|
|
|
|
|
return 1; |
1031
|
|
|
|
|
|
|
} |
1032
|
11913
|
|
|
|
|
|
return 0; |
1033
|
|
|
|
|
|
|
} |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
static U8* |
1036
|
2996
|
|
|
|
|
|
st_string_to_lower(const unsigned char *ptr, IV len) |
1037
|
|
|
|
|
|
|
{ |
1038
|
|
|
|
|
|
|
dTHX; |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
U8 *lc, *d; |
1041
|
2996
|
|
|
|
|
|
U8 *s = (U8*)ptr; |
1042
|
2996
|
|
|
|
|
|
const U8 *const send = s + len; |
1043
|
|
|
|
|
|
|
U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; |
1044
|
2996
|
|
|
|
|
|
lc = st_malloc((UTF8_MAXBYTES_CASE*len)+1); |
1045
|
2996
|
|
|
|
|
|
d = lc; |
1046
|
12646
|
100
|
|
|
|
|
while (s < send) { |
1047
|
9650
|
|
|
|
|
|
const STRLEN u = UTF8SKIP(s); |
1048
|
|
|
|
|
|
|
STRLEN ulen; |
1049
|
|
|
|
|
|
|
#if ((PERL_VERSION > 24) || (PERL_VERSION == 26 && PERL_SUBVERSION >= 5)) |
1050
|
9650
|
|
|
|
|
|
const UV uv = toLOWER_utf8_safe(s, send, tmpbuf, &ulen); |
1051
|
|
|
|
|
|
|
#else |
1052
|
|
|
|
|
|
|
const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); |
1053
|
|
|
|
|
|
|
#endif |
1054
|
9650
|
|
|
|
|
|
Copy(tmpbuf, lc, ulen, U8); |
1055
|
9650
|
|
|
|
|
|
lc += ulen; |
1056
|
9650
|
|
|
|
|
|
s += u; |
1057
|
|
|
|
|
|
|
} |
1058
|
2996
|
|
|
|
|
|
*lc = '\0'; |
1059
|
2996
|
|
|
|
|
|
return d; |
1060
|
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
static IV |
1063
|
12414
|
|
|
|
|
|
st_is_abbreviation(const unsigned char *ptr, IV len) |
1064
|
|
|
|
|
|
|
{ |
1065
|
|
|
|
|
|
|
dTHX; |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
IV i; |
1068
|
|
|
|
|
|
|
unsigned char *ptr_lc; |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
/* only consider strings of abbreviation-like length */ |
1071
|
12414
|
100
|
|
|
|
|
if (len < 2 || len > 5) { |
|
|
100
|
|
|
|
|
|
1072
|
9418
|
|
|
|
|
|
return 0; |
1073
|
|
|
|
|
|
|
} |
1074
|
|
|
|
|
|
|
|
1075
|
2996
|
100
|
|
|
|
|
if (ST_ABBREVS == NULL) { |
1076
|
|
|
|
|
|
|
//warn("ST_ABBREVS not yet built\n"); |
1077
|
17
|
|
|
|
|
|
i = 0; |
1078
|
17
|
|
|
|
|
|
ST_ABBREVS = newHV(); |
1079
|
2414
|
100
|
|
|
|
|
while(en_abbrevs[i] != NULL) { |
1080
|
2397
|
|
|
|
|
|
st_hv_store_int( ST_ABBREVS, en_abbrevs[i], i); |
1081
|
2397
|
|
|
|
|
|
i++; |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
} |
1084
|
2996
|
|
|
|
|
|
ptr_lc = (unsigned char*)st_string_to_lower(ptr, len); |
1085
|
|
|
|
|
|
|
//warn("ptr=%s ptr_lc=%s\n", ptr, ptr_lc); |
1086
|
2996
|
|
|
|
|
|
i = hv_fetch(ST_ABBREVS, (const char *)ptr_lc, len, 0) ? 1 : 0; |
1087
|
2996
|
|
|
|
|
|
free(ptr_lc); |
1088
|
2996
|
|
|
|
|
|
return i; |
1089
|
|
|
|
|
|
|
} |