| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* Copyright 2009 Peter Karman |
|
2
|
|
|
|
|
|
|
* |
|
3
|
|
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify |
|
4
|
|
|
|
|
|
|
* under the same terms as Perl itself. |
|
5
|
|
|
|
|
|
|
*/ |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
/* |
|
8
|
|
|
|
|
|
|
* Search::Tools C helpers |
|
9
|
|
|
|
|
|
|
*/ |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#include |
|
12
|
|
|
|
|
|
|
#include "search-tools.h" |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
/* global vars */ |
|
15
|
|
|
|
|
|
|
static HV* ST_ABBREVS = NULL; |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
/* perl versions < 5.8.8 do not have this */ |
|
18
|
|
|
|
|
|
|
#ifndef is_utf8_string_loclen |
|
19
|
|
|
|
|
|
|
bool |
|
20
|
|
|
|
|
|
|
is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) |
|
21
|
|
|
|
|
|
|
{ |
|
22
|
|
|
|
|
|
|
dTHX; |
|
23
|
|
|
|
|
|
|
const U8* x = s; |
|
24
|
|
|
|
|
|
|
const U8* send; |
|
25
|
|
|
|
|
|
|
STRLEN c; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
if (!len) |
|
28
|
|
|
|
|
|
|
len = strlen((const char *)s); |
|
29
|
|
|
|
|
|
|
send = s + len; |
|
30
|
|
|
|
|
|
|
if (el) |
|
31
|
|
|
|
|
|
|
*el = 0; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
while (x < send) { |
|
34
|
|
|
|
|
|
|
/* Inline the easy bits of is_utf8_char() here for speed... */ |
|
35
|
|
|
|
|
|
|
if (UTF8_IS_INVARIANT(*x)) |
|
36
|
|
|
|
|
|
|
c = 1; |
|
37
|
|
|
|
|
|
|
else if (!UTF8_IS_START(*x)) |
|
38
|
|
|
|
|
|
|
goto out; |
|
39
|
|
|
|
|
|
|
else { |
|
40
|
|
|
|
|
|
|
/* ... and call is_utf8_char() only if really needed. */ |
|
41
|
|
|
|
|
|
|
#ifdef IS_UTF8_CHAR |
|
42
|
|
|
|
|
|
|
c = UTF8SKIP(x); |
|
43
|
|
|
|
|
|
|
if (IS_UTF8_CHAR_FAST(c)) { |
|
44
|
|
|
|
|
|
|
if (!IS_UTF8_CHAR(x, c)) |
|
45
|
|
|
|
|
|
|
c = 0; |
|
46
|
|
|
|
|
|
|
} else |
|
47
|
|
|
|
|
|
|
c = is_utf8_char_slow(x, c); |
|
48
|
|
|
|
|
|
|
#else |
|
49
|
|
|
|
|
|
|
c = is_utf8_char(x); |
|
50
|
|
|
|
|
|
|
#endif /* #ifdef IS_UTF8_CHAR */ |
|
51
|
|
|
|
|
|
|
if (!c) |
|
52
|
|
|
|
|
|
|
goto out; |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
x += c; |
|
55
|
|
|
|
|
|
|
if (el) |
|
56
|
|
|
|
|
|
|
(*el)++; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
out: |
|
60
|
|
|
|
|
|
|
if (ep) |
|
61
|
|
|
|
|
|
|
*ep = x; |
|
62
|
|
|
|
|
|
|
if (x != send) |
|
63
|
|
|
|
|
|
|
return FALSE; |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
return TRUE; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
#endif |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
static SV* |
|
72
|
2397
|
|
|
|
|
|
st_hv_store( HV* h, const char* key, SV* val) { |
|
73
|
|
|
|
|
|
|
dTHX; |
|
74
|
|
|
|
|
|
|
SV** ok; |
|
75
|
2397
|
|
|
|
|
|
ok = hv_store(h, key, strlen(key), SvREFCNT_inc(val), 0); |
|
76
|
2397
|
50
|
|
|
|
|
if (ok == NULL) { |
|
77
|
0
|
|
|
|
|
|
ST_CROAK("failed to store %s in hash", key); |
|
78
|
|
|
|
|
|
|
} |
|
79
|
2397
|
|
|
|
|
|
return *ok; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
static SV* |
|
83
|
0
|
|
|
|
|
|
st_hv_store_char( HV* h, const char *key, char *val) { |
|
84
|
|
|
|
|
|
|
dTHX; |
|
85
|
|
|
|
|
|
|
SV *value; |
|
86
|
0
|
|
|
|
|
|
value = newSVpv(val, 0); |
|
87
|
0
|
|
|
|
|
|
st_hv_store( h, key, value ); |
|
88
|
0
|
|
|
|
|
|
SvREFCNT_dec(value); |
|
89
|
0
|
|
|
|
|
|
return value; |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
static SV* |
|
93
|
2397
|
|
|
|
|
|
st_hv_store_int( HV* h, const char* key, int i) { |
|
94
|
|
|
|
|
|
|
dTHX; |
|
95
|
|
|
|
|
|
|
SV *value; |
|
96
|
2397
|
|
|
|
|
|
value = newSViv(i); |
|
97
|
2397
|
|
|
|
|
|
st_hv_store( h, key, value ); |
|
98
|
2397
|
|
|
|
|
|
SvREFCNT_dec(value); |
|
99
|
2397
|
|
|
|
|
|
return value; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
/* UNUSED |
|
103
|
|
|
|
|
|
|
static SV* |
|
104
|
|
|
|
|
|
|
st_hvref_store( SV* h, const char* key, SV* val) { |
|
105
|
|
|
|
|
|
|
dTHX; |
|
106
|
|
|
|
|
|
|
return st_hv_store( (HV*)SvRV(h), key, val ); |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
*/ |
|
109
|
|
|
|
|
|
|
/* UNUSED |
|
110
|
|
|
|
|
|
|
static SV* |
|
111
|
|
|
|
|
|
|
st_hvref_store_char( SV* h, const char* key, char *val) { |
|
112
|
|
|
|
|
|
|
dTHX; |
|
113
|
|
|
|
|
|
|
return st_hv_store_char( (HV*)SvRV(h), key, val ); |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
*/ |
|
116
|
|
|
|
|
|
|
/* UNUSED |
|
117
|
|
|
|
|
|
|
static SV* |
|
118
|
|
|
|
|
|
|
st_hvref_store_int( SV* h, const char* key, int i) { |
|
119
|
|
|
|
|
|
|
dTHX; |
|
120
|
|
|
|
|
|
|
return st_hv_store_int( (HV*)SvRV(h), key, i ); |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
*/ |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
static SV* |
|
125
|
6629
|
|
|
|
|
|
st_av_fetch( AV* a, I32 index ) { |
|
126
|
|
|
|
|
|
|
dTHX; |
|
127
|
|
|
|
|
|
|
SV** ok; |
|
128
|
6629
|
|
|
|
|
|
ok = av_fetch(a, index, 0); |
|
129
|
6629
|
50
|
|
|
|
|
if (ok == NULL) { |
|
130
|
0
|
|
|
|
|
|
ST_CROAK("failed to fetch index %d", index); |
|
131
|
|
|
|
|
|
|
} |
|
132
|
6629
|
|
|
|
|
|
return *ok; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
static void * |
|
136
|
0
|
|
|
|
|
|
st_av_fetch_ptr( AV* a, I32 index ) { |
|
137
|
|
|
|
|
|
|
dTHX; |
|
138
|
|
|
|
|
|
|
SV** ok; |
|
139
|
|
|
|
|
|
|
void * ptr; |
|
140
|
0
|
|
|
|
|
|
ok = av_fetch(a, index, 0); |
|
141
|
0
|
0
|
|
|
|
|
if (ok == NULL) { |
|
142
|
0
|
|
|
|
|
|
ST_CROAK("failed to fetch index %d", index); |
|
143
|
|
|
|
|
|
|
} |
|
144
|
0
|
|
|
|
|
|
ptr = st_extract_ptr(*ok); |
|
145
|
|
|
|
|
|
|
//warn("%s refcnt == %d", SvPV_nolen(*ok), SvREFCNT(*ok)); |
|
146
|
0
|
|
|
|
|
|
return ptr; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
/* fetch SV* from hash */ |
|
150
|
|
|
|
|
|
|
static SV* |
|
151
|
48
|
|
|
|
|
|
st_hv_fetch( HV* h, const char* key ) { |
|
152
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
|
153
|
|
|
|
|
|
|
SV** ok; |
|
154
|
48
|
|
|
|
|
|
ok = hv_fetch(h, key, strlen(key), 0); |
|
155
|
48
|
50
|
|
|
|
|
if (ok == NULL) { |
|
156
|
0
|
|
|
|
|
|
ST_CROAK("failed to fetch %s", key); |
|
157
|
|
|
|
|
|
|
} |
|
158
|
48
|
|
|
|
|
|
return *ok; |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
static SV* |
|
162
|
48
|
|
|
|
|
|
st_hvref_fetch( SV* h, const char* key ) { |
|
163
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
|
164
|
48
|
|
|
|
|
|
return st_hv_fetch((HV*)SvRV(h), key); |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
/* UNUSED |
|
168
|
|
|
|
|
|
|
static char* |
|
169
|
|
|
|
|
|
|
st_hv_fetch_as_char( HV* h, const char* key ) { |
|
170
|
|
|
|
|
|
|
dTHX; |
|
171
|
|
|
|
|
|
|
SV** ok; |
|
172
|
|
|
|
|
|
|
ok = hv_fetch(h, key, strlen(key), 0); |
|
173
|
|
|
|
|
|
|
if (ok == NULL) { |
|
174
|
|
|
|
|
|
|
ST_CROAK("failed to fetch %s from hash", key); |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
return SvPV_nolen((SV*)*ok); |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
*/ |
|
179
|
|
|
|
|
|
|
/* UNUSED |
|
180
|
|
|
|
|
|
|
static char* |
|
181
|
|
|
|
|
|
|
st_hvref_fetch_as_char( SV* h, const char* key ) { |
|
182
|
|
|
|
|
|
|
dTHX; |
|
183
|
|
|
|
|
|
|
return st_hv_fetch_as_char( (HV*)SvRV(h), key ); |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
*/ |
|
186
|
|
|
|
|
|
|
/* UNUSED |
|
187
|
|
|
|
|
|
|
static IV |
|
188
|
|
|
|
|
|
|
st_hvref_fetch_as_int( SV* h, const char* key ) { |
|
189
|
|
|
|
|
|
|
dTHX; |
|
190
|
|
|
|
|
|
|
SV* val; |
|
191
|
|
|
|
|
|
|
IV i; |
|
192
|
|
|
|
|
|
|
val = st_hv_fetch( (HV*)SvRV(h), key ); |
|
193
|
|
|
|
|
|
|
i = SvIV(val); |
|
194
|
|
|
|
|
|
|
return i; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
*/ |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
void * |
|
199
|
15494
|
|
|
|
|
|
st_malloc(size_t size) { |
|
200
|
|
|
|
|
|
|
dTHX; |
|
201
|
|
|
|
|
|
|
void *ptr; |
|
202
|
15494
|
|
|
|
|
|
ptr = malloc(size); |
|
203
|
15494
|
50
|
|
|
|
|
if (ptr == NULL) { |
|
204
|
0
|
|
|
|
|
|
ST_CROAK("Out of memory! Can't malloc %lu bytes", |
|
205
|
|
|
|
|
|
|
(unsigned long)size); |
|
206
|
|
|
|
|
|
|
} |
|
207
|
15494
|
|
|
|
|
|
return ptr; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
static st_token* |
|
212
|
12450
|
|
|
|
|
|
st_new_token( |
|
213
|
|
|
|
|
|
|
I32 pos, |
|
214
|
|
|
|
|
|
|
I32 len, |
|
215
|
|
|
|
|
|
|
I32 u8len, |
|
216
|
|
|
|
|
|
|
const char *ptr, |
|
217
|
|
|
|
|
|
|
I32 is_hot, |
|
218
|
|
|
|
|
|
|
boolean is_match |
|
219
|
|
|
|
|
|
|
) { |
|
220
|
|
|
|
|
|
|
dTHX; |
|
221
|
|
|
|
|
|
|
st_token *tok; |
|
222
|
|
|
|
|
|
|
|
|
223
|
12450
|
50
|
|
|
|
|
if (!len) { |
|
224
|
0
|
|
|
|
|
|
ST_CROAK("cannot create token with zero length: '%s'", ptr); |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
12450
|
|
|
|
|
|
tok = st_malloc(sizeof(st_token)); |
|
228
|
12450
|
|
|
|
|
|
tok->pos = pos; |
|
229
|
12450
|
|
|
|
|
|
tok->len = len; |
|
230
|
12450
|
|
|
|
|
|
tok->u8len = u8len; |
|
231
|
12450
|
|
|
|
|
|
tok->is_hot = is_hot; |
|
232
|
12450
|
|
|
|
|
|
tok->is_match = is_match; |
|
233
|
12450
|
|
|
|
|
|
tok->is_sentence_start = 0; |
|
234
|
12450
|
|
|
|
|
|
tok->is_sentence_end = 0; |
|
235
|
12450
|
|
|
|
|
|
tok->is_abbreviation = 0; |
|
236
|
12450
|
|
|
|
|
|
tok->str = newSVpvn(ptr, len); /* newSVpvn_utf8 not available in some perls? */ |
|
237
|
12450
|
|
|
|
|
|
SvUTF8_on(tok->str); |
|
238
|
12450
|
|
|
|
|
|
tok->ref_cnt = 1; |
|
239
|
12450
|
|
|
|
|
|
return tok; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
static st_token_list* |
|
243
|
48
|
|
|
|
|
|
st_new_token_list( |
|
244
|
|
|
|
|
|
|
AV *tokens, |
|
245
|
|
|
|
|
|
|
AV *heat, |
|
246
|
|
|
|
|
|
|
AV *sentence_starts, |
|
247
|
|
|
|
|
|
|
unsigned int num |
|
248
|
|
|
|
|
|
|
) { |
|
249
|
|
|
|
|
|
|
dTHX; |
|
250
|
|
|
|
|
|
|
st_token_list *tl; |
|
251
|
48
|
|
|
|
|
|
tl = st_malloc(sizeof(st_token_list)); |
|
252
|
48
|
|
|
|
|
|
tl->pos = 0; |
|
253
|
48
|
|
|
|
|
|
tl->tokens = tokens; |
|
254
|
48
|
|
|
|
|
|
tl->heat = heat; |
|
255
|
48
|
|
|
|
|
|
tl->sentence_starts = sentence_starts; |
|
256
|
48
|
|
|
|
|
|
tl->num = (IV)num; |
|
257
|
48
|
|
|
|
|
|
tl->ref_cnt = 1; |
|
258
|
48
|
|
|
|
|
|
return tl; |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
static void |
|
262
|
12450
|
|
|
|
|
|
st_free_token(st_token *tok) { |
|
263
|
|
|
|
|
|
|
dTHX; |
|
264
|
12450
|
50
|
|
|
|
|
if (tok->ref_cnt != 0) { |
|
265
|
0
|
|
|
|
|
|
ST_CROAK("Won't free token 0x%x with ref_cnt != 0 [%d]", |
|
266
|
|
|
|
|
|
|
tok, tok->ref_cnt); |
|
267
|
|
|
|
|
|
|
} |
|
268
|
12450
|
|
|
|
|
|
SvREFCNT_dec(tok->str); |
|
269
|
12450
|
|
|
|
|
|
free(tok); |
|
270
|
12450
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
static void |
|
273
|
48
|
|
|
|
|
|
st_free_token_list(st_token_list *token_list) { |
|
274
|
|
|
|
|
|
|
dTHX; |
|
275
|
48
|
50
|
|
|
|
|
if (token_list->ref_cnt != 0) { |
|
276
|
0
|
|
|
|
|
|
ST_CROAK("Won't free token_list 0x%x with ref_cnt > 0 [%d]", |
|
277
|
|
|
|
|
|
|
token_list, token_list->ref_cnt); |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
//warn("about to free st_token_list C struct\n"); |
|
281
|
|
|
|
|
|
|
//st_dump_token_list(token_list); |
|
282
|
|
|
|
|
|
|
|
|
283
|
48
|
|
|
|
|
|
SvREFCNT_dec(token_list->tokens); |
|
284
|
48
|
50
|
|
|
|
|
if (SvREFCNT(token_list->tokens)) { |
|
285
|
0
|
|
|
|
|
|
warn("Warning: possible memory leak for token_list->tokens 0x%lx with REFCNT %d\n", |
|
286
|
0
|
|
|
|
|
|
(unsigned long)token_list->tokens, SvREFCNT(token_list->tokens)); |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
48
|
|
|
|
|
|
SvREFCNT_dec(token_list->heat); |
|
290
|
48
|
50
|
|
|
|
|
if (SvREFCNT(token_list->heat)) { |
|
291
|
0
|
|
|
|
|
|
warn("Warning: possible memory leak for token_list->heat 0x%lx with REFCNT %d\n", |
|
292
|
0
|
|
|
|
|
|
(unsigned long)token_list->heat, SvREFCNT(token_list->heat)); |
|
293
|
|
|
|
|
|
|
} |
|
294
|
|
|
|
|
|
|
|
|
295
|
48
|
|
|
|
|
|
SvREFCNT_dec(token_list->sentence_starts); |
|
296
|
48
|
50
|
|
|
|
|
if (SvREFCNT(token_list->sentence_starts)) { |
|
297
|
0
|
|
|
|
|
|
warn("Warning: possible memory leak for token_list->sentence_starts 0x%lx with REFCNT %d\n", |
|
298
|
0
|
|
|
|
|
|
(unsigned long)token_list->sentence_starts, SvREFCNT(token_list->sentence_starts)); |
|
299
|
|
|
|
|
|
|
} |
|
300
|
|
|
|
|
|
|
|
|
301
|
48
|
|
|
|
|
|
free(token_list); |
|
302
|
48
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
static void |
|
305
|
0
|
|
|
|
|
|
st_dump_token_list(st_token_list *tl) { |
|
306
|
|
|
|
|
|
|
dTHX; |
|
307
|
|
|
|
|
|
|
IV len, pos; |
|
308
|
|
|
|
|
|
|
SV* tok; |
|
309
|
0
|
|
|
|
|
|
len = av_len(tl->tokens); |
|
310
|
0
|
|
|
|
|
|
pos = 0; |
|
311
|
0
|
|
|
|
|
|
warn("TokenList 0x%lx", (unsigned long)tl); |
|
312
|
0
|
|
|
|
|
|
warn(" pos = %ld\n", (unsigned long)tl->pos); |
|
313
|
0
|
|
|
|
|
|
warn(" len = %ld\n", (unsigned long)len + 1); |
|
314
|
0
|
|
|
|
|
|
warn(" num = %ld\n", (unsigned long)tl->num); |
|
315
|
0
|
|
|
|
|
|
warn(" ref_cnt = %ld\n", (unsigned long)tl->ref_cnt); |
|
316
|
0
|
|
|
|
|
|
warn(" tokens REFCNT = %ld\n", (unsigned long)SvREFCNT(tl->tokens)); |
|
317
|
0
|
|
|
|
|
|
warn(" heat REFCNT = %ld\n", (unsigned long)SvREFCNT(tl->heat)); |
|
318
|
0
|
|
|
|
|
|
warn(" sen_starts REFCNT = %ld\n", (unsigned long)SvREFCNT(tl->sentence_starts)); |
|
319
|
0
|
0
|
|
|
|
|
while (pos < len) { |
|
320
|
0
|
|
|
|
|
|
tok = st_av_fetch(tl->tokens, pos++); |
|
321
|
0
|
|
|
|
|
|
warn(" Token REFCNT = %ld\n", (unsigned long)SvREFCNT(tok)); |
|
322
|
0
|
|
|
|
|
|
st_dump_token((st_token*)st_extract_ptr(tok)); |
|
323
|
|
|
|
|
|
|
} |
|
324
|
0
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
static void |
|
327
|
0
|
|
|
|
|
|
st_dump_token(st_token *tok) { |
|
328
|
|
|
|
|
|
|
dTHX; |
|
329
|
0
|
|
|
|
|
|
warn("Token 0x%lx", (unsigned long)tok); |
|
330
|
0
|
0
|
|
|
|
|
warn(" str = '%s'\n", SvPV_nolen(tok->str)); |
|
331
|
0
|
|
|
|
|
|
warn(" pos = %ld\n", (unsigned long)tok->pos); |
|
332
|
0
|
|
|
|
|
|
warn(" len = %ld\n", (unsigned long)tok->len); |
|
333
|
0
|
|
|
|
|
|
warn(" u8len = %ld\n", (unsigned long)tok->u8len); |
|
334
|
0
|
|
|
|
|
|
warn(" is_match = %d\n", tok->is_match); |
|
335
|
0
|
|
|
|
|
|
warn(" is_sentence_start = %d\n", tok->is_sentence_start); |
|
336
|
0
|
|
|
|
|
|
warn(" is_sentence_end = %d\n", tok->is_sentence_end); |
|
337
|
0
|
|
|
|
|
|
warn(" is_abbreviation = %d\n", tok->is_abbreviation); |
|
338
|
0
|
|
|
|
|
|
warn(" is_hot = %d\n", tok->is_hot); |
|
339
|
0
|
|
|
|
|
|
warn(" ref_cnt = %ld\n", (unsigned long)tok->ref_cnt); |
|
340
|
0
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
/* make a Perl blessed object from a C pointer */ |
|
343
|
|
|
|
|
|
|
static SV* |
|
344
|
12498
|
|
|
|
|
|
st_bless_ptr( const char *class, void * c_ptr ) { |
|
345
|
|
|
|
|
|
|
dTHX; |
|
346
|
12498
|
|
|
|
|
|
SV* obj = newSViv( PTR2IV( c_ptr ) ); // use instead of sv_newmortal(). |
|
347
|
12498
|
|
|
|
|
|
sv_setref_pv(obj, class, c_ptr); |
|
348
|
12498
|
|
|
|
|
|
return obj; |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
/* return the C pointer from a Perl blessed O_OBJECT */ |
|
352
|
|
|
|
|
|
|
static void * |
|
353
|
12691
|
|
|
|
|
|
st_extract_ptr( SV* object ) { |
|
354
|
|
|
|
|
|
|
dTHX; |
|
355
|
12691
|
50
|
|
|
|
|
return INT2PTR( void*, SvIV(SvRV( object )) ); |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
static void |
|
359
|
0
|
|
|
|
|
|
st_croak( |
|
360
|
|
|
|
|
|
|
const char *file, |
|
361
|
|
|
|
|
|
|
int line, |
|
362
|
|
|
|
|
|
|
const char *func, |
|
363
|
|
|
|
|
|
|
const char *msgfmt, |
|
364
|
|
|
|
|
|
|
... |
|
365
|
|
|
|
|
|
|
) |
|
366
|
|
|
|
|
|
|
{ |
|
367
|
|
|
|
|
|
|
dTHX; |
|
368
|
|
|
|
|
|
|
va_list args; |
|
369
|
0
|
|
|
|
|
|
va_start(args, msgfmt); |
|
370
|
0
|
|
|
|
|
|
warn("Search::Tools error at %s:%d %s: ", file, line, func); |
|
371
|
|
|
|
|
|
|
//warn(msgfmt, args); |
|
372
|
0
|
|
|
|
|
|
croak(msgfmt, args); |
|
373
|
|
|
|
|
|
|
/* NEVER REACH HERE */ |
|
374
|
|
|
|
|
|
|
va_end(args); |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
/* UNUSED |
|
378
|
|
|
|
|
|
|
static SV* |
|
379
|
|
|
|
|
|
|
st_new_hash_object(const char *class) { |
|
380
|
|
|
|
|
|
|
dTHX; |
|
381
|
|
|
|
|
|
|
HV *hash; |
|
382
|
|
|
|
|
|
|
SV *object; |
|
383
|
|
|
|
|
|
|
hash = newHV(); |
|
384
|
|
|
|
|
|
|
object = sv_bless( newRV((SV*)hash), gv_stashpv(class,0) ); |
|
385
|
|
|
|
|
|
|
return object; |
|
386
|
|
|
|
|
|
|
} |
|
387
|
|
|
|
|
|
|
*/ |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
static void |
|
390
|
0
|
|
|
|
|
|
st_dump_sv(SV* ref) { |
|
391
|
|
|
|
|
|
|
dTHX; |
|
392
|
|
|
|
|
|
|
HV* hash; |
|
393
|
|
|
|
|
|
|
HE* hash_entry; |
|
394
|
|
|
|
|
|
|
AV* array; |
|
395
|
|
|
|
|
|
|
int num_keys, i, pos, len; |
|
396
|
|
|
|
|
|
|
SV* sv_key; |
|
397
|
|
|
|
|
|
|
SV* sv_val; |
|
398
|
|
|
|
|
|
|
int refcnt; |
|
399
|
|
|
|
|
|
|
|
|
400
|
0
|
|
|
|
|
|
pos = 0; |
|
401
|
0
|
|
|
|
|
|
i = 0; |
|
402
|
0
|
|
|
|
|
|
len = 0; |
|
403
|
|
|
|
|
|
|
|
|
404
|
0
|
0
|
|
|
|
|
if (SvTYPE(SvRV(ref))==SVt_PVHV) { |
|
405
|
0
|
|
|
|
|
|
warn("SV is a hash reference"); |
|
406
|
0
|
|
|
|
|
|
hash = (HV*)SvRV(ref); |
|
407
|
0
|
|
|
|
|
|
num_keys = hv_iterinit(hash); |
|
408
|
0
|
0
|
|
|
|
|
for (i = 0; i < num_keys; i++) { |
|
409
|
0
|
|
|
|
|
|
hash_entry = hv_iternext(hash); |
|
410
|
0
|
|
|
|
|
|
sv_key = hv_iterkeysv(hash_entry); |
|
411
|
0
|
|
|
|
|
|
sv_val = hv_iterval(hash, hash_entry); |
|
412
|
0
|
|
|
|
|
|
refcnt = SvREFCNT(sv_val); |
|
413
|
0
|
0
|
|
|
|
|
warn(" %s => %s [%d]\n", |
|
|
|
0
|
|
|
|
|
|
|
414
|
0
|
|
|
|
|
|
SvPV_nolen(sv_key), SvPV_nolen(sv_val), refcnt); |
|
415
|
|
|
|
|
|
|
} |
|
416
|
|
|
|
|
|
|
} |
|
417
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(ref))==SVt_PVAV) { |
|
418
|
0
|
|
|
|
|
|
warn("SV is an array reference"); |
|
419
|
0
|
|
|
|
|
|
array = (AV*)SvRV(ref); |
|
420
|
0
|
|
|
|
|
|
len = av_len(array)+1; |
|
421
|
0
|
|
|
|
|
|
warn("SV has %d items\n", len); |
|
422
|
0
|
|
|
|
|
|
pos = 0; |
|
423
|
0
|
0
|
|
|
|
|
while (pos < len) { |
|
424
|
0
|
|
|
|
|
|
st_describe_object( st_av_fetch(array, pos++) ); |
|
425
|
|
|
|
|
|
|
} |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
} |
|
428
|
|
|
|
|
|
|
|
|
429
|
0
|
|
|
|
|
|
return; |
|
430
|
|
|
|
|
|
|
} |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
static void |
|
433
|
0
|
|
|
|
|
|
st_describe_object( SV* object ) { |
|
434
|
|
|
|
|
|
|
dTHX; |
|
435
|
|
|
|
|
|
|
char* str; |
|
436
|
|
|
|
|
|
|
|
|
437
|
0
|
|
|
|
|
|
warn("describing object\n"); |
|
438
|
0
|
0
|
|
|
|
|
str = SvPV_nolen( object ); |
|
439
|
0
|
0
|
|
|
|
|
if (SvROK(object)) |
|
440
|
|
|
|
|
|
|
{ |
|
441
|
0
|
0
|
|
|
|
|
if (SvTYPE(SvRV(object))==SVt_PVHV) |
|
442
|
0
|
|
|
|
|
|
warn("%s is a magic blessed reference\n", str); |
|
443
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(object))==SVt_PVMG) |
|
444
|
0
|
|
|
|
|
|
warn("%s is a magic reference", str); |
|
445
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(object))==SVt_IV) |
|
446
|
0
|
|
|
|
|
|
warn("%s is a IV reference (pointer)", str); |
|
447
|
|
|
|
|
|
|
else |
|
448
|
0
|
|
|
|
|
|
warn("%s is a reference of some kind", str); |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
else |
|
451
|
|
|
|
|
|
|
{ |
|
452
|
0
|
|
|
|
|
|
warn("%s is not a reference", str); |
|
453
|
0
|
0
|
|
|
|
|
if (sv_isobject(object)) |
|
454
|
0
|
|
|
|
|
|
warn("however, %s is an object", str); |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
} |
|
458
|
0
|
|
|
|
|
|
warn("object dump"); |
|
459
|
0
|
|
|
|
|
|
Perl_sv_dump( aTHX_ object ); |
|
460
|
0
|
|
|
|
|
|
warn("object ref dump"); |
|
461
|
0
|
|
|
|
|
|
Perl_sv_dump( aTHX_ (SV*)SvRV(object) ); |
|
462
|
0
|
|
|
|
|
|
st_dump_sv( object ); |
|
463
|
0
|
|
|
|
|
|
} |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
static boolean |
|
466
|
622
|
|
|
|
|
|
st_is_ascii( SV* str ) { |
|
467
|
|
|
|
|
|
|
dTHX; |
|
468
|
|
|
|
|
|
|
STRLEN len; |
|
469
|
|
|
|
|
|
|
char *bytes; |
|
470
|
|
|
|
|
|
|
IV i; |
|
471
|
|
|
|
|
|
|
|
|
472
|
622
|
50
|
|
|
|
|
bytes = SvPV(str, len); |
|
473
|
622
|
|
|
|
|
|
return st_char_is_ascii((unsigned char*)bytes, len); |
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
static boolean |
|
477
|
658
|
|
|
|
|
|
st_char_is_ascii( unsigned char* str, STRLEN len ) { |
|
478
|
|
|
|
|
|
|
dTHX; |
|
479
|
|
|
|
|
|
|
IV i; |
|
480
|
|
|
|
|
|
|
|
|
481
|
59379
|
100
|
|
|
|
|
for(i=0; i
|
|
482
|
58973
|
100
|
|
|
|
|
if (str[i] >= 0x80) { |
|
483
|
252
|
|
|
|
|
|
return 0; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
} |
|
486
|
406
|
|
|
|
|
|
return 1; |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
/* SvRX does this in Perl >= 5.10 */ |
|
490
|
|
|
|
|
|
|
static REGEXP* |
|
491
|
5619
|
|
|
|
|
|
st_get_regex_from_sv( SV *regex_sv ) { |
|
492
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
REGEXP *rx; |
|
495
|
|
|
|
|
|
|
MAGIC *mg; |
|
496
|
5619
|
|
|
|
|
|
mg = NULL; |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
#if ((PERL_VERSION > 9) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) |
|
499
|
5619
|
|
|
|
|
|
rx = SvRX(regex_sv); |
|
500
|
|
|
|
|
|
|
#else |
|
501
|
|
|
|
|
|
|
/* extract regexp struct from qr// entity */ |
|
502
|
|
|
|
|
|
|
if (SvROK(regex_sv)) { |
|
503
|
|
|
|
|
|
|
SV *sv = SvRV(regex_sv); |
|
504
|
|
|
|
|
|
|
if (SvMAGICAL(sv)) |
|
505
|
|
|
|
|
|
|
mg = mg_find(sv, PERL_MAGIC_qr); |
|
506
|
|
|
|
|
|
|
} |
|
507
|
|
|
|
|
|
|
if (!mg) { |
|
508
|
|
|
|
|
|
|
st_describe_object(regex_sv); |
|
509
|
|
|
|
|
|
|
ST_CROAK("regex is not a qr// entity"); |
|
510
|
|
|
|
|
|
|
} |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
rx = (REGEXP*)mg->mg_obj; |
|
513
|
|
|
|
|
|
|
#endif |
|
514
|
|
|
|
|
|
|
|
|
515
|
5619
|
50
|
|
|
|
|
if (rx == NULL) { |
|
516
|
0
|
|
|
|
|
|
ST_CROAK("Failed to extract REGEXP from regex_sv %s", |
|
517
|
|
|
|
|
|
|
SvPV_nolen( regex_sv )); |
|
518
|
|
|
|
|
|
|
} |
|
519
|
|
|
|
|
|
|
|
|
520
|
5619
|
|
|
|
|
|
return rx; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
static void |
|
524
|
5546
|
|
|
|
|
|
st_heat_seeker( st_token *token, SV *re ) { |
|
525
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
REGEXP *rx; |
|
528
|
|
|
|
|
|
|
char *buf, *str_end; |
|
529
|
|
|
|
|
|
|
|
|
530
|
5546
|
|
|
|
|
|
rx = st_get_regex_from_sv(re); |
|
531
|
5546
|
|
|
|
|
|
buf = SvPVX(token->str); |
|
532
|
5546
|
|
|
|
|
|
str_end = buf + token->len; |
|
533
|
|
|
|
|
|
|
|
|
534
|
5546
|
100
|
|
|
|
|
if ( pregexec(rx, buf, str_end, buf, 1, token->str, 1) ) { |
|
535
|
123
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
|
50
|
|
|
|
|
|
|
536
|
0
|
|
|
|
|
|
warn("st_heat_seeker: token is hot: %s", buf); |
|
537
|
|
|
|
|
|
|
} |
|
538
|
123
|
|
|
|
|
|
token->is_hot = 1; |
|
539
|
|
|
|
|
|
|
} |
|
540
|
|
|
|
|
|
|
|
|
541
|
5546
|
|
|
|
|
|
} |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
static AV* |
|
544
|
25
|
|
|
|
|
|
st_heat_seeker_offsets( SV *str, SV *re ) { |
|
545
|
|
|
|
|
|
|
dTHX; |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
REGEXP *rx; |
|
548
|
|
|
|
|
|
|
char *buf, *str_end, *str_start; |
|
549
|
|
|
|
|
|
|
STRLEN str_len; |
|
550
|
|
|
|
|
|
|
AV *offsets; |
|
551
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
|
552
|
|
|
|
|
|
|
regexp *r; |
|
553
|
|
|
|
|
|
|
#endif |
|
554
|
|
|
|
|
|
|
|
|
555
|
25
|
|
|
|
|
|
rx = st_get_regex_from_sv(re); |
|
556
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
|
557
|
25
|
|
|
|
|
|
r = (regexp*)SvANY(rx); |
|
558
|
|
|
|
|
|
|
#endif |
|
559
|
25
|
50
|
|
|
|
|
buf = SvPV(str, str_len); |
|
560
|
25
|
|
|
|
|
|
str_start = buf; |
|
561
|
25
|
|
|
|
|
|
str_end = buf + str_len; |
|
562
|
25
|
|
|
|
|
|
offsets = newAV(); |
|
563
|
|
|
|
|
|
|
|
|
564
|
121
|
100
|
|
|
|
|
while ( pregexec(rx, buf, str_end, buf, 1, str, 1) ) { |
|
565
|
|
|
|
|
|
|
const char *start_ptr, *end_ptr; |
|
566
|
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
#if ((PERL_VERSION == 10) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) |
|
568
|
|
|
|
|
|
|
start_ptr = buf + rx->offs[0].start; |
|
569
|
|
|
|
|
|
|
end_ptr = buf + rx->offs[0].end; |
|
570
|
|
|
|
|
|
|
#elif (PERL_VERSION > 10) |
|
571
|
96
|
|
|
|
|
|
start_ptr = buf + r->offs[0].start; |
|
572
|
96
|
|
|
|
|
|
end_ptr = buf + r->offs[0].end; |
|
573
|
|
|
|
|
|
|
#else |
|
574
|
|
|
|
|
|
|
start_ptr = buf + rx->startp[0]; |
|
575
|
|
|
|
|
|
|
end_ptr = buf + rx->endp[0]; |
|
576
|
|
|
|
|
|
|
#endif |
|
577
|
|
|
|
|
|
|
/* advance the pointer */ |
|
578
|
96
|
|
|
|
|
|
buf = (char*)end_ptr; |
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
//warn("got heat match at %ld", start_ptr - str_start); |
|
581
|
96
|
|
|
|
|
|
av_push(offsets, newSViv(start_ptr - str_start)); |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
} |
|
584
|
|
|
|
|
|
|
|
|
585
|
25
|
|
|
|
|
|
return offsets; |
|
586
|
|
|
|
|
|
|
} |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
/* |
|
589
|
|
|
|
|
|
|
st_tokenize() et al based on KinoSearch::Analysis::Tokenizer |
|
590
|
|
|
|
|
|
|
by Marvin Humphrey. |
|
591
|
|
|
|
|
|
|
He dared go where no XS regex user had gone before... |
|
592
|
|
|
|
|
|
|
*/ |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
static SV* |
|
595
|
48
|
|
|
|
|
|
st_tokenize( SV* str, SV* token_re, SV* heat_seeker, I32 match_num ) { |
|
596
|
|
|
|
|
|
|
dTHX; /* thread-safe perlism */ |
|
597
|
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
/* declare */ |
|
599
|
|
|
|
|
|
|
IV num_tokens, prev_sentence_start; |
|
600
|
|
|
|
|
|
|
REGEXP *rx; |
|
601
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
|
602
|
|
|
|
|
|
|
regexp *r; |
|
603
|
|
|
|
|
|
|
#endif |
|
604
|
|
|
|
|
|
|
char *buf, *str_start, *str_end, *token_str; |
|
605
|
|
|
|
|
|
|
STRLEN str_len; |
|
606
|
|
|
|
|
|
|
const char *prev_end, *prev_start; |
|
607
|
|
|
|
|
|
|
AV *tokens; |
|
608
|
|
|
|
|
|
|
AV *heat; |
|
609
|
|
|
|
|
|
|
AV *sentence_starts; /* list of sentence start points for hot tokens */ |
|
610
|
|
|
|
|
|
|
SV *tok; |
|
611
|
|
|
|
|
|
|
boolean heat_seeker_is_CV, inside_sentence, prev_was_abbrev; |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
/* initialize */ |
|
614
|
48
|
|
|
|
|
|
num_tokens = 0; |
|
615
|
48
|
|
|
|
|
|
rx = st_get_regex_from_sv(token_re); |
|
616
|
|
|
|
|
|
|
#if (PERL_VERSION > 10) |
|
617
|
48
|
|
|
|
|
|
r = (regexp*)SvANY(rx); |
|
618
|
|
|
|
|
|
|
#endif |
|
619
|
48
|
50
|
|
|
|
|
buf = SvPV(str, str_len); |
|
620
|
48
|
|
|
|
|
|
str_start = buf; |
|
621
|
48
|
|
|
|
|
|
str_end = str_start + str_len; |
|
622
|
48
|
|
|
|
|
|
prev_start = str_start; |
|
623
|
48
|
|
|
|
|
|
prev_end = prev_start; |
|
624
|
48
|
|
|
|
|
|
tokens = newAV(); |
|
625
|
48
|
|
|
|
|
|
heat = newAV(); |
|
626
|
48
|
|
|
|
|
|
sentence_starts = newAV(); |
|
627
|
48
|
|
|
|
|
|
prev_sentence_start = 0; |
|
628
|
48
|
|
|
|
|
|
inside_sentence = 0; // assume we start with a sentence start |
|
629
|
48
|
|
|
|
|
|
heat_seeker_is_CV = 0; |
|
630
|
48
|
|
|
|
|
|
prev_was_abbrev = 0; |
|
631
|
48
|
100
|
|
|
|
|
if (heat_seeker != NULL && (SvTYPE(SvRV(heat_seeker))==SVt_PVCV)) { |
|
|
|
100
|
|
|
|
|
|
|
632
|
12
|
|
|
|
|
|
heat_seeker_is_CV = 1; |
|
633
|
|
|
|
|
|
|
} |
|
634
|
|
|
|
|
|
|
|
|
635
|
48
|
50
|
|
|
|
|
if (ST_DEBUG) { |
|
|
|
50
|
|
|
|
|
|
|
636
|
0
|
|
|
|
|
|
warn("tokenizing string %ld bytes long\n", str_len); |
|
637
|
|
|
|
|
|
|
} |
|
638
|
|
|
|
|
|
|
|
|
639
|
6281
|
100
|
|
|
|
|
while ( pregexec(rx, buf, str_end, buf, 1, str, 1) ) { |
|
640
|
|
|
|
|
|
|
const char *start_ptr, *end_ptr; |
|
641
|
|
|
|
|
|
|
st_token *token; |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
#if ((PERL_VERSION == 10) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) |
|
644
|
|
|
|
|
|
|
start_ptr = buf + rx->offs[match_num].start; |
|
645
|
|
|
|
|
|
|
end_ptr = buf + rx->offs[match_num].end; |
|
646
|
|
|
|
|
|
|
#elif (PERL_VERSION > 10) |
|
647
|
6233
|
|
|
|
|
|
start_ptr = buf + r->offs[match_num].start; |
|
648
|
6233
|
|
|
|
|
|
end_ptr = buf + r->offs[match_num].end; |
|
649
|
|
|
|
|
|
|
#else |
|
650
|
|
|
|
|
|
|
start_ptr = buf + rx->startp[match_num]; |
|
651
|
|
|
|
|
|
|
end_ptr = buf + rx->endp[match_num]; |
|
652
|
|
|
|
|
|
|
#endif |
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
/* advance the pointers */ |
|
655
|
6233
|
|
|
|
|
|
buf = (char*)end_ptr; |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
/* create token for the bytes between the last match and this one |
|
658
|
|
|
|
|
|
|
* check first that we have moved past first byte |
|
659
|
|
|
|
|
|
|
* and that the regex has moved us forward at least one byte |
|
660
|
|
|
|
|
|
|
*/ |
|
661
|
6233
|
100
|
|
|
|
|
if (start_ptr != str_start && start_ptr != prev_end) { |
|
|
|
100
|
|
|
|
|
|
|
662
|
6181
|
|
|
|
|
|
token = st_new_token(num_tokens++, |
|
663
|
6181
|
|
|
|
|
|
(start_ptr - prev_end), |
|
664
|
6181
|
|
|
|
|
|
utf8_distance((U8*)start_ptr, (U8*)prev_end), |
|
665
|
|
|
|
|
|
|
prev_end, 0, 0); |
|
666
|
6181
|
50
|
|
|
|
|
token_str = SvPV_nolen(token->str); |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
/* TODO |
|
669
|
|
|
|
|
|
|
there is an edge case here where a token that ends a sentence |
|
670
|
|
|
|
|
|
|
(e.g. punctuation) also matches the start of the next sentence |
|
671
|
|
|
|
|
|
|
(e.g. more punctuation, inverted question mark). |
|
672
|
|
|
|
|
|
|
Need to split that into 2 tokens in order to distinguish |
|
673
|
|
|
|
|
|
|
the end and start |
|
674
|
|
|
|
|
|
|
*/ |
|
675
|
|
|
|
|
|
|
|
|
676
|
6181
|
100
|
|
|
|
|
if (!inside_sentence) { |
|
677
|
7
|
50
|
|
|
|
|
if (num_tokens == 1 |
|
678
|
0
|
0
|
|
|
|
|
|| |
|
679
|
0
|
|
|
|
|
|
st_looks_like_sentence_start((unsigned char*)token_str, token->len) |
|
680
|
|
|
|
|
|
|
) { |
|
681
|
7
|
|
|
|
|
|
token->is_sentence_start = 1; |
|
682
|
7
|
|
|
|
|
|
inside_sentence = 1; |
|
683
|
|
|
|
|
|
|
} |
|
684
|
|
|
|
|
|
|
} |
|
685
|
6174
|
100
|
|
|
|
|
else if (!prev_was_abbrev |
|
686
|
6075
|
100
|
|
|
|
|
&& |
|
687
|
6075
|
|
|
|
|
|
st_looks_like_sentence_end((unsigned char*)token_str, token->len) |
|
688
|
|
|
|
|
|
|
) { |
|
689
|
184
|
|
|
|
|
|
token->is_sentence_end = 1; |
|
690
|
184
|
|
|
|
|
|
inside_sentence = 0; |
|
691
|
|
|
|
|
|
|
} |
|
692
|
6181
|
50
|
|
|
|
|
if (st_is_abbreviation((unsigned char*)token_str, token->len)) { |
|
693
|
0
|
|
|
|
|
|
token->is_abbreviation = 1; |
|
694
|
0
|
|
|
|
|
|
prev_was_abbrev = 1; |
|
695
|
|
|
|
|
|
|
} |
|
696
|
|
|
|
|
|
|
else { |
|
697
|
6181
|
|
|
|
|
|
prev_was_abbrev = 0; |
|
698
|
|
|
|
|
|
|
} |
|
699
|
|
|
|
|
|
|
|
|
700
|
6181
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
|
50
|
|
|
|
|
|
|
701
|
0
|
|
|
|
|
|
warn("prev [%d] [%d] [%d] [%s] [%d] [%d]", |
|
702
|
|
|
|
|
|
|
token->pos, token->len, token->u8len, token_str, |
|
703
|
0
|
|
|
|
|
|
token->is_sentence_start, token->is_sentence_end); |
|
704
|
|
|
|
|
|
|
} |
|
705
|
|
|
|
|
|
|
|
|
706
|
6181
|
|
|
|
|
|
tok = st_bless_ptr(ST_CLASS_TOKEN, token); |
|
707
|
6181
|
|
|
|
|
|
av_push(tokens, tok); |
|
708
|
6181
|
100
|
|
|
|
|
if (token->is_sentence_start) { |
|
709
|
|
|
|
|
|
|
//av_push(sentence_starts, newSViv(token->pos)); |
|
710
|
7
|
|
|
|
|
|
prev_sentence_start = token->pos; |
|
711
|
|
|
|
|
|
|
} |
|
712
|
|
|
|
|
|
|
} |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
/* create token object for the current match */ |
|
715
|
6233
|
|
|
|
|
|
token = st_new_token(num_tokens++, |
|
716
|
6233
|
|
|
|
|
|
(end_ptr - start_ptr), |
|
717
|
6233
|
|
|
|
|
|
utf8_distance((U8*)end_ptr, (U8*)start_ptr), |
|
718
|
|
|
|
|
|
|
start_ptr, |
|
719
|
|
|
|
|
|
|
0, 1); |
|
720
|
6233
|
50
|
|
|
|
|
token_str = SvPV_nolen(token->str); |
|
721
|
|
|
|
|
|
|
|
|
722
|
6233
|
100
|
|
|
|
|
if (!inside_sentence) { |
|
723
|
225
|
|
|
|
|
|
token->is_sentence_start = 1; |
|
724
|
225
|
|
|
|
|
|
inside_sentence = 1; |
|
725
|
225
|
|
|
|
|
|
prev_sentence_start = token->pos; |
|
726
|
|
|
|
|
|
|
} |
|
727
|
6008
|
50
|
|
|
|
|
else if (!prev_was_abbrev |
|
728
|
6008
|
50
|
|
|
|
|
&& |
|
729
|
6008
|
|
|
|
|
|
st_looks_like_sentence_end((unsigned char*)token_str, token->len) |
|
730
|
|
|
|
|
|
|
) { |
|
731
|
0
|
|
|
|
|
|
token->is_sentence_end = 1; |
|
732
|
0
|
|
|
|
|
|
inside_sentence = 0; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
6233
|
100
|
|
|
|
|
if (st_is_abbreviation((unsigned char*)token_str, token->len)) { |
|
735
|
99
|
|
|
|
|
|
token->is_abbreviation = 1; |
|
736
|
99
|
|
|
|
|
|
prev_was_abbrev = 1; |
|
737
|
|
|
|
|
|
|
} |
|
738
|
|
|
|
|
|
|
else { |
|
739
|
6134
|
|
|
|
|
|
prev_was_abbrev = 0; |
|
740
|
|
|
|
|
|
|
} |
|
741
|
|
|
|
|
|
|
|
|
742
|
6233
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
|
50
|
|
|
|
|
|
|
743
|
0
|
|
|
|
|
|
warn("main [%d] [%d] [%d] [%s] [%d] [%d]", |
|
744
|
|
|
|
|
|
|
token->pos, token->len, token->u8len, token_str, |
|
745
|
0
|
|
|
|
|
|
token->is_sentence_start, token->is_sentence_end |
|
746
|
|
|
|
|
|
|
); |
|
747
|
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
|
|
749
|
6233
|
|
|
|
|
|
tok = st_bless_ptr(ST_CLASS_TOKEN, token); |
|
750
|
6233
|
100
|
|
|
|
|
if (heat_seeker != NULL) { |
|
751
|
6109
|
100
|
|
|
|
|
if (heat_seeker_is_CV) { |
|
752
|
563
|
|
|
|
|
|
dSP; |
|
753
|
563
|
|
|
|
|
|
ENTER; |
|
754
|
563
|
|
|
|
|
|
SAVETMPS; |
|
755
|
563
|
50
|
|
|
|
|
PUSHMARK(SP); |
|
756
|
563
|
50
|
|
|
|
|
XPUSHs(tok); |
|
757
|
563
|
|
|
|
|
|
PUTBACK; |
|
758
|
563
|
50
|
|
|
|
|
if (call_sv(heat_seeker, G_SCALAR) != 1) { |
|
759
|
0
|
|
|
|
|
|
croak("Invalid return value from heat_seeker SUB -- should be single integer"); |
|
760
|
|
|
|
|
|
|
} |
|
761
|
563
|
|
|
|
|
|
SPAGAIN; |
|
762
|
563
|
50
|
|
|
|
|
token->is_hot = POPi; |
|
763
|
|
|
|
|
|
|
//warn("heat_seeker CV returned %d\n", token->is_hot); |
|
764
|
563
|
|
|
|
|
|
PUTBACK; |
|
765
|
563
|
50
|
|
|
|
|
FREETMPS; |
|
766
|
563
|
|
|
|
|
|
LEAVE; |
|
767
|
|
|
|
|
|
|
} |
|
768
|
|
|
|
|
|
|
else { |
|
769
|
5546
|
|
|
|
|
|
st_heat_seeker(token, heat_seeker); |
|
770
|
|
|
|
|
|
|
} |
|
771
|
|
|
|
|
|
|
} |
|
772
|
6233
|
|
|
|
|
|
av_push(tokens, tok); |
|
773
|
6233
|
100
|
|
|
|
|
if (token->is_sentence_start) { |
|
774
|
|
|
|
|
|
|
//av_push(sentence_starts, newSViv(token->pos)); |
|
775
|
225
|
|
|
|
|
|
prev_sentence_start = token->pos; |
|
776
|
|
|
|
|
|
|
} |
|
777
|
6233
|
100
|
|
|
|
|
if (token->is_hot) { |
|
778
|
192
|
|
|
|
|
|
av_push(heat, newSViv(token->pos)); |
|
779
|
192
|
50
|
|
|
|
|
if (ST_DEBUG) |
|
|
|
50
|
|
|
|
|
|
|
780
|
0
|
|
|
|
|
|
warn("%s: sentence_start = %ld for hot token at pos %ld\n", |
|
781
|
0
|
|
|
|
|
|
FUNCTION__, (unsigned long)prev_sentence_start, (unsigned long)token->pos); |
|
782
|
|
|
|
|
|
|
|
|
783
|
192
|
|
|
|
|
|
av_push(sentence_starts, newSViv(prev_sentence_start)); |
|
784
|
|
|
|
|
|
|
} |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
/* remember where we are for next time */ |
|
787
|
6233
|
|
|
|
|
|
prev_end = end_ptr; |
|
788
|
6233
|
|
|
|
|
|
prev_start = start_ptr; |
|
789
|
|
|
|
|
|
|
} |
|
790
|
|
|
|
|
|
|
|
|
791
|
48
|
100
|
|
|
|
|
if (prev_end != str_end) { |
|
792
|
|
|
|
|
|
|
/* some bytes after the last match */ |
|
793
|
36
|
|
|
|
|
|
st_token *token = st_new_token(num_tokens++, |
|
794
|
36
|
|
|
|
|
|
(str_end - prev_end), |
|
795
|
36
|
|
|
|
|
|
utf8_distance((U8*)str_end, (U8*)prev_end), |
|
796
|
|
|
|
|
|
|
prev_end, |
|
797
|
|
|
|
|
|
|
0, 0); |
|
798
|
36
|
50
|
|
|
|
|
token_str = SvPV_nolen(token->str); |
|
799
|
36
|
50
|
|
|
|
|
if (st_looks_like_sentence_start((unsigned char*)token_str, token->len)) { |
|
800
|
0
|
|
|
|
|
|
token->is_sentence_start = 1; |
|
801
|
|
|
|
|
|
|
} |
|
802
|
36
|
100
|
|
|
|
|
else if (st_looks_like_sentence_end((unsigned char*)token_str, token->len)) { |
|
803
|
19
|
|
|
|
|
|
token->is_sentence_end = 1; |
|
804
|
|
|
|
|
|
|
} |
|
805
|
36
|
50
|
|
|
|
|
if (ST_DEBUG > 1) { |
|
|
|
50
|
|
|
|
|
|
|
806
|
0
|
|
|
|
|
|
warn("tail: [%d] [%d] [%d] [%s] [%d] [%d]", |
|
807
|
|
|
|
|
|
|
token->pos, token->len, token->u8len, token_str, |
|
808
|
0
|
|
|
|
|
|
token->is_sentence_start, token->is_sentence_end |
|
809
|
|
|
|
|
|
|
); |
|
810
|
|
|
|
|
|
|
} |
|
811
|
|
|
|
|
|
|
|
|
812
|
36
|
|
|
|
|
|
tok = st_bless_ptr(ST_CLASS_TOKEN, token); |
|
813
|
36
|
|
|
|
|
|
av_push(tokens, tok); |
|
814
|
|
|
|
|
|
|
} |
|
815
|
|
|
|
|
|
|
|
|
816
|
48
|
|
|
|
|
|
return st_bless_ptr( |
|
817
|
|
|
|
|
|
|
ST_CLASS_TOKENLIST, |
|
818
|
48
|
|
|
|
|
|
st_new_token_list(tokens, heat, sentence_starts, num_tokens) |
|
819
|
|
|
|
|
|
|
); |
|
820
|
|
|
|
|
|
|
} |
|
821
|
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
static SV* |
|
823
|
5
|
|
|
|
|
|
st_find_bad_utf8( SV* str ) { |
|
824
|
|
|
|
|
|
|
dTHX; |
|
825
|
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
STRLEN len; |
|
827
|
|
|
|
|
|
|
U8 *bytes; |
|
828
|
|
|
|
|
|
|
const U8 *pos; |
|
829
|
|
|
|
|
|
|
STRLEN *el; |
|
830
|
|
|
|
|
|
|
|
|
831
|
5
|
50
|
|
|
|
|
bytes = (U8*)SvPV(str, len); |
|
832
|
5
|
|
|
|
|
|
el = 0; |
|
833
|
5
|
100
|
|
|
|
|
if (is_utf8_string(bytes, len)) { |
|
834
|
3
|
|
|
|
|
|
return &PL_sv_undef; |
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
else { |
|
837
|
2
|
|
|
|
|
|
is_utf8_string_loclen(bytes, len, &pos, el); |
|
838
|
5
|
|
|
|
|
|
return newSVpvn((char*)pos, strlen((char*)pos)); |
|
839
|
|
|
|
|
|
|
} |
|
840
|
|
|
|
|
|
|
} |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
/* lifted nearly verbatim from mod_perl */ |
|
843
|
6
|
|
|
|
|
|
static SV *st_escape_xml(char *s) { |
|
844
|
|
|
|
|
|
|
dTHX; |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
int i, j; |
|
847
|
|
|
|
|
|
|
SV *x; |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
/* first, count the number of extra characters */ |
|
850
|
119
|
100
|
|
|
|
|
for (i = 0, j = 0; s[i] != '\0'; i++) |
|
851
|
113
|
100
|
|
|
|
|
if (s[i] == '<' || s[i] == '>') |
|
|
|
100
|
|
|
|
|
|
|
852
|
6
|
|
|
|
|
|
j += 3; |
|
853
|
107
|
100
|
|
|
|
|
else if (s[i] == '&') |
|
854
|
3
|
|
|
|
|
|
j += 4; |
|
855
|
104
|
100
|
|
|
|
|
else if (s[i] == '"' || s[i] == '\'') |
|
|
|
50
|
|
|
|
|
|
|
856
|
1
|
|
|
|
|
|
j += 5; |
|
857
|
|
|
|
|
|
|
|
|
858
|
6
|
100
|
|
|
|
|
if (j == 0) |
|
859
|
4
|
|
|
|
|
|
return newSVpv(s,i); |
|
860
|
|
|
|
|
|
|
|
|
861
|
2
|
|
|
|
|
|
x = newSV(i + j + 1); |
|
862
|
|
|
|
|
|
|
|
|
863
|
52
|
100
|
|
|
|
|
for (i = 0, j = 0; s[i] != '\0'; i++, j++) |
|
864
|
50
|
100
|
|
|
|
|
if (s[i] == '<') { |
|
865
|
3
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], "<", 4); |
|
866
|
3
|
|
|
|
|
|
j += 3; |
|
867
|
|
|
|
|
|
|
} |
|
868
|
47
|
100
|
|
|
|
|
else if (s[i] == '>') { |
|
869
|
3
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], ">", 4); |
|
870
|
3
|
|
|
|
|
|
j += 3; |
|
871
|
|
|
|
|
|
|
} |
|
872
|
44
|
100
|
|
|
|
|
else if (s[i] == '&') { |
|
873
|
3
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], "&", 5); |
|
874
|
3
|
|
|
|
|
|
j += 4; |
|
875
|
|
|
|
|
|
|
} |
|
876
|
41
|
100
|
|
|
|
|
else if (s[i] == '"') { |
|
877
|
1
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], """, 6); |
|
878
|
1
|
|
|
|
|
|
j += 5; |
|
879
|
|
|
|
|
|
|
} |
|
880
|
40
|
50
|
|
|
|
|
else if (s[i] == '\'') { |
|
881
|
0
|
|
|
|
|
|
memcpy(&SvPVX(x)[j], "'", 5); |
|
882
|
0
|
|
|
|
|
|
j += 4; |
|
883
|
|
|
|
|
|
|
} |
|
884
|
|
|
|
|
|
|
else |
|
885
|
40
|
|
|
|
|
|
SvPVX(x)[j] = s[i]; |
|
886
|
|
|
|
|
|
|
|
|
887
|
2
|
|
|
|
|
|
SvPVX(x)[j] = '\0'; |
|
888
|
2
|
|
|
|
|
|
SvCUR_set(x, j); |
|
889
|
2
|
|
|
|
|
|
SvPOK_on(x); |
|
890
|
2
|
|
|
|
|
|
return x; |
|
891
|
|
|
|
|
|
|
} |
|
892
|
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
/* returns the UCS32 value for a UTF8 string -- the character's Unicode value. |
|
894
|
|
|
|
|
|
|
see http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA |
|
895
|
|
|
|
|
|
|
*/ |
|
896
|
|
|
|
|
|
|
static IV |
|
897
|
0
|
|
|
|
|
|
st_utf8_codepoint( |
|
898
|
|
|
|
|
|
|
const unsigned char *utf8, |
|
899
|
|
|
|
|
|
|
IV len |
|
900
|
|
|
|
|
|
|
) |
|
901
|
|
|
|
|
|
|
{ |
|
902
|
|
|
|
|
|
|
dTHX; |
|
903
|
|
|
|
|
|
|
|
|
904
|
0
|
|
|
|
|
|
switch (len) { |
|
905
|
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
case 1: |
|
907
|
0
|
|
|
|
|
|
return utf8[0]; |
|
908
|
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
case 2: |
|
910
|
0
|
|
|
|
|
|
return (utf8[0] - 192) * 64 + utf8[1] - 128; |
|
911
|
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
case 3: |
|
913
|
0
|
|
|
|
|
|
return (utf8[0] - 224) * 4096 + (utf8[1] - 128) * 64 + utf8[2] - 128; |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
case 4: |
|
916
|
|
|
|
|
|
|
default: |
|
917
|
0
|
|
|
|
|
|
return (utf8[0] - 240) * 262144 + (utf8[1] - 128) * 4096 + (utf8[2] - 128) * 64 + |
|
918
|
0
|
|
|
|
|
|
utf8[3] - 128; |
|
919
|
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
} |
|
921
|
|
|
|
|
|
|
} |
|
922
|
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
static IV |
|
924
|
36
|
|
|
|
|
|
st_looks_like_sentence_start(const unsigned char *ptr, IV len) |
|
925
|
|
|
|
|
|
|
{ |
|
926
|
|
|
|
|
|
|
dTHX; |
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
I32 u8len, u32pt; |
|
929
|
|
|
|
|
|
|
|
|
930
|
36
|
50
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
|
50
|
|
|
|
|
|
|
931
|
0
|
|
|
|
|
|
warn("%s: >%s< %ld\n", FUNCTION__, ptr, len); |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
/* optimized for ASCII */ |
|
934
|
36
|
50
|
|
|
|
|
if (st_char_is_ascii((unsigned char*)ptr, len)) { |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
/* if the string is more than one byte long, |
|
937
|
|
|
|
|
|
|
make sure the second char is NOT UPPER |
|
938
|
|
|
|
|
|
|
since that is likely a false positive. |
|
939
|
|
|
|
|
|
|
*/ |
|
940
|
36
|
100
|
|
|
|
|
if (len > 1) { |
|
941
|
16
|
50
|
|
|
|
|
if (isUPPER(ptr[0]) && !isUPPER(ptr[1])) { |
|
|
|
0
|
|
|
|
|
|
|
942
|
0
|
|
|
|
|
|
return 1; |
|
943
|
|
|
|
|
|
|
} |
|
944
|
|
|
|
|
|
|
else { |
|
945
|
16
|
|
|
|
|
|
return 0; |
|
946
|
|
|
|
|
|
|
} |
|
947
|
|
|
|
|
|
|
} |
|
948
|
|
|
|
|
|
|
else { |
|
949
|
20
|
|
|
|
|
|
return isUPPER(ptr[0]); |
|
950
|
|
|
|
|
|
|
} |
|
951
|
|
|
|
|
|
|
} |
|
952
|
|
|
|
|
|
|
|
|
953
|
0
|
0
|
|
|
|
|
if (!len) { |
|
954
|
0
|
|
|
|
|
|
return 0; |
|
955
|
|
|
|
|
|
|
} |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
/* TODO if any char is UPPER in the string, consider it a start? */ |
|
958
|
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
/* get first full UTF-8 char */ |
|
960
|
|
|
|
|
|
|
#if (PERL_VERSION >= 16) |
|
961
|
|
|
|
|
|
|
//warn("WE HAVE utf8_char_buf\n"); |
|
962
|
0
|
0
|
|
|
|
|
u8len = is_utf8_char_buf((const U8*)ptr, (const U8*)ptr+UTF8SKIP(ptr)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
#else |
|
964
|
|
|
|
|
|
|
//warn("WE HAVE utf8_char\n"); |
|
965
|
|
|
|
|
|
|
u8len = is_utf8_char((U8*)ptr); |
|
966
|
|
|
|
|
|
|
#endif |
|
967
|
|
|
|
|
|
|
|
|
968
|
0
|
0
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
|
0
|
|
|
|
|
|
|
969
|
0
|
|
|
|
|
|
warn("%s: %s is utf8 u8len %d\n", FUNCTION__, ptr, u8len); |
|
970
|
|
|
|
|
|
|
|
|
971
|
0
|
|
|
|
|
|
u32pt = st_utf8_codepoint(ptr, u8len); |
|
972
|
|
|
|
|
|
|
|
|
973
|
0
|
0
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
|
0
|
|
|
|
|
|
|
974
|
0
|
|
|
|
|
|
warn("%s: u32 code point %d\n", FUNCTION__, u32pt); |
|
975
|
|
|
|
|
|
|
|
|
976
|
0
|
0
|
|
|
|
|
if (iswupper((wint_t)u32pt)) { |
|
977
|
0
|
|
|
|
|
|
return 1; |
|
978
|
|
|
|
|
|
|
} |
|
979
|
0
|
0
|
|
|
|
|
if (u32pt == 191) { /* INVERTED QUESTION MARK */ |
|
980
|
0
|
|
|
|
|
|
return 1; |
|
981
|
|
|
|
|
|
|
} |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
/* TODO more here? */ |
|
984
|
|
|
|
|
|
|
|
|
985
|
0
|
|
|
|
|
|
return 0; |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
} |
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
/* does any char in the string look like a sentence ending? */ |
|
990
|
|
|
|
|
|
|
static IV |
|
991
|
12119
|
|
|
|
|
|
st_looks_like_sentence_end(const unsigned char *ptr, IV len) |
|
992
|
|
|
|
|
|
|
{ |
|
993
|
|
|
|
|
|
|
dTHX; |
|
994
|
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
IV i; |
|
996
|
12119
|
|
|
|
|
|
IV num_dots = 0; |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
/* right now this assumes ASCII sentence punctuation. |
|
999
|
|
|
|
|
|
|
* if we ever wanted utf8 support we'd need to iterate |
|
1000
|
|
|
|
|
|
|
* per-character instead of per byte. |
|
1001
|
|
|
|
|
|
|
*/ |
|
1002
|
|
|
|
|
|
|
|
|
1003
|
12119
|
50
|
|
|
|
|
if (ST_DEBUG > 1) |
|
|
|
50
|
|
|
|
|
|
|
1004
|
0
|
|
|
|
|
|
warn("%s: %s\n", FUNCTION__, ptr); |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
59987
|
100
|
|
|
|
|
for (i=0; i
|
|
1007
|
47914
|
|
|
|
|
|
switch (ptr[i]) { |
|
1008
|
|
|
|
|
|
|
case '.': |
|
1009
|
|
|
|
|
|
|
/* if abbrev like e.g. U.S.A. then check before and after */ |
|
1010
|
165
|
|
|
|
|
|
num_dots++; |
|
1011
|
165
|
|
|
|
|
|
break; |
|
1012
|
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
case '?': |
|
1014
|
15
|
|
|
|
|
|
return 1; |
|
1015
|
|
|
|
|
|
|
break; |
|
1016
|
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
case '!': |
|
1018
|
31
|
|
|
|
|
|
return 1; |
|
1019
|
|
|
|
|
|
|
break; |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
default: |
|
1022
|
47703
|
|
|
|
|
|
continue; |
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
} |
|
1025
|
|
|
|
|
|
|
} |
|
1026
|
12073
|
100
|
|
|
|
|
if (num_dots > 1 && num_dots < len) { |
|
|
|
50
|
|
|
|
|
|
|
1027
|
3
|
|
|
|
|
|
return 0; |
|
1028
|
|
|
|
|
|
|
} |
|
1029
|
12070
|
100
|
|
|
|
|
else if (num_dots == 1) { |
|
1030
|
157
|
|
|
|
|
|
return 1; |
|
1031
|
|
|
|
|
|
|
} |
|
1032
|
11913
|
|
|
|
|
|
return 0; |
|
1033
|
|
|
|
|
|
|
} |
|
1034
|
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
static U8* |
|
1036
|
2996
|
|
|
|
|
|
st_string_to_lower(const unsigned char *ptr, IV len) |
|
1037
|
|
|
|
|
|
|
{ |
|
1038
|
|
|
|
|
|
|
dTHX; |
|
1039
|
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
U8 *lc, *d; |
|
1041
|
2996
|
|
|
|
|
|
U8 *s = (U8*)ptr; |
|
1042
|
2996
|
|
|
|
|
|
const U8 *const send = s + len; |
|
1043
|
|
|
|
|
|
|
U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; |
|
1044
|
2996
|
|
|
|
|
|
lc = st_malloc((UTF8_MAXBYTES_CASE*len)+1); |
|
1045
|
2996
|
|
|
|
|
|
d = lc; |
|
1046
|
12646
|
100
|
|
|
|
|
while (s < send) { |
|
1047
|
9650
|
|
|
|
|
|
const STRLEN u = UTF8SKIP(s); |
|
1048
|
|
|
|
|
|
|
STRLEN ulen; |
|
1049
|
|
|
|
|
|
|
#if ((PERL_VERSION > 24) || (PERL_VERSION == 26 && PERL_SUBVERSION >= 5)) |
|
1050
|
9650
|
|
|
|
|
|
const UV uv = toLOWER_utf8_safe(s, send, tmpbuf, &ulen); |
|
1051
|
|
|
|
|
|
|
#else |
|
1052
|
|
|
|
|
|
|
const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); |
|
1053
|
|
|
|
|
|
|
#endif |
|
1054
|
9650
|
|
|
|
|
|
Copy(tmpbuf, lc, ulen, U8); |
|
1055
|
9650
|
|
|
|
|
|
lc += ulen; |
|
1056
|
9650
|
|
|
|
|
|
s += u; |
|
1057
|
|
|
|
|
|
|
} |
|
1058
|
2996
|
|
|
|
|
|
*lc = '\0'; |
|
1059
|
2996
|
|
|
|
|
|
return d; |
|
1060
|
|
|
|
|
|
|
} |
|
1061
|
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
static IV |
|
1063
|
12414
|
|
|
|
|
|
st_is_abbreviation(const unsigned char *ptr, IV len) |
|
1064
|
|
|
|
|
|
|
{ |
|
1065
|
|
|
|
|
|
|
dTHX; |
|
1066
|
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
IV i; |
|
1068
|
|
|
|
|
|
|
unsigned char *ptr_lc; |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
/* only consider strings of abbreviation-like length */ |
|
1071
|
12414
|
100
|
|
|
|
|
if (len < 2 || len > 5) { |
|
|
|
100
|
|
|
|
|
|
|
1072
|
9418
|
|
|
|
|
|
return 0; |
|
1073
|
|
|
|
|
|
|
} |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
2996
|
100
|
|
|
|
|
if (ST_ABBREVS == NULL) { |
|
1076
|
|
|
|
|
|
|
//warn("ST_ABBREVS not yet built\n"); |
|
1077
|
17
|
|
|
|
|
|
i = 0; |
|
1078
|
17
|
|
|
|
|
|
ST_ABBREVS = newHV(); |
|
1079
|
2414
|
100
|
|
|
|
|
while(en_abbrevs[i] != NULL) { |
|
1080
|
2397
|
|
|
|
|
|
st_hv_store_int( ST_ABBREVS, en_abbrevs[i], i); |
|
1081
|
2397
|
|
|
|
|
|
i++; |
|
1082
|
|
|
|
|
|
|
} |
|
1083
|
|
|
|
|
|
|
} |
|
1084
|
2996
|
|
|
|
|
|
ptr_lc = (unsigned char*)st_string_to_lower(ptr, len); |
|
1085
|
|
|
|
|
|
|
//warn("ptr=%s ptr_lc=%s\n", ptr, ptr_lc); |
|
1086
|
2996
|
|
|
|
|
|
i = hv_fetch(ST_ABBREVS, (const char *)ptr_lc, len, 0) ? 1 : 0; |
|
1087
|
2996
|
|
|
|
|
|
free(ptr_lc); |
|
1088
|
2996
|
|
|
|
|
|
return i; |
|
1089
|
|
|
|
|
|
|
} |