line |
stmt |
bran |
cond |
sub |
time |
code |
1
|
|
|
|
|
|
|
2
|
|
|
|
|
|
#define PERL_NO_GET_CONTEXT /* we want efficiency */ |
3
|
|
|
|
|
|
|
4
|
|
|
|
|
|
/* private functions which need pTHX_ and aTHX_ |
5
|
|
|
|
|
|
pv_cat_decompHangul |
6
|
|
|
|
|
|
sv_2pvunicode |
7
|
|
|
|
|
|
pv_utf8_decompose |
8
|
|
|
|
|
|
pv_utf8_reorder |
9
|
|
|
|
|
|
pv_utf8_compose |
10
|
|
|
|
|
|
*/ |
11
|
|
|
|
|
|
|
12
|
|
|
|
|
|
#include "EXTERN.h" |
13
|
|
|
|
|
|
#include "perl.h" |
14
|
|
|
|
|
|
#include "XSUB.h" |
15
|
|
|
|
|
|
|
16
|
|
|
|
|
|
/* These 5 files are prepared by mkheader */ |
17
|
|
|
|
|
|
#include "unfcmb.h" |
18
|
|
|
|
|
|
#include "unfcan.h" |
19
|
|
|
|
|
|
#include "unfcpt.h" |
20
|
|
|
|
|
|
#include "unfcmp.h" |
21
|
|
|
|
|
|
#include "unfexc.h" |
22
|
|
|
|
|
|
|
23
|
|
|
|
|
|
/* Perl 5.6.1 ? */ |
24
|
|
|
|
|
|
#ifndef uvuni_to_utf8 |
25
|
|
|
|
|
|
#define uvuni_to_utf8 uv_to_utf8 |
26
|
|
|
|
|
|
#endif /* uvuni_to_utf8 */ |
27
|
|
|
|
|
|
|
28
|
|
|
|
|
|
/* Perl 5.6.1 ? */ |
29
|
|
|
|
|
|
#ifndef utf8n_to_uvuni |
30
|
|
|
|
|
|
#define utf8n_to_uvuni utf8_to_uv |
31
|
|
|
|
|
|
#endif /* utf8n_to_uvuni */ |
32
|
|
|
|
|
|
|
33
|
|
|
|
|
|
/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
34
|
|
|
|
|
|
#ifndef UTF8_ALLOW_BOM |
35
|
|
|
|
|
|
#define UTF8_ALLOW_BOM (0) |
36
|
|
|
|
|
|
#endif /* UTF8_ALLOW_BOM */ |
37
|
|
|
|
|
|
|
38
|
|
|
|
|
|
#ifndef UTF8_ALLOW_SURROGATE |
39
|
|
|
|
|
|
#define UTF8_ALLOW_SURROGATE (0) |
40
|
|
|
|
|
|
#endif /* UTF8_ALLOW_SURROGATE */ |
41
|
|
|
|
|
|
|
42
|
|
|
|
|
|
#ifndef UTF8_ALLOW_FE_FF |
43
|
|
|
|
|
|
#define UTF8_ALLOW_FE_FF (0) |
44
|
|
|
|
|
|
#endif /* UTF8_ALLOW_FE_FF */ |
45
|
|
|
|
|
|
|
46
|
|
|
|
|
|
#ifndef UTF8_ALLOW_FFFF |
47
|
|
|
|
|
|
#define UTF8_ALLOW_FFFF (0) |
48
|
|
|
|
|
|
#endif /* UTF8_ALLOW_FFFF */ |
49
|
|
|
|
|
|
|
50
|
|
|
|
|
|
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) |
51
|
|
|
|
|
|
|
52
|
|
|
|
|
|
/* check if the string buffer is enough before uvuni_to_utf8(). */ |
53
|
|
|
|
|
|
/* dstart, d, and dlen should be defined outside before. */ |
54
|
|
|
|
|
|
#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ |
55
|
|
|
|
|
|
if (dlen < curlen + (need)) { \ |
56
|
|
|
|
|
|
dlen += (need); \ |
57
|
|
|
|
|
|
Renew(dstart, dlen+1, U8); \ |
58
|
|
|
|
|
|
d = dstart + curlen; \ |
59
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
61
|
|
|
|
|
|
/* if utf8n_to_uvuni() sets retlen to 0 (if broken?) */ |
62
|
|
|
|
|
|
#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
63
|
|
|
|
|
|
|
64
|
|
|
|
|
|
/* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
65
|
|
|
|
|
|
#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
66
|
|
|
|
|
|
|
67
|
|
|
|
|
|
/* At present, char > 0x10ffff are unaffected without complaint, right? */ |
68
|
|
|
|
|
|
#define VALID_UTF_MAX (0x10ffff) |
69
|
|
|
|
|
|
#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
70
|
|
|
|
|
|
|
71
|
|
|
|
|
|
/* size of array for combining characters */ |
72
|
|
|
|
|
|
/* enough as an initial value? */ |
73
|
|
|
|
|
|
#define CC_SEQ_SIZE (10) |
74
|
|
|
|
|
|
#define CC_SEQ_STEP (5) |
75
|
|
|
|
|
|
|
76
|
|
|
|
|
|
/* HANGUL begin */ |
77
|
|
|
|
|
|
#define Hangul_SBase 0xAC00 |
78
|
|
|
|
|
|
#define Hangul_SFinal 0xD7A3 |
79
|
|
|
|
|
|
#define Hangul_SCount 11172 |
80
|
|
|
|
|
|
|
81
|
|
|
|
|
|
#define Hangul_NCount 588 |
82
|
|
|
|
|
|
|
83
|
|
|
|
|
|
#define Hangul_LBase 0x1100 |
84
|
|
|
|
|
|
#define Hangul_LFinal 0x1112 |
85
|
|
|
|
|
|
#define Hangul_LCount 19 |
86
|
|
|
|
|
|
|
87
|
|
|
|
|
|
#define Hangul_VBase 0x1161 |
88
|
|
|
|
|
|
#define Hangul_VFinal 0x1175 |
89
|
|
|
|
|
|
#define Hangul_VCount 21 |
90
|
|
|
|
|
|
|
91
|
|
|
|
|
|
#define Hangul_TBase 0x11A7 |
92
|
|
|
|
|
|
#define Hangul_TFinal 0x11C2 |
93
|
|
|
|
|
|
#define Hangul_TCount 28 |
94
|
|
|
|
|
|
|
95
|
|
|
|
|
|
#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
96
|
|
|
|
|
|
#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
97
|
|
|
|
|
|
#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
98
|
|
|
|
|
|
#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
99
|
|
|
|
|
|
#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
100
|
|
|
|
|
|
#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
101
|
|
|
|
|
|
/* HANGUL end */ |
102
|
|
|
|
|
|
|
103
|
|
|
|
|
|
/* this is used for canonical ordering of combining characters (c.c.). */ |
104
|
|
|
|
|
|
typedef struct { |
105
|
|
|
|
|
|
U8 cc; /* combining class */ |
106
|
|
|
|
|
|
UV uv; /* codepoint */ |
107
|
|
|
|
|
|
STRLEN pos; /* position */ |
108
|
|
|
|
|
|
} UNF_cc; |
109
|
|
|
|
|
|
|
110
|
2320
|
|
|
|
|
static int compare_cc(const void *a, const void *b) |
111
|
|
|
|
|
|
{ |
112
|
|
|
|
|
|
int ret_cc; |
113
|
2320
|
|
|
|
|
ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
114
|
2320
|
|
|
|
|
if (ret_cc) |
115
|
|
|
|
|
|
return ret_cc; |
116
|
|
|
|
|
|
|
117
|
3612
|
|
|
|
|
return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
118
|
1806
|
|
|
|
|
- ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
119
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
121
|
6908
|
|
|
|
|
static U8* dec_canonical(UV uv) |
122
|
|
|
|
|
|
{ |
123
|
|
|
|
|
|
U8 ***plane, **row; |
124
|
6908
|
|
|
|
|
if (OVER_UTF_MAX(uv)) |
125
|
|
|
|
|
|
return NULL; |
126
|
6888
|
|
|
|
|
plane = (U8***)UNF_canon[uv >> 16]; |
127
|
6888
|
|
|
|
|
if (! plane) |
128
|
|
|
|
|
|
return NULL; |
129
|
6866
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
130
|
6866
|
|
|
|
|
return row ? row[uv & 0xff] : NULL; |
131
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
133
|
1042
|
|
|
|
|
static U8* dec_compat(UV uv) |
134
|
|
|
|
|
|
{ |
135
|
|
|
|
|
|
U8 ***plane, **row; |
136
|
1042
|
|
|
|
|
if (OVER_UTF_MAX(uv)) |
137
|
|
|
|
|
|
return NULL; |
138
|
1030
|
|
|
|
|
plane = (U8***)UNF_compat[uv >> 16]; |
139
|
1030
|
|
|
|
|
if (! plane) |
140
|
|
|
|
|
|
return NULL; |
141
|
1024
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
142
|
1024
|
|
|
|
|
return row ? row[uv & 0xff] : NULL; |
143
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
145
|
972
|
|
|
|
|
static UV composite_uv(UV uv, UV uv2) |
146
|
|
|
|
|
|
{ |
147
|
|
|
|
|
|
UNF_complist ***plane, **row, *cell, *i; |
148
|
|
|
|
|
|
|
149
|
972
|
|
|
|
|
if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
150
|
|
|
|
|
|
return 0; |
151
|
|
|
|
|
|
|
152
|
954
|
|
|
|
|
if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
153
|
136
|
|
|
|
|
UV lindex = uv - Hangul_LBase; |
154
|
136
|
|
|
|
|
UV vindex = uv2 - Hangul_VBase; |
155
|
136
|
|
|
|
|
return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * |
156
|
|
|
|
|
|
Hangul_TCount); |
157
|
|
|
|
|
|
} |
158
|
818
|
|
|
|
|
if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
159
|
88
|
|
|
|
|
UV tindex = uv2 - Hangul_TBase; |
160
|
88
|
|
|
|
|
return(uv + tindex); |
161
|
|
|
|
|
|
} |
162
|
730
|
|
|
|
|
plane = UNF_compos[uv >> 16]; |
163
|
730
|
|
|
|
|
if (! plane) |
164
|
|
|
|
|
|
return 0; |
165
|
730
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
166
|
730
|
|
|
|
|
if (! row) |
167
|
|
|
|
|
|
return 0; |
168
|
654
|
|
|
|
|
cell = row[uv & 0xff]; |
169
|
654
|
|
|
|
|
if (! cell) |
170
|
|
|
|
|
|
return 0; |
171
|
2366
|
|
|
|
|
for (i = cell; i->nextchar; i++) { |
172
|
2624
|
|
|
|
|
if (uv2 == i->nextchar) |
173
|
258
|
|
|
|
|
return i->composite; |
174
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
return 0; |
176
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
178
|
10758
|
|
|
|
|
static U8 getCombinClass(UV uv) |
179
|
|
|
|
|
|
{ |
180
|
|
|
|
|
|
U8 **plane, *row; |
181
|
10758
|
|
|
|
|
if (OVER_UTF_MAX(uv)) |
182
|
|
|
|
|
|
return 0; |
183
|
10714
|
|
|
|
|
plane = (U8**)UNF_combin[uv >> 16]; |
184
|
10714
|
|
|
|
|
if (! plane) |
185
|
|
|
|
|
|
return 0; |
186
|
10670
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
187
|
10670
|
|
|
|
|
return row ? row[uv & 0xff] : 0; |
188
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
190
|
470
|
|
|
|
|
static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) |
191
|
|
|
|
|
|
{ |
192
|
470
|
|
|
|
|
UV sindex = uv - Hangul_SBase; |
193
|
470
|
|
|
|
|
UV lindex = sindex / Hangul_NCount; |
194
|
470
|
|
|
|
|
UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
195
|
470
|
|
|
|
|
UV tindex = sindex % Hangul_TCount; |
196
|
|
|
|
|
|
|
197
|
470
|
|
|
|
|
if (! Hangul_IsS(uv)) |
198
|
|
|
|
|
|
return d; |
199
|
|
|
|
|
|
|
200
|
470
|
|
|
|
|
d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); |
201
|
470
|
|
|
|
|
d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); |
202
|
470
|
|
|
|
|
if (tindex) |
203
|
22
|
|
|
|
|
d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); |
204
|
470
|
|
|
|
|
return d; |
205
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
207
|
2160
|
|
|
|
|
static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) |
208
|
|
|
|
|
|
{ |
209
|
|
|
|
|
|
char *s; |
210
|
|
|
|
|
|
STRLEN len; |
211
|
2160
|
|
|
|
|
s = SvPV(sv,len); |
212
|
2160
|
|
|
|
|
if (!SvUTF8(sv)) { |
213
|
464
|
|
|
|
|
SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
214
|
464
|
|
|
|
|
if (!SvPOK(tmpsv)) |
215
|
0
|
|
|
|
|
s = SvPV_force(tmpsv,len); |
216
|
464
|
|
|
|
|
sv_utf8_upgrade(tmpsv); |
217
|
464
|
|
|
|
|
s = SvPV(tmpsv,len); |
218
|
|
|
|
|
|
} |
219
|
2160
|
|
|
|
|
if (lp) |
220
|
2160
|
|
|
|
|
*lp = len; |
221
|
2160
|
|
|
|
|
return s; |
222
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
224
|
|
|
|
|
|
static |
225
|
1628
|
|
|
|
|
U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) |
226
|
|
|
|
|
|
{ |
227
|
|
|
|
|
|
U8* p = s; |
228
|
1628
|
|
|
|
|
U8* e = s + slen; |
229
|
1628
|
|
|
|
|
U8* dstart = *dp; |
230
|
|
|
|
|
|
U8* d = dstart; |
231
|
|
|
|
|
|
|
232
|
10856
|
|
|
|
|
while (p < e) { |
233
|
|
|
|
|
|
STRLEN retlen; |
234
|
7600
|
|
|
|
|
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
235
|
7600
|
|
|
|
|
if (!retlen) |
236
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "decompose"); |
237
|
7600
|
|
|
|
|
p += retlen; |
238
|
|
|
|
|
|
|
239
|
7600
|
|
|
|
|
if (Hangul_IsS(uv)) { |
240
|
454
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) |
241
|
454
|
|
|
|
|
d = pv_cat_decompHangul(aTHX_ d, uv); |
242
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
else { |
244
|
7146
|
|
|
|
|
U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
245
|
|
|
|
|
|
|
246
|
7146
|
|
|
|
|
if (r) { |
247
|
738
|
|
|
|
|
STRLEN len = (STRLEN)strlen((char *)r); |
248
|
738
|
|
|
|
|
Renew_d_if_not_enough_to(len) |
249
|
4254
|
|
|
|
|
while (len--) |
250
|
3516
|
|
|
|
|
*d++ = *r++; |
251
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
else { |
253
|
6408
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
254
|
6408
|
|
|
|
|
d = uvuni_to_utf8(d, uv); |
255
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
} |
258
|
1628
|
|
|
|
|
*dp = dstart; |
259
|
1628
|
|
|
|
|
return d; |
260
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
262
|
|
|
|
|
|
static |
263
|
1578
|
|
|
|
|
U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) |
264
|
|
|
|
|
|
{ |
265
|
|
|
|
|
|
U8* p = s; |
266
|
1578
|
|
|
|
|
U8* e = s + slen; |
267
|
1578
|
|
|
|
|
U8* dstart = *dp; |
268
|
|
|
|
|
|
U8* d = dstart; |
269
|
|
|
|
|
|
|
270
|
|
|
|
|
|
UNF_cc seq_ary[CC_SEQ_SIZE]; |
271
|
|
|
|
|
|
UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ |
272
|
|
|
|
|
|
UNF_cc* seq_ext = NULL; /* extend if need */ |
273
|
|
|
|
|
|
STRLEN seq_max = CC_SEQ_SIZE; |
274
|
|
|
|
|
|
STRLEN cc_pos = 0; |
275
|
|
|
|
|
|
|
276
|
10208
|
|
|
|
|
while (p < e) { |
277
|
|
|
|
|
|
U8 curCC; |
278
|
|
|
|
|
|
STRLEN retlen; |
279
|
7052
|
|
|
|
|
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
280
|
7052
|
|
|
|
|
if (!retlen) |
281
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "reorder"); |
282
|
7052
|
|
|
|
|
p += retlen; |
283
|
|
|
|
|
|
|
284
|
7052
|
|
|
|
|
curCC = getCombinClass(uv); |
285
|
|
|
|
|
|
|
286
|
7052
|
|
|
|
|
if (curCC != 0) { |
287
|
2012
|
|
|
|
|
if (seq_max < cc_pos + 1) { /* extend if need */ |
288
|
108
|
|
|
|
|
seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
289
|
108
|
|
|
|
|
if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
290
|
|
|
|
|
|
STRLEN i; |
291
|
24
|
|
|
|
|
New(0, seq_ext, seq_max, UNF_cc); |
292
|
264
|
|
|
|
|
for (i = 0; i < cc_pos; i++) |
293
|
240
|
|
|
|
|
seq_ext[i] = seq_ary[i]; |
294
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
else { |
296
|
84
|
|
|
|
|
Renew(seq_ext, seq_max, UNF_cc); |
297
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
seq_ptr = seq_ext; /* use seq_ext from now */ |
299
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
301
|
2012
|
|
|
|
|
seq_ptr[cc_pos].cc = curCC; |
302
|
2012
|
|
|
|
|
seq_ptr[cc_pos].uv = uv; |
303
|
2012
|
|
|
|
|
seq_ptr[cc_pos].pos = cc_pos; |
304
|
2012
|
|
|
|
|
++cc_pos; |
305
|
|
|
|
|
|
|
306
|
2012
|
|
|
|
|
if (p < e) |
307
|
1452
|
|
|
|
|
continue; |
308
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
310
|
|
|
|
|
|
/* output */ |
311
|
5600
|
|
|
|
|
if (cc_pos) { |
312
|
|
|
|
|
|
STRLEN i; |
313
|
|
|
|
|
|
|
314
|
852
|
|
|
|
|
if (cc_pos > 1) /* reordered if there are two c.c.'s */ |
315
|
286
|
|
|
|
|
qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); |
316
|
|
|
|
|
|
|
317
|
2012
|
|
|
|
|
for (i = 0; i < cc_pos; i++) { |
318
|
2012
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
319
|
2012
|
|
|
|
|
d = uvuni_to_utf8(d, seq_ptr[i].uv); |
320
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
cc_pos = 0; |
322
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
324
|
5600
|
|
|
|
|
if (curCC == 0) { |
325
|
5040
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
326
|
5040
|
|
|
|
|
d = uvuni_to_utf8(d, uv); |
327
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
} |
329
|
1578
|
|
|
|
|
if (seq_ext) |
330
|
24
|
|
|
|
|
Safefree(seq_ext); |
331
|
1578
|
|
|
|
|
*dp = dstart; |
332
|
1578
|
|
|
|
|
return d; |
333
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
335
|
|
|
|
|
|
static |
336
|
714
|
|
|
|
|
U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) |
337
|
|
|
|
|
|
{ |
338
|
|
|
|
|
|
U8* p = s; |
339
|
714
|
|
|
|
|
U8* e = s + slen; |
340
|
714
|
|
|
|
|
U8* dstart = *dp; |
341
|
|
|
|
|
|
U8* d = dstart; |
342
|
|
|
|
|
|
|
343
|
|
|
|
|
|
UV uvS = 0; /* code point of the starter */ |
344
|
|
|
|
|
|
bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
345
|
|
|
|
|
|
U8 preCC = 0; |
346
|
|
|
|
|
|
|
347
|
|
|
|
|
|
UV seq_ary[CC_SEQ_SIZE]; |
348
|
|
|
|
|
|
UV* seq_ptr = seq_ary; /* use array at the beginning */ |
349
|
|
|
|
|
|
UV* seq_ext = NULL; /* extend if need */ |
350
|
|
|
|
|
|
STRLEN seq_max = CC_SEQ_SIZE; |
351
|
|
|
|
|
|
STRLEN cc_pos = 0; |
352
|
|
|
|
|
|
|
353
|
3810
|
|
|
|
|
while (p < e) { |
354
|
|
|
|
|
|
U8 curCC; |
355
|
|
|
|
|
|
STRLEN retlen; |
356
|
2382
|
|
|
|
|
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
357
|
2382
|
|
|
|
|
if (!retlen) |
358
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "compose"); |
359
|
2382
|
|
|
|
|
p += retlen; |
360
|
|
|
|
|
|
|
361
|
2382
|
|
|
|
|
curCC = getCombinClass(uv); |
362
|
|
|
|
|
|
|
363
|
2382
|
|
|
|
|
if (!valid_uvS) { |
364
|
748
|
|
|
|
|
if (curCC == 0) { |
365
|
|
|
|
|
|
uvS = uv; /* the first Starter is found */ |
366
|
|
|
|
|
|
valid_uvS = TRUE; |
367
|
678
|
|
|
|
|
if (p < e) |
368
|
460
|
|
|
|
|
continue; |
369
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
else { |
371
|
70
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
372
|
70
|
|
|
|
|
d = uvuni_to_utf8(d, uv); |
373
|
70
|
|
|
|
|
continue; |
374
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
else { |
377
|
|
|
|
|
|
bool composed; |
378
|
|
|
|
|
|
|
379
|
|
|
|
|
|
/* blocked */ |
380
|
3002
|
|
|
|
|
if ((iscontig && cc_pos) || /* discontiguous combination */ |
381
|
2372
|
|
|
|
|
(curCC != 0 && preCC == curCC) || /* blocked by same CC */ |
382
|
|
|
|
|
|
(preCC > curCC)) /* blocked by higher CC: revised D2 */ |
383
|
|
|
|
|
|
composed = FALSE; |
384
|
|
|
|
|
|
|
385
|
|
|
|
|
|
/* not blocked: |
386
|
|
|
|
|
|
iscontig && cc_pos == 0 -- contiguous combination |
387
|
|
|
|
|
|
curCC == 0 && preCC == 0 -- starter + starter |
388
|
|
|
|
|
|
curCC != 0 && preCC < curCC -- lower CC */ |
389
|
|
|
|
|
|
else { |
390
|
|
|
|
|
|
/* try composition */ |
391
|
930
|
|
|
|
|
UV uvComp = composite_uv(uvS, uv); |
392
|
|
|
|
|
|
|
393
|
930
|
|
|
|
|
if (uvComp && !isExclusion(uvComp)) { |
394
|
|
|
|
|
|
uvS = uvComp; |
395
|
|
|
|
|
|
composed = TRUE; |
396
|
|
|
|
|
|
|
397
|
|
|
|
|
|
/* preCC should not be changed to curCC */ |
398
|
|
|
|
|
|
/* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ |
399
|
464
|
|
|
|
|
if (p < e) |
400
|
240
|
|
|
|
|
continue; |
401
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
else |
403
|
|
|
|
|
|
composed = FALSE; |
404
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
406
|
1394
|
|
|
|
|
if (!composed) { |
407
|
|
|
|
|
|
preCC = curCC; |
408
|
1170
|
|
|
|
|
if (curCC != 0 || !(p < e)) { |
409
|
1012
|
|
|
|
|
if (seq_max < cc_pos + 1) { /* extend if need */ |
410
|
100
|
|
|
|
|
seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
411
|
100
|
|
|
|
|
if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
412
|
20
|
|
|
|
|
New(0, seq_ext, seq_max, UV); |
413
|
20
|
|
|
|
|
Copy(seq_ary, seq_ext, cc_pos, UV); |
414
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
else { |
416
|
80
|
|
|
|
|
Renew(seq_ext, seq_max, UV); |
417
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
seq_ptr = seq_ext; /* use seq_ext from now */ |
419
|
|
|
|
|
|
} |
420
|
1012
|
|
|
|
|
seq_ptr[cc_pos] = uv; |
421
|
1012
|
|
|
|
|
++cc_pos; |
422
|
|
|
|
|
|
} |
423
|
1170
|
|
|
|
|
if (curCC != 0 && p < e) |
424
|
776
|
|
|
|
|
continue; |
425
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
428
|
|
|
|
|
|
/* output */ |
429
|
|
|
|
|
|
{ |
430
|
836
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
431
|
836
|
|
|
|
|
d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
432
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
434
|
836
|
|
|
|
|
if (cc_pos) { |
435
|
|
|
|
|
|
STRLEN i; |
436
|
|
|
|
|
|
|
437
|
1012
|
|
|
|
|
for (i = 0; i < cc_pos; i++) { |
438
|
1012
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
439
|
1012
|
|
|
|
|
d = uvuni_to_utf8(d, seq_ptr[i]); |
440
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
cc_pos = 0; |
442
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
444
|
|
|
|
|
|
uvS = uv; |
445
|
|
|
|
|
|
} |
446
|
714
|
|
|
|
|
if (seq_ext) |
447
|
20
|
|
|
|
|
Safefree(seq_ext); |
448
|
714
|
|
|
|
|
*dp = dstart; |
449
|
714
|
|
|
|
|
return d; |
450
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
452
|
|
|
|
|
|
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
453
|
|
|
|
|
|
|
454
|
|
|
|
|
|
SV* |
455
|
|
|
|
|
|
decompose(src, compat = &PL_sv_no) |
456
|
|
|
|
|
|
SV * src |
457
|
|
|
|
|
|
SV * compat |
458
|
|
|
|
|
|
PROTOTYPE: $;$ |
459
|
|
|
|
|
|
PREINIT: |
460
|
|
|
|
|
|
SV* dst; |
461
|
|
|
|
|
|
U8 *s, *d, *dend; |
462
|
|
|
|
|
|
STRLEN slen, dlen; |
463
|
|
|
|
|
|
CODE: |
464
|
96
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
465
|
96
|
|
|
|
|
dst = newSVpvn("", 0); |
466
|
96
|
|
|
|
|
dlen = slen; |
467
|
96
|
|
|
|
|
New(0, d, dlen+1, U8); |
468
|
96
|
|
|
|
|
dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); |
469
|
96
|
|
|
|
|
sv_setpvn(dst, (char *)d, dend - d); |
470
|
96
|
|
|
|
|
SvUTF8_on(dst); |
471
|
96
|
|
|
|
|
Safefree(d); |
472
|
|
|
|
|
|
RETVAL = dst; |
473
|
|
|
|
|
|
OUTPUT: |
474
|
|
|
|
|
|
RETVAL |
475
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
477
|
|
|
|
|
|
SV* |
478
|
|
|
|
|
|
reorder(src) |
479
|
|
|
|
|
|
SV * src |
480
|
|
|
|
|
|
PROTOTYPE: $ |
481
|
|
|
|
|
|
PREINIT: |
482
|
|
|
|
|
|
SV* dst; |
483
|
|
|
|
|
|
U8 *s, *d, *dend; |
484
|
|
|
|
|
|
STRLEN slen, dlen; |
485
|
|
|
|
|
|
CODE: |
486
|
46
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
487
|
46
|
|
|
|
|
dst = newSVpvn("", 0); |
488
|
46
|
|
|
|
|
dlen = slen; |
489
|
46
|
|
|
|
|
New(0, d, dlen+1, U8); |
490
|
46
|
|
|
|
|
dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); |
491
|
46
|
|
|
|
|
sv_setpvn(dst, (char *)d, dend - d); |
492
|
46
|
|
|
|
|
SvUTF8_on(dst); |
493
|
46
|
|
|
|
|
Safefree(d); |
494
|
|
|
|
|
|
RETVAL = dst; |
495
|
|
|
|
|
|
OUTPUT: |
496
|
|
|
|
|
|
RETVAL |
497
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
499
|
|
|
|
|
|
SV* |
500
|
|
|
|
|
|
compose(src) |
501
|
|
|
|
|
|
SV * src |
502
|
|
|
|
|
|
PROTOTYPE: $ |
503
|
|
|
|
|
|
ALIAS: |
504
|
|
|
|
|
|
composeContiguous = 1 |
505
|
|
|
|
|
|
PREINIT: |
506
|
|
|
|
|
|
SV* dst; |
507
|
|
|
|
|
|
U8 *s, *d, *dend; |
508
|
|
|
|
|
|
STRLEN slen, dlen; |
509
|
|
|
|
|
|
CODE: |
510
|
80
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
511
|
80
|
|
|
|
|
dst = newSVpvn("", 0); |
512
|
80
|
|
|
|
|
dlen = slen; |
513
|
80
|
|
|
|
|
New(0, d, dlen+1, U8); |
514
|
80
|
|
|
|
|
dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); |
515
|
80
|
|
|
|
|
sv_setpvn(dst, (char *)d, dend - d); |
516
|
80
|
|
|
|
|
SvUTF8_on(dst); |
517
|
80
|
|
|
|
|
Safefree(d); |
518
|
|
|
|
|
|
RETVAL = dst; |
519
|
|
|
|
|
|
OUTPUT: |
520
|
|
|
|
|
|
RETVAL |
521
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
523
|
|
|
|
|
|
SV* |
524
|
|
|
|
|
|
NFD(src) |
525
|
|
|
|
|
|
SV * src |
526
|
|
|
|
|
|
PROTOTYPE: $ |
527
|
|
|
|
|
|
ALIAS: |
528
|
|
|
|
|
|
NFKD = 1 |
529
|
|
|
|
|
|
PREINIT: |
530
|
|
|
|
|
|
SV *dst; |
531
|
|
|
|
|
|
U8 *s, *t, *tend, *d, *dend; |
532
|
|
|
|
|
|
STRLEN slen, tlen, dlen; |
533
|
|
|
|
|
|
CODE: |
534
|
898
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
535
|
|
|
|
|
|
|
536
|
|
|
|
|
|
/* decompose */ |
537
|
898
|
|
|
|
|
tlen = slen; |
538
|
898
|
|
|
|
|
New(0, t, tlen+1, U8); |
539
|
898
|
|
|
|
|
tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); |
540
|
898
|
|
|
|
|
*tend = '\0'; |
541
|
898
|
|
|
|
|
tlen = tend - t; /* no longer know real size of t */ |
542
|
|
|
|
|
|
|
543
|
|
|
|
|
|
/* reorder */ |
544
|
|
|
|
|
|
dlen = tlen; |
545
|
898
|
|
|
|
|
New(0, d, dlen+1, U8); |
546
|
898
|
|
|
|
|
dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); |
547
|
898
|
|
|
|
|
*dend = '\0'; |
548
|
898
|
|
|
|
|
dlen = dend - d; /* no longer know real size of d */ |
549
|
|
|
|
|
|
|
550
|
|
|
|
|
|
/* return */ |
551
|
898
|
|
|
|
|
dst = newSVpvn("", 0); |
552
|
898
|
|
|
|
|
sv_setpvn(dst, (char *)d, dlen); |
553
|
898
|
|
|
|
|
SvUTF8_on(dst); |
554
|
|
|
|
|
|
|
555
|
898
|
|
|
|
|
Safefree(t); |
556
|
898
|
|
|
|
|
Safefree(d); |
557
|
|
|
|
|
|
RETVAL = dst; |
558
|
|
|
|
|
|
OUTPUT: |
559
|
|
|
|
|
|
RETVAL |
560
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
562
|
|
|
|
|
|
SV* |
563
|
|
|
|
|
|
NFC(src) |
564
|
|
|
|
|
|
SV * src |
565
|
|
|
|
|
|
PROTOTYPE: $ |
566
|
|
|
|
|
|
ALIAS: |
567
|
|
|
|
|
|
NFKC = 1 |
568
|
|
|
|
|
|
FCC = 2 |
569
|
|
|
|
|
|
PREINIT: |
570
|
|
|
|
|
|
SV *dst; |
571
|
|
|
|
|
|
U8 *s, *t, *tend, *u, *uend, *d, *dend; |
572
|
|
|
|
|
|
STRLEN slen, tlen, ulen, dlen; |
573
|
|
|
|
|
|
CODE: |
574
|
634
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
575
|
|
|
|
|
|
|
576
|
|
|
|
|
|
/* decompose */ |
577
|
634
|
|
|
|
|
tlen = slen; |
578
|
634
|
|
|
|
|
New(0, t, tlen+1, U8); |
579
|
634
|
|
|
|
|
tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); |
580
|
634
|
|
|
|
|
*tend = '\0'; |
581
|
634
|
|
|
|
|
tlen = tend - t; /* no longer know real size of t */ |
582
|
|
|
|
|
|
|
583
|
|
|
|
|
|
/* reorder */ |
584
|
|
|
|
|
|
ulen = tlen; |
585
|
634
|
|
|
|
|
New(0, u, ulen+1, U8); |
586
|
634
|
|
|
|
|
uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); |
587
|
634
|
|
|
|
|
*uend = '\0'; |
588
|
634
|
|
|
|
|
ulen = uend - u; /* no longer know real size of u */ |
589
|
|
|
|
|
|
|
590
|
|
|
|
|
|
/* compose */ |
591
|
|
|
|
|
|
dlen = ulen; |
592
|
634
|
|
|
|
|
New(0, d, dlen+1, U8); |
593
|
634
|
|
|
|
|
dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); |
594
|
634
|
|
|
|
|
*dend = '\0'; |
595
|
634
|
|
|
|
|
dlen = dend - d; /* no longer know real size of d */ |
596
|
|
|
|
|
|
|
597
|
|
|
|
|
|
/* return */ |
598
|
634
|
|
|
|
|
dst = newSVpvn("", 0); |
599
|
634
|
|
|
|
|
sv_setpvn(dst, (char *)d, dlen); |
600
|
634
|
|
|
|
|
SvUTF8_on(dst); |
601
|
|
|
|
|
|
|
602
|
634
|
|
|
|
|
Safefree(t); |
603
|
634
|
|
|
|
|
Safefree(u); |
604
|
634
|
|
|
|
|
Safefree(d); |
605
|
|
|
|
|
|
RETVAL = dst; |
606
|
|
|
|
|
|
OUTPUT: |
607
|
|
|
|
|
|
RETVAL |
608
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
610
|
|
|
|
|
|
SV* |
611
|
|
|
|
|
|
checkNFD(src) |
612
|
|
|
|
|
|
SV * src |
613
|
|
|
|
|
|
PROTOTYPE: $ |
614
|
|
|
|
|
|
ALIAS: |
615
|
|
|
|
|
|
checkNFKD = 1 |
616
|
|
|
|
|
|
PREINIT: |
617
|
|
|
|
|
|
STRLEN srclen, retlen; |
618
|
|
|
|
|
|
U8 *s, *e, *p, curCC, preCC; |
619
|
|
|
|
|
|
bool result = TRUE; |
620
|
|
|
|
|
|
CODE: |
621
|
46
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
622
|
46
|
|
|
|
|
e = s + srclen; |
623
|
|
|
|
|
|
|
624
|
|
|
|
|
|
preCC = 0; |
625
|
146
|
|
|
|
|
for (p = s; p < e; p += retlen) { |
626
|
122
|
|
|
|
|
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
627
|
122
|
|
|
|
|
if (!retlen) |
628
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
629
|
|
|
|
|
|
|
630
|
122
|
|
|
|
|
curCC = getCombinClass(uv); |
631
|
122
|
|
|
|
|
if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
632
|
|
|
|
|
|
result = FALSE; |
633
|
|
|
|
|
|
break; |
634
|
|
|
|
|
|
} |
635
|
122
|
|
|
|
|
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { |
636
|
|
|
|
|
|
result = FALSE; |
637
|
|
|
|
|
|
break; |
638
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
preCC = curCC; |
640
|
|
|
|
|
|
} |
641
|
46
|
|
|
|
|
RETVAL = boolSV(result); |
642
|
|
|
|
|
|
OUTPUT: |
643
|
|
|
|
|
|
RETVAL |
644
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
646
|
|
|
|
|
|
SV* |
647
|
|
|
|
|
|
checkNFC(src) |
648
|
|
|
|
|
|
SV * src |
649
|
|
|
|
|
|
PROTOTYPE: $ |
650
|
|
|
|
|
|
ALIAS: |
651
|
|
|
|
|
|
checkNFKC = 1 |
652
|
|
|
|
|
|
PREINIT: |
653
|
|
|
|
|
|
STRLEN srclen, retlen; |
654
|
|
|
|
|
|
U8 *s, *e, *p, curCC, preCC; |
655
|
|
|
|
|
|
bool result = TRUE; |
656
|
|
|
|
|
|
bool isMAYBE = FALSE; |
657
|
|
|
|
|
|
CODE: |
658
|
74
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
659
|
74
|
|
|
|
|
e = s + srclen; |
660
|
|
|
|
|
|
|
661
|
|
|
|
|
|
preCC = 0; |
662
|
256
|
|
|
|
|
for (p = s; p < e; p += retlen) { |
663
|
206
|
|
|
|
|
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
664
|
206
|
|
|
|
|
if (!retlen) |
665
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
666
|
|
|
|
|
|
|
667
|
206
|
|
|
|
|
curCC = getCombinClass(uv); |
668
|
206
|
|
|
|
|
if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
669
|
|
|
|
|
|
result = FALSE; |
670
|
|
|
|
|
|
break; |
671
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
673
|
|
|
|
|
|
/* get NFC/NFKC property */ |
674
|
202
|
|
|
|
|
if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
675
|
|
|
|
|
|
; /* YES */ |
676
|
196
|
|
|
|
|
else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
677
|
|
|
|
|
|
result = FALSE; |
678
|
|
|
|
|
|
break; |
679
|
|
|
|
|
|
} |
680
|
184
|
|
|
|
|
else if (isComp2nd(uv)) |
681
|
|
|
|
|
|
isMAYBE = TRUE; |
682
|
158
|
|
|
|
|
else if (ix) { |
683
|
|
|
|
|
|
char *canon, *compat; |
684
|
|
|
|
|
|
/* NFKC_NO when having compatibility mapping. */ |
685
|
68
|
|
|
|
|
canon = (char *) dec_canonical(uv); |
686
|
68
|
|
|
|
|
compat = (char *) dec_compat(uv); |
687
|
68
|
|
|
|
|
if (compat && !(canon && strEQ(canon, compat))) { |
688
|
|
|
|
|
|
result = FALSE; |
689
|
|
|
|
|
|
break; |
690
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
} /* end of get NFC/NFKC property */ |
692
|
|
|
|
|
|
|
693
|
|
|
|
|
|
preCC = curCC; |
694
|
|
|
|
|
|
} |
695
|
74
|
|
|
|
|
if (isMAYBE && result) /* NO precedes MAYBE */ |
696
|
12
|
|
|
|
|
XSRETURN_UNDEF; |
697
|
62
|
|
|
|
|
RETVAL = boolSV(result); |
698
|
|
|
|
|
|
OUTPUT: |
699
|
|
|
|
|
|
RETVAL |
700
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
702
|
|
|
|
|
|
SV* |
703
|
|
|
|
|
|
checkFCD(src) |
704
|
|
|
|
|
|
SV * src |
705
|
|
|
|
|
|
PROTOTYPE: $ |
706
|
|
|
|
|
|
ALIAS: |
707
|
|
|
|
|
|
checkFCC = 1 |
708
|
|
|
|
|
|
PREINIT: |
709
|
|
|
|
|
|
STRLEN srclen, retlen; |
710
|
|
|
|
|
|
U8 *s, *e, *p, curCC, preCC; |
711
|
|
|
|
|
|
bool result = TRUE; |
712
|
|
|
|
|
|
bool isMAYBE = FALSE; |
713
|
|
|
|
|
|
CODE: |
714
|
134
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
715
|
134
|
|
|
|
|
e = s + srclen; |
716
|
|
|
|
|
|
preCC = 0; |
717
|
450
|
|
|
|
|
for (p = s; p < e; p += retlen) { |
718
|
|
|
|
|
|
U8 *sCan; |
719
|
|
|
|
|
|
UV uvLead; |
720
|
|
|
|
|
|
STRLEN canlen = 0; |
721
|
358
|
|
|
|
|
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
722
|
358
|
|
|
|
|
if (!retlen) |
723
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
724
|
|
|
|
|
|
|
725
|
358
|
|
|
|
|
sCan = (U8*) dec_canonical(uv); |
726
|
|
|
|
|
|
|
727
|
358
|
|
|
|
|
if (sCan) { |
728
|
|
|
|
|
|
STRLEN canret; |
729
|
48
|
|
|
|
|
canlen = (STRLEN)strlen((char *) sCan); |
730
|
48
|
|
|
|
|
uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
731
|
48
|
|
|
|
|
if (!canret) |
732
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
733
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
else { |
735
|
|
|
|
|
|
uvLead = uv; |
736
|
|
|
|
|
|
} |
737
|
|
|
|
|
|
|
738
|
358
|
|
|
|
|
curCC = getCombinClass(uvLead); |
739
|
|
|
|
|
|
|
740
|
358
|
|
|
|
|
if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ |
741
|
|
|
|
|
|
result = FALSE; |
742
|
|
|
|
|
|
break; |
743
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
745
|
322
|
|
|
|
|
if (ix) { |
746
|
42
|
|
|
|
|
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
747
|
|
|
|
|
|
result = FALSE; |
748
|
|
|
|
|
|
break; |
749
|
|
|
|
|
|
} |
750
|
36
|
|
|
|
|
else if (isComp2nd(uv)) |
751
|
|
|
|
|
|
isMAYBE = TRUE; |
752
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
754
|
316
|
|
|
|
|
if (sCan) { |
755
|
|
|
|
|
|
STRLEN canret; |
756
|
|
|
|
|
|
UV uvTrail; |
757
|
42
|
|
|
|
|
U8* eCan = sCan + canlen; |
758
|
42
|
|
|
|
|
U8* pCan = utf8_hop(eCan, -1); |
759
|
42
|
|
|
|
|
if (pCan < sCan) |
760
|
0
|
|
|
|
|
croak(ErrHopBeforeStart); |
761
|
42
|
|
|
|
|
uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
762
|
42
|
|
|
|
|
if (!canret) |
763
|
0
|
|
|
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
764
|
42
|
|
|
|
|
preCC = getCombinClass(uvTrail); |
765
|
|
|
|
|
|
} |
766
|
|
|
|
|
|
else { |
767
|
|
|
|
|
|
preCC = curCC; |
768
|
|
|
|
|
|
} |
769
|
|
|
|
|
|
} |
770
|
134
|
|
|
|
|
if (isMAYBE && result) /* NO precedes MAYBE */ |
771
|
10
|
|
|
|
|
XSRETURN_UNDEF; |
772
|
124
|
|
|
|
|
RETVAL = boolSV(result); |
773
|
|
|
|
|
|
OUTPUT: |
774
|
|
|
|
|
|
RETVAL |
775
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
777
|
|
|
|
|
|
U8 |
778
|
|
|
|
|
|
getCombinClass(uv) |
779
|
|
|
|
|
|
UV uv |
780
|
|
|
|
|
|
PROTOTYPE: $ |
781
|
|
|
|
|
|
|
782
|
|
|
|
|
|
bool |
783
|
|
|
|
|
|
isExclusion(uv) |
784
|
|
|
|
|
|
UV uv |
785
|
|
|
|
|
|
PROTOTYPE: $ |
786
|
|
|
|
|
|
|
787
|
|
|
|
|
|
bool |
788
|
|
|
|
|
|
isSingleton(uv) |
789
|
|
|
|
|
|
UV uv |
790
|
|
|
|
|
|
PROTOTYPE: $ |
791
|
|
|
|
|
|
|
792
|
|
|
|
|
|
bool |
793
|
|
|
|
|
|
isNonStDecomp(uv) |
794
|
|
|
|
|
|
UV uv |
795
|
|
|
|
|
|
PROTOTYPE: $ |
796
|
|
|
|
|
|
|
797
|
|
|
|
|
|
bool |
798
|
|
|
|
|
|
isComp2nd(uv) |
799
|
|
|
|
|
|
UV uv |
800
|
|
|
|
|
|
PROTOTYPE: $ |
801
|
|
|
|
|
|
ALIAS: |
802
|
|
|
|
|
|
isNFC_MAYBE = 1 |
803
|
|
|
|
|
|
isNFKC_MAYBE = 2 |
804
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
807
|
|
|
|
|
|
SV* |
808
|
|
|
|
|
|
isNFD_NO(uv) |
809
|
|
|
|
|
|
UV uv |
810
|
|
|
|
|
|
PROTOTYPE: $ |
811
|
|
|
|
|
|
ALIAS: |
812
|
|
|
|
|
|
isNFKD_NO = 1 |
813
|
|
|
|
|
|
PREINIT: |
814
|
|
|
|
|
|
bool result = FALSE; |
815
|
|
|
|
|
|
CODE: |
816
|
88
|
|
|
|
|
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
817
|
|
|
|
|
|
result = TRUE; /* NFD_NO or NFKD_NO */ |
818
|
88
|
|
|
|
|
RETVAL = boolSV(result); |
819
|
|
|
|
|
|
OUTPUT: |
820
|
|
|
|
|
|
RETVAL |
821
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
823
|
|
|
|
|
|
SV* |
824
|
|
|
|
|
|
isComp_Ex(uv) |
825
|
|
|
|
|
|
UV uv |
826
|
|
|
|
|
|
PROTOTYPE: $ |
827
|
|
|
|
|
|
ALIAS: |
828
|
|
|
|
|
|
isNFC_NO = 0 |
829
|
|
|
|
|
|
isNFKC_NO = 1 |
830
|
|
|
|
|
|
PREINIT: |
831
|
|
|
|
|
|
bool result = FALSE; |
832
|
|
|
|
|
|
CODE: |
833
|
132
|
|
|
|
|
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
834
|
|
|
|
|
|
result = TRUE; /* NFC_NO or NFKC_NO */ |
835
|
66
|
|
|
|
|
else if (ix) { |
836
|
|
|
|
|
|
char *canon, *compat; |
837
|
22
|
|
|
|
|
canon = (char *) dec_canonical(uv); |
838
|
22
|
|
|
|
|
compat = (char *) dec_compat(uv); |
839
|
22
|
|
|
|
|
if (compat && (!canon || strNE(canon, compat))) |
840
|
|
|
|
|
|
result = TRUE; /* NFC_NO or NFKC_NO */ |
841
|
|
|
|
|
|
} |
842
|
132
|
|
|
|
|
RETVAL = boolSV(result); |
843
|
|
|
|
|
|
OUTPUT: |
844
|
|
|
|
|
|
RETVAL |
845
|
|
|
|
|
|
|
846
|
|
|
|
|
|
SV* |
847
|
|
|
|
|
|
getComposite(uv, uv2) |
848
|
|
|
|
|
|
UV uv |
849
|
|
|
|
|
|
UV uv2 |
850
|
|
|
|
|
|
PROTOTYPE: $$ |
851
|
|
|
|
|
|
PREINIT: |
852
|
|
|
|
|
|
UV composite; |
853
|
|
|
|
|
|
CODE: |
854
|
42
|
|
|
|
|
composite = composite_uv(uv, uv2); |
855
|
42
|
|
|
|
|
RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
856
|
|
|
|
|
|
OUTPUT: |
857
|
|
|
|
|
|
RETVAL |
858
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
861
|
|
|
|
|
|
SV* |
862
|
|
|
|
|
|
getCanon(uv) |
863
|
|
|
|
|
|
UV uv |
864
|
|
|
|
|
|
PROTOTYPE: $ |
865
|
|
|
|
|
|
ALIAS: |
866
|
|
|
|
|
|
getCompat = 1 |
867
|
|
|
|
|
|
CODE: |
868
|
76
|
|
|
|
|
if (Hangul_IsS(uv)) { |
869
|
|
|
|
|
|
U8 tmp[3 * UTF8_MAXLEN + 1]; |
870
|
|
|
|
|
|
U8 *t = tmp; |
871
|
16
|
|
|
|
|
U8 *e = pv_cat_decompHangul(aTHX_ t, uv); |
872
|
16
|
|
|
|
|
RETVAL = newSVpvn((char *)t, e - t); |
873
|
|
|
|
|
|
} else { |
874
|
60
|
|
|
|
|
U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
875
|
60
|
|
|
|
|
if (!rstr) |
876
|
16
|
|
|
|
|
XSRETURN_UNDEF; |
877
|
44
|
|
|
|
|
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
878
|
|
|
|
|
|
} |
879
|
60
|
|
|
|
|
SvUTF8_on(RETVAL); |
880
|
|
|
|
|
|
OUTPUT: |
881
|
|
|
|
|
|
RETVAL |
882
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
884
|
|
|
|
|
|
void |
885
|
|
|
|
|
|
splitOnLastStarter(src) |
886
|
|
|
|
|
|
SV * src |
887
|
|
|
|
|
|
PREINIT: |
888
|
|
|
|
|
|
SV *svp; |
889
|
|
|
|
|
|
STRLEN srclen; |
890
|
|
|
|
|
|
U8 *s, *e, *p; |
891
|
|
|
|
|
|
PPCODE: |
892
|
152
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
893
|
152
|
|
|
|
|
e = s + srclen; |
894
|
|
|
|
|
|
p = e; |
895
|
422
|
|
|
|
|
while (s < p) { |
896
|
|
|
|
|
|
UV uv; |
897
|
266
|
|
|
|
|
p = utf8_hop(p, -1); |
898
|
266
|
|
|
|
|
if (p < s) |
899
|
0
|
|
|
|
|
croak(ErrHopBeforeStart); |
900
|
266
|
|
|
|
|
uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); |
901
|
266
|
|
|
|
|
if (getCombinClass(uv) == 0) /* Last Starter found */ |
902
|
|
|
|
|
|
break; |
903
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
905
|
152
|
|
|
|
|
svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
906
|
152
|
|
|
|
|
SvUTF8_on(svp); |
907
|
152
|
|
|
|
|
XPUSHs(svp); |
908
|
|
|
|
|
|
|
909
|
152
|
|
|
|
|
svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
910
|
152
|
|
|
|
|
SvUTF8_on(svp); |
911
|
152
|
|
|
|
|
XPUSHs(svp); |
912
|
|
|
|
|
|
|