line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
#define PERL_NO_GET_CONTEXT /* we want efficiency */ |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
/* private functions which need pTHX_ and aTHX_ |
5
|
|
|
|
|
|
|
pv_cat_decompHangul |
6
|
|
|
|
|
|
|
sv_2pvunicode |
7
|
|
|
|
|
|
|
pv_utf8_decompose |
8
|
|
|
|
|
|
|
pv_utf8_reorder |
9
|
|
|
|
|
|
|
pv_utf8_compose |
10
|
|
|
|
|
|
|
*/ |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
#include "EXTERN.h" |
13
|
|
|
|
|
|
|
#include "perl.h" |
14
|
|
|
|
|
|
|
#include "XSUB.h" |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
/* These 5 files are prepared by mkheader */ |
17
|
|
|
|
|
|
|
#include "unfcmb.h" |
18
|
|
|
|
|
|
|
#include "unfcan.h" |
19
|
|
|
|
|
|
|
#include "unfcpt.h" |
20
|
|
|
|
|
|
|
#include "unfcmp.h" |
21
|
|
|
|
|
|
|
#include "unfexc.h" |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
/* The generated normalization tables since v5.20 are in native character set |
24
|
|
|
|
|
|
|
* terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for |
25
|
|
|
|
|
|
|
* later perls, and redefine that to be 'uvuni' for earlier ones */ |
26
|
|
|
|
|
|
|
#if PERL_VERSION < 20 |
27
|
|
|
|
|
|
|
# undef uvchr_to_utf8 |
28
|
|
|
|
|
|
|
# ifdef uvuni_to_utf8 |
29
|
|
|
|
|
|
|
# define uvchr_to_utf8 uvuni_to_utf8 |
30
|
|
|
|
|
|
|
# else /* Perl 5.6.1 */ |
31
|
|
|
|
|
|
|
# define uvchr_to_utf8 uv_to_utf8 |
32
|
|
|
|
|
|
|
# endif |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# undef utf8n_to_uvchr |
35
|
|
|
|
|
|
|
# ifdef utf8n_to_uvuni |
36
|
|
|
|
|
|
|
# define utf8n_to_uvchr utf8n_to_uvuni |
37
|
|
|
|
|
|
|
# else /* Perl 5.6.1 */ |
38
|
|
|
|
|
|
|
# define utf8n_to_uvchr utf8_to_uv |
39
|
|
|
|
|
|
|
# endif |
40
|
|
|
|
|
|
|
#endif |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
43
|
|
|
|
|
|
|
#ifndef UTF8_ALLOW_BOM |
44
|
|
|
|
|
|
|
#define UTF8_ALLOW_BOM (0) |
45
|
|
|
|
|
|
|
#endif /* UTF8_ALLOW_BOM */ |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
#ifndef UTF8_ALLOW_SURROGATE |
48
|
|
|
|
|
|
|
#define UTF8_ALLOW_SURROGATE (0) |
49
|
|
|
|
|
|
|
#endif /* UTF8_ALLOW_SURROGATE */ |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
#ifndef UTF8_ALLOW_FE_FF |
52
|
|
|
|
|
|
|
#define UTF8_ALLOW_FE_FF (0) |
53
|
|
|
|
|
|
|
#endif /* UTF8_ALLOW_FE_FF */ |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#ifndef UTF8_ALLOW_FFFF |
56
|
|
|
|
|
|
|
#define UTF8_ALLOW_FFFF (0) |
57
|
|
|
|
|
|
|
#endif /* UTF8_ALLOW_FFFF */ |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
#ifndef PERL_UNUSED_VAR |
60
|
|
|
|
|
|
|
# define PERL_UNUSED_VAR(x) ((void)sizeof(x)) |
61
|
|
|
|
|
|
|
#endif |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
/* check if the string buffer is enough before uvchr_to_utf8(). */ |
66
|
|
|
|
|
|
|
/* dstart, d, and dlen should be defined outside before. */ |
67
|
|
|
|
|
|
|
#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ |
68
|
|
|
|
|
|
|
if (dlen < curlen + (need)) { \ |
69
|
|
|
|
|
|
|
dlen += (need); \ |
70
|
|
|
|
|
|
|
Renew(dstart, dlen+1, U8); \ |
71
|
|
|
|
|
|
|
d = dstart + curlen; \ |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
/* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */ |
75
|
|
|
|
|
|
|
#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
/* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
78
|
|
|
|
|
|
|
#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
/* At present, char > 0x10ffff are unaffected without complaint, right? */ |
81
|
|
|
|
|
|
|
#define VALID_UTF_MAX (0x10ffff) |
82
|
|
|
|
|
|
|
#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
/* size of array for combining characters */ |
85
|
|
|
|
|
|
|
/* enough as an initial value? */ |
86
|
|
|
|
|
|
|
#define CC_SEQ_SIZE (10) |
87
|
|
|
|
|
|
|
#define CC_SEQ_STEP (5) |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
/* HANGUL begin */ |
90
|
|
|
|
|
|
|
#define Hangul_SBase 0xAC00 |
91
|
|
|
|
|
|
|
#define Hangul_SFinal 0xD7A3 |
92
|
|
|
|
|
|
|
#define Hangul_SCount 11172 |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
#define Hangul_NCount 588 |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
#define Hangul_LBase 0x1100 |
97
|
|
|
|
|
|
|
#define Hangul_LFinal 0x1112 |
98
|
|
|
|
|
|
|
#define Hangul_LCount 19 |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#define Hangul_VBase 0x1161 |
101
|
|
|
|
|
|
|
#define Hangul_VFinal 0x1175 |
102
|
|
|
|
|
|
|
#define Hangul_VCount 21 |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
#define Hangul_TBase 0x11A7 |
105
|
|
|
|
|
|
|
#define Hangul_TFinal 0x11C2 |
106
|
|
|
|
|
|
|
#define Hangul_TCount 28 |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
109
|
|
|
|
|
|
|
#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
110
|
|
|
|
|
|
|
#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
111
|
|
|
|
|
|
|
#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
112
|
|
|
|
|
|
|
#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
113
|
|
|
|
|
|
|
#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
114
|
|
|
|
|
|
|
/* HANGUL end */ |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
/* this is used for canonical ordering of combining characters (c.c.). */ |
117
|
|
|
|
|
|
|
typedef struct { |
118
|
|
|
|
|
|
|
U8 cc; /* combining class */ |
119
|
|
|
|
|
|
|
UV uv; /* codepoint */ |
120
|
|
|
|
|
|
|
STRLEN pos; /* position */ |
121
|
|
|
|
|
|
|
} UNF_cc; |
122
|
|
|
|
|
|
|
|
123
|
1083
|
|
|
|
|
|
static int compare_cc(const void *a, const void *b) |
124
|
|
|
|
|
|
|
{ |
125
|
|
|
|
|
|
|
int ret_cc; |
126
|
1083
|
|
|
|
|
|
ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
127
|
1083
|
100
|
|
|
|
|
if (ret_cc) |
128
|
184
|
|
|
|
|
|
return ret_cc; |
129
|
|
|
|
|
|
|
|
130
|
899
|
|
|
|
|
|
return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
131
|
899
|
|
|
|
|
|
- ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
1526
|
|
|
|
|
|
static U8* dec_canonical(UV uv) |
135
|
|
|
|
|
|
|
{ |
136
|
|
|
|
|
|
|
U8 ***plane, **row; |
137
|
1526
|
100
|
|
|
|
|
if (OVER_UTF_MAX(uv)) |
138
|
10
|
|
|
|
|
|
return NULL; |
139
|
1516
|
|
|
|
|
|
plane = (U8***)UNF_canon[uv >> 16]; |
140
|
1516
|
100
|
|
|
|
|
if (! plane) |
141
|
5
|
|
|
|
|
|
return NULL; |
142
|
1511
|
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
143
|
1511
|
100
|
|
|
|
|
return row ? row[uv & 0xff] : NULL; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
429
|
|
|
|
|
|
static U8* dec_compat(UV uv) |
147
|
|
|
|
|
|
|
{ |
148
|
|
|
|
|
|
|
U8 ***plane, **row; |
149
|
429
|
100
|
|
|
|
|
if (OVER_UTF_MAX(uv)) |
150
|
6
|
|
|
|
|
|
return NULL; |
151
|
423
|
|
|
|
|
|
plane = (U8***)UNF_compat[uv >> 16]; |
152
|
423
|
100
|
|
|
|
|
if (! plane) |
153
|
3
|
|
|
|
|
|
return NULL; |
154
|
420
|
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
155
|
420
|
100
|
|
|
|
|
return row ? row[uv & 0xff] : NULL; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
438
|
|
|
|
|
|
static UV composite_uv(UV uv, UV uv2) |
159
|
|
|
|
|
|
|
{ |
160
|
|
|
|
|
|
|
UNF_complist ***plane, **row, *cell, *i; |
161
|
|
|
|
|
|
|
|
162
|
438
|
100
|
|
|
|
|
if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
163
|
9
|
|
|
|
|
|
return 0; |
164
|
|
|
|
|
|
|
|
165
|
429
|
100
|
|
|
|
|
if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
166
|
68
|
|
|
|
|
|
UV lindex = uv - Hangul_LBase; |
167
|
68
|
|
|
|
|
|
UV vindex = uv2 - Hangul_VBase; |
168
|
68
|
|
|
|
|
|
return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * |
169
|
|
|
|
|
|
|
Hangul_TCount); |
170
|
|
|
|
|
|
|
} |
171
|
361
|
100
|
|
|
|
|
if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
172
|
44
|
|
|
|
|
|
UV tindex = uv2 - Hangul_TBase; |
173
|
44
|
|
|
|
|
|
return(uv + tindex); |
174
|
|
|
|
|
|
|
} |
175
|
317
|
|
|
|
|
|
plane = UNF_compos[uv >> 16]; |
176
|
317
|
50
|
|
|
|
|
if (! plane) |
177
|
0
|
|
|
|
|
|
return 0; |
178
|
317
|
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
179
|
317
|
100
|
|
|
|
|
if (! row) |
180
|
38
|
|
|
|
|
|
return 0; |
181
|
279
|
|
|
|
|
|
cell = row[uv & 0xff]; |
182
|
279
|
100
|
|
|
|
|
if (! cell) |
183
|
122
|
|
|
|
|
|
return 0; |
184
|
908
|
100
|
|
|
|
|
for (i = cell; i->nextchar; i++) { |
185
|
832
|
100
|
|
|
|
|
if (uv2 == i->nextchar) |
186
|
81
|
|
|
|
|
|
return i->composite; |
187
|
|
|
|
|
|
|
} |
188
|
76
|
|
|
|
|
|
return 0; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
2979
|
|
|
|
|
|
static U8 getCombinClass(UV uv) |
192
|
|
|
|
|
|
|
{ |
193
|
|
|
|
|
|
|
U8 **plane, *row; |
194
|
2979
|
100
|
|
|
|
|
if (OVER_UTF_MAX(uv)) |
195
|
22
|
|
|
|
|
|
return 0; |
196
|
2957
|
|
|
|
|
|
plane = (U8**)UNF_combin[uv >> 16]; |
197
|
2957
|
100
|
|
|
|
|
if (! plane) |
198
|
11
|
|
|
|
|
|
return 0; |
199
|
2946
|
|
|
|
|
|
row = plane[(uv >> 8) & 0xff]; |
200
|
2946
|
100
|
|
|
|
|
return row ? row[uv & 0xff] : 0; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
233
|
|
|
|
|
|
static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) |
204
|
|
|
|
|
|
|
{ |
205
|
233
|
|
|
|
|
|
UV sindex = uv - Hangul_SBase; |
206
|
233
|
|
|
|
|
|
UV lindex = sindex / Hangul_NCount; |
207
|
233
|
|
|
|
|
|
UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
208
|
233
|
|
|
|
|
|
UV tindex = sindex % Hangul_TCount; |
209
|
|
|
|
|
|
|
|
210
|
233
|
50
|
|
|
|
|
if (! Hangul_IsS(uv)) |
|
|
50
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
return d; |
212
|
|
|
|
|
|
|
|
213
|
233
|
|
|
|
|
|
d = uvchr_to_utf8(d, (lindex + Hangul_LBase)); |
214
|
233
|
|
|
|
|
|
d = uvchr_to_utf8(d, (vindex + Hangul_VBase)); |
215
|
233
|
100
|
|
|
|
|
if (tindex) |
216
|
10
|
|
|
|
|
|
d = uvchr_to_utf8(d, (tindex + Hangul_TBase)); |
217
|
233
|
|
|
|
|
|
return d; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
671
|
|
|
|
|
|
static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) |
221
|
|
|
|
|
|
|
{ |
222
|
|
|
|
|
|
|
char *s; |
223
|
|
|
|
|
|
|
STRLEN len; |
224
|
671
|
100
|
|
|
|
|
s = SvPV(sv,len); |
225
|
671
|
100
|
|
|
|
|
if (!SvUTF8(sv)) { |
226
|
95
|
|
|
|
|
|
SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
227
|
95
|
50
|
|
|
|
|
if (!SvPOK(tmpsv)) |
228
|
0
|
0
|
|
|
|
|
s = SvPV_force(tmpsv,len); |
229
|
95
|
|
|
|
|
|
sv_utf8_upgrade(tmpsv); |
230
|
95
|
50
|
|
|
|
|
s = SvPV(tmpsv,len); |
231
|
|
|
|
|
|
|
} |
232
|
671
|
50
|
|
|
|
|
if (lp) |
233
|
671
|
|
|
|
|
|
*lp = len; |
234
|
671
|
|
|
|
|
|
return s; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
static |
238
|
405
|
|
|
|
|
|
U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) |
239
|
|
|
|
|
|
|
{ |
240
|
405
|
|
|
|
|
|
U8* p = s; |
241
|
405
|
|
|
|
|
|
U8* e = s + slen; |
242
|
405
|
|
|
|
|
|
U8* dstart = *dp; |
243
|
405
|
|
|
|
|
|
U8* d = dstart; |
244
|
|
|
|
|
|
|
|
245
|
2183
|
100
|
|
|
|
|
while (p < e) { |
246
|
|
|
|
|
|
|
STRLEN retlen; |
247
|
1778
|
|
|
|
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); |
248
|
1778
|
50
|
|
|
|
|
if (!retlen) |
249
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "decompose"); |
250
|
1778
|
|
|
|
|
|
p += retlen; |
251
|
|
|
|
|
|
|
|
252
|
2003
|
100
|
|
|
|
|
if (Hangul_IsS(uv)) { |
|
|
100
|
|
|
|
|
|
253
|
225
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) |
254
|
225
|
|
|
|
|
|
d = pv_cat_decompHangul(aTHX_ d, uv); |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
else { |
257
|
1553
|
100
|
|
|
|
|
U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
258
|
|
|
|
|
|
|
|
259
|
1553
|
100
|
|
|
|
|
if (r) { |
260
|
265
|
|
|
|
|
|
STRLEN len = (STRLEN)strlen((char *)r); |
261
|
265
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(len) |
262
|
1682
|
100
|
|
|
|
|
while (len--) |
263
|
1417
|
|
|
|
|
|
*d++ = *r++; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
else { |
266
|
1288
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
267
|
1778
|
|
|
|
|
|
d = uvchr_to_utf8(d, uv); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
} |
271
|
405
|
|
|
|
|
|
*dp = dstart; |
272
|
405
|
|
|
|
|
|
return d; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
static |
276
|
380
|
|
|
|
|
|
U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) |
277
|
|
|
|
|
|
|
{ |
278
|
380
|
|
|
|
|
|
U8* p = s; |
279
|
380
|
|
|
|
|
|
U8* e = s + slen; |
280
|
380
|
|
|
|
|
|
U8* dstart = *dp; |
281
|
380
|
|
|
|
|
|
U8* d = dstart; |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
UNF_cc seq_ary[CC_SEQ_SIZE]; |
284
|
380
|
|
|
|
|
|
UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ |
285
|
380
|
|
|
|
|
|
UNF_cc* seq_ext = NULL; /* extend if need */ |
286
|
380
|
|
|
|
|
|
STRLEN seq_max = CC_SEQ_SIZE; |
287
|
380
|
|
|
|
|
|
STRLEN cc_pos = 0; |
288
|
|
|
|
|
|
|
|
289
|
1791
|
100
|
|
|
|
|
while (p < e) { |
290
|
|
|
|
|
|
|
U8 curCC; |
291
|
|
|
|
|
|
|
STRLEN retlen; |
292
|
1411
|
|
|
|
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); |
293
|
1411
|
50
|
|
|
|
|
if (!retlen) |
294
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "reorder"); |
295
|
1411
|
|
|
|
|
|
p += retlen; |
296
|
|
|
|
|
|
|
|
297
|
1411
|
|
|
|
|
|
curCC = getCombinClass(uv); |
298
|
|
|
|
|
|
|
|
299
|
1411
|
100
|
|
|
|
|
if (curCC != 0) { |
300
|
734
|
100
|
|
|
|
|
if (seq_max < cc_pos + 1) { /* extend if need */ |
301
|
54
|
|
|
|
|
|
seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
302
|
54
|
100
|
|
|
|
|
if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
303
|
|
|
|
|
|
|
STRLEN i; |
304
|
12
|
50
|
|
|
|
|
New(0, seq_ext, seq_max, UNF_cc); |
305
|
132
|
100
|
|
|
|
|
for (i = 0; i < cc_pos; i++) |
306
|
120
|
|
|
|
|
|
seq_ext[i] = seq_ary[i]; |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
else { |
309
|
42
|
50
|
|
|
|
|
Renew(seq_ext, seq_max, UNF_cc); |
310
|
|
|
|
|
|
|
} |
311
|
54
|
|
|
|
|
|
seq_ptr = seq_ext; /* use seq_ext from now */ |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
734
|
|
|
|
|
|
seq_ptr[cc_pos].cc = curCC; |
315
|
734
|
|
|
|
|
|
seq_ptr[cc_pos].uv = uv; |
316
|
734
|
|
|
|
|
|
seq_ptr[cc_pos].pos = cc_pos; |
317
|
734
|
|
|
|
|
|
++cc_pos; |
318
|
|
|
|
|
|
|
|
319
|
734
|
100
|
|
|
|
|
if (p < e) |
320
|
635
|
|
|
|
|
|
continue; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
/* output */ |
324
|
776
|
100
|
|
|
|
|
if (cc_pos) { |
325
|
|
|
|
|
|
|
STRLEN i; |
326
|
|
|
|
|
|
|
|
327
|
217
|
100
|
|
|
|
|
if (cc_pos > 1) /* reordered if there are two c.c.'s */ |
328
|
101
|
|
|
|
|
|
qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); |
329
|
|
|
|
|
|
|
|
330
|
951
|
100
|
|
|
|
|
for (i = 0; i < cc_pos; i++) { |
331
|
734
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
332
|
734
|
|
|
|
|
|
d = uvchr_to_utf8(d, seq_ptr[i].uv); |
333
|
|
|
|
|
|
|
} |
334
|
217
|
|
|
|
|
|
cc_pos = 0; |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
|
337
|
776
|
100
|
|
|
|
|
if (curCC == 0) { |
338
|
677
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
339
|
776
|
|
|
|
|
|
d = uvchr_to_utf8(d, uv); |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
} |
342
|
380
|
100
|
|
|
|
|
if (seq_ext) |
343
|
12
|
|
|
|
|
|
Safefree(seq_ext); |
344
|
380
|
|
|
|
|
|
*dp = dstart; |
345
|
380
|
|
|
|
|
|
return d; |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
static |
349
|
273
|
|
|
|
|
|
U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) |
350
|
|
|
|
|
|
|
{ |
351
|
273
|
|
|
|
|
|
U8* p = s; |
352
|
273
|
|
|
|
|
|
U8* e = s + slen; |
353
|
273
|
|
|
|
|
|
U8* dstart = *dp; |
354
|
273
|
|
|
|
|
|
U8* d = dstart; |
355
|
|
|
|
|
|
|
|
356
|
273
|
|
|
|
|
|
UV uvS = 0; /* code point of the starter */ |
357
|
273
|
|
|
|
|
|
bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
358
|
273
|
|
|
|
|
|
U8 preCC = 0; |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
UV seq_ary[CC_SEQ_SIZE]; |
361
|
273
|
|
|
|
|
|
UV* seq_ptr = seq_ary; /* use array at the beginning */ |
362
|
273
|
|
|
|
|
|
UV* seq_ext = NULL; /* extend if need */ |
363
|
273
|
|
|
|
|
|
STRLEN seq_max = CC_SEQ_SIZE; |
364
|
273
|
|
|
|
|
|
STRLEN cc_pos = 0; |
365
|
|
|
|
|
|
|
|
366
|
1336
|
100
|
|
|
|
|
while (p < e) { |
367
|
|
|
|
|
|
|
U8 curCC; |
368
|
|
|
|
|
|
|
STRLEN retlen; |
369
|
1063
|
|
|
|
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); |
370
|
1063
|
50
|
|
|
|
|
if (!retlen) |
371
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "compose"); |
372
|
1063
|
|
|
|
|
|
p += retlen; |
373
|
|
|
|
|
|
|
|
374
|
1063
|
|
|
|
|
|
curCC = getCombinClass(uv); |
375
|
|
|
|
|
|
|
|
376
|
1063
|
100
|
|
|
|
|
if (!valid_uvS) { |
377
|
294
|
100
|
|
|
|
|
if (curCC == 0) { |
378
|
259
|
|
|
|
|
|
uvS = uv; /* the first Starter is found */ |
379
|
259
|
|
|
|
|
|
valid_uvS = TRUE; |
380
|
259
|
100
|
|
|
|
|
if (p < e) |
381
|
725
|
|
|
|
|
|
continue; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
else { |
384
|
35
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
385
|
35
|
|
|
|
|
|
d = uvchr_to_utf8(d, uv); |
386
|
35
|
|
|
|
|
|
continue; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
else { |
390
|
|
|
|
|
|
|
bool composed; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
/* blocked */ |
393
|
769
|
100
|
|
|
|
|
if ((iscontig && cc_pos) || /* discontiguous combination */ |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
394
|
636
|
100
|
|
|
|
|
(curCC != 0 && preCC == curCC) || /* blocked by same CC */ |
|
|
100
|
|
|
|
|
|
395
|
|
|
|
|
|
|
(preCC > curCC)) /* blocked by higher CC: revised D2 */ |
396
|
352
|
|
|
|
|
|
composed = FALSE; |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
/* not blocked: |
399
|
|
|
|
|
|
|
iscontig && cc_pos == 0 -- contiguous combination |
400
|
|
|
|
|
|
|
curCC == 0 && preCC == 0 -- starter + starter |
401
|
|
|
|
|
|
|
curCC != 0 && preCC < curCC -- lower CC */ |
402
|
|
|
|
|
|
|
else { |
403
|
|
|
|
|
|
|
/* try composition */ |
404
|
417
|
|
|
|
|
|
UV uvComp = composite_uv(uvS, uv); |
405
|
|
|
|
|
|
|
|
406
|
417
|
100
|
|
|
|
|
if (uvComp && !isExclusion(uvComp)) { |
|
|
50
|
|
|
|
|
|
407
|
184
|
|
|
|
|
|
uvS = uvComp; |
408
|
184
|
|
|
|
|
|
composed = TRUE; |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
/* preCC should not be changed to curCC */ |
411
|
|
|
|
|
|
|
/* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ |
412
|
184
|
100
|
|
|
|
|
if (p < e) |
413
|
120
|
|
|
|
|
|
continue; |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
else |
416
|
233
|
|
|
|
|
|
composed = FALSE; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
649
|
100
|
|
|
|
|
if (!composed) { |
420
|
585
|
|
|
|
|
|
preCC = curCC; |
421
|
585
|
100
|
|
|
|
|
if (curCC != 0 || !(p < e)) { |
|
|
100
|
|
|
|
|
|
422
|
506
|
100
|
|
|
|
|
if (seq_max < cc_pos + 1) { /* extend if need */ |
423
|
50
|
|
|
|
|
|
seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
424
|
50
|
100
|
|
|
|
|
if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
425
|
10
|
50
|
|
|
|
|
New(0, seq_ext, seq_max, UV); |
426
|
10
|
50
|
|
|
|
|
Copy(seq_ary, seq_ext, cc_pos, UV); |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
else { |
429
|
40
|
50
|
|
|
|
|
Renew(seq_ext, seq_max, UV); |
430
|
|
|
|
|
|
|
} |
431
|
50
|
|
|
|
|
|
seq_ptr = seq_ext; /* use seq_ext from now */ |
432
|
|
|
|
|
|
|
} |
433
|
506
|
|
|
|
|
|
seq_ptr[cc_pos] = uv; |
434
|
506
|
|
|
|
|
|
++cc_pos; |
435
|
|
|
|
|
|
|
} |
436
|
585
|
100
|
|
|
|
|
if (curCC != 0 && p < e) |
|
|
100
|
|
|
|
|
|
437
|
388
|
|
|
|
|
|
continue; |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
/* output */ |
442
|
|
|
|
|
|
|
{ |
443
|
338
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
444
|
338
|
|
|
|
|
|
d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */ |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
338
|
100
|
|
|
|
|
if (cc_pos) { |
448
|
|
|
|
|
|
|
STRLEN i; |
449
|
|
|
|
|
|
|
|
450
|
643
|
100
|
|
|
|
|
for (i = 0; i < cc_pos; i++) { |
451
|
506
|
100
|
|
|
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN) |
452
|
506
|
|
|
|
|
|
d = uvchr_to_utf8(d, seq_ptr[i]); |
453
|
|
|
|
|
|
|
} |
454
|
137
|
|
|
|
|
|
cc_pos = 0; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
338
|
|
|
|
|
|
uvS = uv; |
458
|
|
|
|
|
|
|
} |
459
|
273
|
100
|
|
|
|
|
if (seq_ext) |
460
|
10
|
|
|
|
|
|
Safefree(seq_ext); |
461
|
273
|
|
|
|
|
|
*dp = dstart; |
462
|
273
|
|
|
|
|
|
return d; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
SV* |
468
|
|
|
|
|
|
|
decompose(src, compat = &PL_sv_no) |
469
|
|
|
|
|
|
|
SV * src |
470
|
|
|
|
|
|
|
SV * compat |
471
|
|
|
|
|
|
|
PROTOTYPE: $;$ |
472
|
|
|
|
|
|
|
PREINIT: |
473
|
|
|
|
|
|
|
SV* dst; |
474
|
|
|
|
|
|
|
U8 *s, *d, *dend; |
475
|
|
|
|
|
|
|
STRLEN slen, dlen; |
476
|
|
|
|
|
|
|
CODE: |
477
|
48
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
478
|
48
|
|
|
|
|
|
dst = newSVpvn("", 0); |
479
|
48
|
|
|
|
|
|
dlen = slen; |
480
|
48
|
|
|
|
|
|
New(0, d, dlen+1, U8); |
481
|
48
|
50
|
|
|
|
|
dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
482
|
48
|
|
|
|
|
|
sv_setpvn(dst, (char *)d, dend - d); |
483
|
48
|
|
|
|
|
|
SvUTF8_on(dst); |
484
|
48
|
|
|
|
|
|
Safefree(d); |
485
|
48
|
|
|
|
|
|
RETVAL = dst; |
486
|
|
|
|
|
|
|
OUTPUT: |
487
|
|
|
|
|
|
|
RETVAL |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
SV* |
491
|
|
|
|
|
|
|
reorder(src) |
492
|
|
|
|
|
|
|
SV * src |
493
|
|
|
|
|
|
|
PROTOTYPE: $ |
494
|
|
|
|
|
|
|
PREINIT: |
495
|
|
|
|
|
|
|
SV* dst; |
496
|
|
|
|
|
|
|
U8 *s, *d, *dend; |
497
|
|
|
|
|
|
|
STRLEN slen, dlen; |
498
|
|
|
|
|
|
|
CODE: |
499
|
23
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
500
|
23
|
|
|
|
|
|
dst = newSVpvn("", 0); |
501
|
23
|
|
|
|
|
|
dlen = slen; |
502
|
23
|
|
|
|
|
|
New(0, d, dlen+1, U8); |
503
|
23
|
|
|
|
|
|
dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); |
504
|
23
|
|
|
|
|
|
sv_setpvn(dst, (char *)d, dend - d); |
505
|
23
|
|
|
|
|
|
SvUTF8_on(dst); |
506
|
23
|
|
|
|
|
|
Safefree(d); |
507
|
23
|
|
|
|
|
|
RETVAL = dst; |
508
|
|
|
|
|
|
|
OUTPUT: |
509
|
|
|
|
|
|
|
RETVAL |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
SV* |
513
|
|
|
|
|
|
|
compose(src) |
514
|
|
|
|
|
|
|
SV * src |
515
|
|
|
|
|
|
|
PROTOTYPE: $ |
516
|
|
|
|
|
|
|
ALIAS: |
517
|
|
|
|
|
|
|
composeContiguous = 1 |
518
|
|
|
|
|
|
|
PREINIT: |
519
|
|
|
|
|
|
|
SV* dst; |
520
|
|
|
|
|
|
|
U8 *s, *d, *dend; |
521
|
|
|
|
|
|
|
STRLEN slen, dlen; |
522
|
|
|
|
|
|
|
CODE: |
523
|
40
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
524
|
40
|
|
|
|
|
|
dst = newSVpvn("", 0); |
525
|
40
|
|
|
|
|
|
dlen = slen; |
526
|
40
|
|
|
|
|
|
New(0, d, dlen+1, U8); |
527
|
40
|
|
|
|
|
|
dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); |
528
|
40
|
|
|
|
|
|
sv_setpvn(dst, (char *)d, dend - d); |
529
|
40
|
|
|
|
|
|
SvUTF8_on(dst); |
530
|
40
|
|
|
|
|
|
Safefree(d); |
531
|
40
|
|
|
|
|
|
RETVAL = dst; |
532
|
|
|
|
|
|
|
OUTPUT: |
533
|
|
|
|
|
|
|
RETVAL |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
SV* |
537
|
|
|
|
|
|
|
NFD(src) |
538
|
|
|
|
|
|
|
SV * src |
539
|
|
|
|
|
|
|
PROTOTYPE: $ |
540
|
|
|
|
|
|
|
ALIAS: |
541
|
|
|
|
|
|
|
NFKD = 1 |
542
|
|
|
|
|
|
|
PREINIT: |
543
|
|
|
|
|
|
|
SV *dst; |
544
|
|
|
|
|
|
|
U8 *s, *t, *tend, *d, *dend; |
545
|
|
|
|
|
|
|
STRLEN slen, tlen, dlen; |
546
|
|
|
|
|
|
|
CODE: |
547
|
124
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
/* decompose */ |
550
|
124
|
|
|
|
|
|
tlen = slen; |
551
|
124
|
|
|
|
|
|
New(0, t, tlen+1, U8); |
552
|
124
|
|
|
|
|
|
tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); |
553
|
124
|
|
|
|
|
|
*tend = '\0'; |
554
|
124
|
|
|
|
|
|
tlen = tend - t; /* no longer know real size of t */ |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
/* reorder */ |
557
|
124
|
|
|
|
|
|
dlen = tlen; |
558
|
124
|
|
|
|
|
|
New(0, d, dlen+1, U8); |
559
|
124
|
|
|
|
|
|
dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); |
560
|
124
|
|
|
|
|
|
*dend = '\0'; |
561
|
124
|
|
|
|
|
|
dlen = dend - d; /* no longer know real size of d */ |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
/* return */ |
564
|
124
|
|
|
|
|
|
dst = newSVpvn("", 0); |
565
|
124
|
|
|
|
|
|
sv_setpvn(dst, (char *)d, dlen); |
566
|
124
|
|
|
|
|
|
SvUTF8_on(dst); |
567
|
|
|
|
|
|
|
|
568
|
124
|
|
|
|
|
|
Safefree(t); |
569
|
124
|
|
|
|
|
|
Safefree(d); |
570
|
124
|
|
|
|
|
|
RETVAL = dst; |
571
|
|
|
|
|
|
|
OUTPUT: |
572
|
|
|
|
|
|
|
RETVAL |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
SV* |
576
|
|
|
|
|
|
|
NFC(src) |
577
|
|
|
|
|
|
|
SV * src |
578
|
|
|
|
|
|
|
PROTOTYPE: $ |
579
|
|
|
|
|
|
|
ALIAS: |
580
|
|
|
|
|
|
|
NFKC = 1 |
581
|
|
|
|
|
|
|
FCC = 2 |
582
|
|
|
|
|
|
|
PREINIT: |
583
|
|
|
|
|
|
|
SV *dst; |
584
|
|
|
|
|
|
|
U8 *s, *t, *tend, *u, *uend, *d, *dend; |
585
|
|
|
|
|
|
|
STRLEN slen, tlen, ulen, dlen; |
586
|
|
|
|
|
|
|
CODE: |
587
|
233
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
/* decompose */ |
590
|
233
|
|
|
|
|
|
tlen = slen; |
591
|
233
|
|
|
|
|
|
New(0, t, tlen+1, U8); |
592
|
233
|
|
|
|
|
|
tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); |
593
|
233
|
|
|
|
|
|
*tend = '\0'; |
594
|
233
|
|
|
|
|
|
tlen = tend - t; /* no longer know real size of t */ |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
/* reorder */ |
597
|
233
|
|
|
|
|
|
ulen = tlen; |
598
|
233
|
|
|
|
|
|
New(0, u, ulen+1, U8); |
599
|
233
|
|
|
|
|
|
uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); |
600
|
233
|
|
|
|
|
|
*uend = '\0'; |
601
|
233
|
|
|
|
|
|
ulen = uend - u; /* no longer know real size of u */ |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
/* compose */ |
604
|
233
|
|
|
|
|
|
dlen = ulen; |
605
|
233
|
|
|
|
|
|
New(0, d, dlen+1, U8); |
606
|
233
|
|
|
|
|
|
dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); |
607
|
233
|
|
|
|
|
|
*dend = '\0'; |
608
|
233
|
|
|
|
|
|
dlen = dend - d; /* no longer know real size of d */ |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
/* return */ |
611
|
233
|
|
|
|
|
|
dst = newSVpvn("", 0); |
612
|
233
|
|
|
|
|
|
sv_setpvn(dst, (char *)d, dlen); |
613
|
233
|
|
|
|
|
|
SvUTF8_on(dst); |
614
|
|
|
|
|
|
|
|
615
|
233
|
|
|
|
|
|
Safefree(t); |
616
|
233
|
|
|
|
|
|
Safefree(u); |
617
|
233
|
|
|
|
|
|
Safefree(d); |
618
|
233
|
|
|
|
|
|
RETVAL = dst; |
619
|
|
|
|
|
|
|
OUTPUT: |
620
|
|
|
|
|
|
|
RETVAL |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
SV* |
624
|
|
|
|
|
|
|
checkNFD(src) |
625
|
|
|
|
|
|
|
SV * src |
626
|
|
|
|
|
|
|
PROTOTYPE: $ |
627
|
|
|
|
|
|
|
ALIAS: |
628
|
|
|
|
|
|
|
checkNFKD = 1 |
629
|
|
|
|
|
|
|
PREINIT: |
630
|
|
|
|
|
|
|
STRLEN srclen, retlen; |
631
|
|
|
|
|
|
|
U8 *s, *e, *p, curCC, preCC; |
632
|
23
|
|
|
|
|
|
bool result = TRUE; |
633
|
|
|
|
|
|
|
CODE: |
634
|
23
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
635
|
23
|
|
|
|
|
|
e = s + srclen; |
636
|
|
|
|
|
|
|
|
637
|
23
|
|
|
|
|
|
preCC = 0; |
638
|
73
|
100
|
|
|
|
|
for (p = s; p < e; p += retlen) { |
639
|
61
|
|
|
|
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); |
640
|
61
|
50
|
|
|
|
|
if (!retlen) |
641
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
642
|
|
|
|
|
|
|
|
643
|
61
|
|
|
|
|
|
curCC = getCombinClass(uv); |
644
|
61
|
100
|
|
|
|
|
if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
|
|
50
|
|
|
|
|
|
645
|
0
|
|
|
|
|
|
result = FALSE; |
646
|
0
|
|
|
|
|
|
break; |
647
|
|
|
|
|
|
|
} |
648
|
61
|
100
|
|
|
|
|
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
649
|
11
|
|
|
|
|
|
result = FALSE; |
650
|
11
|
|
|
|
|
|
break; |
651
|
|
|
|
|
|
|
} |
652
|
50
|
|
|
|
|
|
preCC = curCC; |
653
|
|
|
|
|
|
|
} |
654
|
23
|
100
|
|
|
|
|
RETVAL = boolSV(result); |
655
|
|
|
|
|
|
|
OUTPUT: |
656
|
|
|
|
|
|
|
RETVAL |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
SV* |
660
|
|
|
|
|
|
|
checkNFC(src) |
661
|
|
|
|
|
|
|
SV * src |
662
|
|
|
|
|
|
|
PROTOTYPE: $ |
663
|
|
|
|
|
|
|
ALIAS: |
664
|
|
|
|
|
|
|
checkNFKC = 1 |
665
|
|
|
|
|
|
|
PREINIT: |
666
|
|
|
|
|
|
|
STRLEN srclen, retlen; |
667
|
|
|
|
|
|
|
U8 *s, *e, *p, curCC, preCC; |
668
|
37
|
|
|
|
|
|
bool result = TRUE; |
669
|
37
|
|
|
|
|
|
bool isMAYBE = FALSE; |
670
|
|
|
|
|
|
|
CODE: |
671
|
37
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
672
|
37
|
|
|
|
|
|
e = s + srclen; |
673
|
|
|
|
|
|
|
|
674
|
37
|
|
|
|
|
|
preCC = 0; |
675
|
128
|
100
|
|
|
|
|
for (p = s; p < e; p += retlen) { |
676
|
103
|
|
|
|
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); |
677
|
103
|
50
|
|
|
|
|
if (!retlen) |
678
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
679
|
|
|
|
|
|
|
|
680
|
103
|
|
|
|
|
|
curCC = getCombinClass(uv); |
681
|
103
|
100
|
|
|
|
|
if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
|
|
100
|
|
|
|
|
|
682
|
2
|
|
|
|
|
|
result = FALSE; |
683
|
2
|
|
|
|
|
|
break; |
684
|
|
|
|
|
|
|
} |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
/* get NFC/NFKC property */ |
687
|
101
|
100
|
|
|
|
|
if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
|
|
100
|
|
|
|
|
|
688
|
|
|
|
|
|
|
; /* YES */ |
689
|
98
|
50
|
|
|
|
|
else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
690
|
6
|
|
|
|
|
|
result = FALSE; |
691
|
6
|
|
|
|
|
|
break; |
692
|
|
|
|
|
|
|
} |
693
|
92
|
100
|
|
|
|
|
else if (isComp2nd(uv)) |
694
|
13
|
|
|
|
|
|
isMAYBE = TRUE; |
695
|
79
|
100
|
|
|
|
|
else if (ix) { |
696
|
|
|
|
|
|
|
char *canon, *compat; |
697
|
|
|
|
|
|
|
/* NFKC_NO when having compatibility mapping. */ |
698
|
34
|
|
|
|
|
|
canon = (char *) dec_canonical(uv); |
699
|
34
|
|
|
|
|
|
compat = (char *) dec_compat(uv); |
700
|
34
|
100
|
|
|
|
|
if (compat && !(canon && strEQ(canon, compat))) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
701
|
4
|
|
|
|
|
|
result = FALSE; |
702
|
4
|
|
|
|
|
|
break; |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
} /* end of get NFC/NFKC property */ |
705
|
|
|
|
|
|
|
|
706
|
91
|
|
|
|
|
|
preCC = curCC; |
707
|
|
|
|
|
|
|
} |
708
|
37
|
100
|
|
|
|
|
if (isMAYBE && result) /* NO precedes MAYBE */ |
|
|
100
|
|
|
|
|
|
709
|
6
|
|
|
|
|
|
XSRETURN_UNDEF; |
710
|
31
|
100
|
|
|
|
|
RETVAL = boolSV(result); |
711
|
|
|
|
|
|
|
OUTPUT: |
712
|
|
|
|
|
|
|
RETVAL |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
SV* |
716
|
|
|
|
|
|
|
checkFCD(src) |
717
|
|
|
|
|
|
|
SV * src |
718
|
|
|
|
|
|
|
PROTOTYPE: $ |
719
|
|
|
|
|
|
|
ALIAS: |
720
|
|
|
|
|
|
|
checkFCC = 1 |
721
|
|
|
|
|
|
|
PREINIT: |
722
|
|
|
|
|
|
|
STRLEN srclen, retlen; |
723
|
|
|
|
|
|
|
U8 *s, *e, *p, curCC, preCC; |
724
|
67
|
|
|
|
|
|
bool result = TRUE; |
725
|
67
|
|
|
|
|
|
bool isMAYBE = FALSE; |
726
|
|
|
|
|
|
|
CODE: |
727
|
67
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
728
|
67
|
|
|
|
|
|
e = s + srclen; |
729
|
67
|
|
|
|
|
|
preCC = 0; |
730
|
225
|
100
|
|
|
|
|
for (p = s; p < e; p += retlen) { |
731
|
|
|
|
|
|
|
U8 *sCan; |
732
|
|
|
|
|
|
|
UV uvLead; |
733
|
179
|
|
|
|
|
|
STRLEN canlen = 0; |
734
|
179
|
|
|
|
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); |
735
|
179
|
50
|
|
|
|
|
if (!retlen) |
736
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
737
|
|
|
|
|
|
|
|
738
|
179
|
|
|
|
|
|
sCan = (U8*) dec_canonical(uv); |
739
|
|
|
|
|
|
|
|
740
|
179
|
100
|
|
|
|
|
if (sCan) { |
741
|
|
|
|
|
|
|
STRLEN canret; |
742
|
24
|
|
|
|
|
|
canlen = (STRLEN)strlen((char *) sCan); |
743
|
24
|
|
|
|
|
|
uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF); |
744
|
24
|
50
|
|
|
|
|
if (!canret) |
745
|
24
|
|
|
|
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
else { |
748
|
155
|
|
|
|
|
|
uvLead = uv; |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
|
751
|
179
|
|
|
|
|
|
curCC = getCombinClass(uvLead); |
752
|
|
|
|
|
|
|
|
753
|
179
|
100
|
|
|
|
|
if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ |
|
|
100
|
|
|
|
|
|
754
|
18
|
|
|
|
|
|
result = FALSE; |
755
|
18
|
|
|
|
|
|
break; |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
|
758
|
161
|
100
|
|
|
|
|
if (ix) { |
759
|
21
|
50
|
|
|
|
|
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
760
|
3
|
|
|
|
|
|
result = FALSE; |
761
|
3
|
|
|
|
|
|
break; |
762
|
|
|
|
|
|
|
} |
763
|
18
|
100
|
|
|
|
|
else if (isComp2nd(uv)) |
764
|
8
|
|
|
|
|
|
isMAYBE = TRUE; |
765
|
|
|
|
|
|
|
} |
766
|
|
|
|
|
|
|
|
767
|
158
|
100
|
|
|
|
|
if (sCan) { |
768
|
|
|
|
|
|
|
STRLEN canret; |
769
|
|
|
|
|
|
|
UV uvTrail; |
770
|
21
|
|
|
|
|
|
U8* eCan = sCan + canlen; |
771
|
21
|
|
|
|
|
|
U8* pCan = utf8_hop(eCan, -1); |
772
|
21
|
50
|
|
|
|
|
if (pCan < sCan) |
773
|
0
|
|
|
|
|
|
croak(ErrHopBeforeStart); |
774
|
21
|
|
|
|
|
|
uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF); |
775
|
21
|
50
|
|
|
|
|
if (!canret) |
776
|
0
|
|
|
|
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
777
|
21
|
|
|
|
|
|
preCC = getCombinClass(uvTrail); |
778
|
|
|
|
|
|
|
} |
779
|
|
|
|
|
|
|
else { |
780
|
137
|
|
|
|
|
|
preCC = curCC; |
781
|
|
|
|
|
|
|
} |
782
|
|
|
|
|
|
|
} |
783
|
67
|
100
|
|
|
|
|
if (isMAYBE && result) /* NO precedes MAYBE */ |
|
|
100
|
|
|
|
|
|
784
|
5
|
|
|
|
|
|
XSRETURN_UNDEF; |
785
|
62
|
100
|
|
|
|
|
RETVAL = boolSV(result); |
786
|
|
|
|
|
|
|
OUTPUT: |
787
|
|
|
|
|
|
|
RETVAL |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
U8 |
791
|
|
|
|
|
|
|
getCombinClass(uv) |
792
|
|
|
|
|
|
|
UV uv |
793
|
|
|
|
|
|
|
PROTOTYPE: $ |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
bool |
796
|
|
|
|
|
|
|
isExclusion(uv) |
797
|
|
|
|
|
|
|
UV uv |
798
|
|
|
|
|
|
|
PROTOTYPE: $ |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
bool |
801
|
|
|
|
|
|
|
isSingleton(uv) |
802
|
|
|
|
|
|
|
UV uv |
803
|
|
|
|
|
|
|
PROTOTYPE: $ |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
bool |
806
|
|
|
|
|
|
|
isNonStDecomp(uv) |
807
|
|
|
|
|
|
|
UV uv |
808
|
|
|
|
|
|
|
PROTOTYPE: $ |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
bool |
811
|
|
|
|
|
|
|
isComp2nd(uv) |
812
|
|
|
|
|
|
|
UV uv |
813
|
|
|
|
|
|
|
PROTOTYPE: $ |
814
|
|
|
|
|
|
|
ALIAS: |
815
|
|
|
|
|
|
|
isNFC_MAYBE = 1 |
816
|
|
|
|
|
|
|
isNFKC_MAYBE = 2 |
817
|
|
|
|
|
|
|
INIT: |
818
|
|
|
|
|
|
|
PERL_UNUSED_VAR(ix); |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
SV* |
821
|
|
|
|
|
|
|
isNFD_NO(uv) |
822
|
|
|
|
|
|
|
UV uv |
823
|
|
|
|
|
|
|
PROTOTYPE: $ |
824
|
|
|
|
|
|
|
ALIAS: |
825
|
|
|
|
|
|
|
isNFKD_NO = 1 |
826
|
|
|
|
|
|
|
PREINIT: |
827
|
44
|
|
|
|
|
|
bool result = FALSE; |
828
|
|
|
|
|
|
|
CODE: |
829
|
44
|
100
|
|
|
|
|
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
830
|
28
|
|
|
|
|
|
result = TRUE; /* NFD_NO or NFKD_NO */ |
831
|
44
|
100
|
|
|
|
|
RETVAL = boolSV(result); |
832
|
|
|
|
|
|
|
OUTPUT: |
833
|
|
|
|
|
|
|
RETVAL |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
SV* |
837
|
|
|
|
|
|
|
isComp_Ex(uv) |
838
|
|
|
|
|
|
|
UV uv |
839
|
|
|
|
|
|
|
PROTOTYPE: $ |
840
|
|
|
|
|
|
|
ALIAS: |
841
|
|
|
|
|
|
|
isNFC_NO = 0 |
842
|
|
|
|
|
|
|
isNFKC_NO = 1 |
843
|
|
|
|
|
|
|
PREINIT: |
844
|
66
|
|
|
|
|
|
bool result = FALSE; |
845
|
|
|
|
|
|
|
CODE: |
846
|
66
|
100
|
|
|
|
|
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
847
|
33
|
|
|
|
|
|
result = TRUE; /* NFC_NO or NFKC_NO */ |
848
|
33
|
100
|
|
|
|
|
else if (ix) { |
849
|
|
|
|
|
|
|
char *canon, *compat; |
850
|
11
|
|
|
|
|
|
canon = (char *) dec_canonical(uv); |
851
|
11
|
|
|
|
|
|
compat = (char *) dec_compat(uv); |
852
|
11
|
100
|
|
|
|
|
if (compat && (!canon || strNE(canon, compat))) |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
853
|
2
|
|
|
|
|
|
result = TRUE; /* NFC_NO or NFKC_NO */ |
854
|
|
|
|
|
|
|
} |
855
|
66
|
100
|
|
|
|
|
RETVAL = boolSV(result); |
856
|
|
|
|
|
|
|
OUTPUT: |
857
|
|
|
|
|
|
|
RETVAL |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
SV* |
860
|
|
|
|
|
|
|
getComposite(uv, uv2) |
861
|
|
|
|
|
|
|
UV uv |
862
|
|
|
|
|
|
|
UV uv2 |
863
|
|
|
|
|
|
|
PROTOTYPE: $$ |
864
|
|
|
|
|
|
|
PREINIT: |
865
|
|
|
|
|
|
|
UV composite; |
866
|
|
|
|
|
|
|
CODE: |
867
|
21
|
|
|
|
|
|
composite = composite_uv(uv, uv2); |
868
|
21
|
100
|
|
|
|
|
RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
869
|
|
|
|
|
|
|
OUTPUT: |
870
|
|
|
|
|
|
|
RETVAL |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
SV* |
875
|
|
|
|
|
|
|
getCanon(uv) |
876
|
|
|
|
|
|
|
UV uv |
877
|
|
|
|
|
|
|
PROTOTYPE: $ |
878
|
|
|
|
|
|
|
ALIAS: |
879
|
|
|
|
|
|
|
getCompat = 1 |
880
|
|
|
|
|
|
|
CODE: |
881
|
46
|
100
|
|
|
|
|
if (Hangul_IsS(uv)) { |
|
|
100
|
|
|
|
|
|
882
|
|
|
|
|
|
|
U8 tmp[3 * UTF8_MAXLEN + 1]; |
883
|
8
|
|
|
|
|
|
U8 *t = tmp; |
884
|
8
|
|
|
|
|
|
U8 *e = pv_cat_decompHangul(aTHX_ t, uv); |
885
|
8
|
|
|
|
|
|
RETVAL = newSVpvn((char *)t, e - t); |
886
|
|
|
|
|
|
|
} else { |
887
|
30
|
100
|
|
|
|
|
U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
888
|
30
|
100
|
|
|
|
|
if (!rstr) |
889
|
8
|
|
|
|
|
|
XSRETURN_UNDEF; |
890
|
22
|
|
|
|
|
|
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
891
|
|
|
|
|
|
|
} |
892
|
30
|
|
|
|
|
|
SvUTF8_on(RETVAL); |
893
|
|
|
|
|
|
|
OUTPUT: |
894
|
|
|
|
|
|
|
RETVAL |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
void |
898
|
|
|
|
|
|
|
splitOnLastStarter(src) |
899
|
|
|
|
|
|
|
SV * src |
900
|
|
|
|
|
|
|
PREINIT: |
901
|
|
|
|
|
|
|
SV *svp; |
902
|
|
|
|
|
|
|
STRLEN srclen; |
903
|
|
|
|
|
|
|
U8 *s, *e, *p; |
904
|
|
|
|
|
|
|
PPCODE: |
905
|
76
|
|
|
|
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
906
|
76
|
|
|
|
|
|
e = s + srclen; |
907
|
76
|
|
|
|
|
|
p = e; |
908
|
135
|
100
|
|
|
|
|
while (s < p) { |
909
|
|
|
|
|
|
|
UV uv; |
910
|
133
|
|
|
|
|
|
p = utf8_hop(p, -1); |
911
|
133
|
50
|
|
|
|
|
if (p < s) |
912
|
0
|
|
|
|
|
|
croak(ErrHopBeforeStart); |
913
|
133
|
|
|
|
|
|
uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF); |
914
|
133
|
100
|
|
|
|
|
if (getCombinClass(uv) == 0) /* Last Starter found */ |
915
|
74
|
|
|
|
|
|
break; |
916
|
|
|
|
|
|
|
} |
917
|
|
|
|
|
|
|
|
918
|
76
|
|
|
|
|
|
svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
919
|
76
|
|
|
|
|
|
SvUTF8_on(svp); |
920
|
76
|
50
|
|
|
|
|
XPUSHs(svp); |
921
|
|
|
|
|
|
|
|
922
|
76
|
|
|
|
|
|
svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
923
|
76
|
|
|
|
|
|
SvUTF8_on(svp); |
924
|
76
|
50
|
|
|
|
|
XPUSHs(svp); |
925
|
|
|
|
|
|
|
|