| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------------- |
|
2
|
|
|
|
|
|
|
* utf16_utf8.c |
|
3
|
|
|
|
|
|
|
* ---------------------------------------------------------------------------- |
|
4
|
|
|
|
|
|
|
* $Id: utf8.c 4631 2006-04-14 05:18:55Z pho $ |
|
5
|
|
|
|
|
|
|
* ------------------------------------------------------------------------- */ |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
#include "Japanese.h" |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------------- |
|
10
|
|
|
|
|
|
|
* replace invalid UTF-8 chars with '?' |
|
11
|
|
|
|
|
|
|
* ------------------------------------------------------------------------- */ |
|
12
|
|
|
|
|
|
|
EXTERN_C |
|
13
|
|
|
|
|
|
|
SV* |
|
14
|
2676
|
|
|
|
|
|
xs_validate_utf8(SV* sv_str) { |
|
15
|
|
|
|
|
|
|
unsigned char* src; |
|
16
|
|
|
|
|
|
|
STRLEN len; |
|
17
|
|
|
|
|
|
|
SV_Buf result; |
|
18
|
|
|
|
|
|
|
const unsigned char* src_end; |
|
19
|
|
|
|
|
|
|
|
|
20
|
2676
|
50
|
|
|
|
|
if (sv_str == &PL_sv_undef) { |
|
21
|
0
|
|
|
|
|
|
return newSVpvn("", 0); |
|
22
|
|
|
|
|
|
|
} |
|
23
|
2676
|
50
|
|
|
|
|
if( SvGMAGICAL(sv_str) ) |
|
24
|
|
|
|
|
|
|
{ |
|
25
|
0
|
|
|
|
|
|
mg_get(sv_str); |
|
26
|
|
|
|
|
|
|
} |
|
27
|
2676
|
50
|
|
|
|
|
if( !SvOK(sv_str) ) |
|
28
|
|
|
|
|
|
|
{ |
|
29
|
0
|
|
|
|
|
|
return newSVpvn("", 0); |
|
30
|
|
|
|
|
|
|
} |
|
31
|
|
|
|
|
|
|
|
|
32
|
2676
|
|
|
|
|
|
src = (unsigned char*)SvPV(sv_str, len); |
|
33
|
2676
|
|
|
|
|
|
src_end = src + len; |
|
34
|
2676
|
50
|
|
|
|
|
SV_Buf_init(&result, len); |
|
|
|
100
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
|
36
|
11867
|
100
|
|
|
|
|
while (src < src_end) { |
|
37
|
9191
|
100
|
|
|
|
|
if (*src >= 0xC0 && *src < 0xC1) { |
|
|
|
100
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
/* 2 bytes char which is restricted 1 byte. */ |
|
39
|
2
|
50
|
|
|
|
|
if (src + 1 <= src_end) { |
|
40
|
2
|
50
|
|
|
|
|
if (src[1] >= 0x80 && src[1] <= 0xBF) { |
|
|
|
50
|
|
|
|
|
|
|
41
|
2
|
50
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
42
|
2
|
|
|
|
|
|
src += 2; |
|
43
|
2
|
|
|
|
|
|
continue; |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
} |
|
47
|
9189
|
100
|
|
|
|
|
else if (*src == 0xE0) { |
|
48
|
|
|
|
|
|
|
/* 3 bytes char which is restricted <= 2 bytes. */ |
|
49
|
2
|
50
|
|
|
|
|
if (src + 2 <= src_end) { |
|
50
|
2
|
50
|
|
|
|
|
if (src[1] >= 0x80 && src[1] <= 0x9F && |
|
|
|
50
|
|
|
|
|
|
|
51
|
2
|
50
|
|
|
|
|
src[2] >= 0x80 && src[2] <= 0xBF) { |
|
|
|
50
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
|
53
|
2
|
50
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
54
|
2
|
|
|
|
|
|
src += 3; |
|
55
|
2
|
|
|
|
|
|
continue; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
} |
|
59
|
9187
|
100
|
|
|
|
|
else if (*src == 0xF0) { |
|
60
|
|
|
|
|
|
|
/* 4 bytes char which is restricted <= 3 bytes. */ |
|
61
|
4
|
50
|
|
|
|
|
if (src + 3 <= src_end) { |
|
62
|
4
|
50
|
|
|
|
|
if (src[1] >= 0x80 && src[1] <= 0x8F && |
|
|
|
100
|
|
|
|
|
|
|
63
|
2
|
50
|
|
|
|
|
src[2] >= 0x80 && src[2] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
64
|
2
|
50
|
|
|
|
|
src[3] >= 0x80 && src[3] <= 0xBF) { |
|
|
|
50
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
|
|
66
|
2
|
50
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
67
|
2
|
|
|
|
|
|
src += 4; |
|
68
|
2
|
|
|
|
|
|
continue; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
} |
|
72
|
9183
|
50
|
|
|
|
|
else if (*src == 0xF4) { |
|
73
|
|
|
|
|
|
|
/* > U+10FFFF (4byte) */ |
|
74
|
0
|
0
|
|
|
|
|
if (src + 3 <= src_end) { |
|
75
|
0
|
0
|
|
|
|
|
if (src[1] >= 0x90 && src[1] <= 0xBF && |
|
|
|
0
|
|
|
|
|
|
|
76
|
0
|
0
|
|
|
|
|
src[2] >= 0x80 && src[2] <= 0xBF && |
|
|
|
0
|
|
|
|
|
|
|
77
|
0
|
0
|
|
|
|
|
src[3] >= 0x80 && src[3] <= 0xBF) { |
|
|
|
0
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
|
|
79
|
0
|
0
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
80
|
0
|
|
|
|
|
|
src += 4; |
|
81
|
0
|
|
|
|
|
|
continue; |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
} |
|
85
|
9183
|
100
|
|
|
|
|
else if (*src >= 0xF5 && *src <= 0xF7) { |
|
|
|
50
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
/* ditto */ |
|
87
|
0
|
0
|
|
|
|
|
if (src + 3 <= src_end) { |
|
88
|
0
|
0
|
|
|
|
|
if (src[1] >= 0x80 && src[1] <= 0xBF && |
|
|
|
0
|
|
|
|
|
|
|
89
|
0
|
0
|
|
|
|
|
src[2] >= 0x80 && src[2] <= 0xBF && |
|
|
|
0
|
|
|
|
|
|
|
90
|
0
|
0
|
|
|
|
|
src[3] >= 0x80 && src[3] <= 0xBF) { |
|
|
|
0
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
|
|
92
|
0
|
0
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
93
|
0
|
|
|
|
|
|
src += 4; |
|
94
|
0
|
|
|
|
|
|
continue; |
|
95
|
|
|
|
|
|
|
} |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
} |
|
98
|
9183
|
100
|
|
|
|
|
else if (*src >= 0xF8 && *src <= 0xFB) { |
|
|
|
100
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
/* > U+10FFFF (5byte) */ |
|
100
|
2
|
50
|
|
|
|
|
if (src + 4 <= src_end) { |
|
101
|
2
|
50
|
|
|
|
|
if (src[1] >= 0x80 && src[1] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
102
|
2
|
50
|
|
|
|
|
src[2] >= 0x80 && src[2] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
103
|
2
|
50
|
|
|
|
|
src[3] >= 0x80 && src[3] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
104
|
2
|
50
|
|
|
|
|
src[4] >= 0x80 && src[4] <= 0xBF) { |
|
|
|
50
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
|
106
|
2
|
50
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
107
|
2
|
|
|
|
|
|
src += 5; |
|
108
|
2
|
|
|
|
|
|
continue; |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
} |
|
112
|
9181
|
100
|
|
|
|
|
else if (*src >= 0xFC && *src <= 0xFD) { |
|
|
|
50
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
/* > U+10FFFF (6byte) */ |
|
114
|
2
|
50
|
|
|
|
|
if (src + 5 <= src_end) { |
|
115
|
2
|
50
|
|
|
|
|
if (src[1] >= 0x80 && src[1] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
116
|
2
|
50
|
|
|
|
|
src[2] >= 0x80 && src[2] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
117
|
2
|
50
|
|
|
|
|
src[3] >= 0x80 && src[3] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
118
|
2
|
50
|
|
|
|
|
src[4] >= 0x80 && src[4] <= 0xBF && |
|
|
|
50
|
|
|
|
|
|
|
119
|
2
|
50
|
|
|
|
|
src[5] >= 0x80 && src[5] <= 0xBF) { |
|
|
|
50
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
|
121
|
2
|
50
|
|
|
|
|
SV_Buf_append_ch(&result, '?'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
122
|
2
|
|
|
|
|
|
src += 6; |
|
123
|
2
|
|
|
|
|
|
continue; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
9181
|
100
|
|
|
|
|
SV_Buf_append_ch(&result, *src); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
129
|
9181
|
|
|
|
|
|
src++; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
2676
|
|
|
|
|
|
SV_Buf_setLength(&result); |
|
133
|
2676
|
|
|
|
|
|
return SV_Buf_getSv(&result); |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------------- |
|
137
|
|
|
|
|
|
|
* End Of File. |
|
138
|
|
|
|
|
|
|
* ------------------------------------------------------------------------- */ |