line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* |
2
|
|
|
|
|
|
|
* Notes for the casual reader ... |
3
|
|
|
|
|
|
|
* |
4
|
|
|
|
|
|
|
* This is my first attempt at writing an XS module so it's probably not the |
5
|
|
|
|
|
|
|
* finest example for a new XS coder to read. Of course if you do read it and |
6
|
|
|
|
|
|
|
* have suggestions for improvements then please let me know. |
7
|
|
|
|
|
|
|
* |
8
|
|
|
|
|
|
|
* Unlike some XS modules, this one is not wrapping an existing library. All |
9
|
|
|
|
|
|
|
* the C source is contained in this file, along with the XSUB definition. |
10
|
|
|
|
|
|
|
* |
11
|
|
|
|
|
|
|
* Although the XSUB layer allows automatic conversion between the data |
12
|
|
|
|
|
|
|
* structures used by Perl variables (different types of SV) and native C types |
13
|
|
|
|
|
|
|
* (like ints and character pointers) this module doesn't really take advantage |
14
|
|
|
|
|
|
|
* of that. Instead, it takes an SV as input and returns an SV as output. |
15
|
|
|
|
|
|
|
* This design decision was made in order to support the (premature/micro) |
16
|
|
|
|
|
|
|
* optimisation whereby if the input SV contained all-ASCII characters, then |
17
|
|
|
|
|
|
|
* the return value would be a pointer to the same SV, rather than needlessly |
18
|
|
|
|
|
|
|
* making a copy of it. |
19
|
|
|
|
|
|
|
* |
20
|
|
|
|
|
|
|
* Copyright (C) 2014 by Grant McLean |
21
|
|
|
|
|
|
|
* |
22
|
|
|
|
|
|
|
*/ |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#include "EXTERN.h" |
25
|
|
|
|
|
|
|
#include "perl.h" |
26
|
|
|
|
|
|
|
#include "XSUB.h" |
27
|
|
|
|
|
|
|
#include "ppport.h" |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
U8 _encoding_fix_latin_ms_map[] = { |
31
|
|
|
|
|
|
|
0xE2, 0x82, 0xAC, 0x00, // 80 EURO SIGN |
32
|
|
|
|
|
|
|
0x25, 0x38, 0x31, 0x00, // 81 |
33
|
|
|
|
|
|
|
0xE2, 0x80, 0x9A, 0x00, // 82 SINGLE LOW-9 QUOTATION MARK |
34
|
|
|
|
|
|
|
0xC6, 0x92, 0x00, 0x00, // 83 LATIN SMALL LETTER F WITH HOOK |
35
|
|
|
|
|
|
|
0xE2, 0x80, 0x9E, 0x00, // 84 DOUBLE LOW-9 QUOTATION MARK |
36
|
|
|
|
|
|
|
0xE2, 0x80, 0xA6, 0x00, // 85 HORIZONTAL ELLIPSIS |
37
|
|
|
|
|
|
|
0xE2, 0x80, 0xA0, 0x00, // 86 DAGGER |
38
|
|
|
|
|
|
|
0xE2, 0x80, 0xA1, 0x00, // 87 DOUBLE DAGGER |
39
|
|
|
|
|
|
|
0xCB, 0x86, 0x00, 0x00, // 88 MODIFIER LETTER CIRCUMFLEX ACCENT |
40
|
|
|
|
|
|
|
0xE2, 0x80, 0xB0, 0x00, // 89 PER MILLE SIGN |
41
|
|
|
|
|
|
|
0xC5, 0xA0, 0x00, 0x00, // 8A LATIN CAPITAL LETTER S WITH CARON |
42
|
|
|
|
|
|
|
0xE2, 0x80, 0xB9, 0x00, // 8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
43
|
|
|
|
|
|
|
0xC5, 0x92, 0x00, 0x00, // 8C LATIN CAPITAL LIGATURE OE |
44
|
|
|
|
|
|
|
0x25, 0x38, 0x44, 0x00, // 8D |
45
|
|
|
|
|
|
|
0xC5, 0xBD, 0x00, 0x00, // 8E LATIN CAPITAL LETTER Z WITH CARON |
46
|
|
|
|
|
|
|
0x25, 0x38, 0x46, 0x00, // 8F |
47
|
|
|
|
|
|
|
0x25, 0x39, 0x30, 0x00, // 90 |
48
|
|
|
|
|
|
|
0xE2, 0x80, 0x98, 0x00, // 91 LEFT SINGLE QUOTATION MARK |
49
|
|
|
|
|
|
|
0xE2, 0x80, 0x99, 0x00, // 92 RIGHT SINGLE QUOTATION MARK |
50
|
|
|
|
|
|
|
0xE2, 0x80, 0x9C, 0x00, // 93 LEFT DOUBLE QUOTATION MARK |
51
|
|
|
|
|
|
|
0xE2, 0x80, 0x9D, 0x00, // 94 RIGHT DOUBLE QUOTATION MARK |
52
|
|
|
|
|
|
|
0xE2, 0x80, 0xA2, 0x00, // 95 BULLET |
53
|
|
|
|
|
|
|
0xE2, 0x80, 0x93, 0x00, // 96 EN DASH |
54
|
|
|
|
|
|
|
0xE2, 0x80, 0x94, 0x00, // 97 EM DASH |
55
|
|
|
|
|
|
|
0xCB, 0x9C, 0x00, 0x00, // 98 SMALL TILDE |
56
|
|
|
|
|
|
|
0xE2, 0x84, 0xA2, 0x00, // 99 TRADE MARK SIGN |
57
|
|
|
|
|
|
|
0xC5, 0xA1, 0x00, 0x00, // 9A LATIN SMALL LETTER S WITH CARON |
58
|
|
|
|
|
|
|
0xE2, 0x80, 0xBA, 0x00, // 9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
59
|
|
|
|
|
|
|
0xC5, 0x93, 0x00, 0x00, // 9C LATIN SMALL LIGATURE OE |
60
|
|
|
|
|
|
|
0x25, 0x39, 0x44, 0x00, // 9D |
61
|
|
|
|
|
|
|
0xC5, 0xBE, 0x00, 0x00, // 9E LATIN SMALL LETTER Z WITH CARON |
62
|
|
|
|
|
|
|
0xC5, 0xB8, 0x00, 0x00, // 9F LATIN CAPITAL LETTER Y WITH DIAERESIS |
63
|
|
|
|
|
|
|
0x00 |
64
|
|
|
|
|
|
|
}; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
static SV* _encoding_fix_latin_xs(SV*, int, int); |
68
|
|
|
|
|
|
|
static int consume_utf8_bytes(U8*, U8*, int); |
69
|
|
|
|
|
|
|
static int consume_latin_byte(U8*, U8*, int); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
35
|
|
|
|
|
|
static SV* _encoding_fix_latin_xs(SV* source, int overlong_fatal, int ascii_hex) { |
73
|
35
|
|
|
|
|
|
SV* out = NULL; // Defer initialisation until first non-ASCII character |
74
|
|
|
|
|
|
|
U8 *ph, *pt; |
75
|
|
|
|
|
|
|
U8 ubuf[8]; |
76
|
|
|
|
|
|
|
UV i, bytes, bytes_consumed; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
STRLEN l; |
79
|
35
|
50
|
|
|
|
|
ph = pt = SvPV(source, l); |
80
|
35
|
|
|
|
|
|
bytes = SvCUR(source); |
81
|
218
|
100
|
|
|
|
|
for(i = 0; i < bytes; i++, ph++) { |
82
|
184
|
100
|
|
|
|
|
if((*ph & 0x80) == 0) |
83
|
140
|
|
|
|
|
|
continue; |
84
|
|
|
|
|
|
|
|
85
|
44
|
100
|
|
|
|
|
if(out == NULL) { // Deferred initialisation |
86
|
27
|
|
|
|
|
|
out = newSV(bytes * 12 / 10); // Pre-allocate 20% more space |
87
|
27
|
|
|
|
|
|
SvPOK_on(out); |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
// Copy the ASCII byte sequence up to, but not including, the byte that |
91
|
|
|
|
|
|
|
// we're currently pointing at |
92
|
44
|
100
|
|
|
|
|
if(ph > pt) { |
93
|
15
|
|
|
|
|
|
sv_catpvn(out, pt, (STRLEN)(ph - pt)); |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
44
|
|
|
|
|
|
bytes_consumed = consume_utf8_bytes(ph, ubuf, overlong_fatal); |
97
|
43
|
100
|
|
|
|
|
if(!bytes_consumed) { |
98
|
24
|
|
|
|
|
|
bytes_consumed = consume_latin_byte(ph, ubuf, ascii_hex); |
99
|
|
|
|
|
|
|
} |
100
|
43
|
|
|
|
|
|
sv_catpvn(out, ubuf, strlen(ubuf)); |
101
|
43
|
|
|
|
|
|
i += bytes_consumed - 1; |
102
|
43
|
|
|
|
|
|
ph += bytes_consumed - 1; |
103
|
|
|
|
|
|
|
|
104
|
43
|
|
|
|
|
|
pt = ph + 1; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
// If the input was all ASCII, just return the input |
108
|
34
|
100
|
|
|
|
|
if(out == NULL) { |
109
|
8
|
|
|
|
|
|
return(source); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
26
|
100
|
|
|
|
|
if(ph > pt) { |
113
|
6
|
|
|
|
|
|
sv_catpvn(out, pt, (STRLEN)(ph - pt)); |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
26
|
|
|
|
|
|
SvUTF8_on(out); |
117
|
|
|
|
|
|
|
|
118
|
34
|
|
|
|
|
|
return(sv_2mortal(out)); |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
44
|
|
|
|
|
|
static int consume_utf8_bytes(U8* in, U8* out, int overlong_fatal) { |
122
|
|
|
|
|
|
|
UV cp, min_cp, bytes, i; |
123
|
|
|
|
|
|
|
U8 *d, ebuf[8]; |
124
|
|
|
|
|
|
|
SV *exception; |
125
|
|
|
|
|
|
|
|
126
|
44
|
100
|
|
|
|
|
if((in[0] & 0xE0) == 0xC0) { |
127
|
10
|
|
|
|
|
|
cp = in[0] & 0x1F; |
128
|
10
|
|
|
|
|
|
bytes = 2; |
129
|
10
|
|
|
|
|
|
min_cp = 0x80; |
130
|
|
|
|
|
|
|
} |
131
|
34
|
100
|
|
|
|
|
else if((in[0] & 0xF0) == 0xE0) { |
132
|
11
|
|
|
|
|
|
cp = in[0] & 0x0F; |
133
|
11
|
|
|
|
|
|
bytes = 3; |
134
|
11
|
|
|
|
|
|
min_cp = 0x800; |
135
|
|
|
|
|
|
|
} |
136
|
23
|
100
|
|
|
|
|
else if((in[0] & 0xF8) == 0xF0) { |
137
|
2
|
|
|
|
|
|
cp = in[0] & 0x07; |
138
|
2
|
|
|
|
|
|
bytes = 4; |
139
|
2
|
|
|
|
|
|
min_cp = 0x10000; |
140
|
|
|
|
|
|
|
} |
141
|
21
|
100
|
|
|
|
|
else if((in[0] & 0xFC) == 0xF8) { |
142
|
1
|
|
|
|
|
|
cp = in[0] & 0x03; |
143
|
1
|
|
|
|
|
|
bytes = 5; |
144
|
1
|
|
|
|
|
|
min_cp = 0x200000; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
else { |
147
|
20
|
|
|
|
|
|
return(0); |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
58
|
100
|
|
|
|
|
for(i = 1; i < bytes; i++) { |
151
|
38
|
100
|
|
|
|
|
if((in[i] & 0xC0) != 0x80) { |
152
|
4
|
|
|
|
|
|
return(0); |
153
|
|
|
|
|
|
|
} |
154
|
34
|
|
|
|
|
|
cp <<= 6; |
155
|
34
|
|
|
|
|
|
cp += in[i] & 0x3F; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
20
|
100
|
|
|
|
|
if(overlong_fatal && cp < min_cp) { |
|
|
50
|
|
|
|
|
|
159
|
1
|
|
|
|
|
|
exception = newSV(48); |
160
|
1
|
|
|
|
|
|
SvPOK_on(exception); |
161
|
1
|
|
|
|
|
|
sv_catpv(exception, "Over-long UTF-8 byte sequence:"); |
162
|
4
|
100
|
|
|
|
|
for(i = 0; i < bytes; i++) { |
163
|
3
|
|
|
|
|
|
sprintf(ebuf, " %02X", (int)in[i]); |
164
|
3
|
|
|
|
|
|
sv_catpv(exception, ebuf); |
165
|
|
|
|
|
|
|
} |
166
|
1
|
|
|
|
|
|
croak_sv(exception); |
167
|
|
|
|
|
|
|
} |
168
|
19
|
|
|
|
|
|
d = uvchr_to_utf8(out, cp); |
169
|
19
|
|
|
|
|
|
*d = '\0'; |
170
|
43
|
|
|
|
|
|
return(bytes); |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
24
|
|
|
|
|
|
static int consume_latin_byte(U8* in, U8* out, int ascii_hex) { |
175
|
|
|
|
|
|
|
U8 *d, *utf_bytes; |
176
|
|
|
|
|
|
|
|
177
|
24
|
100
|
|
|
|
|
if(in[0] > 0x9F) { |
178
|
9
|
|
|
|
|
|
d = uvchr_to_utf8(out, (UV)in[0]); |
179
|
9
|
|
|
|
|
|
*d = '\0'; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
else { |
182
|
15
|
|
|
|
|
|
utf_bytes = _encoding_fix_latin_ms_map + (in[0] & 0x7F) * 4; |
183
|
15
|
100
|
|
|
|
|
if(ascii_hex == 0 && *utf_bytes == '%') { |
|
|
50
|
|
|
|
|
|
184
|
6
|
|
|
|
|
|
d = uvchr_to_utf8(out, (UV)in[0]); |
185
|
6
|
|
|
|
|
|
*d = '\0'; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
else { |
188
|
9
|
|
|
|
|
|
strncpy(out, utf_bytes, 4); |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
} |
191
|
24
|
|
|
|
|
|
return(1); |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
MODULE = Encoding::FixLatin::XS PACKAGE = Encoding::FixLatin::XS |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
SV * |
198
|
|
|
|
|
|
|
_fix_latin_xs(source, overlong_fatal, ascii_hex) |
199
|
|
|
|
|
|
|
SV * source |
200
|
|
|
|
|
|
|
int overlong_fatal |
201
|
|
|
|
|
|
|
int ascii_hex |
202
|
|
|
|
|
|
|
PPCODE: |
203
|
35
|
|
|
|
|
|
ST(0) = _encoding_fix_latin_xs(source, overlong_fatal, ascii_hex); |
204
|
34
|
|
|
|
|
|
XSRETURN(1); |