File Coverage

blib/lib/Encode/BOCU1.pm
Criterion Covered Total %
statement 112 144 77.7
branch 51 78 65.3
condition 21 39 53.8
subroutine 10 10 100.0
pod 2 4 50.0
total 196 275 71.2


line stmt bran cond sub pod time code
1             package Encode::BOCU1;
2              
3 1     1   40890 use 5.008;
  1         6  
  1         46  
4 1     1   5 use strict;
  1         3  
  1         32  
5 1     1   6 use warnings;
  1         16  
  1         27  
6 1     1   6 use Carp;
  1         1  
  1         82  
7              
8 1     1   5 use base qw(Encode::Encoding);
  1         2  
  1         196  
9              
10             our $VERSION = '0.03';
11              
12             __PACKAGE__->Define('bocu1');
13              
14 1     1   5 use Encode::Alias;
  1         2  
  1         1761  
15             define_alias( qr/^bocu.1$/i => '"bocu1"');
16             define_alias( qr/^bocu$/i => '"bocu1"');
17              
18              
19             #
20             # encode / decode
21             #
22             sub encode($$;$) {
23 4     4 1 6905 my ($obj, $str, $check) = @_;
24 4         12 my $octet = utf8_to_bocu1($str);
25              
26 4 50       16 $_[1] = '' if $check;
27 4         12 return $octet;
28             }
29             sub decode($$;$) {
30 3     3 1 584 my ($obj, $octet, $check) = @_;
31 3         7 my $str = bocu1_to_utf8($octet);
32              
33 3 50       9 $_[1] = '' if $check;
34 3         8 return $str;
35             }
36              
37             #
38             # subroutines
39             #
40             my @bocu1_trail_to_byte = (
41             # 0 - 19 (0x0 - 0x13)
42             0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
43             0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1c, 0x1d, 0x1e, 0x1f,
44             # 20 - 242 (0x14 - 0xf2)
45             0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
46             0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
47             0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
48             0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
49             0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
50             0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
51             0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
52             0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
53             0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
54             0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
55             0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
56             0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
57             0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
58             0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff );
59              
60             my @bocu1_byte_to_trail = (
61             # 0x00 - 0x20
62             -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, -1, -1, -1, -1, -1, -1, -1, -1,
63             0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
64             -1,
65             # 0x21 - 0xff
66             0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22,
67             0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
68             0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42,
69             0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52,
70             0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
71             0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72,
72             0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82,
73             0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92,
74             0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2,
75             0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2,
76             0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2,
77             0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2,
78             0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2,
79             0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2 );
80              
81             sub bocu1_to_utf8 {
82 3     3 0 4 my $bocu1str = shift;
83 3         26 my @chars = unpack("C*", $bocu1str);
84              
85 3         6 my $pc = 0x40;
86 3         4 my @codepoints;
87 3         10 for (my $i=0; $i<=$#chars; $i++) {
88 100         116 my $lead = $chars[$i];
89 100         99 my $cp = 0;
90 100         103 my $diff = 0;
91 100 100       513 if ($lead <= 0x20) {
    50          
    100          
    100          
    100          
    100          
    50          
    0          
    0          
92 10         14 $cp = $lead;
93             } elsif ($lead == 0x21) { # 21 t1 t2 t3
94 0         0 my $t1 = $bocu1_byte_to_trail[$chars[++$i]];
95 0         0 my $t2 = $bocu1_byte_to_trail[$chars[++$i]];
96 0         0 my $t3 = $bocu1_byte_to_trail[$chars[++$i]];
97 0 0 0     0 croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0;
      0        
98 0         0 $diff = 14161247 + $t1 * 59049 + $t2 * 243 + $t3
99             } elsif ($lead < 0x25) { # [22-24] t1 t2
100 3         5 my $t1 = $bocu1_byte_to_trail[$chars[++$i]];
101 3         5 my $t2 = $bocu1_byte_to_trail[$chars[++$i]];
102 3 50 33     13 croak "illegal trail char" if $t1 < 0 || $t2 < 0;
103 3         7 $diff = -2195326 + $lead * 59049 + $t1 * 243 + $t2;
104             } elsif ($lead < 0x50) { # [25-4F] t1
105 2         4 my $t1 = $bocu1_byte_to_trail[$chars[++$i]];
106 2 50       5 croak "illegal trail char" if $t1 < 0;
107 2         4 $diff = -19504 + $lead * 243 + $t1;
108             } elsif ($lead < 0xd0) { # [50-CF]
109 79         88 $diff = $lead - 0x90;
110             } elsif ($lead < 0xfb) { # [D0-FA] t1
111 1         3 my $t1 = $bocu1_byte_to_trail[$chars[++$i]];
112 1 50       3 croak "illegal trail char" if $t1 < 0;
113 1         2 $diff = -50480 + $lead * 243 + $t1;
114             } elsif ($lead < 0xfe) { # [FB-FD] t1 t2
115 5         8 my $t1 = $bocu1_byte_to_trail[$chars[++$i]];
116 5         8 my $t2 = $bocu1_byte_to_trail[$chars[++$i]];
117 5 50 33     25 croak "illegal trail char" if $t1 < 0 || $t2 < 0;
118 5         11 $diff = -14810786 + $lead * 59049 + $t1 * 243 + $t2;
119             } elsif ($lead == 0xfe) { # FE t1 t2 t3
120 0         0 my $t1 = $bocu1_byte_to_trail[$chars[++$i]];
121 0         0 my $t2 = $bocu1_byte_to_trail[$chars[++$i]];
122 0         0 my $t3 = $bocu1_byte_to_trail[$chars[++$i]];
123 0 0 0     0 croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0;
      0        
124 0         0 $diff = 187660 + $t1 * 59049 + $t2 * 243 + $t3;
125             } elsif ($lead == 0xff) {
126             ## reset
127 0         0 $cp = 0;
128 0         0 $diff = 0;
129             }
130              
131             # codepoint, next pc
132 100 100       244 if ($lead <= 0x20) {
    50          
133 10 50       19 $pc = 0x40 if ($lead < 0x20);
134 10         27 push(@codepoints,$lead);
135             } elsif ($lead < 0xff) {
136 90         91 $cp = $pc + $diff;
137 90 50       158 $cp = 0 if $cp < 0;
138 90         110 push(@codepoints,$cp);
139 90 50 100     520 if ($cp < 0x20) {
    50 100        
    100 66        
    100          
    50          
140 0         0 $pc = 0x40;
141             } elsif ($cp == 0x20) {
142             # keep pc
143             } elsif (0x3040 <= $cp && $cp <= 0x309f) {
144 21         50 $pc = 0x3070;
145             } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) {
146 1         4 $pc = 0x7711;
147             } elsif (0xac00 <= $cp && $cp <= 0xd7a3) {
148 0         0 $pc = 0xc1d1;
149             } else {
150 68         190 $pc = ($cp & ~0x7f) + 0x40;
151             }
152             } else { # 0xff : reset
153 0         0 $pc = 0x40;
154             }
155             }
156              
157 3         15 my $utf8str = pack("U*", @codepoints);
158 3         8 Encode::_utf8_on($utf8str);
159 3         14 $utf8str;
160             }
161              
162             sub utf8_to_bocu1 {
163 4     4 0 6 my $utf8str = shift;
164              
165 4         31 my @chars = unpack("U*", $utf8str);
166 4         15 my $bocu1str = '*' x $#chars;
167 4         8 $bocu1str = '';
168 4         5 my $pc = 0x40;
169 4         16 for (my $i=0; $i<=$#chars; $i++) {
170 102         124 my $cp = $chars[$i];
171 102 100 100     228 next if $i == 0 && $cp == 0xfeff;
172              
173 101 50       174 croak "unsupported codepoint (>0x1fffff)." if $cp > 0x001fffff;
174             # cp -> diff -> bocu1
175 101 100       165 if ($cp <= 0x20) {
176 11         14 $bocu1str .= chr($cp);
177 11 50       39 $pc = 0x40 unless $cp == 0x20;
178             } else {
179 90         104 my $diff = $cp - $pc;
180 90         82 my $b;
181 90 50       307 if ($diff < -187660) { # [...,-187660) : 21
    100          
    100          
    100          
    100          
    50          
182 0         0 $diff -= -14536567;
183 0         0 my $t3 = $diff % 243; $diff = int($diff / 243);
  0         0  
184 0         0 my $t2 = $diff % 243; $diff = int($diff / 243);
  0         0  
185 0         0 my $t1 = $diff % 243; $diff = int($diff / 243);
  0         0  
186             # my $t0 = $diff;
187 0         0 $b = pack("C4", 0x21, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]);
188             } elsif ($diff < -10513) { # [-187660,-10513) : 22-24
189 3         5 $diff -= -187660;
190 3         4 my $t2 = $diff % 243; $diff = int($diff / 243);
  3         4  
191 3         5 my $t1 = $diff % 243; $diff = int($diff / 243);
  3         5  
192 3         3 my $t0 = $diff;
193 3         9 $b = pack("C3", (0x22 + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]);
194             } elsif ($diff < -64) { # [-10513,-64) : 25-4F
195 2         3 $diff -= -10513;
196 2         4 my $t1 = $diff % 243; $diff = int($diff / 243);
  2         5  
197 2         3 my $t0 = $diff;
198 2         5 $b = pack("C2", (0x25 + $t0), $bocu1_trail_to_byte[$t1]);
199             } elsif ($diff < 64) { # [-64,63) : 50-CF
200 79         92 $diff -= -64;
201 79         85 my $t0 = $diff;
202 79         130 $b = pack("C", (0x50 + $t0));
203             } elsif ($diff < 10513) { # [64,10513) : D0-FA
204 1         2 $diff -= 64;
205 1         2 my $t1 = $diff % 243; $diff = int($diff / 243);
  1         2  
206 1         2 my $t0 = $diff;
207 1         4 $b = pack("C2", (0xd0 + $t0), $bocu1_trail_to_byte[$t1]);
208             } elsif ($diff < 187660) { # [10513,187660) : FB-FD
209 5         6 $diff -= 10513;
210 5         8 my $t2 = $diff % 243; $diff = int($diff / 243);
  5         9  
211 5         8 my $t1 = $diff % 243; $diff = int($diff / 243);
  5         6  
212 5         7 my $t0 = $diff;
213 5         17 $b = pack("C3", (0xfb + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]);
214             } else { # [187660,...) : FE
215 0         0 $diff -= 187660;
216 0         0 my $t3 = $diff % 243; $diff = int($diff / 243);
  0         0  
217 0         0 my $t2 = $diff % 243; $diff = int($diff / 243);
  0         0  
218 0         0 my $t1 = $diff % 243; $diff = int($diff / 243);
  0         0  
219             # my $t0 = $diff;
220 0         0 $b = pack("C4", 0xfe, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]);
221             }
222 90         130 $bocu1str .= $b;
223              
224             # next pc
225 90 100 100     447 if (0x3040 <= $cp && $cp <= 0x309f) {
    100 100        
    50 66        
226 21         62 $pc = 0x3070;
227             } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) {
228 1         3 $pc = 0x7711;
229             } elsif (0xac00 <= $cp && $cp <= 0xd7a3) {
230 0         0 $pc = 0xc1d1;
231             } else {
232 68         183 $pc = $cp & ~0x7f | 0x40;
233             }
234             }
235             }
236              
237 4         14 $bocu1str;
238             }
239              
240             1;
241             __END__