File Coverage

blib/lib/ZMachine/ZSCII.pm
Criterion Covered Total %
statement 152 155 98.0
branch 41 58 70.6
condition 15 22 68.1
subroutine 17 17 100.0
pod 10 10 100.0
total 235 262 89.6


line stmt bran cond sub pod time code
1             package ZMachine::ZSCII;
2             {
3             $ZMachine::ZSCII::VERSION = '0.004';
4             }
5 1     1   69975 use 5.14.0;
  1         4  
  1         35  
6 1     1   5 use warnings;
  1         2  
  1         21  
7             # ABSTRACT: an encoder/decoder for Z-Machine text
8              
9 1     1   4 use Carp ();
  1         2  
  1         13  
10 1     1   4 use charnames ':full';
  1         2  
  1         8  
11              
12              
13             my %DEFAULT_ZSCII = (
14             chr(0x00) => "\N{NULL}",
15             chr(0x08) => "\N{DELETE}",
16             chr(0x0D) => "\x0D",
17             chr(0x1B) => "\N{ESCAPE}",
18              
19             (map {; chr $_ => chr $_ } (0x20 .. 0x7E)), # ASCII maps over
20              
21             # 0x09B - 0x0FB are the "extra characters" and need Unicode translation table
22             # 0x0FF - 0x3FF are undefined and never (?) used
23             );
24              
25             # We can use these characters below because they all (save for the magic A2-C6)
26             # are the same in Unicode/ASCII/ZSCII. -- rjbs, 2013-01-18
27             my $DEFAULT_ALPHABET = join(q{},
28             'a' .. 'z', # A0
29             'A' .. 'Z', # A1
30             ( # A2
31             "\0", # special: read 2 chars for 10-bit zscii character
32             "\x0D",
33             (0 .. 9),
34 1     1   13768 do { no warnings 'qw'; qw[ . , ! ? _ # ' " / \ - : ( ) ] },
  1         3  
  1         2292  
35             ),
36             );
37              
38             my @DEFAULT_EXTRA = map chr hex, qw(
39             E4 F6 FC C4 D6 DC DF BB AB EB EF FF CB CF E1 E9
40             ED F3 FA FD C1 C9 CD D3 DA DD E0 E8 EC F2 F9 C0
41             C8 CC D2 D9
42              
43             E2 EA EE F4 FB C2 CA CE D4 DB E5 C5 F8 D8 E3 F1
44             F5 C3 D1 D5 E6 C6 E7 C7 FE F0 DE D0 A3 153 152 A1
45             BF
46             );
47              
48             sub _validate_alphabet {
49 4     4   8 my (undef, $alphabet) = @_;
50              
51 4 50       11 Carp::croak("alphabet table was not 78 entries long")
52             unless length $alphabet == 78;
53              
54 4 50       10 Carp::carp("alphabet character 52 not set to 0x000")
55             unless substr($alphabet, 52, 1) eq chr(0);
56              
57             Carp::croak("alphabet table contains characters over 0xFF")
58 4 50       47 if grep {; ord > 0xFF } split //, $alphabet;
  312         351  
59             }
60              
61             sub _shortcuts_for {
62 4     4   10 my ($self, $alphabet) = @_;
63              
64 4         12 $self->_validate_alphabet($alphabet);
65              
66 4         29 my %shortcut = (q{ } => chr(0));
67              
68 4         21 for my $i (0 .. 2) {
69 12         15 my $offset = $i * 26;
70 12 100       25 my $prefix = $i ? chr(0x03 + $i) : '';
71              
72 12         16 for my $j (0 .. 25) {
73 312 100 100     640 next if $i == 2 and $j == 0; # that guy is magic! -- rjbs, 2013-01-18
74              
75 308         549 $shortcut{ substr($alphabet, $offset + $j, 1) } = $prefix . chr($j + 6);
76             }
77             }
78              
79 4         16 return \%shortcut;
80             }
81              
82              
83             sub new {
84 4     4 1 4451 my ($class, $arg) = @_;
85              
86 4 50       13 if (! defined $arg) {
87 0         0 $arg = { version => 5 };
88 4 100       12 } if (! ref $arg) {
89 1         3 $arg = { version => $arg };
90             }
91              
92 4         12 my $guts = { version => $arg->{version} };
93              
94 4 0 33     18 Carp::croak("only Version 5, 7, and 8 ZSCII are supported at present")
      33        
95             unless $guts->{version} == 5
96             or $guts->{version} == 7
97             or $guts->{version} == 8;
98              
99 4         353 $guts->{zscii} = { %DEFAULT_ZSCII };
100              
101             # Why is this an arrayref and not, like alphabets, a string?
102             # Alphabets are strings because they're guaranteed to fit in bytestrings.
103             # You can't put a ZSCII character over 0xFF in the alphabet, because it can't
104             # be put in the story file's alphabet table! By using a string, it's easy to
105             # just pass in the alphabet from memory to/from the codec. On the other
106             # hand, the Unicode translation table stores Unicode codepoint values packed
107             # into words, and it's not a good fit for use in the codec. Maybe a
108             # ZMachine::Util will be useful for packing/unpacking Unicode translation
109             # tables.
110 4   100     37 $guts->{extra} = $arg->{extra_characters}
111             || \@DEFAULT_EXTRA;
112              
113 4         15 Carp::confess("Unicode translation table exceeds maximum length of 97")
114 4 50       6 if @{ $guts->{extra} } > 97;
115              
116 4         7 for (0 .. $#{ $guts->{extra} }) {
  4         13  
117 78 50       155 Carp::confess("tried to add ambiguous Z->U mapping")
118             if exists $guts->{zscii}{ chr(155 + $_) };
119              
120 78         104 my $u_char = $guts->{extra}[$_];
121              
122             # Extra characters must go into the Unicode substitution table, which can
123             # only represent characters with codepoints between 0 and 0xFFFF. See
124             # Z-Machine Spec v1.1 ยง 3.8.4.2.1
125 78 50       126 Carp::confess("tried to add Unicode codepoint greater than U+FFFF")
126             if ord($u_char) > 0xFFFF;
127              
128 78         186 $guts->{zscii}{ chr(155 + $_) } = $u_char;
129             }
130              
131 4         9 $guts->{zscii_for} = { };
132 4         8 for my $zscii_char (sort keys %{ $guts->{zscii} }) {
  4         176  
133 474         515 my $unicode_char = $guts->{zscii}{$zscii_char};
134              
135 474 50       763 Carp::confess("tried to add ambiguous U->Z mapping")
136             if exists $guts->{zscii_for}{ $unicode_char };
137              
138 474         1705 $guts->{zscii_for}{ $unicode_char } = $zscii_char;
139             }
140              
141 4         33 my $self = bless $guts => $class;
142              
143             # The default alphabet is entirely made up of characters that are the same in
144             # Unicode and ZSCII. If a user wants to put "extra characters" into the
145             # alphabet table, though, the alphabet should contain ZSCII values. When
146             # we're building a ZMachine::ZSCII using the contents of the story file's
147             # alphabet table, that's easy. If we're building a codec to *produce* a
148             # story file, it's less trivial, because we don't want to think about the
149             # specific ZSCII codepoints for the Unicode text we'll encode.
150             #
151             # We provide alphabet_is_unicode to let the user say "my alphabet is supplied
152             # in Unicode, please convert it to ZSCII during construction." -- rjbs,
153             # 2013-01-19
154 4   66     18 my $alphabet = $arg->{alphabet} || $DEFAULT_ALPHABET;
155              
156             # It's okay if the user supplies alphabet_is_unicode but not alphabet,
157             # because the default alphabet is all characters with the same value in both
158             # character sets! -- rjbs, 2013-01-20
159 4 100       16 $alphabet = $self->unicode_to_zscii($alphabet)
160             if $arg->{alphabet_is_unicode};
161              
162 4         14 $self->{alphabet} = $alphabet;
163 4         12 $self->{shortcut} = $class->_shortcuts_for( $self->{alphabet} );
164              
165 4         11 return $self;
166             }
167              
168              
169             sub encode {
170 3     3 1 860 my ($self, $string) = @_;
171              
172 3         10 $string =~ s/\n/\x0D/g;
173              
174 3         8 my $zscii = $self->unicode_to_zscii($string);
175 3         7 my $zchars = $self->zscii_to_zchars($zscii);
176              
177 3         9 return $self->pack_zchars($zchars);
178             }
179              
180              
181             sub decode {
182 3     3 1 686 my ($self, $bytestring) = @_;
183              
184 3         7 my $zchars = $self->unpack_zchars( $bytestring );
185 3         9 my $zscii = $self->zchars_to_zscii( $zchars );
186 3         9 my $unicode = $self->zscii_to_unicode( $zscii );
187              
188 3         9 $unicode =~ s/\x0D/\n/g;
189              
190 3         10 return $unicode;
191             }
192              
193              
194             sub unicode_to_zscii {
195 14     14 1 6594 my ($self, $unicode_text) = @_;
196              
197 14         19 my $zscii = '';
198 14         38 for (0 .. length($unicode_text) - 1) {
199 189         242 my $char = substr $unicode_text, $_, 1;
200              
201 189 100       411 Carp::croak(
202             sprintf "no ZSCII character available for Unicode U+%v05X <%s>",
203             $char,
204             charnames::viacode(ord $char),
205             ) unless defined( my $zscii_char = $self->{zscii_for}{ $char } );
206              
207 188         216 $zscii .= $zscii_char;
208             }
209              
210 13         41 return $zscii;
211             }
212              
213              
214             sub zscii_to_unicode {
215 4     4 1 411 my ($self, $zscii) = @_;
216              
217 4         6 my $unicode = '';
218 4         8 for (0 .. length($zscii) - 1) {
219 49         46 my $char = substr $zscii, $_, 1;
220              
221 49 50       97 Carp::croak(
222             sprintf "no Unicode character available for ZSCII %#v05x", $char,
223             ) unless defined(my $unicode_char = $self->{zscii}{ $char });
224              
225 49         51 $unicode .= $unicode_char;
226             }
227              
228 4         10 return $unicode;
229             }
230              
231              
232             sub zscii_to_zchars {
233 10     10 1 5850 my ($self, $zscii) = @_;
234              
235 10 50       23 return '' unless length $zscii;
236              
237 10         13 my $zchars = '';
238 10         22 for (0 .. length($zscii) - 1) {
239 103         105 my $zscii_char = substr($zscii, $_, 1);
240 103 100       208 if (defined (my $shortcut = $self->{shortcut}{ $zscii_char })) {
241 92         83 $zchars .= $shortcut;
242 92         110 next;
243             }
244              
245 11         14 my $ord = ord $zscii_char;
246              
247 11 50       23 if ($ord >= 1024) {
248 0         0 Carp::croak(
249             sprintf "can't encode ZSCII codepoint %#v05x in Z-characters",
250             $zscii_char
251             );
252             }
253              
254 11         14 my $top = ($ord & 0b1111100000) >> 5;
255 11         10 my $bot = ($ord & 0b0000011111);
256              
257 11         11 $zchars .= "\x05\x06"; # The escape code for a ten-bit ZSCII character.
258 11         23 $zchars .= chr($top) . chr($bot);
259             }
260              
261 10         24 return $zchars;
262             }
263              
264              
265             sub zchars_to_zscii {
266 7     7 1 2571 my ($self, $zchars, $arg) = @_;
267 7   100     30 $arg ||= {};
268              
269 7         10 my $text = '';
270 7         9 my $alphabet = 0;
271              
272 7         22 while (length( my $char = substr $zchars, 0, 1, '')) {
273 104         96 my $ord = ord $char;
274              
275 104 100       151 if ($ord == 0) { $text .= q{ }; next; }
  3         4  
  3         5  
276              
277 101 100       187 if ($ord == 0x04) { $alphabet = 1; next }
  7 100       8  
  7         15  
278 23         26 elsif ($ord == 0x05) { $alphabet = 2; next }
  23         44  
279              
280 71 100 100     152 if ($alphabet == 2 && $ord == 0x06) {
281 12         16 my $next_two = substr $zchars, 0, 2, '';
282 12 100       21 if (length $next_two != 2) {
283 2 100       6 last if $arg->{allow_early_termination};
284 1         133 Carp::croak("ten-bit ZSCII encoding segment terminated early")
285             }
286              
287 10         15 my $value = ord(substr $next_two, 0, 1) << 5
288             | ord(substr $next_two, 1, 1);
289              
290 10         11 $text .= chr $value;
291 10         8 $alphabet = 0;
292 10         22 next;
293             }
294              
295 59 50 33     182 if ($ord >= 0x06 && $ord <= 0x1F) {
296 59         118 $text .= substr $self->{alphabet}, (26 * $alphabet) + $ord - 6, 1;
297 59         49 $alphabet = 0;
298 59         123 next;
299             }
300              
301 0         0 Carp::croak("unknown zchar <$char> encountered in alphabet <$alphabet>");
302             }
303              
304 6         17 return $text;
305             }
306              
307              
308             sub make_dict_length {
309 3     3 1 1331 my ($self, $zchars) = @_;
310              
311 3 50       10 my $length = $self->{version} >= 5 ? 9 : 6;
312 3         6 $zchars = substr $zchars, 0, $length;
313 3         7 $zchars .= "\x05" x ($length - length($zchars));
314              
315 3         11 return $zchars;
316             }
317              
318              
319             sub pack_zchars {
320 4     4 1 800 my ($self, $zchars) = @_;
321              
322 4         4 my $bytestring = '';
323              
324 4         14 while (my $substr = substr $zchars, 0, 3, '') {
325 31         61 $substr .= chr(5) until length $substr == 3;
326              
327 31         50 my $value = ord(substr($substr, 0, 1)) << 10
328             | ord(substr($substr, 1, 1)) << 5
329             | ord(substr($substr, 2, 1));
330              
331 31 100       46 $value |= (0x8000) if ! length $zchars;
332              
333 31         80 $bytestring .= pack 'n', $value;
334             }
335              
336 4         12 return $bytestring;
337             }
338              
339              
340             sub unpack_zchars {
341 5     5 1 1289 my ($self, $bytestring) = @_;
342              
343 5 50       15 Carp::croak("bytestring of packed zchars is not an even number of bytes")
344             if length($bytestring) % 2;
345              
346 5         5 my $terminate;
347 5         7 my $zchars = '';
348 5         14 while (my $word = substr $bytestring, 0, 2, '') {
349             # XXX: Probably allow this to warn and `last` -- rjbs, 2013-01-18
350 37 50       46 Carp::croak("input continues after terminating byte") if $terminate;
351              
352 37         44 my $n = unpack 'n', $word;
353 37         38 $terminate = $n & 0x8000;
354              
355 37         35 my $c1 = chr( ($n & 0b0111110000000000) >> 10 );
356 37         35 my $c2 = chr( ($n & 0b0000001111100000) >> 5 );
357 37         30 my $c3 = chr( ($n & 0b0000000000011111) );
358              
359 37         84 $zchars .= "$c1$c2$c3";
360             }
361              
362 5         14 return $zchars;
363             }
364              
365             1;
366              
367             __END__