line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package MARC::Charset; |
2
|
|
|
|
|
|
|
|
3
|
16
|
|
|
16
|
|
302329
|
use strict; |
|
16
|
|
|
|
|
36
|
|
|
16
|
|
|
|
|
786
|
|
4
|
16
|
|
|
16
|
|
169
|
use warnings; |
|
16
|
|
|
|
|
31
|
|
|
16
|
|
|
|
|
865
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = '1.35'; |
7
|
|
|
|
|
|
|
|
8
|
16
|
|
|
16
|
|
90
|
use base qw(Exporter); |
|
16
|
|
|
|
|
34
|
|
|
16
|
|
|
|
|
2295
|
|
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); |
10
|
|
|
|
|
|
|
|
11
|
16
|
|
|
16
|
|
28563
|
use Unicode::Normalize; |
|
16
|
|
|
|
|
131206
|
|
|
16
|
|
|
|
|
1575
|
|
12
|
16
|
|
|
16
|
|
35298
|
use Encode 'decode'; |
|
16
|
|
|
|
|
302132
|
|
|
16
|
|
|
|
|
5385
|
|
13
|
16
|
|
|
16
|
|
22577
|
use charnames ':full'; |
|
16
|
|
|
|
|
900130
|
|
|
16
|
|
|
|
|
123
|
|
14
|
16
|
|
|
16
|
|
24903
|
use MARC::Charset::Table; |
|
16
|
|
|
|
|
74
|
|
|
16
|
|
|
|
|
720
|
|
15
|
16
|
|
|
16
|
|
157
|
use MARC::Charset::Constants qw(:all); |
|
16
|
|
|
|
|
38
|
|
|
16
|
|
|
|
|
79274
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
MARC::Charset - convert MARC-8 encoded strings to UTF-8 |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# import the marc8_to_utf8 function |
24
|
|
|
|
|
|
|
use MARC::Charset 'marc8_to_utf8'; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# prepare STDOUT for utf8 |
27
|
|
|
|
|
|
|
binmode(STDOUT, 'utf8'); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# print out some marc8 as utf8 |
30
|
|
|
|
|
|
|
print marc8_to_utf8($marc8_string); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8 |
35
|
|
|
|
|
|
|
strings. MARC-8 is a single byte character encoding that predates unicode, and |
36
|
|
|
|
|
|
|
allows you to put non-Roman scripts in MARC bibliographic records. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
http://www.loc.gov/marc/specifications/spechome.html |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 EXPORTS |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=cut |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# get the mapping table |
45
|
|
|
|
|
|
|
our $table = MARC::Charset::Table->new(); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# set default character sets |
48
|
|
|
|
|
|
|
# these are viewable at the package level |
49
|
|
|
|
|
|
|
# in case someone wants to set them |
50
|
|
|
|
|
|
|
our $DEFAULT_G0 = ASCII_DEFAULT; |
51
|
|
|
|
|
|
|
our $DEFAULT_G1 = EXTENDED_LATIN; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
our %SPECIAL_DECOMPOSABLE = ( |
54
|
|
|
|
|
|
|
chr(0x01a0) => chr(0x01a0), # uppercase o-hook |
55
|
|
|
|
|
|
|
chr(0x01af) => chr(0x01af), # uppercase u-hook |
56
|
|
|
|
|
|
|
chr(0x01a1) => chr(0x01a1), # lowercase o-hook |
57
|
|
|
|
|
|
|
chr(0x01b0) => chr(0x01b0), # lowercase u-hook |
58
|
|
|
|
|
|
|
chr(0x1ef1) => chr(0x01b0) . chr(0x0323), # lowercase u-hook with dot below |
59
|
|
|
|
|
|
|
chr(0x1ee9) => chr(0x01b0) . chr(0x0301), # lowercase u-hook with acute |
60
|
|
|
|
|
|
|
# Arabic to not decompose |
61
|
|
|
|
|
|
|
chr(0x0622) => chr(0x0622), |
62
|
|
|
|
|
|
|
chr(0x0623) => chr(0x0623), |
63
|
|
|
|
|
|
|
chr(0x0624) => chr(0x0624), |
64
|
|
|
|
|
|
|
chr(0x0625) => chr(0x0625), |
65
|
|
|
|
|
|
|
chr(0x0626) => chr(0x0626), |
66
|
|
|
|
|
|
|
chr(0x0649) => chr(0x0649), |
67
|
|
|
|
|
|
|
chr(0x0671) => chr(0x0671), |
68
|
|
|
|
|
|
|
chr(0x06c0) => chr(0x06c0), |
69
|
|
|
|
|
|
|
chr(0x06D3) => chr(0x06D3), |
70
|
|
|
|
|
|
|
# Cyrillic to not decompose |
71
|
|
|
|
|
|
|
chr(0x0439) => chr(0x0439), |
72
|
|
|
|
|
|
|
chr(0x0419) => chr(0x0419), |
73
|
|
|
|
|
|
|
chr(0x0453) => chr(0x0453), |
74
|
|
|
|
|
|
|
chr(0x0451) => chr(0x0451), |
75
|
|
|
|
|
|
|
chr(0x0457) => chr(0x0457), |
76
|
|
|
|
|
|
|
chr(0x045C) => chr(0x045C), |
77
|
|
|
|
|
|
|
chr(0x045E) => chr(0x045E), |
78
|
|
|
|
|
|
|
chr(0x0403) => chr(0x0403), |
79
|
|
|
|
|
|
|
chr(0x0401) => chr(0x0401), |
80
|
|
|
|
|
|
|
chr(0x0407) => chr(0x0407), |
81
|
|
|
|
|
|
|
chr(0x040C) => chr(0x040C), |
82
|
|
|
|
|
|
|
chr(0x040E) => chr(0x040E), |
83
|
|
|
|
|
|
|
# Katakana to not decompose |
84
|
|
|
|
|
|
|
chr(0x309B) => chr(0x309B), |
85
|
|
|
|
|
|
|
chr(0x309C) => chr(0x309C), |
86
|
|
|
|
|
|
|
chr(0x30AC) => chr(0x30AC), |
87
|
|
|
|
|
|
|
chr(0x30AE) => chr(0x30AE), |
88
|
|
|
|
|
|
|
chr(0x30B0) => chr(0x30B0), |
89
|
|
|
|
|
|
|
chr(0x30B2) => chr(0x30B2), |
90
|
|
|
|
|
|
|
chr(0x30B4) => chr(0x30B4), |
91
|
|
|
|
|
|
|
chr(0x30B6) => chr(0x30B6), |
92
|
|
|
|
|
|
|
chr(0x30B8) => chr(0x30B8), |
93
|
|
|
|
|
|
|
chr(0x30BA) => chr(0x30BA), |
94
|
|
|
|
|
|
|
chr(0x30BC) => chr(0x30BC), |
95
|
|
|
|
|
|
|
chr(0x30BE) => chr(0x30BE), |
96
|
|
|
|
|
|
|
chr(0x30C0) => chr(0x30C0), |
97
|
|
|
|
|
|
|
chr(0x30C2) => chr(0x30C2), |
98
|
|
|
|
|
|
|
chr(0x30C5) => chr(0x30C5), |
99
|
|
|
|
|
|
|
chr(0x30C7) => chr(0x30C7), |
100
|
|
|
|
|
|
|
chr(0x30C9) => chr(0x30C9), |
101
|
|
|
|
|
|
|
chr(0x30D0) => chr(0x30D0), |
102
|
|
|
|
|
|
|
chr(0x30D1) => chr(0x30D1), |
103
|
|
|
|
|
|
|
chr(0x30D3) => chr(0x30D3), |
104
|
|
|
|
|
|
|
chr(0x30D4) => chr(0x30D4), |
105
|
|
|
|
|
|
|
chr(0x30D6) => chr(0x30D6), |
106
|
|
|
|
|
|
|
chr(0x30D7) => chr(0x30D7), |
107
|
|
|
|
|
|
|
chr(0x30D9) => chr(0x30D9), |
108
|
|
|
|
|
|
|
chr(0x30DA) => chr(0x30DA), |
109
|
|
|
|
|
|
|
chr(0x30DC) => chr(0x30DC), |
110
|
|
|
|
|
|
|
chr(0x30DD) => chr(0x30DD), |
111
|
|
|
|
|
|
|
chr(0x30F4) => chr(0x30F4), |
112
|
|
|
|
|
|
|
chr(0x30F7) => chr(0x30F7), |
113
|
|
|
|
|
|
|
chr(0x30F8) => chr(0x30F8), |
114
|
|
|
|
|
|
|
chr(0x30F9) => chr(0x30F9), |
115
|
|
|
|
|
|
|
chr(0x30FA) => chr(0x30FA), |
116
|
|
|
|
|
|
|
chr(0x30FE) => chr(0x30FE), |
117
|
|
|
|
|
|
|
chr(0x30FF) => chr(0x30FF), |
118
|
|
|
|
|
|
|
); |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head2 ignore_errors() |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Tells MARC::Charset whether or not to ignore all encoding errors, and |
123
|
|
|
|
|
|
|
returns the current setting. This is helpful if you have records that |
124
|
|
|
|
|
|
|
contain both MARC8 and UNICODE characters. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
my $ignore = MARC::Charset->ignore_errors(); |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
MARC::Charset->ignore_errors(1); # ignore errors |
129
|
|
|
|
|
|
|
MARC::Charset->ignore_errors(0); # DO NOT ignore errors |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=cut |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
our $_ignore_errors = 0; |
135
|
|
|
|
|
|
|
sub ignore_errors { |
136
|
0
|
|
|
0
|
1
|
0
|
my ($self,$i) = @_; |
137
|
0
|
0
|
|
|
|
0
|
$_ignore_errors = $i if (defined($i)); |
138
|
0
|
|
|
|
|
0
|
return $_ignore_errors; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head2 assume_unicode() |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Tells MARC::Charset whether or not to assume UNICODE when an error is |
145
|
|
|
|
|
|
|
encountered in ignore_errors mode and returns the current setting. |
146
|
|
|
|
|
|
|
This is helpful if you have records that contain both MARC8 and UNICODE |
147
|
|
|
|
|
|
|
characters. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
my $setting = MARC::Charset->assume_unicode(); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8) |
152
|
|
|
|
|
|
|
MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=cut |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
our $_assume = ''; |
158
|
|
|
|
|
|
|
sub assume_unicode { |
159
|
0
|
|
|
0
|
1
|
0
|
my ($self,$i) = @_; |
160
|
0
|
0
|
0
|
|
|
0
|
$_assume = 'utf8' if (defined($i) and $i); |
161
|
0
|
0
|
|
|
|
0
|
return 1 if ($_assume eq 'utf8'); |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head2 assume_encoding() |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
Tells MARC::Charset whether or not to assume a specific encoding when an error |
168
|
|
|
|
|
|
|
is encountered in ignore_errors mode and returns the current setting. This |
169
|
|
|
|
|
|
|
is helpful if you have records that contain both MARC8 and other characters. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
my $setting = MARC::Charset->assume_encoding(); |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
MARC::Charset->assume_encoding('cp850'); # assume characters are cp850 |
174
|
|
|
|
|
|
|
MARC::Charset->assume_encoding(''); # DO NOT assume any encoding |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=cut |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub assume_encoding { |
180
|
0
|
|
|
0
|
1
|
0
|
my ($self,$i) = @_; |
181
|
0
|
0
|
|
|
|
0
|
$_assume = $i if (defined($i)); |
182
|
0
|
|
|
|
|
0
|
return $_assume; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# place holders for working graphical character sets |
187
|
|
|
|
|
|
|
my $G0; |
188
|
|
|
|
|
|
|
my $G1; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 marc8_to_utf8() |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Converts a MARC-8 encoded string to UTF-8. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
my $utf8 = marc8_to_utf8($marc8); |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
If you'd like to ignore errors pass in a true value as the 2nd |
197
|
|
|
|
|
|
|
parameter or call MARC::Charset->ignore_errors() with a true |
198
|
|
|
|
|
|
|
value: |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
my $utf8 = marc8_to_utf8($marc8, 'ignore-errors'); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
or |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
MARC::Charset->ignore_errors(1); |
205
|
|
|
|
|
|
|
my $utf8 = marc8_to_utf8($marc8); |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub marc8_to_utf8 |
211
|
|
|
|
|
|
|
{ |
212
|
27
|
|
|
27
|
1
|
2238
|
my ($marc8, $ignore_errors) = @_; |
213
|
27
|
|
|
|
|
182
|
reset_charsets(); |
214
|
|
|
|
|
|
|
|
215
|
27
|
50
|
|
|
|
91
|
$ignore_errors = $_ignore_errors if (!defined($ignore_errors)); |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# holder for our utf8 |
218
|
27
|
|
|
|
|
54
|
my $utf8 = ''; |
219
|
|
|
|
|
|
|
|
220
|
27
|
|
|
|
|
40
|
my $index = 0; |
221
|
27
|
|
|
|
|
47
|
my $length = length($marc8); |
222
|
27
|
|
|
|
|
41
|
my $combining = ''; |
223
|
27
|
|
|
|
|
96
|
CHAR_LOOP: while ($index < $length) |
224
|
|
|
|
|
|
|
{ |
225
|
|
|
|
|
|
|
# whitespace, line feeds and carriage returns just get added on unmolested |
226
|
480
|
100
|
|
|
|
2506
|
if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) |
227
|
|
|
|
|
|
|
{ |
228
|
58
|
|
|
|
|
144
|
$utf8 .= $1; |
229
|
58
|
|
|
|
|
76
|
$index += 1; |
230
|
58
|
|
|
|
|
143
|
next CHAR_LOOP; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# look for any escape sequences |
234
|
422
|
|
|
|
|
868
|
my $new_index = _process_escape(\$marc8, $index, $length); |
235
|
422
|
100
|
|
|
|
977
|
if ($new_index > $index) |
236
|
|
|
|
|
|
|
{ |
237
|
56
|
|
|
|
|
61
|
$index = $new_index; |
238
|
56
|
|
|
|
|
132
|
next CHAR_LOOP; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
366
|
|
|
|
|
351
|
my $found; |
242
|
366
|
|
|
|
|
523
|
CHARSET_LOOP: foreach my $charset ($G0, $G1) |
243
|
|
|
|
|
|
|
{ |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# cjk characters are a string of three chars |
246
|
366
|
100
|
|
|
|
663
|
my $char_size = $charset eq CJK ? 3 : 1; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# extract the next code point to examine |
249
|
366
|
|
|
|
|
528
|
my $chunk = substr($marc8, $index, $char_size); |
250
|
|
|
|
|
|
|
|
251
|
366
|
|
|
|
|
338
|
my $code; |
252
|
366
|
100
|
|
|
|
600
|
if ($char_size == 1) { |
253
|
358
|
|
|
|
|
398
|
my $codepoint = ord($chunk); |
254
|
358
|
100
|
100
|
|
|
1789
|
if ($codepoint >= 0x21 && $codepoint <= 0x7e) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
255
|
|
|
|
|
|
|
# character is G0 |
256
|
338
|
|
|
|
|
1085
|
$code = $table->lookup_by_marc8($G0, $chunk); |
257
|
|
|
|
|
|
|
} elsif ($codepoint >= 0xa1 && $codepoint <= 0xfe) { |
258
|
|
|
|
|
|
|
# character is G1, map it to G0 before atttempting lookup |
259
|
17
|
|
|
|
|
88
|
$code = $table->lookup_by_marc8($G1, chr($codepoint - 128)); |
260
|
|
|
|
|
|
|
} elsif ($codepoint >= 0x88 && $codepoint <= 0x8e) { |
261
|
|
|
|
|
|
|
# in the C1 range used by MARC8 |
262
|
1
|
|
|
|
|
5
|
$code = $table->lookup_by_marc8(EXTENDED_LATIN, $chunk); |
263
|
|
|
|
|
|
|
} elsif ($codepoint >= 0x1b && $codepoint <= 0x1f) { |
264
|
|
|
|
|
|
|
# in the C0 range used by MARC8 |
265
|
2
|
|
|
|
|
9
|
$code = $table->lookup_by_marc8(BASIC_LATIN, $chunk); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} else { |
268
|
|
|
|
|
|
|
# EACC doesn't need G0/G1 conversion |
269
|
8
|
|
|
|
|
28
|
$code = $table->lookup_by_marc8($charset, $chunk); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# try the next character set if no mapping was found |
273
|
366
|
50
|
|
|
|
7769
|
next CHARSET_LOOP if ! $code; |
274
|
366
|
|
|
|
|
642
|
$found = 1; |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# gobble up all combining characters for appending later |
277
|
|
|
|
|
|
|
# this is necessary because combinging characters precede |
278
|
|
|
|
|
|
|
# the character they modify in MARC-8, whereas they follow |
279
|
|
|
|
|
|
|
# the character they modify in UTF-8. |
280
|
366
|
100
|
|
|
|
999
|
if ($code->is_combining()) |
281
|
|
|
|
|
|
|
{ |
282
|
|
|
|
|
|
|
# If the current character is the right half of a MARC-8 |
283
|
|
|
|
|
|
|
# ligature or double tilde, we don't want to include |
284
|
|
|
|
|
|
|
# it in the UTF-8 output. For the explanation, see |
285
|
|
|
|
|
|
|
# http://lcweb2.loc.gov/diglib/codetables/45.html#Note1 |
286
|
|
|
|
|
|
|
# Note that if the MARC-8 string includes a right half |
287
|
|
|
|
|
|
|
# without the corresponding left half, the right half will |
288
|
|
|
|
|
|
|
# get dropped instead of being mapped to its UCS alternate. |
289
|
|
|
|
|
|
|
# That's OK since including only one half of a double diacritic |
290
|
|
|
|
|
|
|
# was presumably a mistake to begin with. |
291
|
11
|
100
|
|
|
|
130
|
unless (defined $code->marc_left_half()) |
292
|
|
|
|
|
|
|
{ |
293
|
6
|
|
|
|
|
69
|
$combining .= $code->char_value(); |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
else |
297
|
|
|
|
|
|
|
{ |
298
|
355
|
|
|
|
|
3847
|
$utf8 .= $code->char_value() . $combining; |
299
|
355
|
|
|
|
|
26244
|
$combining = ''; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
366
|
|
|
|
|
574
|
$index += $char_size; |
303
|
366
|
|
|
|
|
1712
|
next CHAR_LOOP; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
0
|
0
|
|
|
|
0
|
if (!$found) |
307
|
|
|
|
|
|
|
{ |
308
|
0
|
|
|
|
|
0
|
warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". |
309
|
|
|
|
|
|
|
"g0=".MARC::Charset::Constants::charset_name($G0) . " " . |
310
|
|
|
|
|
|
|
"g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); |
311
|
0
|
0
|
|
|
|
0
|
if (!$ignore_errors) |
312
|
|
|
|
|
|
|
{ |
313
|
0
|
|
|
|
|
0
|
reset_charsets(); |
314
|
0
|
|
|
|
|
0
|
return; |
315
|
|
|
|
|
|
|
} |
316
|
0
|
0
|
|
|
|
0
|
if ($_assume) |
317
|
|
|
|
|
|
|
{ |
318
|
0
|
|
|
|
|
0
|
reset_charsets(); |
319
|
0
|
|
|
|
|
0
|
return NFC(decode($_assume => $marc8)); |
320
|
|
|
|
|
|
|
} |
321
|
0
|
|
|
|
|
0
|
$index += 1; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# return the utf8 |
327
|
27
|
|
|
|
|
73
|
reset_charsets(); |
328
|
27
|
|
|
|
|
77
|
utf8::upgrade($utf8); |
329
|
27
|
|
|
|
|
788
|
return $utf8; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=head2 utf8_to_marc8() |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
Will attempt to translate utf8 into marc8. |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
my $marc8 = utf8_to_marc8($utf8); |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
If you'd like to ignore errors, or characters that can't be |
341
|
|
|
|
|
|
|
converted to marc8 then pass in a true value as the second |
342
|
|
|
|
|
|
|
parameter: |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
my $marc8 = utf8_to_marc8($utf8, 'ignore-errors'); |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
or |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
MARC::Charset->ignore_errors(1); |
349
|
|
|
|
|
|
|
my $utf8 = marc8_to_utf8($marc8); |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=cut |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
sub utf8_to_marc8 |
354
|
|
|
|
|
|
|
{ |
355
|
21
|
|
|
21
|
1
|
3754
|
my ($utf8, $ignore_errors) = @_; |
356
|
21
|
|
|
|
|
120
|
reset_charsets(); |
357
|
|
|
|
|
|
|
|
358
|
21
|
50
|
|
|
|
76
|
$ignore_errors = $_ignore_errors if (!defined($ignore_errors)); |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# decompose combined characters |
361
|
117
|
100
|
|
|
|
1731
|
$utf8 = join('', |
362
|
21
|
|
|
|
|
127
|
map { exists $SPECIAL_DECOMPOSABLE{$_} ? $SPECIAL_DECOMPOSABLE{$_} : NFD($_) } |
363
|
|
|
|
|
|
|
split //, $utf8 |
364
|
|
|
|
|
|
|
); |
365
|
|
|
|
|
|
|
|
366
|
21
|
|
|
|
|
136
|
my $len = length($utf8); |
367
|
21
|
|
|
|
|
42
|
my $marc8 = ''; |
368
|
21
|
|
|
|
|
95
|
for (my $i=0; $i<$len; $i++) |
369
|
|
|
|
|
|
|
{ |
370
|
120
|
|
|
|
|
326
|
my $slice = substr($utf8, $i, 1); |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# spaces are copied from utf8 into marc8 |
373
|
120
|
100
|
|
|
|
296
|
if ($slice eq ' ') |
374
|
|
|
|
|
|
|
{ |
375
|
10
|
|
|
|
|
26
|
$marc8 .= ' '; |
376
|
10
|
|
|
|
|
29
|
next; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# try to find the code point in our mapping table |
380
|
110
|
|
|
|
|
441
|
my $code = $table->lookup_by_utf8($slice); |
381
|
|
|
|
|
|
|
|
382
|
110
|
50
|
|
|
|
4937
|
if (! $code) |
383
|
|
|
|
|
|
|
{ |
384
|
0
|
|
|
|
|
0
|
warn("no mapping found at position $i in $utf8"); |
385
|
0
|
0
|
0
|
|
|
0
|
reset_charsets() and return unless $ignore_errors; |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
# if it's a combining character move it around |
389
|
110
|
100
|
|
|
|
631
|
if ($code->is_combining()) |
390
|
|
|
|
|
|
|
{ |
391
|
17
|
|
|
|
|
340
|
my $prev = chop($marc8); |
392
|
17
|
100
|
|
|
|
76
|
if ($code->marc_left_half()) |
393
|
|
|
|
|
|
|
{ |
394
|
|
|
|
|
|
|
# don't add the MARC-8 right half character |
395
|
|
|
|
|
|
|
# if it was already inserted when the double |
396
|
|
|
|
|
|
|
# diacritic was converted from UTF-8 |
397
|
2
|
50
|
|
|
|
24
|
if ($code->marc_value() eq substr($marc8, -1, 1)) |
398
|
|
|
|
|
|
|
{ |
399
|
2
|
|
|
|
|
5
|
$marc8 .= $prev; |
400
|
2
|
|
|
|
|
12
|
next; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
} |
403
|
15
|
|
|
|
|
279
|
$marc8 .= $code->marc_value() . $prev; |
404
|
15
|
100
|
|
|
|
71
|
if ($code->marc_right_half()) |
405
|
|
|
|
|
|
|
{ |
406
|
4
|
|
|
|
|
47
|
$marc8 .= chr(hex($code->marc_right_half())); |
407
|
|
|
|
|
|
|
} |
408
|
15
|
|
|
|
|
438
|
next; |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# look to see if we need to escape to a new G0 charset |
412
|
93
|
|
|
|
|
1481
|
my $charset_value = $code->charset_value(); |
413
|
|
|
|
|
|
|
|
414
|
93
|
100
|
100
|
|
|
1636
|
if ($code->default_charset_group() eq 'G0' |
|
|
100
|
100
|
|
|
|
|
415
|
|
|
|
|
|
|
and $G0 ne $charset_value) |
416
|
|
|
|
|
|
|
{ |
417
|
87
|
100
|
100
|
|
|
584
|
if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) |
418
|
|
|
|
|
|
|
{ |
419
|
|
|
|
|
|
|
# don't bother escaping, they're functionally the same |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
else |
422
|
|
|
|
|
|
|
{ |
423
|
11
|
|
|
|
|
44
|
$marc8 .= $code->get_escape(); |
424
|
11
|
|
|
|
|
29
|
$G0 = $charset_value; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# look to see if we need to escape to a new G1 charset |
429
|
|
|
|
|
|
|
elsif ($code->default_charset_group() eq 'G1' |
430
|
|
|
|
|
|
|
and $G1 ne $charset_value) |
431
|
|
|
|
|
|
|
{ |
432
|
2
|
|
|
|
|
11
|
$marc8 .= $code->get_escape(); |
433
|
2
|
|
|
|
|
4
|
$G1 = $charset_value; |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
|
436
|
93
|
|
|
|
|
352
|
$marc8 .= $code->marc_value(); |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
# escape back to default G0 if necessary |
440
|
21
|
100
|
|
|
|
74
|
if ($G0 ne $DEFAULT_G0) |
441
|
|
|
|
|
|
|
{ |
442
|
10
|
50
|
|
|
|
22
|
if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } |
|
10
|
0
|
|
|
|
21
|
|
443
|
0
|
|
|
|
|
0
|
elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } |
444
|
0
|
|
|
|
|
0
|
else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
# escape back to default G1 if necessary |
448
|
21
|
100
|
|
|
|
63
|
if ($G1 ne $DEFAULT_G1) |
449
|
|
|
|
|
|
|
{ |
450
|
2
|
50
|
|
|
|
8
|
if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } |
|
0
|
|
|
|
|
0
|
|
451
|
2
|
|
|
|
|
5
|
else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
21
|
|
|
|
|
161
|
return $marc8; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=head1 DEFAULT CHARACTER SETS |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
If you need to alter the default character sets you can set the |
462
|
|
|
|
|
|
|
$MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the |
463
|
|
|
|
|
|
|
appropriate character set code: |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
use MARC::Charset::Constants qw(:all); |
466
|
|
|
|
|
|
|
$MARC::Charset::DEFAULT_G0 = BASIC_ARABIC; |
467
|
|
|
|
|
|
|
$MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
=head1 SEE ALSO |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=over 4 |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=item * L<MARC::Charset::Constant> |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=item * L<MARC::Charset::Table> |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=item * L<MARC::Charset::Code> |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=item * L<MARC::Charset::Compiler> |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=item * L<MARC::Record> |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=item * L<MARC::XML> |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=back |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=head1 AUTHOR |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
Ed Summers (ehs@pobox.com) |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
=cut |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
sub _process_escape |
495
|
|
|
|
|
|
|
{ |
496
|
|
|
|
|
|
|
## this stuff is kind of scary ... for an explanation of what is |
497
|
|
|
|
|
|
|
## going on here check out the MARC-8 specs at LC. |
498
|
|
|
|
|
|
|
## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html |
499
|
422
|
|
|
422
|
|
625
|
my ($str_ref, $left, $right) = @_; |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
# first char needs to be an escape or else this isn't an escape sequence |
502
|
422
|
100
|
|
|
|
1350
|
return $left unless substr($$str_ref, $left, 1) eq ESCAPE; |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
## if we don't have at least one character after the escape |
505
|
|
|
|
|
|
|
## then this can't be a character escape sequence |
506
|
56
|
50
|
|
|
|
124
|
return $left if ($left+1 >= $right); |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
## pull off the first escape |
509
|
56
|
|
|
|
|
96
|
my $esc_char_1 = substr($$str_ref, $left+1, 1); |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
## the first method of escaping to small character sets |
512
|
56
|
100
|
100
|
|
|
458
|
if ( $esc_char_1 eq GREEK_SYMBOLS |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
513
|
|
|
|
|
|
|
or $esc_char_1 eq SUBSCRIPTS |
514
|
|
|
|
|
|
|
or $esc_char_1 eq SUPERSCRIPTS |
515
|
|
|
|
|
|
|
or $esc_char_1 eq ASCII_DEFAULT) |
516
|
|
|
|
|
|
|
{ |
517
|
14
|
|
|
|
|
16
|
$G0 = $esc_char_1; |
518
|
14
|
|
|
|
|
28
|
return $left+2; |
519
|
|
|
|
|
|
|
} |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
## the second more complicated method of escaping to bigger charsets |
522
|
42
|
50
|
|
|
|
89
|
return $left if $left+2 >= $right; |
523
|
|
|
|
|
|
|
|
524
|
42
|
|
|
|
|
80
|
my $esc_char_2 = substr($$str_ref, $left+2, 1); |
525
|
42
|
|
|
|
|
61
|
my $esc_chars = $esc_char_1 . $esc_char_2; |
526
|
|
|
|
|
|
|
|
527
|
42
|
100
|
66
|
|
|
169
|
if ($esc_char_1 eq SINGLE_G0_A |
|
|
100
|
66
|
|
|
|
|
|
|
50
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
528
|
|
|
|
|
|
|
or $esc_char_1 eq SINGLE_G0_B) |
529
|
|
|
|
|
|
|
{ |
530
|
33
|
|
|
|
|
40
|
$G0 = $esc_char_2; |
531
|
33
|
|
|
|
|
202
|
return $left+3; |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
elsif ($esc_char_1 eq SINGLE_G1_A |
535
|
|
|
|
|
|
|
or $esc_char_1 eq SINGLE_G1_B) |
536
|
|
|
|
|
|
|
{ |
537
|
7
|
|
|
|
|
12
|
$G1 = $esc_char_2; |
538
|
7
|
|
|
|
|
14
|
return $left+3; |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
elsif ( $esc_char_1 eq MULTI_G0_A ) { |
542
|
2
|
|
|
|
|
4
|
$G0 = $esc_char_2; |
543
|
2
|
|
|
|
|
6
|
return $left+3; |
544
|
|
|
|
|
|
|
} |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
elsif ($esc_chars eq MULTI_G0_B |
547
|
|
|
|
|
|
|
and ($left+3 < $right)) |
548
|
|
|
|
|
|
|
{ |
549
|
0
|
|
|
|
|
0
|
$G0 = substr($$str_ref, $left+3, 1); |
550
|
0
|
|
|
|
|
0
|
return $left+4; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) |
554
|
|
|
|
|
|
|
and ($left + 3 < $right)) |
555
|
|
|
|
|
|
|
{ |
556
|
0
|
|
|
|
|
0
|
$G1 = substr($$str_ref, $left+3, 1); |
557
|
0
|
|
|
|
|
0
|
return $left+4; |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
# we should never get here |
561
|
0
|
|
|
|
|
0
|
warn("seem to have fallen through in _process_escape()"); |
562
|
0
|
|
|
|
|
0
|
return $left; |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub reset_charsets |
566
|
|
|
|
|
|
|
{ |
567
|
75
|
|
|
75
|
0
|
154
|
$G0 = $DEFAULT_G0; |
568
|
75
|
|
|
|
|
133
|
$G1 = $DEFAULT_G1; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
1; |