File Coverage

blib/lib/MARC/Charset.pm

Criterion	Covered	Total	%
statement	123	152	80.9
branch	57	90	63.3
condition	30	50	60.0
subroutine	12	15	80.0
pod	5	6	83.3
total	227	313	72.5

line	stmt	bran	cond	sub	pod	time	code
1							package MARC::Charset;
2
3	16			16		302329	use strict;
	16					36
	16					786
4	16			16		169	use warnings;
	16					31
	16					865
5
6							our $VERSION = '1.35';
7
8	16			16		90	use base qw(Exporter);
	16					34
	16					2295
9							our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
10
11	16			16		28563	use Unicode::Normalize;
	16					131206
	16					1575
12	16			16		35298	use Encode 'decode';
	16					302132
	16					5385
13	16			16		22577	use charnames ':full';
	16					900130
	16					123
14	16			16		24903	use MARC::Charset::Table;
	16					74
	16					720
15	16			16		157	use MARC::Charset::Constants qw(:all);
	16					38
	16					79274
16
17							=head1 NAME
18
19							MARC::Charset - convert MARC-8 encoded strings to UTF-8
20
21							=head1 SYNOPSIS
22
23							# import the marc8_to_utf8 function
24							use MARC::Charset 'marc8_to_utf8';
25
26							# prepare STDOUT for utf8
27							binmode(STDOUT, 'utf8');
28
29							# print out some marc8 as utf8
30							print marc8_to_utf8($marc8_string);
31
32							=head1 DESCRIPTION
33
34							MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8
35							strings. MARC-8 is a single byte character encoding that predates unicode, and
36							allows you to put non-Roman scripts in MARC bibliographic records.
37
38							http://www.loc.gov/marc/specifications/spechome.html
39
40							=head1 EXPORTS
41
42							=cut
43
44							# get the mapping table
45							our $table = MARC::Charset::Table->new();
46
47							# set default character sets
48							# these are viewable at the package level
49							# in case someone wants to set them
50							our $DEFAULT_G0 = ASCII_DEFAULT;
51							our $DEFAULT_G1 = EXTENDED_LATIN;
52
53							our %SPECIAL_DECOMPOSABLE = (
54							chr(0x01a0) => chr(0x01a0), # uppercase o-hook
55							chr(0x01af) => chr(0x01af), # uppercase u-hook
56							chr(0x01a1) => chr(0x01a1), # lowercase o-hook
57							chr(0x01b0) => chr(0x01b0), # lowercase u-hook
58							chr(0x1ef1) => chr(0x01b0) . chr(0x0323), # lowercase u-hook with dot below
59							chr(0x1ee9) => chr(0x01b0) . chr(0x0301), # lowercase u-hook with acute
60							# Arabic to not decompose
61							chr(0x0622) => chr(0x0622),
62							chr(0x0623) => chr(0x0623),
63							chr(0x0624) => chr(0x0624),
64							chr(0x0625) => chr(0x0625),
65							chr(0x0626) => chr(0x0626),
66							chr(0x0649) => chr(0x0649),
67							chr(0x0671) => chr(0x0671),
68							chr(0x06c0) => chr(0x06c0),
69							chr(0x06D3) => chr(0x06D3),
70							# Cyrillic to not decompose
71							chr(0x0439) => chr(0x0439),
72							chr(0x0419) => chr(0x0419),
73							chr(0x0453) => chr(0x0453),
74							chr(0x0451) => chr(0x0451),
75							chr(0x0457) => chr(0x0457),
76							chr(0x045C) => chr(0x045C),
77							chr(0x045E) => chr(0x045E),
78							chr(0x0403) => chr(0x0403),
79							chr(0x0401) => chr(0x0401),
80							chr(0x0407) => chr(0x0407),
81							chr(0x040C) => chr(0x040C),
82							chr(0x040E) => chr(0x040E),
83							# Katakana to not decompose
84							chr(0x309B) => chr(0x309B),
85							chr(0x309C) => chr(0x309C),
86							chr(0x30AC) => chr(0x30AC),
87							chr(0x30AE) => chr(0x30AE),
88							chr(0x30B0) => chr(0x30B0),
89							chr(0x30B2) => chr(0x30B2),
90							chr(0x30B4) => chr(0x30B4),
91							chr(0x30B6) => chr(0x30B6),
92							chr(0x30B8) => chr(0x30B8),
93							chr(0x30BA) => chr(0x30BA),
94							chr(0x30BC) => chr(0x30BC),
95							chr(0x30BE) => chr(0x30BE),
96							chr(0x30C0) => chr(0x30C0),
97							chr(0x30C2) => chr(0x30C2),
98							chr(0x30C5) => chr(0x30C5),
99							chr(0x30C7) => chr(0x30C7),
100							chr(0x30C9) => chr(0x30C9),
101							chr(0x30D0) => chr(0x30D0),
102							chr(0x30D1) => chr(0x30D1),
103							chr(0x30D3) => chr(0x30D3),
104							chr(0x30D4) => chr(0x30D4),
105							chr(0x30D6) => chr(0x30D6),
106							chr(0x30D7) => chr(0x30D7),
107							chr(0x30D9) => chr(0x30D9),
108							chr(0x30DA) => chr(0x30DA),
109							chr(0x30DC) => chr(0x30DC),
110							chr(0x30DD) => chr(0x30DD),
111							chr(0x30F4) => chr(0x30F4),
112							chr(0x30F7) => chr(0x30F7),
113							chr(0x30F8) => chr(0x30F8),
114							chr(0x30F9) => chr(0x30F9),
115							chr(0x30FA) => chr(0x30FA),
116							chr(0x30FE) => chr(0x30FE),
117							chr(0x30FF) => chr(0x30FF),
118							);
119
120							=head2 ignore_errors()
121
122							Tells MARC::Charset whether or not to ignore all encoding errors, and
123							returns the current setting. This is helpful if you have records that
124							contain both MARC8 and UNICODE characters.
125
126							my $ignore = MARC::Charset->ignore_errors();
127
128							MARC::Charset->ignore_errors(1); # ignore errors
129							MARC::Charset->ignore_errors(0); # DO NOT ignore errors
130
131							=cut
132
133
134							our $_ignore_errors = 0;
135							sub ignore_errors {
136	0			0	1	0	my ($self,$i) = @_;
137	0	0				0	$_ignore_errors = $i if (defined($i));
138	0					0	return $_ignore_errors;
139							}
140
141
142							=head2 assume_unicode()
143
144							Tells MARC::Charset whether or not to assume UNICODE when an error is
145							encountered in ignore_errors mode and returns the current setting.
146							This is helpful if you have records that contain both MARC8 and UNICODE
147							characters.
148
149							my $setting = MARC::Charset->assume_unicode();
150
151							MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8)
152							MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode
153
154							=cut
155
156
157							our $_assume = '';
158							sub assume_unicode {
159	0			0	1	0	my ($self,$i) = @_;
160	0	0	0			0	$_assume = 'utf8' if (defined($i) and $i);
161	0	0				0	return 1 if ($_assume eq 'utf8');
162							}
163
164
165							=head2 assume_encoding()
166
167							Tells MARC::Charset whether or not to assume a specific encoding when an error
168							is encountered in ignore_errors mode and returns the current setting. This
169							is helpful if you have records that contain both MARC8 and other characters.
170
171							my $setting = MARC::Charset->assume_encoding();
172
173							MARC::Charset->assume_encoding('cp850'); # assume characters are cp850
174							MARC::Charset->assume_encoding(''); # DO NOT assume any encoding
175
176							=cut
177
178
179							sub assume_encoding {
180	0			0	1	0	my ($self,$i) = @_;
181	0	0				0	$_assume = $i if (defined($i));
182	0					0	return $_assume;
183							}
184
185
186							# place holders for working graphical character sets
187							my $G0;
188							my $G1;
189
190							=head2 marc8_to_utf8()
191
192							Converts a MARC-8 encoded string to UTF-8.
193
194							my $utf8 = marc8_to_utf8($marc8);
195
196							If you'd like to ignore errors pass in a true value as the 2nd
197							parameter or call MARC::Charset->ignore_errors() with a true
198							value:
199
200							my $utf8 = marc8_to_utf8($marc8, 'ignore-errors');
201
202							or
203
204							MARC::Charset->ignore_errors(1);
205							my $utf8 = marc8_to_utf8($marc8);
206
207							=cut
208
209
210							sub marc8_to_utf8
211							{
212	27			27	1	2238	my ($marc8, $ignore_errors) = @_;
213	27					182	reset_charsets();
214
215	27	50				91	$ignore_errors = $_ignore_errors if (!defined($ignore_errors));
216
217							# holder for our utf8
218	27					54	my $utf8 = '';
219
220	27					40	my $index = 0;
221	27					47	my $length = length($marc8);
222	27					41	my $combining = '';
223	27					96	CHAR_LOOP: while ($index < $length)
224							{
225							# whitespace, line feeds and carriage returns just get added on unmolested
226	480	100				2506	if (substr($marc8, $index, 1) =~ m/(\s+\|\x0A+\|\x0D+)/so)
227							{
228	58					144	$utf8 .= $1;
229	58					76	$index += 1;
230	58					143	next CHAR_LOOP;
231							}
232
233							# look for any escape sequences
234	422					868	my $new_index = _process_escape(\$marc8, $index, $length);
235	422	100				977	if ($new_index > $index)
236							{
237	56					61	$index = $new_index;
238	56					132	next CHAR_LOOP;
239							}
240
241	366					351	my $found;
242	366					523	CHARSET_LOOP: foreach my $charset ($G0, $G1)
243							{
244
245							# cjk characters are a string of three chars
246	366	100				663	my $char_size = $charset eq CJK ? 3 : 1;
247
248							# extract the next code point to examine
249	366					528	my $chunk = substr($marc8, $index, $char_size);
250
251	366					338	my $code;
252	366	100				600	if ($char_size == 1) {
253	358					398	my $codepoint = ord($chunk);
254	358	100	100			1789	if ($codepoint >= 0x21 && $codepoint <= 0x7e) {
		100	66
		100	66
		50	33
255							# character is G0
256	338					1085	$code = $table->lookup_by_marc8($G0, $chunk);
257							} elsif ($codepoint >= 0xa1 && $codepoint <= 0xfe) {
258							# character is G1, map it to G0 before atttempting lookup
259	17					88	$code = $table->lookup_by_marc8($G1, chr($codepoint - 128));
260							} elsif ($codepoint >= 0x88 && $codepoint <= 0x8e) {
261							# in the C1 range used by MARC8
262	1					5	$code = $table->lookup_by_marc8(EXTENDED_LATIN, $chunk);
263							} elsif ($codepoint >= 0x1b && $codepoint <= 0x1f) {
264							# in the C0 range used by MARC8
265	2					9	$code = $table->lookup_by_marc8(BASIC_LATIN, $chunk);
266							}
267							} else {
268							# EACC doesn't need G0/G1 conversion
269	8					28	$code = $table->lookup_by_marc8($charset, $chunk);
270							}
271
272							# try the next character set if no mapping was found
273	366	50				7769	next CHARSET_LOOP if ! $code;
274	366					642	$found = 1;
275
276							# gobble up all combining characters for appending later
277							# this is necessary because combinging characters precede
278							# the character they modify in MARC-8, whereas they follow
279							# the character they modify in UTF-8.
280	366	100				999	if ($code->is_combining())
281							{
282							# If the current character is the right half of a MARC-8
283							# ligature or double tilde, we don't want to include
284							# it in the UTF-8 output. For the explanation, see
285							# http://lcweb2.loc.gov/diglib/codetables/45.html#Note1
286							# Note that if the MARC-8 string includes a right half
287							# without the corresponding left half, the right half will
288							# get dropped instead of being mapped to its UCS alternate.
289							# That's OK since including only one half of a double diacritic
290							# was presumably a mistake to begin with.
291	11	100				130	unless (defined $code->marc_left_half())
292							{
293	6					69	$combining .= $code->char_value();
294							}
295							}
296							else
297							{
298	355					3847	$utf8 .= $code->char_value() . $combining;
299	355					26244	$combining = '';
300							}
301
302	366					574	$index += $char_size;
303	366					1712	next CHAR_LOOP;
304							}
305
306	0	0				0	if (!$found)
307							{
308	0					0	warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
309							"g0=".MARC::Charset::Constants::charset_name($G0) . " " .
310							"g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
311	0	0				0	if (!$ignore_errors)
312							{
313	0					0	reset_charsets();
314	0					0	return;
315							}
316	0	0				0	if ($_assume)
317							{
318	0					0	reset_charsets();
319	0					0	return NFC(decode($_assume => $marc8));
320							}
321	0					0	$index += 1;
322							}
323
324							}
325
326							# return the utf8
327	27					73	reset_charsets();
328	27					77	utf8::upgrade($utf8);
329	27					788	return $utf8;
330							}
331
332
333
334							=head2 utf8_to_marc8()
335
336							Will attempt to translate utf8 into marc8.
337
338							my $marc8 = utf8_to_marc8($utf8);
339
340							If you'd like to ignore errors, or characters that can't be
341							converted to marc8 then pass in a true value as the second
342							parameter:
343
344							my $marc8 = utf8_to_marc8($utf8, 'ignore-errors');
345
346							or
347
348							MARC::Charset->ignore_errors(1);
349							my $utf8 = marc8_to_utf8($marc8);
350
351							=cut
352
353							sub utf8_to_marc8
354							{
355	21			21	1	3754	my ($utf8, $ignore_errors) = @_;
356	21					120	reset_charsets();
357
358	21	50				76	$ignore_errors = $_ignore_errors if (!defined($ignore_errors));
359
360							# decompose combined characters
361	117	100				1731	$utf8 = join('',
362	21					127	map { exists $SPECIAL_DECOMPOSABLE{$_} ? $SPECIAL_DECOMPOSABLE{$_} : NFD($_) }
363							split //, $utf8
364							);
365
366	21					136	my $len = length($utf8);
367	21					42	my $marc8 = '';
368	21					95	for (my $i=0; $i<$len; $i++)
369							{
370	120					326	my $slice = substr($utf8, $i, 1);
371
372							# spaces are copied from utf8 into marc8
373	120	100				296	if ($slice eq ' ')
374							{
375	10					26	$marc8 .= ' ';
376	10					29	next;
377							}
378
379							# try to find the code point in our mapping table
380	110					441	my $code = $table->lookup_by_utf8($slice);
381
382	110	50				4937	if (! $code)
383							{
384	0					0	warn("no mapping found at position $i in $utf8");
385	0	0	0			0	reset_charsets() and return unless $ignore_errors;
386							}
387
388							# if it's a combining character move it around
389	110	100				631	if ($code->is_combining())
390							{
391	17					340	my $prev = chop($marc8);
392	17	100				76	if ($code->marc_left_half())
393							{
394							# don't add the MARC-8 right half character
395							# if it was already inserted when the double
396							# diacritic was converted from UTF-8
397	2	50				24	if ($code->marc_value() eq substr($marc8, -1, 1))
398							{
399	2					5	$marc8 .= $prev;
400	2					12	next;
401							}
402							}
403	15					279	$marc8 .= $code->marc_value() . $prev;
404	15	100				71	if ($code->marc_right_half())
405							{
406	4					47	$marc8 .= chr(hex($code->marc_right_half()));
407							}
408	15					438	next;
409							}
410
411							# look to see if we need to escape to a new G0 charset
412	93					1481	my $charset_value = $code->charset_value();
413
414	93	100	100			1636	if ($code->default_charset_group() eq 'G0'
		100	100
415							and $G0 ne $charset_value)
416							{
417	87	100	100			584	if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
418							{
419							# don't bother escaping, they're functionally the same
420							}
421							else
422							{
423	11					44	$marc8 .= $code->get_escape();
424	11					29	$G0 = $charset_value;
425							}
426							}
427
428							# look to see if we need to escape to a new G1 charset
429							elsif ($code->default_charset_group() eq 'G1'
430							and $G1 ne $charset_value)
431							{
432	2					11	$marc8 .= $code->get_escape();
433	2					4	$G1 = $charset_value;
434							}
435
436	93					352	$marc8 .= $code->marc_value();
437							}
438
439							# escape back to default G0 if necessary
440	21	100				74	if ($G0 ne $DEFAULT_G0)
441							{
442	10	50				22	if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
	10	0				21
443	0					0	elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
444	0					0	else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
445							}
446
447							# escape back to default G1 if necessary
448	21	100				63	if ($G1 ne $DEFAULT_G1)
449							{
450	2	50				8	if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
	0					0
451	2					5	else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
452							}
453
454	21					161	return $marc8;
455							}
456
457
458
459							=head1 DEFAULT CHARACTER SETS
460
461							If you need to alter the default character sets you can set the
462							$MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the
463							appropriate character set code:
464
465							use MARC::Charset::Constants qw(:all);
466							$MARC::Charset::DEFAULT_G0 = BASIC_ARABIC;
467							$MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC;
468
469							=head1 SEE ALSO
470
471							=over 4
472
473							=item * L<MARC::Charset::Constant>
474
475							=item * L<MARC::Charset::Table>
476
477							=item * L<MARC::Charset::Code>
478
479							=item * L<MARC::Charset::Compiler>
480
481							=item * L<MARC::Record>
482
483							=item * L<MARC::XML>
484
485							=back
486
487							=head1 AUTHOR
488
489							Ed Summers (ehs@pobox.com)
490
491							=cut
492
493
494							sub _process_escape
495							{
496							## this stuff is kind of scary ... for an explanation of what is
497							## going on here check out the MARC-8 specs at LC.
498							## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
499	422			422		625	my ($str_ref, $left, $right) = @_;
500
501							# first char needs to be an escape or else this isn't an escape sequence
502	422	100				1350	return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
503
504							## if we don't have at least one character after the escape
505							## then this can't be a character escape sequence
506	56	50				124	return $left if ($left+1 >= $right);
507
508							## pull off the first escape
509	56					96	my $esc_char_1 = substr($$str_ref, $left+1, 1);
510
511							## the first method of escaping to small character sets
512	56	100	100			458	if ( $esc_char_1 eq GREEK_SYMBOLS
			100
			100
513							or $esc_char_1 eq SUBSCRIPTS
514							or $esc_char_1 eq SUPERSCRIPTS
515							or $esc_char_1 eq ASCII_DEFAULT)
516							{
517	14					16	$G0 = $esc_char_1;
518	14					28	return $left+2;
519							}
520
521							## the second more complicated method of escaping to bigger charsets
522	42	50				89	return $left if $left+2 >= $right;
523
524	42					80	my $esc_char_2 = substr($$str_ref, $left+2, 1);
525	42					61	my $esc_chars = $esc_char_1 . $esc_char_2;
526
527	42	100	66			169	if ($esc_char_1 eq SINGLE_G0_A
		100	66
		50	0
		0	0
		0	0
528							or $esc_char_1 eq SINGLE_G0_B)
529							{
530	33					40	$G0 = $esc_char_2;
531	33					202	return $left+3;
532							}
533
534							elsif ($esc_char_1 eq SINGLE_G1_A
535							or $esc_char_1 eq SINGLE_G1_B)
536							{
537	7					12	$G1 = $esc_char_2;
538	7					14	return $left+3;
539							}
540
541							elsif ( $esc_char_1 eq MULTI_G0_A ) {
542	2					4	$G0 = $esc_char_2;
543	2					6	return $left+3;
544							}
545
546							elsif ($esc_chars eq MULTI_G0_B
547							and ($left+3 < $right))
548							{
549	0					0	$G0 = substr($$str_ref, $left+3, 1);
550	0					0	return $left+4;
551							}
552
553							elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
554							and ($left + 3 < $right))
555							{
556	0					0	$G1 = substr($$str_ref, $left+3, 1);
557	0					0	return $left+4;
558							}
559
560							# we should never get here
561	0					0	warn("seem to have fallen through in _process_escape()");
562	0					0	return $left;
563							}
564
565							sub reset_charsets
566							{
567	75			75	0	154	$G0 = $DEFAULT_G0;
568	75					133	$G1 = $DEFAULT_G1;
569							}
570
571							1;