File Coverage

blib/lib/Cz/Cstocs.pm

Criterion	Covered	Total	%
statement	222	261	85.0
branch	100	150	66.6
condition	25	33	75.7
subroutine	15	17	88.2
pod	0	11	0.0
total	362	472	76.6

line	stmt	bran	cond	sub	pod	time	code
1
2							=head1 NAME
3
4							Cz::Cstocs - conversions of charset encodings for the Czech language
5
6							=cut
7
8							package Cz::Cstocs;
9
10	1			1		1772	use strict;
	1					2
	1					40
11	1			1		6	use Exporter;
	1					2
	1					50
12
13	1			1		6	use vars qw( $VERSION $DEBUG $cstocsdir @ISA @EXPORT_OK %EXPORT $errstr);
	1					1
	1					10621
14
15							@ISA = qw(Exporter);
16							@EXPORT_OK = ( '_stupidity_workaround' );
17							%EXPORT = ( '_stupidity_workaround' => 1 );
18
19	0			0		0	sub _stupidity_workaround {
20							}
21
22							sub import {
23	2			2		10688	my $class = shift;
24	2					5	my @data = @_;
25	2	100				9	if (@data) {
26	1					5	my @avail = Cz::Cstocs->available_enc();
27	1					3	my $fn;
28	1					3	for $fn (@data) {
29	1					7	local $^W = 0;
30	1	50				2	next if grep { $_ eq $fn } @EXPORT_OK;
	1					6
31	1					11	my ($in, $out) = $fn =~ /^_?(.?)_(?:to_)?(.)$/;
32	1	50				5	next unless defined $out;
33	1					6	my $fnref = new Cz::Cstocs $in, $out;
34	1	50				4	die "Definition of $fn failed: $errstr"
35							unless defined $fnref;;
36	1			1		51	eval "sub $fn { \$fnref->conv(\@_); }; ";
	1					46
37	1	50				5	if ($@) {
38	0					0	die "Creating conversion function $fn failed: $@";
39							}
40	1					4	push @EXPORT_OK, $fn;
41	1					10	$EXPORT{$fn} = 1;
42							}
43							}
44	2					15357	Cz::Cstocs->export_to_level(1, '_stupidity_workaround', @data);
45							}
46
47							$VERSION = '3.4';
48
49							# Debugging option
50							$DEBUG = 0 unless defined $DEBUG;
51	20			20	0	109	sub DEBUG () { $DEBUG; }
52
53
54							# Where to get the encoding files from
55							# Start with some default
56							my $defaultcstocsdir = '/packages/share/cstocs/lib';
57
58							# Look at the environment variable
59							if (defined $ENV{'CSTOCSDIR'}) {
60							$defaultcstocsdir = $ENV{'CSTOCSDIR'};
61							print STDERR "Using enc-dir $defaultcstocsdir from the CSTOCSDIR env-var\n"
62							if DEBUG;
63							}
64							# Or take the encoding files from the Perl tree
65							elsif (defined $INC{'Cz/Cstocs.pm'}) {
66							$defaultcstocsdir = $INC{'Cz/Cstocs.pm'};
67							$defaultcstocsdir =~ s!\.pm$!/enc!;
68							print STDERR "Using enc-dir $defaultcstocsdir from \@INC\n"
69							if DEBUG;
70							}
71
72							# We have unless hare because you could have overriden $cstocsdir
73							$cstocsdir = $defaultcstocsdir unless defined $cstocsdir;
74
75
76							# Hash that holds the accent file and a tag saying if the accent
77							# file has already been read
78							my %accent = ();
79							my $accent_read = 0;
80
81							# Hash of alias covnersions
82							my %alias = ();
83							my $alias_read = 0;
84
85							# Input and output hashes
86							my %input_hashes = ();
87							my %output_hashes = ();
88
89							# Array of regexp parts
90							my %regexp_matches = ();
91
92							# Table of conversion functions, so that we do not need to create them twice
93							my %functions = ();
94
95							# List of diacritics
96							my @diacritics = qw( abovedot acute breve caron cedilla circumflex
97							diaeresis doubleacute ogonek ring );
98
99
100
101							# ######################################################
102							# Now, the function -- loading encoding and accent files
103
104							# Filling input and output_hashes tables for given encoding
105							sub load_encoding {
106	19			19	0	33	my $enc = lc shift;
107
108	19	100				51	return if defined $input_hashes{$enc}; # has already been loaded
109
110	5	50				13	if ($enc eq 'mime') {
111	0					0	eval 'use MIME::Words ()';
112	0	0				0	if ($@) {
113	0					0	die "Error loading encofing $enc: $@\n";
114							}
115	0					0	return;
116							}
117
118	5					15	my $file = "$cstocsdir/$enc.enc";
119	5	50				361	open FILE, $file or die "Error reading $file: $!\n";
120	5	50				15	print STDERR "Parsing encoding file $file\n" if DEBUG;
121
122	5					15	my ($input, $output) = ({}, {}); # just speedup thing
123	5					7	local $_;
124	5					3170	while () {
125	811	100				2351	next if /^(#\|\s*$)/;
126	804					3100	my ($tag, $desc) = /^\s(\S+)\s+(\S+)\s$/;
127	804	50	33			3138	unless (defined $tag and defined $desc) {
128	0					0	chomp;
129	0					0	warn "Syntax error in $file at line $: `$_'.\n";
130	0					0	next;
131							}
132	804	100				2131	if ($tag =~ /^\d+\|0x\d+$/) {
133	970	100				3690	$tag = pack 'C*', map {
134	779					9474	/^0/ ? oct($_) : $_
135							} split /,/, $tag;
136							}
137	804					1990	$input->{$tag} = $desc;
138	804	100				4733	$output->{$desc} = $tag unless defined $output->{$desc};
139							}
140	5					85	close FILE;
141
142	5					21	$input_hashes{$enc} = $input;
143	5					12	$output_hashes{$enc} = $output;
144
145	5	100				27	if ($enc eq "tex") {
146	1					7	fixup_tex_encoding();
147							}
148							}
149
150							sub fixup_tex_encoding {
151	1			1	0	2	my $tag;
152
153	1	50				4	print STDERR "Doing tex fixup\n" if DEBUG;
154
155	1					5	my $input = $input_hashes{"tex"};
156	1					3	my $output = $output_hashes{"tex"};
157
158							# we need this to fill the defaults
159	1					9	load_encoding('ascii');
160	1					2	my $asciiref = $output_hashes{'ascii'};
161	1					26	for $tag (keys %$asciiref) {
162	95	50				268	$output->{$tag} = $asciiref->{$tag}
163							unless defined $output->{$tag};
164							}
165
166	1					7	my %processed = ();
167
168	1					7	my (@dialetters, @dianonletters, @nondialetters, @nondianonletters);
169	1					9	my (@inputs) = keys %$input;
170	1					4	for $tag (@inputs) {
171	24					39	my $value = $input->{$tag};
172
173	24					33	my $az = 0;
174	24	100				94	$az = 1 if $tag =~ /[a-zA-Z]$/;
175
176	24	100	100			106	if ($az and $output->{$value} eq $tag) {
177	16					25	$output->{$value} = $tag . '{}';
178							}
179	24					56	$input->{$tag . ' '} = $value;
180
181	24	100				33	if (grep { $_ eq $value } @diacritics) {
	240	100				349
182	10					10	my $e;
183	10	100				14	if ($az) {
184	5					7	push @dialetters, $tag;
185	5					12	for $e ('a'..'h', 'k'..'z', 'A'..'Z') {
186	250					675	$output->{$e.$value} = $tag.' '.$e
187							}
188							} else {
189	5					6	push @dianonletters, $tag;
190	5					33	for $e ('a'..'h', 'k'..'z', 'A'..'Z') {
191	250					652	$output->{$e.$value} = $tag.$e
192							}
193	5					15	for $e ('a'..'z', 'A'..'Z') {
194	260					637	$input->{$tag.$e} = $e.$value;
195							}
196							}
197	10					21	for $e ('i', 'j') {
198	20					63	$output->{$e.$value} = $tag.'\\'.$e.'{}'
199							}
200	10					22	for $e ('a'..'z', 'A'..'Z') {
201	520					1179	$input->{$tag.' '.$e} = $e.$value;
202							}
203	10					12	for $e ('i', 'j') {
204	20					45	$input->{$tag.'\\'.$e} = $e.$value;
205	20					66	$input->{$tag.' \\'.$e} = $e.$value;
206							}
207							} elsif ($az) {
208	13					28	push @nondialetters, $tag;
209							} else {
210	1					3	push @nondianonletters, $tag;
211							}
212							}
213
214	1					5	my $regexp = '';
215
216	1	50				5	if (@dialetters) {
217	5					14	$regexp .= join '', '(',
218	1					2	join('\|', map { "\Q$_"; } @dialetters),
219							")([ \\t]+[a-zA-Z]\|[ \\t](\\\\[ij]([ \\t]+(\{\})?\|[ \\t](\$\|\{\}))\|{([a-zA-Z]\|\\\\[ij][ \\t]*(\{\})?)}))";
220							}
221	1	50				10	if (@dianonletters) {
222	1	50				4	$regexp .= '\|' if $regexp ne '';
223	5					14	$regexp .= '(' . join '',
224	1					4	join('\|', map { "\Q$_"; } @dianonletters),
225							")[ \\t]([a-zA-Z]\|\\\\[ij]([ \\t]+(\{\})?\|[ \\t](\$\|\{\}))\|{([a-zA-Z]\|\\\\[ij][ \\t]*(\{\})?)})";
226							}
227	1	50				4	if (@nondialetters) {
228	1	50				12	$regexp .= '\|' if $regexp ne '';
229	13					27	$regexp .= '(' . join '',
230	1					2	join('\|', map { "\Q$_"; } @nondialetters),
231							")([ \\t]+(\{\})?\|[ \\t]*\$)"
232							}
233	1	50				6	if (@nondianonletters) {
234	1	50				3	$regexp .= '\|' if $regexp ne '';
235	1					4	$regexp .= '(' . join '',
236	1					3	join('\|', map { "\Q$_"; } @nondianonletters),
237							")[ \\t]*(\{\})?"
238							}
239
240	1					4	$regexp_matches{'tex'} = $regexp;
241	1					8	1;
242							}
243
244							# Loading accent file
245							sub load_accent {
246	8	100		8	0	34	return if $accent_read;
247	1					3	$accent_read = 1;
248
249	1					4	my $file = "$cstocsdir/accent";
250	1	50				58	open FILE, $file or die "Error reading accent file $file: $!\n";
251	1	50				4	print STDERR "Parsing accent file $file\n" if DEBUG;
252
253	1					2	local $_;
254	1					26	while () {
255	201	50				505	next if /^\s*(#\|$)/;
256	201					844	my ($key, $val) = /^\s(\S+)\s+(.+?)\s$/;
257	201	50	33			734	unless (defined $key and defined $val) {
258	0					0	chomp;
259	0					0	warn "Syntax error in $file at line $: `$_'.\n";
260	0					0	next;
261							}
262	201					774	$accent{$key} = $val;
263							}
264	1					12	close FILE;
265							}
266
267							# Load the alias file, fill the global %alias hash;
268							sub load_alias {
269	22	100		22	0	51	return if $alias_read;
270	1					2	$alias_read = 1;
271	1					3	my $file = "$cstocsdir/alias";
272
273	1	50				51	open FILE, $file or die "Error reading alias file $file: $!\n";
274	1					2	local $_;
275	1					24	while () {
276	17					19	chomp;
277	17					36	my ($alias, $enc) = split;
278	17					63	$alias{$alias} = $enc;
279							}
280	1					11	close FILE;
281							}
282
283							# Normalizes the encoding name -- expands aliases
284							sub normalize_enc_name {
285	22			22	0	47	load_alias();
286	22					46	my $enc = lc shift;
287	22					48	$enc =~ s/[^a-z0-9]//g;
288	22	100				77	( defined $alias{$enc} ? $alias{$enc} : $enc );
289							}
290
291							# Recursively lookup the target
292							sub lookup_accent {
293	229			229	0	346	my ($outenc, $accent, $in) = @_;
294	229					544	my @target = split /\s+/, $in;
295	229					288	my $out = '';
296	229					341	for my $desc (@target) {
297	294	50				541	if (defined $outenc->{$desc}) {
		0
298	294					966	$out .= $outenc->{$desc};
299							} elsif (defined $accent->{$desc}) {
300	0					0	$out .= lookup_accent($outenc, $accent, $accent->{$desc});
301							} else {
302	0					0	die;
303							}
304							}
305	229					944	return $out;
306							}
307
308							# Constructor -- takes two arguments, input and output encodings,
309							# a optionally hash of options. Returns reference to code that will
310							# do the conversion, or undef
311							sub new {
312	11			11	0	253	my $class = shift;
313	11					28	my ($inputenc, $outputenc) = (shift, shift);
314
315	11					51	local $/ = "\n";
316
317							# check input values
318	11	50	33			78	unless (defined $inputenc and defined $outputenc) {
319	0					0	print STDERR "Both input and output encodings must be specified in call to ", __PACKAGE__, "::new\n";
320	0					0	return;
321							}
322
323							# Default options
324	11					17	my $fillstring = ' ';
325	11					17	my $use_fillstring = 1;
326	11					12	my $use_accent = 1;
327	11					13	my $one_by_one = 0;
328
329							# this is exception for TeX
330	11	100				32	$use_fillstring = 0 if $inputenc eq "tex";
331
332	11					28	my %opts = @_;
333	11					13	my ($tag, $value);
334	11					43	while (($tag, $value) = each %opts) {
335	3	50				9	print STDERR "Option: $tag = '$value'\n" if DEBUG;
336	3	100				16	$tag eq 'fillstring' and $fillstring = $value;
337	3	50				15	$tag eq 'use_accent' and
		100
338							$use_accent = (defined $value ? $value : 0);
339	3	0				8	$tag eq 'nofillstring' and
		0
		50
340							$use_fillstring = (defined $value ?
341							( $value ? 0 : 1) : 0);
342	3	50				8	$tag eq 'cstocsdir' and $cstocsdir = $value;
343	3	100				13	$tag eq 'one_by_one' and $one_by_one = $value;
344							}
345
346	11					31	$inputenc = normalize_enc_name($inputenc);
347	11					24	$outputenc = normalize_enc_name($outputenc);
348
349							# encode settings into the function name
350	11	100				54	if (defined $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"}) {
351	2					14	return $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"};
352							}
353
354	9					16	eval {
355	9					32	load_encoding($inputenc);
356	9					26	load_encoding($outputenc);
357	9	100				41	load_accent() if $use_accent;
358							};
359	9	50				27	if ($@) {
360	0					0	$errstr = $@;
361	0					0	return;
362							}
363
364	9					23	my $conv = {};
365
366	9					18	my ($is_one_by_one, $has_space) = (1, 0);
367
368	9	50				22	if ($outputenc ne 'mime') {
369	9					15	my $key;
370	9					10	for $key (keys %{$input_hashes{$inputenc}}) {
	9					702
371	3111					5407	my $desc = $input_hashes{$inputenc}{$key};
372	3111					4986	my $output = $output_hashes{$outputenc}{$desc};
373
374	3111	100	100			8996	if (not defined $output and $use_accent) {
375							# Doesn't have friend in output encoding
376
377
378	928					1239	$output = eval {
379	928	100				3056	lookup_accent($output_hashes{$outputenc},
380							\%accent, $accent{$desc}) if defined $accent{$desc};
381							};
382	928	50				1683	if ($@) {
383	0					0	$errstr = "Error processing translitaration for $inputenc -> $outputenc for character $desc.\n";
384	0					0	return;
385							}
386
387	928	100	66			2019	$output = undef if $one_by_one and defined $output
			100
388							and length $key < length $output;
389							}
390	3111	100	100			7455	if (not defined $output and $use_fillstring) {
391	11					16	$output = $fillstring;
392							}
393
394	3111	100	100			12203	next if (not defined $output
			66
395							or ($inputenc ne 'utf8' and $key eq $output));
396	1673	100	100			4491	if (length $key != 1 or length $output != 1)
397	1419					1717	{ $is_one_by_one = 0; }
398	1673					4454	$conv->{$key} = $output;
399							}
400							}
401
402	9					721	my $fntext = ' sub { my @converted = map { my $e = $_; if (defined $e) {';
403
404	9	50				90	if ($inputenc eq 'mime') {
		50
		100
		100
		100
		100
405	0					0	$fntext .= qq!
406							\$e =~ s/=\\s*=/==/g;
407							\$e = join '', map {
408							my \$conv;
409							if (defined \$_->[1]) {
410							(defined(\$conv = new Cz::Cstocs \$_->[1], '$outputenc', %{ \\%opts }))
411							? \$conv->conv(\$_->[0])
412							: ()
413							} else {
414							\$_->[0]
415							}
416							} MIME::Words::decode_mimewords(\$e);
417							!;
418							} elsif ($outputenc eq 'mime') {
419	0					0	my %MIME_NAMES = (
420							il1 => 'ISO-8859-1',
421							il2 => 'ISO-8859-2',
422							utf8 => 'UTF-8',
423							1250 => 'Windows-1250',
424							1252 => 'Windows-1252',
425							);
426	0					0	my $charset = $MIME_NAMES{$inputenc};
427	0	0				0	if (not defined $charset) {
428	0					0	die "Couldn't find MIME name for encoding $inputenc\n";
429							}
430	0					0	$fntext .= qq!
431							\$e = MIME::Words::encode_mimewords(\$e, Charset => '$charset');
432							\$e =~ s/\\?=( +)=\\?.*?\\?Q\\?/'_' x length \$1/egi;
433							!;
434							} elsif (not keys %$conv) {
435							# do nothing;
436							} elsif ($is_one_by_one) {
437	1					12	my $src = join "", keys %$conv;
438	1					5	$src = "\Q$src";
439	1					8	my $dst = join "", values %$conv;
440	1					3	$dst = "\Q$dst";
441	1					5	$fntext .= qq! \$e =~ tr/$src/$dst/; !;
442							} elsif ($inputenc eq 'tex') {
443	2					6	my $src = $regexp_matches{'tex'};
444	2					13	$fntext .= qq! \$e =~ s/$src/ my \$e = \$&; my \$orig = \$e; \$e =~ s#[{}]# #sog; \$e =~ s#[ \\t]+# #sog; \$e =~ s# \$##o; (defined \$conv->{\$e} ? \$conv->{\$e} : \$orig); /esog; !;
445							} elsif ($inputenc eq 'utf8') {
446	1					3	$fntext .= qq! \$e =~ s/[\\x21-\\x7f]\|[\\xc0-\\xdf].\|[\\xe0-\\xef]..\|[\\xf0-\\xf7]...\|[\\xf8-\\xfb]....\|[\\xfc\\xfd]...../defined \$conv->{\$&} ? \$conv->{\$&} : (
447							$use_fillstring ? \$fillstring : '') /esog; !;
448							} else {
449	4					41	my $singles = join "", grep { length $_ == 1 } keys %$conv;
	328					814
450	4					32	$singles = "[". "\Q$singles" . "]";
451
452	0					0	my $src = join "\|",
453	0					0	( map { my $e = "\Q$_"; $e; }
	0					0
454	328					423	sort { length $b <=> length $a }
455	4					275	grep { length $_ != 1 } keys %$conv);
456	4	50				25	if ($singles ne "[]") {
457	4	50				13	$src .= "\|" unless $src eq '';
458	4					7	$src .= $singles;
459							}
460
461	4					17	$fntext .= qq! \$e =~ s/$src/\$conv->{\$&}/sog; !;
462							}
463
464	9					19	$fntext .= ' $e; } else { undef; }} @_; if (wantarray) { return @converted; } else { return join "", map { defined $_ ? $_ : "" } @converted; } }';
465
466	9	50				31	print STDERR "Conversion function for $inputenc to $outputenc:\n$fntext\n" if DEBUG;
467
468	9					3361	my $fn = eval $fntext;
469	9	50				37	do { chomp $@;
	0					0
470	0					0	die "Fatal error in Cz::Cstocs: please report this to adelton\@fi.muni.cz so\n that we could find out what happened. Thanks.\n$@, line ", __LINE__, "\n";
471							} if $@;
472	9					30	bless $fn, $class;
473
474	9					59	$functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"} = $fn;
475	9					88	$fn;
476							}
477
478							sub conv {
479	12			12	0	496	my $self = shift;
480	12					310	return &$self($_[0]);
481							}
482
483							sub available_enc {
484	1	50		1	0	72	opendir DIR, $cstocsdir or warn "Error reading $cstocsdir\n";
485	1					27	my @list = sort map { s/\.enc$//; $_ } grep { /\.enc$/ } readdir DIR;
	15					35
	15					63
	19					49
486	1					19	closedir DIR;
487	1					8	return @list;
488							}
489
490							sub diacritic_char {
491	0			0	0		my ($encoding, $char) = @_;
492	0						load_encoding($encoding);
493
494	0						my @result = ();
495	0						my $dia;
496	0						for $dia (@diacritics) {
497	0						my $name = $char . $dia;
498	0	0					push @result, $output_hashes{$encoding}{$name}
499							if defined $output_hashes{$encoding}{$name};
500							}
501	0						@result;
502							}
503
504							1;
505
506							=head1 SYNOPSIS
507
508							use Cz::Cstocs;
509							my $il2_to_ascii = new Cz::Cstocs 'il2', 'ascii';
510							while (<>) {
511							print &$il2_to_ascii($_);
512							}
513
514							use Cz::Cstocs 'il2_ascii';
515							while (<>) {
516							print il2_ascii($_);
517							}
518
519							use Cz::Cstocs;
520							sub il2toascii;
521							# inform the parser that there is a function il2toascii
522							*il2toascii = new Cz::Cstocs 'il2', 'ascii';
523							# now define the function
524							print il2toascii $data;
525							# thanks to Jan Krynicky for poining this out
526
527							=head1 DESCRIPTION
528
529							This module helps in converting texts between various charset
530							encodings, used for Czech and Slovak languages. The instance of the
531							object B is created using method B. It takes at
532							least two parameters for input and output encoding and can be
533							afterwards used as a function reference to convert strings/lists.
534							Cz::Cstocs supports fairly free form of aliases, so iso8859-2,
535							ISO-8859-2, iso88592 and il2 are all aliases of the same encoding.
536							For backward compatibility, method I is supported as well,
537							so the example above could also read
538
539							while (<>) {
540							print $il2_to_ascii->conv($_);
541							}
542
543							You can also use typeglob syntax.
544
545							The conversion function takes a list and returns list of converted
546							strings (in the list context) or one string consisting of concatenated
547							results (in the scalar context).
548
549							You can modify the behaviour of the conversion function by specifying
550							hash of other options after the encoding names in call to B.
551
552							=over 4
553
554							=item fillstring
555
556							Gives alternate string that will replace characters from input
557							encoding that are not present in the output encoding. Default is
558							space.
559
560							=item use_accent
561
562							Defines whether the accent file should be used. Default is 1 (true).
563
564							=item nofillstring
565
566							When 1 (true), will keep characters that do not have friends in
567							accent nor output encoding, will no replace them with fillstring.
568							Default is 0 except for tex, because you probably rather want to keep
569							backslashed symbols than loose them.
570
571							=item cstocsdir
572
573							Alternate location for encoding and accent files. The default is the
574							F directory in Perl library tree. This location can
575							also be changed with the I environment variable.
576
577							=back
578
579							There is an alternate way to define the conversion function: any
580							arguments after use Cz::Cstocs that have form encoding_encoding or
581							encoding_to_encoding are processed and the appropriate functions are
582							imported. So,
583
584							use Cz::Cstocs qw(pc2_to_il2 il2_ascii);
585
586							define two functions, that are loaded into caller's namespace and
587							can be used directly. In this case, you cannot specify additional
588							options, you only have default behaviour.
589
590							=head1 ERROR HANDLING
591
592							If you request an unknown encoding in the call to new Cz::Cstocs,
593							the conversion object is not defined and the variable
594							$Cz::Cstocs::errstr is set to the error message. When you specify
595							unknown encoding in the use call style (like C
596							'il2_ascii';>), the die is called.
597
598							=head1 AUTHOR
599
600							Jan Pazdziora, adelton@fi.muni.cz, created the module version.
601
602							Jan "Yenya" Kasprzak has done the original Un*x implementation.
603
604							=head1 VERSION
605
606							3.4
607
608							=head1 SEE ALSO
609
610							cstocs(1), perl(1), or Xcstocs at
611							http://www.lut.fi/~kurz/programs/xcstocs.tar.gz.
612
613							=cut
614
615