File Coverage

blib/lib/Cz/Cstocs.pm

Criterion	Covered	Total	%
statement	222	260	85.3
branch	100	150	66.6
condition	26	33	78.7
subroutine	15	17	88.2
pod	0	11	0.0
total	363	471	77.0

line	stmt	bran	cond	sub	pod	time	code
1
2							=head1 NAME
3
4							Cz::Cstocs - conversions of charset encodings for the Czech language
5
6							=cut
7
8							package Cz::Cstocs;
9
10	1			1		483	use strict;
	1					6
	1					23
11	1			1		5	use Exporter;
	1					1
	1					43
12
13	1			1		5	use vars qw( $VERSION $DEBUG $cstocsdir @ISA @EXPORT_OK %EXPORT $errstr);
	1					1
	1					2968
14
15							@ISA = qw(Exporter);
16							@EXPORT_OK = ( '_stupidity_workaround' );
17							%EXPORT = ( '_stupidity_workaround' => 1 );
18
19				0			sub _stupidity_workaround {
20							}
21
22							sub import {
23	2			2		2346	my $class = shift;
24	2					5	my @data = @_;
25	2	100				5	if (@data) {
26	1					3	my @avail = Cz::Cstocs->available_enc();
27	1					2	my $fn;
28	1					2	for $fn (@data) {
29	1					8	local $^W = 0;
30	1	50				3	next if grep { $_ eq $fn } @EXPORT_OK;
	1					4
31	1					7	my ($in, $out) = $fn =~ /^_?(.?)_(?:to_)?(.)$/;
32	1	50				3	next unless defined $out;
33	1					3	my $fnref = new Cz::Cstocs $in, $out;
34	1	50				3	die "Definition of $fn failed: $errstr"
35							unless defined $fnref;;
36	1			1		46	eval "sub $fn { \$fnref->conv(\@_); }; ";
	1					50
37	1	50				4	if ($@) {
38	0					0	die "Creating conversion function $fn failed: $@";
39							}
40	1					2	push @EXPORT_OK, $fn;
41	1					5	$EXPORT{$fn} = 1;
42							}
43							}
44	2					1778	Cz::Cstocs->export_to_level(1, '_stupidity_workaround', @data);
45							}
46
47							$VERSION = '3.4';
48
49							# Debugging option
50							$DEBUG = 0 unless defined $DEBUG;
51	20			20	0	43	sub DEBUG () { $DEBUG; }
52
53
54							# Where to get the encoding files from
55							# Start with some default
56							my $defaultcstocsdir = '/packages/share/cstocs/lib';
57
58							# Look at the environment variable
59							if (defined $ENV{'CSTOCSDIR'}) {
60							$defaultcstocsdir = $ENV{'CSTOCSDIR'};
61							print STDERR "Using enc-dir $defaultcstocsdir from the CSTOCSDIR env-var\n"
62							if DEBUG;
63							}
64							# Or take the encoding files from the Perl tree
65							elsif (defined $INC{'Cz/Cstocs.pm'}) {
66							$defaultcstocsdir = $INC{'Cz/Cstocs.pm'};
67							$defaultcstocsdir =~ s!\.pm$!/enc!;
68							print STDERR "Using enc-dir $defaultcstocsdir from \@INC\n"
69							if DEBUG;
70							}
71
72							# We have unless hare because you could have overriden $cstocsdir
73							$cstocsdir = $defaultcstocsdir unless defined $cstocsdir;
74
75
76							# Hash that holds the accent file and a tag saying if the accent
77							# file has already been read
78							my %accent = ();
79							my $accent_read = 0;
80
81							# Hash of alias covnersions
82							my %alias = ();
83							my $alias_read = 0;
84
85							# Input and output hashes
86							my %input_hashes = ();
87							my %output_hashes = ();
88
89							# Array of regexp parts
90							my %regexp_matches = ();
91
92							# Table of conversion functions, so that we do not need to create them twice
93							my %functions = ();
94
95							# List of diacritics
96							my @diacritics = qw( abovedot acute breve caron cedilla circumflex
97							diaeresis doubleacute ogonek ring );
98
99
100
101							# ######################################################
102							# Now, the function -- loading encoding and accent files
103
104							# Filling input and output_hashes tables for given encoding
105							sub load_encoding {
106	19			19	0	26	my $enc = lc shift;
107
108	19	100				41	return if defined $input_hashes{$enc}; # has already been loaded
109
110	5	50				8	if ($enc eq 'mime') {
111	0					0	eval 'use MIME::Words ()';
112	0	0				0	if ($@) {
113	0					0	die "Error loading encofing $enc: $@\n";
114							}
115	0					0	return;
116							}
117
118	5					11	my $file = "$cstocsdir/$enc.enc";
119	5	50				157	open FILE, $file or die "Error reading $file: $!\n";
120	5	50				17	print STDERR "Parsing encoding file $file\n" if DEBUG;
121
122	5					12	my ($input, $output) = ({}, {}); # just speedup thing
123	5					8	local $_;
124	5					86	while () {
125	811	100				1698	next if /^(#\|\s*$)/;
126	804					2252	my ($tag, $desc) = /^\s(\S+)\s+(\S+)\s$/;
127	804	50	33			1909	unless (defined $tag and defined $desc) {
128	0					0	chomp;
129	0					0	warn "Syntax error in $file at line $: `$_'.\n";
130	0					0	next;
131							}
132	804	100				1489	if ($tag =~ /^\d+\|0x\d+$/) {
133							$tag = pack 'C*', map {
134	779	100				1188	/^0/ ? oct($_) : $_
	970					2476
135							} split /,/, $tag;
136							}
137	804					1555	$input->{$tag} = $desc;
138	804	100				3794	$output->{$desc} = $tag unless defined $output->{$desc};
139							}
140	5					39	close FILE;
141
142	5					19	$input_hashes{$enc} = $input;
143	5					10	$output_hashes{$enc} = $output;
144
145	5	100				18	if ($enc eq "tex") {
146	1					3	fixup_tex_encoding();
147							}
148							}
149
150							sub fixup_tex_encoding {
151	1			1	0	2	my $tag;
152
153	1	50				2	print STDERR "Doing tex fixup\n" if DEBUG;
154
155	1					3	my $input = $input_hashes{"tex"};
156	1					2	my $output = $output_hashes{"tex"};
157
158							# we need this to fill the defaults
159	1					9	load_encoding('ascii');
160	1					2	my $asciiref = $output_hashes{'ascii'};
161	1					30	for $tag (keys %$asciiref) {
162							$output->{$tag} = $asciiref->{$tag}
163	95	50				184	unless defined $output->{$tag};
164							}
165
166	1					9	my %processed = ();
167
168	1					2	my (@dialetters, @dianonletters, @nondialetters, @nondianonletters);
169	1					5	my (@inputs) = keys %$input;
170	1					3	for $tag (@inputs) {
171	24					31	my $value = $input->{$tag};
172
173	24					25	my $az = 0;
174	24	100				75	$az = 1 if $tag =~ /[a-zA-Z]$/;
175
176	24	100	100			67	if ($az and $output->{$value} eq $tag) {
177	16					23	$output->{$value} = $tag . '{}';
178							}
179	24					40	$input->{$tag . ' '} = $value;
180
181	24	100				35	if (grep { $_ eq $value } @diacritics) {
	240	100				305
182	10					15	my $e;
183	10	100				17	if ($az) {
184	5					18	push @dialetters, $tag;
185	5					10	for $e ('a'..'h', 'k'..'z', 'A'..'Z') {
186	250					561	$output->{$e.$value} = $tag.' '.$e
187							}
188							} else {
189	5					7	push @dianonletters, $tag;
190	5					9	for $e ('a'..'h', 'k'..'z', 'A'..'Z') {
191	250					543	$output->{$e.$value} = $tag.$e
192							}
193	5					8	for $e ('a'..'z', 'A'..'Z') {
194	260					524	$input->{$tag.$e} = $e.$value;
195							}
196							}
197	10					13	for $e ('i', 'j') {
198	20					47	$output->{$e.$value} = $tag.'\\'.$e.'{}'
199							}
200	10					20	for $e ('a'..'z', 'A'..'Z') {
201	520					1036	$input->{$tag.' '.$e} = $e.$value;
202							}
203	10					12	for $e ('i', 'j') {
204	20					41	$input->{$tag.'\\'.$e} = $e.$value;
205	20					47	$input->{$tag.' \\'.$e} = $e.$value;
206							}
207							} elsif ($az) {
208	13					24	push @nondialetters, $tag;
209							} else {
210	1					6	push @nondianonletters, $tag;
211							}
212							}
213
214	1					2	my $regexp = '';
215
216	1	50				3	if (@dialetters) {
217							$regexp .= join '', '(',
218	1					7	join('\|', map { "\Q$_"; } @dialetters),
	5					11
219							")([ \\t]+[a-zA-Z]\|[ \\t](\\\\[ij]([ \\t]+(\\{\\})?\|[ \\t](\$\|\\{\\}))\|\\{([a-zA-Z]\|\\\\[ij][ \\t]*(\\{\\})?)\\}))";
220							}
221	1	50				3	if (@dianonletters) {
222	1	50				3	$regexp .= '\|' if $regexp ne '';
223							$regexp .= '(' . join '',
224	1					7	join('\|', map { "\Q$_"; } @dianonletters),
	5					11
225							")[ \\t]([a-zA-Z]\|\\\\[ij]([ \\t]+(\\{\\})?\|[ \\t](\$\|\\{\\}))\|\\{([a-zA-Z]\|\\\\[ij][ \\t]*(\\{\\})?)\\})";
226							}
227	1	50				4	if (@nondialetters) {
228	1	50				3	$regexp .= '\|' if $regexp ne '';
229							$regexp .= '(' . join '',
230	1					2	join('\|', map { "\Q$_"; } @nondialetters),
	13					24
231							")([ \\t]+(\\{\\})?\|[ \\t]*\$)"
232							}
233	1	50				3	if (@nondianonletters) {
234	1	50				3	$regexp .= '\|' if $regexp ne '';
235							$regexp .= '(' . join '',
236	1					1	join('\|', map { "\Q$_"; } @nondianonletters),
	1					4
237							")[ \\t]*(\\{\\})?"
238							}
239
240	1					3	$regexp_matches{'tex'} = $regexp;
241	1					8	1;
242							}
243
244							# Loading accent file
245							sub load_accent {
246	8	100		8	0	21	return if $accent_read;
247	1					2	$accent_read = 1;
248
249	1					2	my $file = "$cstocsdir/accent";
250	1	50				29	open FILE, $file or die "Error reading accent file $file: $!\n";
251	1	50				5	print STDERR "Parsing accent file $file\n" if DEBUG;
252
253	1					2	local $_;
254	1					18	while () {
255	201	50				421	next if /^\s*(#\|$)/;
256	201					611	my ($key, $val) = /^\s(\S+)\s+(.+?)\s$/;
257	201	50	33			475	unless (defined $key and defined $val) {
258	0					0	chomp;
259	0					0	warn "Syntax error in $file at line $: `$_'.\n";
260	0					0	next;
261							}
262	201					552	$accent{$key} = $val;
263							}
264	1					10	close FILE;
265							}
266
267							# Load the alias file, fill the global %alias hash;
268							sub load_alias {
269	22	100		22	0	52	return if $alias_read;
270	1					3	$alias_read = 1;
271	1					2	my $file = "$cstocsdir/alias";
272
273	1	50				31	open FILE, $file or die "Error reading alias file $file: $!\n";
274	1					3	local $_;
275	1					18	while () {
276	17					21	chomp;
277	17					28	my ($alias, $enc) = split;
278	17					50	$alias{$alias} = $enc;
279							}
280	1					9	close FILE;
281							}
282
283							# Normalizes the encoding name -- expands aliases
284							sub normalize_enc_name {
285	22			22	0	40	load_alias();
286	22					38	my $enc = lc shift;
287	22					45	$enc =~ s/[^a-z0-9]//g;
288	22	100				56	( defined $alias{$enc} ? $alias{$enc} : $enc );
289							}
290
291							# Recursively lookup the target
292							sub lookup_accent {
293	229			229	0	354	my ($outenc, $accent, $in) = @_;
294	229					465	my @target = split /\s+/, $in;
295	229					249	my $out = '';
296	229					281	for my $desc (@target) {
297	294	50				405	if (defined $outenc->{$desc}) {
		0
298	294					381	$out .= $outenc->{$desc};
299							} elsif (defined $accent->{$desc}) {
300	0					0	$out .= lookup_accent($outenc, $accent, $accent->{$desc});
301							} else {
302	0					0	die;
303							}
304							}
305	229					586	return $out;
306							}
307
308							# Constructor -- takes two arguments, input and output encodings,
309							# a optionally hash of options. Returns reference to code that will
310							# do the conversion, or undef
311							sub new {
312	11			11	0	299	my $class = shift;
313	11					29	my ($inputenc, $outputenc) = (shift, shift);
314
315	11					49	local $/ = "\n";
316
317							# check input values
318	11	50	33			56	unless (defined $inputenc and defined $outputenc) {
319	0					0	print STDERR "Both input and output encodings must be specified in call to ", __PACKAGE__, "::new\n";
320	0					0	return;
321							}
322
323							# Default options
324	11					17	my $fillstring = ' ';
325	11					14	my $use_fillstring = 1;
326	11					12	my $use_accent = 1;
327	11					12	my $one_by_one = 0;
328
329							# this is exception for TeX
330	11	100				18	$use_fillstring = 0 if $inputenc eq "tex";
331
332	11					19	my %opts = @_;
333	11					20	my ($tag, $value);
334	11					31	while (($tag, $value) = each %opts) {
335	3	50				7	print STDERR "Option: $tag = '$value'\n" if DEBUG;
336	3	100				7	$tag eq 'fillstring' and $fillstring = $value;
337	3	50				5	$tag eq 'use_accent' and
		100
338							$use_accent = (defined $value ? $value : 0);
339	3	0				6	$tag eq 'nofillstring' and
		0
		50
340							$use_fillstring = (defined $value ?
341							( $value ? 0 : 1) : 0);
342	3	50				4	$tag eq 'cstocsdir' and $cstocsdir = $value;
343	3	100				8	$tag eq 'one_by_one' and $one_by_one = $value;
344							}
345
346	11					21	$inputenc = normalize_enc_name($inputenc);
347	11					16	$outputenc = normalize_enc_name($outputenc);
348
349							# encode settings into the function name
350	11	100				41	if (defined $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"}) {
351	2					11	return $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"};
352							}
353
354	9					16	eval {
355	9					26	load_encoding($inputenc);
356	9					20	load_encoding($outputenc);
357	9	100				25	load_accent() if $use_accent;
358							};
359	9	50				19	if ($@) {
360	0					0	$errstr = $@;
361	0					0	return;
362							}
363
364	9					22	my $conv = {};
365
366	9					16	my ($is_one_by_one, $has_space) = (1, 0);
367
368	9	50				19	if ($outputenc ne 'mime') {
369	9					10	my $key;
370	9					9	for $key (keys %{$input_hashes{$inputenc}}) {
	9					476
371	3111					3655	my $desc = $input_hashes{$inputenc}{$key};
372	3111					3526	my $output = $output_hashes{$outputenc}{$desc};
373
374	3111	100	100			5442	if (not defined $output and $use_accent) {
375							# Doesn't have friend in output encoding
376
377
378	928					927	$output = eval {
379							lookup_accent($output_hashes{$outputenc},
380	928	100				1781	\%accent, $accent{$desc}) if defined $accent{$desc};
381							};
382	928	50				1238	if ($@) {
383	0					0	$errstr = "Error processing translitaration for $inputenc -> $outputenc for character $desc.\n";
384	0					0	return;
385							}
386
387	928	100	66			1242	$output = undef if $one_by_one and defined $output
			100
388							and length $key < length $output;
389							}
390	3111	100	100			4457	if (not defined $output and $use_fillstring) {
391	11					12	$output = $fillstring;
392							}
393
394	3111	100	100			7248	next if (not defined $output
			100
395							or ($inputenc ne 'utf8' and $key eq $output));
396	1673	100	100			2678	if (length $key != 1 or length $output != 1)
397	1419					1286	{ $is_one_by_one = 0; }
398	1673					2709	$conv->{$key} = $output;
399							}
400							}
401
402	9					99	my $fntext = ' sub { my @converted = map { my $e = $_; if (defined $e) {';
403
404	9	50				52	if ($inputenc eq 'mime') {
		50
		100
		100
		100
		100
405	0					0	$fntext .= qq!
406							\$e =~ s/=\\s*=/==/g;
407							\$e = join '', map {
408							my \$conv;
409							if (defined \$_->[1]) {
410							(defined(\$conv = new Cz::Cstocs \$_->[1], '$outputenc', %{ \\%opts }))
411							? \$conv->conv(\$_->[0])
412							: ()
413							} else {
414							\$_->[0]
415							}
416							} MIME::Words::decode_mimewords(\$e);
417							!;
418							} elsif ($outputenc eq 'mime') {
419	0					0	my %MIME_NAMES = (
420							il1 => 'ISO-8859-1',
421							il2 => 'ISO-8859-2',
422							utf8 => 'UTF-8',
423							1250 => 'Windows-1250',
424							1252 => 'Windows-1252',
425							);
426	0					0	my $charset = $MIME_NAMES{$inputenc};
427	0	0				0	if (not defined $charset) {
428	0					0	die "Couldn't find MIME name for encoding $inputenc\n";
429							}
430	0					0	$fntext .= qq!
431							\$e = MIME::Words::encode_mimewords(\$e, Charset => '$charset');
432							\$e =~ s/\\?=( +)=\\?.*?\\?Q\\?/'_' x length \$1/egi;
433							!;
434							} elsif (not keys %$conv) {
435							# do nothing;
436							} elsif ($is_one_by_one) {
437	1					8	my $src = join "", keys %$conv;
438	1					3	$src = "\Q$src";
439	1					18	my $dst = join "", values %$conv;
440	1					3	$dst = "\Q$dst";
441	1					3	$fntext .= qq! \$e =~ tr/$src/$dst/; !;
442							} elsif ($inputenc eq 'tex') {
443	2					4	my $src = $regexp_matches{'tex'};
444	2					6	$fntext .= qq! \$e =~ s/$src/ my \$e = \$&; my \$orig = \$e; \$e =~ s#[{}]# #sog; \$e =~ s#[ \\t]+# #sog; \$e =~ s# \$##o; (defined \$conv->{\$e} ? \$conv->{\$e} : \$orig); /esog; !;
445							} elsif ($inputenc eq 'utf8') {
446	1					5	$fntext .= qq! \$e =~ s/[\\x21-\\x7f]\|[\\xc0-\\xdf].\|[\\xe0-\\xef]..\|[\\xf0-\\xf7]...\|[\\xf8-\\xfb]....\|[\\xfc\\xfd]...../defined \$conv->{\$&} ? \$conv->{\$&} : (
447							$use_fillstring ? \$fillstring : '') /esog; !;
448							} else {
449	4					28	my $singles = join "", grep { length $_ == 1 } keys %$conv;
	328					410
450	4					19	$singles = "[". "\Q$singles" . "]";
451
452							my $src = join "\|",
453	0					0	( map { my $e = "\Q$_"; $e; }
	0					0
454	0					0	sort { length $b <=> length $a }
455	4					26	grep { length $_ != 1 } keys %$conv);
	328					360
456	4	50				17	if ($singles ne "[]") {
457	4	50				14	$src .= "\|" unless $src eq '';
458	4					7	$src .= $singles;
459							}
460
461	4					10	$fntext .= qq! \$e =~ s/$src/\$conv->{\$&}/sog; !;
462							}
463
464	9					12	$fntext .= ' $e; } else { undef; }} @_; if (wantarray) { return @converted; } else { return join "", map { defined $_ ? $_ : "" } @converted; } }';
465
466	9	50				17	print STDERR "Conversion function for $inputenc to $outputenc:\n$fntext\n" if DEBUG;
467
468	9					1926	my $fn = eval $fntext;
469	9	50				28	do { chomp $@;
	0					0
470	0					0	die "Fatal error in Cz::Cstocs: $@, line ", __LINE__, "\n";
471							} if $@;
472	9					16	bless $fn, $class;
473
474	9					37	$functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"} = $fn;
475	9					47	$fn;
476							}
477
478							sub conv {
479	12			12	0	443	my $self = shift;
480	12					250	return &$self($_[0]);
481							}
482
483							sub available_enc {
484	1	50		1	0	33	opendir DIR, $cstocsdir or warn "Error reading $cstocsdir\n";
485	1					41	my @list = sort map { s/\.enc$//; $_ } grep { /\.enc$/ } readdir DIR;
	15					25
	15					29
	19					35
486	1					16	closedir DIR;
487	1					6	return @list;
488							}
489
490							sub diacritic_char {
491	0			0	0		my ($encoding, $char) = @_;
492	0						load_encoding($encoding);
493
494	0						my @result = ();
495	0						my $dia;
496	0						for $dia (@diacritics) {
497	0						my $name = $char . $dia;
498							push @result, $output_hashes{$encoding}{$name}
499	0	0					if defined $output_hashes{$encoding}{$name};
500							}
501	0						@result;
502							}
503
504							1;
505
506							=head1 SYNOPSIS
507
508							use Cz::Cstocs;
509							my $il2_to_ascii = new Cz::Cstocs 'il2', 'ascii';
510							while (<>) {
511							print &$il2_to_ascii($_);
512							}
513
514							use Cz::Cstocs 'il2_ascii';
515							while (<>) {
516							print il2_ascii($_);
517							}
518
519							use Cz::Cstocs;
520							sub il2toascii;
521							# inform the parser that there is a function il2toascii
522							*il2toascii = new Cz::Cstocs 'il2', 'ascii';
523							# now define the function
524							print il2toascii $data;
525							# thanks to Jan Krynicky for poining this out
526
527							=head1 DESCRIPTION
528
529							This module helps in converting texts between various charset
530							encodings, used for Czech and Slovak languages. The instance of the
531							object B is created using method B. It takes at
532							least two parameters for input and output encoding and can be
533							afterwards used as a function reference to convert strings/lists.
534							Cz::Cstocs supports fairly free form of aliases, so iso8859-2,
535							ISO-8859-2, iso88592 and il2 are all aliases of the same encoding.
536							For backward compatibility, method I is supported as well,
537							so the example above could also read
538
539							while (<>) {
540							print $il2_to_ascii->conv($_);
541							}
542
543							You can also use typeglob syntax.
544
545							The conversion function takes a list and returns list of converted
546							strings (in the list context) or one string consisting of concatenated
547							results (in the scalar context).
548
549							You can modify the behaviour of the conversion function by specifying
550							hash of other options after the encoding names in call to B.
551
552							=over 4
553
554							=item fillstring
555
556							Gives alternate string that will replace characters from input
557							encoding that are not present in the output encoding. Default is
558							space.
559
560							=item use_accent
561
562							Defines whether the accent file should be used. Default is 1 (true).
563
564							=item nofillstring
565
566							When 1 (true), will keep characters that do not have friends in
567							accent nor output encoding, will no replace them with fillstring.
568							Default is 0 except for tex, because you probably rather want to keep
569							backslashed symbols than loose them.
570
571							=item cstocsdir
572
573							Alternate location for encoding and accent files. The default is the
574							F directory in Perl library tree. This location can
575							also be changed with the I environment variable.
576
577							=back
578
579							There is an alternate way to define the conversion function: any
580							arguments after use Cz::Cstocs that have form encoding_encoding or
581							encoding_to_encoding are processed and the appropriate functions are
582							imported. So,
583
584							use Cz::Cstocs qw(pc2_to_il2 il2_ascii);
585
586							define two functions, that are loaded into caller's namespace and
587							can be used directly. In this case, you cannot specify additional
588							options, you only have default behaviour.
589
590							=head1 ERROR HANDLING
591
592							If you request an unknown encoding in the call to new Cz::Cstocs,
593							the conversion object is not defined and the variable
594							$Cz::Cstocs::errstr is set to the error message. When you specify
595							unknown encoding in the use call style (like C
596							'il2_ascii';>), the die is called.
597
598							=head1 AUTHOR
599
600							Jan Pazdziora created the module version.
601
602							Jan "Yenya" Kasprzak has done the original Un*x implementation.
603
604							=head1 VERSION
605
606							3.4
607
608							=head1 SEE ALSO
609
610							cstocs(1), perl(1), or Xcstocs at
611							http://www.lut.fi/~kurz/programs/xcstocs.tar.gz.
612
613							=cut
614
615