File Coverage

blib/lib/Cz/Cstocs.pm

Criterion	Covered	Total	%
statement	222	260	85.3
branch	100	150	66.6
condition	26	33	78.7
subroutine	15	17	88.2
pod	0	11	0.0
total	363	471	77.0

line	stmt	bran	cond	sub	pod	time	code
1
2							=head1 NAME
3
4							Cz::Cstocs - conversions of charset encodings for the Czech language
5
6							=cut
7
8							package Cz::Cstocs;
9
10	1			1		609	use strict;
	1					8
	1					28
11	1			1		5	use Exporter;
	1					2
	1					49
12
13	1			1		6	use vars qw( $VERSION $DEBUG $cstocsdir @ISA @EXPORT_OK %EXPORT $errstr);
	1					2
	1					3634
14
15							@ISA = qw(Exporter);
16							@EXPORT_OK = ( '_stupidity_workaround' );
17							%EXPORT = ( '_stupidity_workaround' => 1 );
18
19				0			sub _stupidity_workaround {
20							}
21
22							sub import {
23	2			2		2880	my $class = shift;
24	2					5	my @data = @_;
25	2	100				7	if (@data) {
26	1					4	my @avail = Cz::Cstocs->available_enc();
27	1					2	my $fn;
28	1					2	for $fn (@data) {
29	1					11	local $^W = 0;
30	1	50				3	next if grep { $_ eq $fn } @EXPORT_OK;
	1					5
31	1					9	my ($in, $out) = $fn =~ /^_?(.?)_(?:to_)?(.)$/;
32	1	50				4	next unless defined $out;
33	1					11	my $fnref = new Cz::Cstocs $in, $out;
34	1	50				4	die "Definition of $fn failed: $errstr"
35							unless defined $fnref;;
36	1			1		58	eval "sub $fn { \$fnref->conv(\@_); }; ";
	1					51
37	1	50				5	if ($@) {
38	0					0	die "Creating conversion function $fn failed: $@";
39							}
40	1					3	push @EXPORT_OK, $fn;
41	1					7	$EXPORT{$fn} = 1;
42							}
43							}
44	2					2465	Cz::Cstocs->export_to_level(1, '_stupidity_workaround', @data);
45							}
46
47							$VERSION = '3.43';
48
49							# Debugging option
50							$DEBUG = 0 unless defined $DEBUG;
51	20			20	0	57	sub DEBUG () { $DEBUG; }
52
53
54							# Where to get the encoding files from
55							# Start with some default
56							my $defaultcstocsdir = '/packages/share/cstocs/lib';
57
58							# Look at the environment variable
59							if (defined $ENV{'CSTOCSDIR'}) {
60							$defaultcstocsdir = $ENV{'CSTOCSDIR'};
61							print STDERR "Using enc-dir $defaultcstocsdir from the CSTOCSDIR env-var\n"
62							if DEBUG;
63							}
64							# Or take the encoding files from the Perl tree
65							elsif (defined $INC{'Cz/Cstocs.pm'}) {
66							$defaultcstocsdir = $INC{'Cz/Cstocs.pm'};
67							$defaultcstocsdir =~ s!\.pm$!/enc!;
68							print STDERR "Using enc-dir $defaultcstocsdir from \@INC\n"
69							if DEBUG;
70							}
71
72							# We have unless hare because you could have overriden $cstocsdir
73							$cstocsdir = $defaultcstocsdir unless defined $cstocsdir;
74
75
76							# Hash that holds the accent file and a tag saying if the accent
77							# file has already been read
78							my %accent = ();
79							my $accent_read = 0;
80
81							# Hash of alias covnersions
82							my %alias = ();
83							my $alias_read = 0;
84
85							# Input and output hashes
86							my %input_hashes = ();
87							my %output_hashes = ();
88
89							# Array of regexp parts
90							my %regexp_matches = ();
91
92							# Table of conversion functions, so that we do not need to create them twice
93							my %functions = ();
94
95							# List of diacritics
96							my @diacritics = qw( abovedot acute breve caron cedilla circumflex
97							diaeresis doubleacute ogonek ring );
98
99
100
101							# ######################################################
102							# Now, the function -- loading encoding and accent files
103
104							# Filling input and output_hashes tables for given encoding
105							sub load_encoding {
106	19			19	0	35	my $enc = lc shift;
107
108	19	100				80	return if defined $input_hashes{$enc}; # has already been loaded
109
110	5	50				16	if ($enc eq 'mime') {
111	0					0	eval 'use MIME::Words ()';
112	0	0				0	if ($@) {
113	0					0	die "Error loading encofing $enc: $@\n";
114							}
115	0					0	return;
116							}
117
118	5					20	my $file = "$cstocsdir/$enc.enc";
119	5	50				262	open FILE, $file or die "Error reading $file: $!\n";
120	5	50				21	print STDERR "Parsing encoding file $file\n" if DEBUG;
121
122	5					22	my ($input, $output) = ({}, {}); # just speedup thing
123	5					11	local $_;
124	5					107	while () {
125	811	100				2084	next if /^(#\|\s*$)/;
126	804					2741	my ($tag, $desc) = /^\s(\S+)\s+(\S+)\s$/;
127	804	50	33			2401	unless (defined $tag and defined $desc) {
128	0					0	chomp;
129	0					0	warn "Syntax error in $file at line $: `$_'.\n";
130	0					0	next;
131							}
132	804	100				1807	if ($tag =~ /^\d+\|0x\d+$/) {
133							$tag = pack 'C*', map {
134	779	100				1386	/^0/ ? oct($_) : $_
	970					3056
135							} split /,/, $tag;
136							}
137	804					1972	$input->{$tag} = $desc;
138	804	100				5032	$output->{$desc} = $tag unless defined $output->{$desc};
139							}
140	5					54	close FILE;
141
142	5					20	$input_hashes{$enc} = $input;
143	5					11	$output_hashes{$enc} = $output;
144
145	5	100				34	if ($enc eq "tex") {
146	1					5	fixup_tex_encoding();
147							}
148							}
149
150							sub fixup_tex_encoding {
151	1			1	0	7	my $tag;
152
153	1	50				5	print STDERR "Doing tex fixup\n" if DEBUG;
154
155	1					3	my $input = $input_hashes{"tex"};
156	1					3	my $output = $output_hashes{"tex"};
157
158							# we need this to fill the defaults
159	1					15	load_encoding('ascii');
160	1					2	my $asciiref = $output_hashes{'ascii'};
161	1					31	for $tag (keys %$asciiref) {
162							$output->{$tag} = $asciiref->{$tag}
163	95	50				249	unless defined $output->{$tag};
164							}
165
166	1					13	my %processed = ();
167
168	1					3	my (@dialetters, @dianonletters, @nondialetters, @nondianonletters);
169	1					9	my (@inputs) = keys %$input;
170	1					9	for $tag (@inputs) {
171	24					44	my $value = $input->{$tag};
172
173	24					29	my $az = 0;
174	24	100				86	$az = 1 if $tag =~ /[a-zA-Z]$/;
175
176	24	100	100			98	if ($az and $output->{$value} eq $tag) {
177	16					36	$output->{$value} = $tag . '{}';
178							}
179	24					56	$input->{$tag . ' '} = $value;
180
181	24	100				42	if (grep { $_ eq $value } @diacritics) {
	240	100				389
182	10					18	my $e;
183	10	100				19	if ($az) {
184	5					10	push @dialetters, $tag;
185	5					12	for $e ('a'..'h', 'k'..'z', 'A'..'Z') {
186	250					712	$output->{$e.$value} = $tag.' '.$e
187							}
188							} else {
189	5					8	push @dianonletters, $tag;
190	5					12	for $e ('a'..'h', 'k'..'z', 'A'..'Z') {
191	250					739	$output->{$e.$value} = $tag.$e
192							}
193	5					12	for $e ('a'..'z', 'A'..'Z') {
194	260					728	$input->{$tag.$e} = $e.$value;
195							}
196							}
197	10					22	for $e ('i', 'j') {
198	20					66	$output->{$e.$value} = $tag.'\\'.$e.'{}'
199							}
200	10					19	for $e ('a'..'z', 'A'..'Z') {
201	520					1432	$input->{$tag.' '.$e} = $e.$value;
202							}
203	10					23	for $e ('i', 'j') {
204	20					54	$input->{$tag.'\\'.$e} = $e.$value;
205	20					67	$input->{$tag.' \\'.$e} = $e.$value;
206							}
207							} elsif ($az) {
208	13					36	push @nondialetters, $tag;
209							} else {
210	1					3	push @nondianonletters, $tag;
211							}
212							}
213
214	1					4	my $regexp = '';
215
216	1	50				4	if (@dialetters) {
217							$regexp .= join '', '(',
218	1					4	join('\|', map { "\Q$_"; } @dialetters),
	5					21
219							")([ \\t]+[a-zA-Z]\|[ \\t](\\\\[ij]([ \\t]+(\\{\\})?\|[ \\t](\$\|\\{\\}))\|\\{([a-zA-Z]\|\\\\[ij][ \\t]*(\\{\\})?)\\}))";
220							}
221	1	50				12	if (@dianonletters) {
222	1	50				9	$regexp .= '\|' if $regexp ne '';
223							$regexp .= '(' . join '',
224	1					4	join('\|', map { "\Q$_"; } @dianonletters),
	5					14
225							")[ \\t]([a-zA-Z]\|\\\\[ij]([ \\t]+(\\{\\})?\|[ \\t](\$\|\\{\\}))\|\\{([a-zA-Z]\|\\\\[ij][ \\t]*(\\{\\})?)\\})";
226							}
227	1	50				5	if (@nondialetters) {
228	1	50				5	$regexp .= '\|' if $regexp ne '';
229							$regexp .= '(' . join '',
230	1					3	join('\|', map { "\Q$_"; } @nondialetters),
	13					31
231							")([ \\t]+(\\{\\})?\|[ \\t]*\$)"
232							}
233	1	50				5	if (@nondianonletters) {
234	1	50				4	$regexp .= '\|' if $regexp ne '';
235							$regexp .= '(' . join '',
236	1					4	join('\|', map { "\Q$_"; } @nondianonletters),
	1					12
237							")[ \\t]*(\\{\\})?"
238							}
239
240	1					5	$regexp_matches{'tex'} = $regexp;
241	1					6	1;
242							}
243
244							# Loading accent file
245							sub load_accent {
246	8	100		8	0	27	return if $accent_read;
247	1					3	$accent_read = 1;
248
249	1					4	my $file = "$cstocsdir/accent";
250	1	50				41	open FILE, $file or die "Error reading accent file $file: $!\n";
251	1	50				7	print STDERR "Parsing accent file $file\n" if DEBUG;
252
253	1					2	local $_;
254	1					29	while () {
255	201	50				465	next if /^\s*(#\|$)/;
256	201					754	my ($key, $val) = /^\s(\S+)\s+(.+?)\s$/;
257	201	50	33			579	unless (defined $key and defined $val) {
258	0					0	chomp;
259	0					0	warn "Syntax error in $file at line $: `$_'.\n";
260	0					0	next;
261							}
262	201					675	$accent{$key} = $val;
263							}
264	1					13	close FILE;
265							}
266
267							# Load the alias file, fill the global %alias hash;
268							sub load_alias {
269	22	100		22	0	79	return if $alias_read;
270	1					2	$alias_read = 1;
271	1					3	my $file = "$cstocsdir/alias";
272
273	1	50				40	open FILE, $file or die "Error reading alias file $file: $!\n";
274	1					3	local $_;
275	1					22	while () {
276	17					25	chomp;
277	17					32	my ($alias, $enc) = split;
278	17					66	$alias{$alias} = $enc;
279							}
280	1					10	close FILE;
281							}
282
283							# Normalizes the encoding name -- expands aliases
284							sub normalize_enc_name {
285	22			22	0	45	load_alias();
286	22					77	my $enc = lc shift;
287	22					63	$enc =~ s/[^a-z0-9]//g;
288	22	100				74	( defined $alias{$enc} ? $alias{$enc} : $enc );
289							}
290
291							# Recursively lookup the target
292							sub lookup_accent {
293	229			229	0	412	my ($outenc, $accent, $in) = @_;
294	229					651	my @target = split /\s+/, $in;
295	229					313	my $out = '';
296	229					325	for my $desc (@target) {
297	294	50				485	if (defined $outenc->{$desc}) {
		0
298	294					479	$out .= $outenc->{$desc};
299							} elsif (defined $accent->{$desc}) {
300	0					0	$out .= lookup_accent($outenc, $accent, $accent->{$desc});
301							} else {
302	0					0	die;
303							}
304							}
305	229					649	return $out;
306							}
307
308							# Constructor -- takes two arguments, input and output encodings,
309							# a optionally hash of options. Returns reference to code that will
310							# do the conversion, or undef
311							sub new {
312	11			11	0	397	my $class = shift;
313	11					42	my ($inputenc, $outputenc) = (shift, shift);
314
315	11					61	local $/ = "\n";
316
317							# check input values
318	11	50	33			67	unless (defined $inputenc and defined $outputenc) {
319	0					0	print STDERR "Both input and output encodings must be specified in call to ", __PACKAGE__, "::new\n";
320	0					0	return;
321							}
322
323							# Default options
324	11					18	my $fillstring = ' ';
325	11					20	my $use_fillstring = 1;
326	11					16	my $use_accent = 1;
327	11					13	my $one_by_one = 0;
328
329							# this is exception for TeX
330	11	100				25	$use_fillstring = 0 if $inputenc eq "tex";
331
332	11					26	my %opts = @_;
333	11					16	my ($tag, $value);
334	11					45	while (($tag, $value) = each %opts) {
335	3	50				7	print STDERR "Option: $tag = '$value'\n" if DEBUG;
336	3	100				8	$tag eq 'fillstring' and $fillstring = $value;
337	3	50				9	$tag eq 'use_accent' and
		100
338							$use_accent = (defined $value ? $value : 0);
339	3	0				9	$tag eq 'nofillstring' and
		0
		50
340							$use_fillstring = (defined $value ?
341							( $value ? 0 : 1) : 0);
342	3	50				7	$tag eq 'cstocsdir' and $cstocsdir = $value;
343	3	100				59	$tag eq 'one_by_one' and $one_by_one = $value;
344							}
345
346	11					37	$inputenc = normalize_enc_name($inputenc);
347	11					29	$outputenc = normalize_enc_name($outputenc);
348
349							# encode settings into the function name
350	11	100				56	if (defined $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"}) {
351	2					18	return $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"};
352							}
353
354	9					18	eval {
355	9					32	load_encoding($inputenc);
356	9					24	load_encoding($outputenc);
357	9	100				43	load_accent() if $use_accent;
358							};
359	9	50				26	if ($@) {
360	0					0	$errstr = $@;
361	0					0	return;
362							}
363
364	9					22	my $conv = {};
365
366	9					22	my ($is_one_by_one, $has_space) = (1, 0);
367
368	9	50				24	if ($outputenc ne 'mime') {
369	9					11	my $key;
370	9					16	for $key (keys %{$input_hashes{$inputenc}}) {
	9					726
371	3111					4763	my $desc = $input_hashes{$inputenc}{$key};
372	3111					4369	my $output = $output_hashes{$outputenc}{$desc};
373
374	3111	100	100			6532	if (not defined $output and $use_accent) {
375							# Doesn't have friend in output encoding
376
377
378	928					1109	$output = eval {
379							lookup_accent($output_hashes{$outputenc},
380	928	100				2261	\%accent, $accent{$desc}) if defined $accent{$desc};
381							};
382	928	50				1538	if ($@) {
383	0					0	$errstr = "Error processing translitaration for $inputenc -> $outputenc for character $desc.\n";
384	0					0	return;
385							}
386
387	928	100	66			1597	$output = undef if $one_by_one and defined $output
			100
388							and length $key < length $output;
389							}
390	3111	100	100			5807	if (not defined $output and $use_fillstring) {
391	11					18	$output = $fillstring;
392							}
393
394	3111	100	100			8998	next if (not defined $output
			100
395							or ($inputenc ne 'utf8' and $key eq $output));
396	1673	100	100			3274	if (length $key != 1 or length $output != 1)
397	1419					1627	{ $is_one_by_one = 0; }
398	1673					3419	$conv->{$key} = $output;
399							}
400							}
401
402	9					128	my $fntext = ' sub { my @converted = map { my $e = $_; if (defined $e) {';
403
404	9	50				93	if ($inputenc eq 'mime') {
		50
		100
		100
		100
		100
405	0					0	$fntext .= qq!
406							\$e =~ s/=\\s*=/==/g;
407							\$e = join '', map {
408							my \$conv;
409							if (defined \$_->[1]) {
410							(defined(\$conv = new Cz::Cstocs \$_->[1], '$outputenc', %{ \\%opts }))
411							? \$conv->conv(\$_->[0])
412							: ()
413							} else {
414							\$_->[0]
415							}
416							} MIME::Words::decode_mimewords(\$e);
417							!;
418							} elsif ($outputenc eq 'mime') {
419	0					0	my %MIME_NAMES = (
420							il1 => 'ISO-8859-1',
421							il2 => 'ISO-8859-2',
422							utf8 => 'UTF-8',
423							1250 => 'Windows-1250',
424							1252 => 'Windows-1252',
425							);
426	0					0	my $charset = $MIME_NAMES{$inputenc};
427	0	0				0	if (not defined $charset) {
428	0					0	die "Couldn't find MIME name for encoding $inputenc\n";
429							}
430	0					0	$fntext .= qq!
431							\$e = MIME::Words::encode_mimewords(\$e, Charset => '$charset');
432							\$e =~ s/\\?=( +)=\\?.*?\\?Q\\?/'_' x length \$1/egi;
433							!;
434							} elsif (not keys %$conv) {
435							# do nothing;
436							} elsif ($is_one_by_one) {
437	1					10	my $src = join "", keys %$conv;
438	1					6	$src = "\Q$src";
439	1					6	my $dst = join "", values %$conv;
440	1					4	$dst = "\Q$dst";
441	1					4	$fntext .= qq! \$e =~ tr/$src/$dst/; !;
442							} elsif ($inputenc eq 'tex') {
443	2					7	my $src = $regexp_matches{'tex'};
444	2					8	$fntext .= qq! \$e =~ s/$src/ my \$e = \$&; my \$orig = \$e; \$e =~ s#[{}]# #sog; \$e =~ s#[ \\t]+# #sog; \$e =~ s# \$##o; (defined \$conv->{\$e} ? \$conv->{\$e} : \$orig); /esog; !;
445							} elsif ($inputenc eq 'utf8') {
446	1					4	$fntext .= qq! \$e =~ s/[\\x21-\\x7f]\|[\\xc0-\\xdf].\|[\\xe0-\\xef]..\|[\\xf0-\\xf7]...\|[\\xf8-\\xfb]....\|[\\xfc\\xfd]...../defined \$conv->{\$&} ? \$conv->{\$&} : (
447							$use_fillstring ? \$fillstring : '') /esog; !;
448							} else {
449	4					39	my $singles = join "", grep { length $_ == 1 } keys %$conv;
	328					477
450	4					29	$singles = "[". "\Q$singles" . "]";
451
452							my $src = join "\|",
453	0					0	( map { my $e = "\Q$_"; $e; }
	0					0
454	0					0	sort { length $b <=> length $a }
455	4					37	grep { length $_ != 1 } keys %$conv);
	328					456
456	4	50				23	if ($singles ne "[]") {
457	4	50				11	$src .= "\|" unless $src eq '';
458	4					8	$src .= $singles;
459							}
460
461	4					12	$fntext .= qq! \$e =~ s/$src/\$conv->{\$&}/sog; !;
462							}
463
464	9					27	$fntext .= ' $e; } else { undef; }} @_; if (wantarray) { return @converted; } else { return join "", map { defined $_ ? $_ : "" } @converted; } }';
465
466	9	50				24	print STDERR "Conversion function for $inputenc to $outputenc:\n$fntext\n" if DEBUG;
467
468	9					2597	my $fn = eval $fntext;
469	9	50				36	do { chomp $@;
	0					0
470	0					0	die "Fatal error in Cz::Cstocs: $@, line ", __LINE__, "\n";
471							} if $@;
472	9					23	bless $fn, $class;
473
474	9					51	$functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"} = $fn;
475	9					63	$fn;
476							}
477
478							sub conv {
479	12			12	0	623	my $self = shift;
480	12					303	return &$self($_[0]);
481							}
482
483							sub available_enc {
484	1	50		1	0	45	opendir DIR, $cstocsdir or warn "Error reading $cstocsdir\n";
485	1					52	my @list = sort map { s/\.enc$//; $_ } grep { /\.enc$/ } readdir DIR;
	15					30
	15					38
	19					42
486	1					19	closedir DIR;
487	1					8	return @list;
488							}
489
490							sub diacritic_char {
491	0			0	0		my ($encoding, $char) = @_;
492	0						load_encoding($encoding);
493
494	0						my @result = ();
495	0						my $dia;
496	0						for $dia (@diacritics) {
497	0						my $name = $char . $dia;
498							push @result, $output_hashes{$encoding}{$name}
499	0	0					if defined $output_hashes{$encoding}{$name};
500							}
501	0						@result;
502							}
503
504							1;
505
506							=head1 SYNOPSIS
507
508							use Cz::Cstocs;
509							my $il2_to_ascii = new Cz::Cstocs 'il2', 'ascii';
510							while (<>) {
511							print &$il2_to_ascii($_);
512							}
513
514							use Cz::Cstocs 'il2_ascii';
515							while (<>) {
516							print il2_ascii($_);
517							}
518
519							use Cz::Cstocs;
520							sub il2toascii;
521							# inform the parser that there is a function il2toascii
522							*il2toascii = new Cz::Cstocs 'il2', 'ascii';
523							# now define the function
524							print il2toascii $data;
525							# thanks to Jan Krynicky for poining this out
526
527							=head1 DESCRIPTION
528
529							This module helps in converting texts between various charset
530							encodings, used for Czech and Slovak languages. The instance of the
531							object B is created using method B. It takes at
532							least two parameters for input and output encoding and can be
533							afterwards used as a function reference to convert strings/lists.
534							Cz::Cstocs supports fairly free form of aliases, so iso8859-2,
535							ISO-8859-2, iso88592 and il2 are all aliases of the same encoding.
536							For backward compatibility, method I is supported as well,
537							so the example above could also read
538
539							while (<>) {
540							print $il2_to_ascii->conv($_);
541							}
542
543							You can also use typeglob syntax.
544
545							The conversion function takes a list and returns list of converted
546							strings (in the list context) or one string consisting of concatenated
547							results (in the scalar context).
548
549							You can modify the behaviour of the conversion function by specifying
550							hash of other options after the encoding names in call to B.
551
552							=over 4
553
554							=item fillstring
555
556							Gives alternate string that will replace characters from input
557							encoding that are not present in the output encoding. Default is
558							space.
559
560							=item use_accent
561
562							Defines whether the accent file should be used. Default is 1 (true).
563
564							=item nofillstring
565
566							When 1 (true), will keep characters that do not have friends in
567							accent nor output encoding, will no replace them with fillstring.
568							Default is 0 except for tex, because you probably rather want to keep
569							backslashed symbols than loose them.
570
571							=item cstocsdir
572
573							Alternate location for encoding and accent files. The default is the
574							F directory in Perl library tree. This location can
575							also be changed with the I environment variable.
576
577							=back
578
579							There is an alternate way to define the conversion function: any
580							arguments after use Cz::Cstocs that have form encoding_encoding or
581							encoding_to_encoding are processed and the appropriate functions are
582							imported. So,
583
584							use Cz::Cstocs qw(pc2_to_il2 il2_ascii);
585
586							define two functions, that are loaded into caller's namespace and
587							can be used directly. In this case, you cannot specify additional
588							options, you only have default behaviour.
589
590							=head1 ERROR HANDLING
591
592							If you request an unknown encoding in the call to new Cz::Cstocs,
593							the conversion object is not defined and the variable
594							$Cz::Cstocs::errstr is set to the error message. When you specify
595							unknown encoding in the use call style (like C
596							'il2_ascii';>), the die is called.
597
598							=head1 AUTHOR
599
600							Jan Pazdziora created the module version.
601
602							Jan "Yenya" Kasprzak has done the original Un*x implementation.
603
604							=head1 VERSION
605
606							3.43
607
608							=head1 SEE ALSO
609
610							cstocs(1), perl(1), or Xcstocs at
611							http://www.lut.fi/~kurz/programs/xcstocs.tar.gz.
612
613							=cut
614
615