line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
=head1 NAME |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
Cz::Cstocs - conversions of charset encodings for the Czech language |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=cut |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package Cz::Cstocs; |
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
609
|
use strict; |
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
28
|
|
11
|
1
|
|
|
1
|
|
5
|
use Exporter; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
49
|
|
12
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
6
|
use vars qw( $VERSION $DEBUG $cstocsdir @ISA @EXPORT_OK %EXPORT $errstr); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
3634
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
16
|
|
|
|
|
|
|
@EXPORT_OK = ( '_stupidity_workaround' ); |
17
|
|
|
|
|
|
|
%EXPORT = ( '_stupidity_workaround' => 1 ); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
0
|
|
|
sub _stupidity_workaround { |
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub import { |
23
|
2
|
|
|
2
|
|
2880
|
my $class = shift; |
24
|
2
|
|
|
|
|
5
|
my @data = @_; |
25
|
2
|
100
|
|
|
|
7
|
if (@data) { |
26
|
1
|
|
|
|
|
4
|
my @avail = Cz::Cstocs->available_enc(); |
27
|
1
|
|
|
|
|
2
|
my $fn; |
28
|
1
|
|
|
|
|
2
|
for $fn (@data) { |
29
|
1
|
|
|
|
|
11
|
local $^W = 0; |
30
|
1
|
50
|
|
|
|
3
|
next if grep { $_ eq $fn } @EXPORT_OK; |
|
1
|
|
|
|
|
5
|
|
31
|
1
|
|
|
|
|
9
|
my ($in, $out) = $fn =~ /^_?(.*?)_(?:to_)?(.*)$/; |
32
|
1
|
50
|
|
|
|
4
|
next unless defined $out; |
33
|
1
|
|
|
|
|
11
|
my $fnref = new Cz::Cstocs $in, $out; |
34
|
1
|
50
|
|
|
|
4
|
die "Definition of $fn failed: $errstr" |
35
|
|
|
|
|
|
|
unless defined $fnref;; |
36
|
1
|
|
|
1
|
|
58
|
eval "sub $fn { \$fnref->conv(\@_); }; "; |
|
1
|
|
|
|
|
51
|
|
37
|
1
|
50
|
|
|
|
5
|
if ($@) { |
38
|
0
|
|
|
|
|
0
|
die "Creating conversion function $fn failed: $@"; |
39
|
|
|
|
|
|
|
} |
40
|
1
|
|
|
|
|
3
|
push @EXPORT_OK, $fn; |
41
|
1
|
|
|
|
|
7
|
$EXPORT{$fn} = 1; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
} |
44
|
2
|
|
|
|
|
2465
|
Cz::Cstocs->export_to_level(1, '_stupidity_workaround', @data); |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$VERSION = '3.43'; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# Debugging option |
50
|
|
|
|
|
|
|
$DEBUG = 0 unless defined $DEBUG; |
51
|
20
|
|
|
20
|
0
|
57
|
sub DEBUG () { $DEBUG; } |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Where to get the encoding files from |
55
|
|
|
|
|
|
|
# Start with some default |
56
|
|
|
|
|
|
|
my $defaultcstocsdir = '/packages/share/cstocs/lib'; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# Look at the environment variable |
59
|
|
|
|
|
|
|
if (defined $ENV{'CSTOCSDIR'}) { |
60
|
|
|
|
|
|
|
$defaultcstocsdir = $ENV{'CSTOCSDIR'}; |
61
|
|
|
|
|
|
|
print STDERR "Using enc-dir $defaultcstocsdir from the CSTOCSDIR env-var\n" |
62
|
|
|
|
|
|
|
if DEBUG; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
# Or take the encoding files from the Perl tree |
65
|
|
|
|
|
|
|
elsif (defined $INC{'Cz/Cstocs.pm'}) { |
66
|
|
|
|
|
|
|
$defaultcstocsdir = $INC{'Cz/Cstocs.pm'}; |
67
|
|
|
|
|
|
|
$defaultcstocsdir =~ s!\.pm$!/enc!; |
68
|
|
|
|
|
|
|
print STDERR "Using enc-dir $defaultcstocsdir from \@INC\n" |
69
|
|
|
|
|
|
|
if DEBUG; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# We have unless hare because you could have overriden $cstocsdir |
73
|
|
|
|
|
|
|
$cstocsdir = $defaultcstocsdir unless defined $cstocsdir; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# Hash that holds the accent file and a tag saying if the accent |
77
|
|
|
|
|
|
|
# file has already been read |
78
|
|
|
|
|
|
|
my %accent = (); |
79
|
|
|
|
|
|
|
my $accent_read = 0; |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Hash of alias covnersions |
82
|
|
|
|
|
|
|
my %alias = (); |
83
|
|
|
|
|
|
|
my $alias_read = 0; |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# Input and output hashes |
86
|
|
|
|
|
|
|
my %input_hashes = (); |
87
|
|
|
|
|
|
|
my %output_hashes = (); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# Array of regexp parts |
90
|
|
|
|
|
|
|
my %regexp_matches = (); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Table of conversion functions, so that we do not need to create them twice |
93
|
|
|
|
|
|
|
my %functions = (); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# List of diacritics |
96
|
|
|
|
|
|
|
my @diacritics = qw( abovedot acute breve caron cedilla circumflex |
97
|
|
|
|
|
|
|
diaeresis doubleacute ogonek ring ); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# ###################################################### |
102
|
|
|
|
|
|
|
# Now, the function -- loading encoding and accent files |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# Filling input and output_hashes tables for given encoding |
105
|
|
|
|
|
|
|
sub load_encoding { |
106
|
19
|
|
|
19
|
0
|
35
|
my $enc = lc shift; |
107
|
|
|
|
|
|
|
|
108
|
19
|
100
|
|
|
|
80
|
return if defined $input_hashes{$enc}; # has already been loaded |
109
|
|
|
|
|
|
|
|
110
|
5
|
50
|
|
|
|
16
|
if ($enc eq 'mime') { |
111
|
0
|
|
|
|
|
0
|
eval 'use MIME::Words ()'; |
112
|
0
|
0
|
|
|
|
0
|
if ($@) { |
113
|
0
|
|
|
|
|
0
|
die "Error loading encofing $enc: $@\n"; |
114
|
|
|
|
|
|
|
} |
115
|
0
|
|
|
|
|
0
|
return; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
5
|
|
|
|
|
20
|
my $file = "$cstocsdir/$enc.enc"; |
119
|
5
|
50
|
|
|
|
262
|
open FILE, $file or die "Error reading $file: $!\n"; |
120
|
5
|
50
|
|
|
|
21
|
print STDERR "Parsing encoding file $file\n" if DEBUG; |
121
|
|
|
|
|
|
|
|
122
|
5
|
|
|
|
|
22
|
my ($input, $output) = ({}, {}); # just speedup thing |
123
|
5
|
|
|
|
|
11
|
local $_; |
124
|
5
|
|
|
|
|
107
|
while () { |
125
|
811
|
100
|
|
|
|
2084
|
next if /^(#|\s*$)/; |
126
|
804
|
|
|
|
|
2741
|
my ($tag, $desc) = /^\s*(\S+)\s+(\S+)\s*$/; |
127
|
804
|
50
|
33
|
|
|
2401
|
unless (defined $tag and defined $desc) { |
128
|
0
|
|
|
|
|
0
|
chomp; |
129
|
0
|
|
|
|
|
0
|
warn "Syntax error in $file at line $: `$_'.\n"; |
130
|
0
|
|
|
|
|
0
|
next; |
131
|
|
|
|
|
|
|
} |
132
|
804
|
100
|
|
|
|
1807
|
if ($tag =~ /^\d+|0x\d+$/) { |
133
|
|
|
|
|
|
|
$tag = pack 'C*', map { |
134
|
779
|
100
|
|
|
|
1386
|
/^0/ ? oct($_) : $_ |
|
970
|
|
|
|
|
3056
|
|
135
|
|
|
|
|
|
|
} split /,/, $tag; |
136
|
|
|
|
|
|
|
} |
137
|
804
|
|
|
|
|
1972
|
$input->{$tag} = $desc; |
138
|
804
|
100
|
|
|
|
5032
|
$output->{$desc} = $tag unless defined $output->{$desc}; |
139
|
|
|
|
|
|
|
} |
140
|
5
|
|
|
|
|
54
|
close FILE; |
141
|
|
|
|
|
|
|
|
142
|
5
|
|
|
|
|
20
|
$input_hashes{$enc} = $input; |
143
|
5
|
|
|
|
|
11
|
$output_hashes{$enc} = $output; |
144
|
|
|
|
|
|
|
|
145
|
5
|
100
|
|
|
|
34
|
if ($enc eq "tex") { |
146
|
1
|
|
|
|
|
5
|
fixup_tex_encoding(); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub fixup_tex_encoding { |
151
|
1
|
|
|
1
|
0
|
7
|
my $tag; |
152
|
|
|
|
|
|
|
|
153
|
1
|
50
|
|
|
|
5
|
print STDERR "Doing tex fixup\n" if DEBUG; |
154
|
|
|
|
|
|
|
|
155
|
1
|
|
|
|
|
3
|
my $input = $input_hashes{"tex"}; |
156
|
1
|
|
|
|
|
3
|
my $output = $output_hashes{"tex"}; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# we need this to fill the defaults |
159
|
1
|
|
|
|
|
15
|
load_encoding('ascii'); |
160
|
1
|
|
|
|
|
2
|
my $asciiref = $output_hashes{'ascii'}; |
161
|
1
|
|
|
|
|
31
|
for $tag (keys %$asciiref) { |
162
|
|
|
|
|
|
|
$output->{$tag} = $asciiref->{$tag} |
163
|
95
|
50
|
|
|
|
249
|
unless defined $output->{$tag}; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
1
|
|
|
|
|
13
|
my %processed = (); |
167
|
|
|
|
|
|
|
|
168
|
1
|
|
|
|
|
3
|
my (@dialetters, @dianonletters, @nondialetters, @nondianonletters); |
169
|
1
|
|
|
|
|
9
|
my (@inputs) = keys %$input; |
170
|
1
|
|
|
|
|
9
|
for $tag (@inputs) { |
171
|
24
|
|
|
|
|
44
|
my $value = $input->{$tag}; |
172
|
|
|
|
|
|
|
|
173
|
24
|
|
|
|
|
29
|
my $az = 0; |
174
|
24
|
100
|
|
|
|
86
|
$az = 1 if $tag =~ /[a-zA-Z]$/; |
175
|
|
|
|
|
|
|
|
176
|
24
|
100
|
100
|
|
|
98
|
if ($az and $output->{$value} eq $tag) { |
177
|
16
|
|
|
|
|
36
|
$output->{$value} = $tag . '{}'; |
178
|
|
|
|
|
|
|
} |
179
|
24
|
|
|
|
|
56
|
$input->{$tag . ' '} = $value; |
180
|
|
|
|
|
|
|
|
181
|
24
|
100
|
|
|
|
42
|
if (grep { $_ eq $value } @diacritics) { |
|
240
|
100
|
|
|
|
389
|
|
182
|
10
|
|
|
|
|
18
|
my $e; |
183
|
10
|
100
|
|
|
|
19
|
if ($az) { |
184
|
5
|
|
|
|
|
10
|
push @dialetters, $tag; |
185
|
5
|
|
|
|
|
12
|
for $e ('a'..'h', 'k'..'z', 'A'..'Z') { |
186
|
250
|
|
|
|
|
712
|
$output->{$e.$value} = $tag.' '.$e |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} else { |
189
|
5
|
|
|
|
|
8
|
push @dianonletters, $tag; |
190
|
5
|
|
|
|
|
12
|
for $e ('a'..'h', 'k'..'z', 'A'..'Z') { |
191
|
250
|
|
|
|
|
739
|
$output->{$e.$value} = $tag.$e |
192
|
|
|
|
|
|
|
} |
193
|
5
|
|
|
|
|
12
|
for $e ('a'..'z', 'A'..'Z') { |
194
|
260
|
|
|
|
|
728
|
$input->{$tag.$e} = $e.$value; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
10
|
|
|
|
|
22
|
for $e ('i', 'j') { |
198
|
20
|
|
|
|
|
66
|
$output->{$e.$value} = $tag.'\\'.$e.'{}' |
199
|
|
|
|
|
|
|
} |
200
|
10
|
|
|
|
|
19
|
for $e ('a'..'z', 'A'..'Z') { |
201
|
520
|
|
|
|
|
1432
|
$input->{$tag.' '.$e} = $e.$value; |
202
|
|
|
|
|
|
|
} |
203
|
10
|
|
|
|
|
23
|
for $e ('i', 'j') { |
204
|
20
|
|
|
|
|
54
|
$input->{$tag.'\\'.$e} = $e.$value; |
205
|
20
|
|
|
|
|
67
|
$input->{$tag.' \\'.$e} = $e.$value; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} elsif ($az) { |
208
|
13
|
|
|
|
|
36
|
push @nondialetters, $tag; |
209
|
|
|
|
|
|
|
} else { |
210
|
1
|
|
|
|
|
3
|
push @nondianonletters, $tag; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
1
|
|
|
|
|
4
|
my $regexp = ''; |
215
|
|
|
|
|
|
|
|
216
|
1
|
50
|
|
|
|
4
|
if (@dialetters) { |
217
|
|
|
|
|
|
|
$regexp .= join '', '(', |
218
|
1
|
|
|
|
|
4
|
join('|', map { "\Q$_"; } @dialetters), |
|
5
|
|
|
|
|
21
|
|
219
|
|
|
|
|
|
|
")([ \\t]+[a-zA-Z]|[ \\t]*(\\\\[ij]([ \\t]+(\\{\\})?|[ \\t]*(\$|\\{\\}))|\\{([a-zA-Z]|\\\\[ij][ \\t]*(\\{\\})?)\\}))"; |
220
|
|
|
|
|
|
|
} |
221
|
1
|
50
|
|
|
|
12
|
if (@dianonletters) { |
222
|
1
|
50
|
|
|
|
9
|
$regexp .= '|' if $regexp ne ''; |
223
|
|
|
|
|
|
|
$regexp .= '(' . join '', |
224
|
1
|
|
|
|
|
4
|
join('|', map { "\Q$_"; } @dianonletters), |
|
5
|
|
|
|
|
14
|
|
225
|
|
|
|
|
|
|
")[ \\t]*([a-zA-Z]|\\\\[ij]([ \\t]+(\\{\\})?|[ \\t]*(\$|\\{\\}))|\\{([a-zA-Z]|\\\\[ij][ \\t]*(\\{\\})?)\\})"; |
226
|
|
|
|
|
|
|
} |
227
|
1
|
50
|
|
|
|
5
|
if (@nondialetters) { |
228
|
1
|
50
|
|
|
|
5
|
$regexp .= '|' if $regexp ne ''; |
229
|
|
|
|
|
|
|
$regexp .= '(' . join '', |
230
|
1
|
|
|
|
|
3
|
join('|', map { "\Q$_"; } @nondialetters), |
|
13
|
|
|
|
|
31
|
|
231
|
|
|
|
|
|
|
")([ \\t]+(\\{\\})?|[ \\t]*\$)" |
232
|
|
|
|
|
|
|
} |
233
|
1
|
50
|
|
|
|
5
|
if (@nondianonletters) { |
234
|
1
|
50
|
|
|
|
4
|
$regexp .= '|' if $regexp ne ''; |
235
|
|
|
|
|
|
|
$regexp .= '(' . join '', |
236
|
1
|
|
|
|
|
4
|
join('|', map { "\Q$_"; } @nondianonletters), |
|
1
|
|
|
|
|
12
|
|
237
|
|
|
|
|
|
|
")[ \\t]*(\\{\\})?" |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
1
|
|
|
|
|
5
|
$regexp_matches{'tex'} = $regexp; |
241
|
1
|
|
|
|
|
6
|
1; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# Loading accent file |
245
|
|
|
|
|
|
|
sub load_accent { |
246
|
8
|
100
|
|
8
|
0
|
27
|
return if $accent_read; |
247
|
1
|
|
|
|
|
3
|
$accent_read = 1; |
248
|
|
|
|
|
|
|
|
249
|
1
|
|
|
|
|
4
|
my $file = "$cstocsdir/accent"; |
250
|
1
|
50
|
|
|
|
41
|
open FILE, $file or die "Error reading accent file $file: $!\n"; |
251
|
1
|
50
|
|
|
|
7
|
print STDERR "Parsing accent file $file\n" if DEBUG; |
252
|
|
|
|
|
|
|
|
253
|
1
|
|
|
|
|
2
|
local $_; |
254
|
1
|
|
|
|
|
29
|
while () { |
255
|
201
|
50
|
|
|
|
465
|
next if /^\s*(#|$)/; |
256
|
201
|
|
|
|
|
754
|
my ($key, $val) = /^\s*(\S+)\s+(.+?)\s*$/; |
257
|
201
|
50
|
33
|
|
|
579
|
unless (defined $key and defined $val) { |
258
|
0
|
|
|
|
|
0
|
chomp; |
259
|
0
|
|
|
|
|
0
|
warn "Syntax error in $file at line $: `$_'.\n"; |
260
|
0
|
|
|
|
|
0
|
next; |
261
|
|
|
|
|
|
|
} |
262
|
201
|
|
|
|
|
675
|
$accent{$key} = $val; |
263
|
|
|
|
|
|
|
} |
264
|
1
|
|
|
|
|
13
|
close FILE; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# Load the alias file, fill the global %alias hash; |
268
|
|
|
|
|
|
|
sub load_alias { |
269
|
22
|
100
|
|
22
|
0
|
79
|
return if $alias_read; |
270
|
1
|
|
|
|
|
2
|
$alias_read = 1; |
271
|
1
|
|
|
|
|
3
|
my $file = "$cstocsdir/alias"; |
272
|
|
|
|
|
|
|
|
273
|
1
|
50
|
|
|
|
40
|
open FILE, $file or die "Error reading alias file $file: $!\n"; |
274
|
1
|
|
|
|
|
3
|
local $_; |
275
|
1
|
|
|
|
|
22
|
while () { |
276
|
17
|
|
|
|
|
25
|
chomp; |
277
|
17
|
|
|
|
|
32
|
my ($alias, $enc) = split; |
278
|
17
|
|
|
|
|
66
|
$alias{$alias} = $enc; |
279
|
|
|
|
|
|
|
} |
280
|
1
|
|
|
|
|
10
|
close FILE; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# Normalizes the encoding name -- expands aliases |
284
|
|
|
|
|
|
|
sub normalize_enc_name { |
285
|
22
|
|
|
22
|
0
|
45
|
load_alias(); |
286
|
22
|
|
|
|
|
77
|
my $enc = lc shift; |
287
|
22
|
|
|
|
|
63
|
$enc =~ s/[^a-z0-9]//g; |
288
|
22
|
100
|
|
|
|
74
|
( defined $alias{$enc} ? $alias{$enc} : $enc ); |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# Recursively lookup the target |
292
|
|
|
|
|
|
|
sub lookup_accent { |
293
|
229
|
|
|
229
|
0
|
412
|
my ($outenc, $accent, $in) = @_; |
294
|
229
|
|
|
|
|
651
|
my @target = split /\s+/, $in; |
295
|
229
|
|
|
|
|
313
|
my $out = ''; |
296
|
229
|
|
|
|
|
325
|
for my $desc (@target) { |
297
|
294
|
50
|
|
|
|
485
|
if (defined $outenc->{$desc}) { |
|
|
0
|
|
|
|
|
|
298
|
294
|
|
|
|
|
479
|
$out .= $outenc->{$desc}; |
299
|
|
|
|
|
|
|
} elsif (defined $accent->{$desc}) { |
300
|
0
|
|
|
|
|
0
|
$out .= lookup_accent($outenc, $accent, $accent->{$desc}); |
301
|
|
|
|
|
|
|
} else { |
302
|
0
|
|
|
|
|
0
|
die; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
} |
305
|
229
|
|
|
|
|
649
|
return $out; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Constructor -- takes two arguments, input and output encodings, |
309
|
|
|
|
|
|
|
# a optionally hash of options. Returns reference to code that will |
310
|
|
|
|
|
|
|
# do the conversion, or undef |
311
|
|
|
|
|
|
|
sub new { |
312
|
11
|
|
|
11
|
0
|
397
|
my $class = shift; |
313
|
11
|
|
|
|
|
42
|
my ($inputenc, $outputenc) = (shift, shift); |
314
|
|
|
|
|
|
|
|
315
|
11
|
|
|
|
|
61
|
local $/ = "\n"; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# check input values |
318
|
11
|
50
|
33
|
|
|
67
|
unless (defined $inputenc and defined $outputenc) { |
319
|
0
|
|
|
|
|
0
|
print STDERR "Both input and output encodings must be specified in call to ", __PACKAGE__, "::new\n"; |
320
|
0
|
|
|
|
|
0
|
return; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# Default options |
324
|
11
|
|
|
|
|
18
|
my $fillstring = ' '; |
325
|
11
|
|
|
|
|
20
|
my $use_fillstring = 1; |
326
|
11
|
|
|
|
|
16
|
my $use_accent = 1; |
327
|
11
|
|
|
|
|
13
|
my $one_by_one = 0; |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# this is exception for TeX |
330
|
11
|
100
|
|
|
|
25
|
$use_fillstring = 0 if $inputenc eq "tex"; |
331
|
|
|
|
|
|
|
|
332
|
11
|
|
|
|
|
26
|
my %opts = @_; |
333
|
11
|
|
|
|
|
16
|
my ($tag, $value); |
334
|
11
|
|
|
|
|
45
|
while (($tag, $value) = each %opts) { |
335
|
3
|
50
|
|
|
|
7
|
print STDERR "Option: $tag = '$value'\n" if DEBUG; |
336
|
3
|
100
|
|
|
|
8
|
$tag eq 'fillstring' and $fillstring = $value; |
337
|
3
|
50
|
|
|
|
9
|
$tag eq 'use_accent' and |
|
|
100
|
|
|
|
|
|
338
|
|
|
|
|
|
|
$use_accent = (defined $value ? $value : 0); |
339
|
3
|
0
|
|
|
|
9
|
$tag eq 'nofillstring' and |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
340
|
|
|
|
|
|
|
$use_fillstring = (defined $value ? |
341
|
|
|
|
|
|
|
( $value ? 0 : 1) : 0); |
342
|
3
|
50
|
|
|
|
7
|
$tag eq 'cstocsdir' and $cstocsdir = $value; |
343
|
3
|
100
|
|
|
|
59
|
$tag eq 'one_by_one' and $one_by_one = $value; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
11
|
|
|
|
|
37
|
$inputenc = normalize_enc_name($inputenc); |
347
|
11
|
|
|
|
|
29
|
$outputenc = normalize_enc_name($outputenc); |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# encode settings into the function name |
350
|
11
|
100
|
|
|
|
56
|
if (defined $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"}) { |
351
|
2
|
|
|
|
|
18
|
return $functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"}; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
|
354
|
9
|
|
|
|
|
18
|
eval { |
355
|
9
|
|
|
|
|
32
|
load_encoding($inputenc); |
356
|
9
|
|
|
|
|
24
|
load_encoding($outputenc); |
357
|
9
|
100
|
|
|
|
43
|
load_accent() if $use_accent; |
358
|
|
|
|
|
|
|
}; |
359
|
9
|
50
|
|
|
|
26
|
if ($@) { |
360
|
0
|
|
|
|
|
0
|
$errstr = $@; |
361
|
0
|
|
|
|
|
0
|
return; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
9
|
|
|
|
|
22
|
my $conv = {}; |
365
|
|
|
|
|
|
|
|
366
|
9
|
|
|
|
|
22
|
my ($is_one_by_one, $has_space) = (1, 0); |
367
|
|
|
|
|
|
|
|
368
|
9
|
50
|
|
|
|
24
|
if ($outputenc ne 'mime') { |
369
|
9
|
|
|
|
|
11
|
my $key; |
370
|
9
|
|
|
|
|
16
|
for $key (keys %{$input_hashes{$inputenc}}) { |
|
9
|
|
|
|
|
726
|
|
371
|
3111
|
|
|
|
|
4763
|
my $desc = $input_hashes{$inputenc}{$key}; |
372
|
3111
|
|
|
|
|
4369
|
my $output = $output_hashes{$outputenc}{$desc}; |
373
|
|
|
|
|
|
|
|
374
|
3111
|
100
|
100
|
|
|
6532
|
if (not defined $output and $use_accent) { |
375
|
|
|
|
|
|
|
# Doesn't have friend in output encoding |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
|
378
|
928
|
|
|
|
|
1109
|
$output = eval { |
379
|
|
|
|
|
|
|
lookup_accent($output_hashes{$outputenc}, |
380
|
928
|
100
|
|
|
|
2261
|
\%accent, $accent{$desc}) if defined $accent{$desc}; |
381
|
|
|
|
|
|
|
}; |
382
|
928
|
50
|
|
|
|
1538
|
if ($@) { |
383
|
0
|
|
|
|
|
0
|
$errstr = "Error processing translitaration for $inputenc -> $outputenc for character $desc.\n"; |
384
|
0
|
|
|
|
|
0
|
return; |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
928
|
100
|
66
|
|
|
1597
|
$output = undef if $one_by_one and defined $output |
|
|
|
100
|
|
|
|
|
388
|
|
|
|
|
|
|
and length $key < length $output; |
389
|
|
|
|
|
|
|
} |
390
|
3111
|
100
|
100
|
|
|
5807
|
if (not defined $output and $use_fillstring) { |
391
|
11
|
|
|
|
|
18
|
$output = $fillstring; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
3111
|
100
|
100
|
|
|
8998
|
next if (not defined $output |
|
|
|
100
|
|
|
|
|
395
|
|
|
|
|
|
|
or ($inputenc ne 'utf8' and $key eq $output)); |
396
|
1673
|
100
|
100
|
|
|
3274
|
if (length $key != 1 or length $output != 1) |
397
|
1419
|
|
|
|
|
1627
|
{ $is_one_by_one = 0; } |
398
|
1673
|
|
|
|
|
3419
|
$conv->{$key} = $output; |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
9
|
|
|
|
|
128
|
my $fntext = ' sub { my @converted = map { my $e = $_; if (defined $e) {'; |
403
|
|
|
|
|
|
|
|
404
|
9
|
50
|
|
|
|
93
|
if ($inputenc eq 'mime') { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
405
|
0
|
|
|
|
|
0
|
$fntext .= qq! |
406
|
|
|
|
|
|
|
\$e =~ s/=\\s*=/==/g; |
407
|
|
|
|
|
|
|
\$e = join '', map { |
408
|
|
|
|
|
|
|
my \$conv; |
409
|
|
|
|
|
|
|
if (defined \$_->[1]) { |
410
|
|
|
|
|
|
|
(defined(\$conv = new Cz::Cstocs \$_->[1], '$outputenc', %{ \\%opts })) |
411
|
|
|
|
|
|
|
? \$conv->conv(\$_->[0]) |
412
|
|
|
|
|
|
|
: () |
413
|
|
|
|
|
|
|
} else { |
414
|
|
|
|
|
|
|
\$_->[0] |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
} MIME::Words::decode_mimewords(\$e); |
417
|
|
|
|
|
|
|
!; |
418
|
|
|
|
|
|
|
} elsif ($outputenc eq 'mime') { |
419
|
0
|
|
|
|
|
0
|
my %MIME_NAMES = ( |
420
|
|
|
|
|
|
|
il1 => 'ISO-8859-1', |
421
|
|
|
|
|
|
|
il2 => 'ISO-8859-2', |
422
|
|
|
|
|
|
|
utf8 => 'UTF-8', |
423
|
|
|
|
|
|
|
1250 => 'Windows-1250', |
424
|
|
|
|
|
|
|
1252 => 'Windows-1252', |
425
|
|
|
|
|
|
|
); |
426
|
0
|
|
|
|
|
0
|
my $charset = $MIME_NAMES{$inputenc}; |
427
|
0
|
0
|
|
|
|
0
|
if (not defined $charset) { |
428
|
0
|
|
|
|
|
0
|
die "Couldn't find MIME name for encoding $inputenc\n"; |
429
|
|
|
|
|
|
|
} |
430
|
0
|
|
|
|
|
0
|
$fntext .= qq! |
431
|
|
|
|
|
|
|
\$e = MIME::Words::encode_mimewords(\$e, Charset => '$charset'); |
432
|
|
|
|
|
|
|
\$e =~ s/\\?=( +)=\\?.*?\\?Q\\?/'_' x length \$1/egi; |
433
|
|
|
|
|
|
|
!; |
434
|
|
|
|
|
|
|
} elsif (not keys %$conv) { |
435
|
|
|
|
|
|
|
# do nothing; |
436
|
|
|
|
|
|
|
} elsif ($is_one_by_one) { |
437
|
1
|
|
|
|
|
10
|
my $src = join "", keys %$conv; |
438
|
1
|
|
|
|
|
6
|
$src = "\Q$src"; |
439
|
1
|
|
|
|
|
6
|
my $dst = join "", values %$conv; |
440
|
1
|
|
|
|
|
4
|
$dst = "\Q$dst"; |
441
|
1
|
|
|
|
|
4
|
$fntext .= qq! \$e =~ tr/$src/$dst/; !; |
442
|
|
|
|
|
|
|
} elsif ($inputenc eq 'tex') { |
443
|
2
|
|
|
|
|
7
|
my $src = $regexp_matches{'tex'}; |
444
|
2
|
|
|
|
|
8
|
$fntext .= qq! \$e =~ s/$src/ my \$e = \$&; my \$orig = \$e; \$e =~ s#[{}]# #sog; \$e =~ s#[ \\t]+# #sog; \$e =~ s# \$##o; (defined \$conv->{\$e} ? \$conv->{\$e} : \$orig); /esog; !; |
445
|
|
|
|
|
|
|
} elsif ($inputenc eq 'utf8') { |
446
|
1
|
|
|
|
|
4
|
$fntext .= qq! \$e =~ s/[\\x21-\\x7f]|[\\xc0-\\xdf].|[\\xe0-\\xef]..|[\\xf0-\\xf7]...|[\\xf8-\\xfb]....|[\\xfc\\xfd]...../defined \$conv->{\$&} ? \$conv->{\$&} : ( |
447
|
|
|
|
|
|
|
$use_fillstring ? \$fillstring : '') /esog; !; |
448
|
|
|
|
|
|
|
} else { |
449
|
4
|
|
|
|
|
39
|
my $singles = join "", grep { length $_ == 1 } keys %$conv; |
|
328
|
|
|
|
|
477
|
|
450
|
4
|
|
|
|
|
29
|
$singles = "[". "\Q$singles" . "]"; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
my $src = join "|", |
453
|
0
|
|
|
|
|
0
|
( map { my $e = "\Q$_"; $e; } |
|
0
|
|
|
|
|
0
|
|
454
|
0
|
|
|
|
|
0
|
sort { length $b <=> length $a } |
455
|
4
|
|
|
|
|
37
|
grep { length $_ != 1 } keys %$conv); |
|
328
|
|
|
|
|
456
|
|
456
|
4
|
50
|
|
|
|
23
|
if ($singles ne "[]") { |
457
|
4
|
50
|
|
|
|
11
|
$src .= "|" unless $src eq ''; |
458
|
4
|
|
|
|
|
8
|
$src .= $singles; |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
4
|
|
|
|
|
12
|
$fntext .= qq! \$e =~ s/$src/\$conv->{\$&}/sog; !; |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
9
|
|
|
|
|
27
|
$fntext .= ' $e; } else { undef; }} @_; if (wantarray) { return @converted; } else { return join "", map { defined $_ ? $_ : "" } @converted; } }'; |
465
|
|
|
|
|
|
|
|
466
|
9
|
50
|
|
|
|
24
|
print STDERR "Conversion function for $inputenc to $outputenc:\n$fntext\n" if DEBUG; |
467
|
|
|
|
|
|
|
|
468
|
9
|
|
|
|
|
2597
|
my $fn = eval $fntext; |
469
|
9
|
50
|
|
|
|
36
|
do { chomp $@; |
|
0
|
|
|
|
|
0
|
|
470
|
0
|
|
|
|
|
0
|
die "Fatal error in Cz::Cstocs: $@, line ", __LINE__, "\n"; |
471
|
|
|
|
|
|
|
} if $@; |
472
|
9
|
|
|
|
|
23
|
bless $fn, $class; |
473
|
|
|
|
|
|
|
|
474
|
9
|
|
|
|
|
51
|
$functions{"${inputenc}_${outputenc}_${fillstring}_${use_fillstring}_${use_accent}_${one_by_one}"} = $fn; |
475
|
9
|
|
|
|
|
63
|
$fn; |
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
sub conv { |
479
|
12
|
|
|
12
|
0
|
623
|
my $self = shift; |
480
|
12
|
|
|
|
|
303
|
return &$self($_[0]); |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub available_enc { |
484
|
1
|
50
|
|
1
|
0
|
45
|
opendir DIR, $cstocsdir or warn "Error reading $cstocsdir\n"; |
485
|
1
|
|
|
|
|
52
|
my @list = sort map { s/\.enc$//; $_ } grep { /\.enc$/ } readdir DIR; |
|
15
|
|
|
|
|
30
|
|
|
15
|
|
|
|
|
38
|
|
|
19
|
|
|
|
|
42
|
|
486
|
1
|
|
|
|
|
19
|
closedir DIR; |
487
|
1
|
|
|
|
|
8
|
return @list; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub diacritic_char { |
491
|
0
|
|
|
0
|
0
|
|
my ($encoding, $char) = @_; |
492
|
0
|
|
|
|
|
|
load_encoding($encoding); |
493
|
|
|
|
|
|
|
|
494
|
0
|
|
|
|
|
|
my @result = (); |
495
|
0
|
|
|
|
|
|
my $dia; |
496
|
0
|
|
|
|
|
|
for $dia (@diacritics) { |
497
|
0
|
|
|
|
|
|
my $name = $char . $dia; |
498
|
|
|
|
|
|
|
push @result, $output_hashes{$encoding}{$name} |
499
|
0
|
0
|
|
|
|
|
if defined $output_hashes{$encoding}{$name}; |
500
|
|
|
|
|
|
|
} |
501
|
0
|
|
|
|
|
|
@result; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
1; |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=head1 SYNOPSIS |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
use Cz::Cstocs; |
509
|
|
|
|
|
|
|
my $il2_to_ascii = new Cz::Cstocs 'il2', 'ascii'; |
510
|
|
|
|
|
|
|
while (<>) { |
511
|
|
|
|
|
|
|
print &$il2_to_ascii($_); |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
use Cz::Cstocs 'il2_ascii'; |
515
|
|
|
|
|
|
|
while (<>) { |
516
|
|
|
|
|
|
|
print il2_ascii($_); |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
use Cz::Cstocs; |
520
|
|
|
|
|
|
|
sub il2toascii; |
521
|
|
|
|
|
|
|
# inform the parser that there is a function il2toascii |
522
|
|
|
|
|
|
|
*il2toascii = new Cz::Cstocs 'il2', 'ascii'; |
523
|
|
|
|
|
|
|
# now define the function |
524
|
|
|
|
|
|
|
print il2toascii $data; |
525
|
|
|
|
|
|
|
# thanks to Jan Krynicky for poining this out |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=head1 DESCRIPTION |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
This module helps in converting texts between various charset |
530
|
|
|
|
|
|
|
encodings, used for Czech and Slovak languages. The instance of the |
531
|
|
|
|
|
|
|
object B is created using method B. It takes at |
532
|
|
|
|
|
|
|
least two parameters for input and output encoding and can be |
533
|
|
|
|
|
|
|
afterwards used as a function reference to convert strings/lists. |
534
|
|
|
|
|
|
|
Cz::Cstocs supports fairly free form of aliases, so iso8859-2, |
535
|
|
|
|
|
|
|
ISO-8859-2, iso88592 and il2 are all aliases of the same encoding. |
536
|
|
|
|
|
|
|
For backward compatibility, method I is supported as well, |
537
|
|
|
|
|
|
|
so the example above could also read |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
while (<>) { |
540
|
|
|
|
|
|
|
print $il2_to_ascii->conv($_); |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
You can also use typeglob syntax. |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
The conversion function takes a list and returns list of converted |
546
|
|
|
|
|
|
|
strings (in the list context) or one string consisting of concatenated |
547
|
|
|
|
|
|
|
results (in the scalar context). |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
You can modify the behaviour of the conversion function by specifying |
550
|
|
|
|
|
|
|
hash of other options after the encoding names in call to B. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=over 4 |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=item fillstring |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
Gives alternate string that will replace characters from input |
557
|
|
|
|
|
|
|
encoding that are not present in the output encoding. Default is |
558
|
|
|
|
|
|
|
space. |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=item use_accent |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Defines whether the accent file should be used. Default is 1 (true). |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=item nofillstring |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
When 1 (true), will keep characters that do not have friends in |
567
|
|
|
|
|
|
|
accent nor output encoding, will no replace them with fillstring. |
568
|
|
|
|
|
|
|
Default is 0 except for tex, because you probably rather want to keep |
569
|
|
|
|
|
|
|
backslashed symbols than loose them. |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=item cstocsdir |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
Alternate location for encoding and accent files. The default is the |
574
|
|
|
|
|
|
|
F directory in Perl library tree. This location can |
575
|
|
|
|
|
|
|
also be changed with the I environment variable. |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=back |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
There is an alternate way to define the conversion function: any |
580
|
|
|
|
|
|
|
arguments after use Cz::Cstocs that have form encoding_encoding or |
581
|
|
|
|
|
|
|
encoding_to_encoding are processed and the appropriate functions are |
582
|
|
|
|
|
|
|
imported. So, |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
use Cz::Cstocs qw(pc2_to_il2 il2_ascii); |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
define two functions, that are loaded into caller's namespace and |
587
|
|
|
|
|
|
|
can be used directly. In this case, you cannot specify additional |
588
|
|
|
|
|
|
|
options, you only have default behaviour. |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=head1 ERROR HANDLING |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
If you request an unknown encoding in the call to new Cz::Cstocs, |
593
|
|
|
|
|
|
|
the conversion object is not defined and the variable |
594
|
|
|
|
|
|
|
$Cz::Cstocs::errstr is set to the error message. When you specify |
595
|
|
|
|
|
|
|
unknown encoding in the use call style (like C |
596
|
|
|
|
|
|
|
'il2_ascii';>), the die is called. |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
=head1 AUTHOR |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
Jan Pazdziora created the module version. |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
Jan "Yenya" Kasprzak has done the original Un*x implementation. |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
=head1 VERSION |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
3.43 |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
=head1 SEE ALSO |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
cstocs(1), perl(1), or Xcstocs at |
611
|
|
|
|
|
|
|
http://www.lut.fi/~kurz/programs/xcstocs.tar.gz. |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=cut |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
|