line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!perl |
2
|
|
|
|
|
|
|
# "Translit.pm" by Genji Schmeder 4 October 1996 |
3
|
|
|
|
|
|
|
# creates transliteration map between character sets defined in IETF RFC 1345 |
4
|
|
|
|
|
|
|
# with acknowledgements to Chris Leach, author of "EBCDIC.pm" |
5
|
|
|
|
|
|
|
# and to Keld Simonsen, author of RFC 1345 |
6
|
|
|
|
|
|
|
# Copyright (c) 1997 Genji Schmeder. All rights reserved. |
7
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
8
|
|
|
|
|
|
|
# it under the same terms as Perl itself. |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
package Convert::Translit; |
11
|
1
|
|
|
1
|
|
765
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
46
|
|
12
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT_OK); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
130
|
|
13
|
|
|
|
|
|
|
require Exporter; |
14
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
15
|
|
|
|
|
|
|
@EXPORT_OK = qw(transliterate build_substitutes); |
16
|
|
|
|
|
|
|
$VERSION = '1.03'; # dated 5 November 1997 |
17
|
1
|
|
|
1
|
|
925
|
use integer; |
|
1
|
|
|
|
|
14
|
|
|
1
|
|
|
|
|
6
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
if ($] < 5) {die "Perl version must be at least 5.\n";} |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my $here = where_module(); |
22
|
|
|
|
|
|
|
my $rfc_fnam = $here."rfc1345"; |
23
|
|
|
|
|
|
|
my $substi_fnam = $here."substitutes"; |
24
|
|
|
|
|
|
|
undef &where_module; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my %nam_mne; # hash of name keyed by mnemonic |
27
|
|
|
|
|
|
|
my %mne_nam; # hash of mnemonic keyed by name |
28
|
|
|
|
|
|
|
my %aprox_mne; # hash of substitute list keyed by mnemonic |
29
|
|
|
|
|
|
|
# if new() never called, transliterate() returns unchanged arg |
30
|
|
|
|
|
|
|
my @transform = (0 .. 255); |
31
|
|
|
|
|
|
|
my $verbose = 0; |
32
|
|
|
|
|
|
|
1; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub where_module { |
36
|
|
|
|
|
|
|
my @xx = caller; |
37
|
1
|
|
|
1
|
|
1458
|
use File::Basename; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
6696
|
|
38
|
|
|
|
|
|
|
my $yy = dirname($xx[1]); |
39
|
|
|
|
|
|
|
# adjust for Macintosh and Unix different return from dirname() |
40
|
|
|
|
|
|
|
if ( ($^O ne "MacOS") && ($yy =~ /[^\/]$/) ) { |
41
|
|
|
|
|
|
|
$yy .= "/"; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
return $yy; |
44
|
|
|
|
|
|
|
} # end of sub where_module |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub new { |
48
|
1
|
|
|
1
|
1
|
59
|
my ($bb, $cc, $dd, $ee, $gg, $ii, $jj, $kk, $line); |
49
|
0
|
|
|
|
|
0
|
my ($mm, $pp, $qq, $uu, $vv, $ww, $xx, $yy, $zz); |
50
|
0
|
|
|
|
|
0
|
my (@ch_tab, @thistab, @bitmap, @ff, @tt, @pp, @oo); |
51
|
0
|
|
|
|
|
0
|
my (%duplcatfrom, %duplcatto); |
52
|
1
|
|
|
|
|
3
|
my $this = shift; |
53
|
1
|
|
33
|
|
|
9
|
my $class = ref($this) || $this; |
54
|
|
|
|
|
|
|
|
55
|
1
|
|
|
|
|
3
|
%nam_mne = (); |
56
|
1
|
|
|
|
|
2
|
%mne_nam = (); |
57
|
1
|
|
|
|
|
2
|
%aprox_mne = (); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# syntax: new([FROM charset], TO charset, [verbose]) |
60
|
1
|
|
|
|
|
3
|
my @chset; |
61
|
1
|
50
|
|
|
|
5
|
$chset[0] = ($#_ > 0) ? shift : "ascii"; # ascii is default FROM charset |
62
|
1
|
|
|
|
|
3
|
$chset[1] = shift; |
63
|
1
|
|
|
|
|
2
|
$verbose = shift; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# typical rfc1345 table header: |
66
|
|
|
|
|
|
|
# &charset CSA_Z243.4-1985-1 |
67
|
|
|
|
|
|
|
# &rem source: ECMA registry |
68
|
|
|
|
|
|
|
# &alias iso-ir-121 |
69
|
|
|
|
|
|
|
# &g0esc x2877 &g1esc x2977 &g2esc x2a77 &g3esc x2b77 |
70
|
|
|
|
|
|
|
# &alias ISO646-CA |
71
|
|
|
|
|
|
|
# &alias csa7-1 |
72
|
|
|
|
|
|
|
# &alias ca |
73
|
|
|
|
|
|
|
# &code 0 |
74
|
|
|
|
|
|
|
# NU SH SX EX ET EQ AK BL BS HT LF VT FF CR SO SI |
75
|
|
|
|
|
|
|
|
76
|
1
|
50
|
|
|
|
5
|
load_dsf( $substi_fnam) or return undef; |
77
|
1
|
50
|
|
|
|
5
|
if ($verbose) {print "Creating transliteration from \"$chset[0]\" to \"$chset[1]\"\n";} |
|
0
|
|
|
|
|
0
|
|
78
|
1
|
|
|
|
|
7
|
for $ii ( 0 .. 1 ){ |
79
|
|
|
|
|
|
|
#traverse whole file twice for simplicity |
80
|
1
|
50
|
|
|
|
109
|
unless (open(RFC, $rfc_fnam)) {print "Can't open $rfc_fnam: $!.\n"; return undef;} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
81
|
1
|
|
|
|
|
41
|
while() { #skip to relevant section |
82
|
2709
|
100
|
|
|
|
8809
|
if (/5.\s+CHARSET TABLES/) { |
83
|
1
|
|
|
|
|
4
|
last; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
} |
86
|
1
|
|
|
|
|
4
|
@thistab = (); |
87
|
1
|
|
|
|
|
8
|
while() { # read table lines |
88
|
10
|
|
|
|
|
18
|
$line = $_; |
89
|
10
|
|
|
|
|
12
|
chomp $line; |
90
|
10
|
50
|
|
|
|
45
|
if ($line =~ /^ACKNOWLEDGEMENTS/) { # past relevant section |
91
|
0
|
0
|
|
|
|
0
|
if (@thistab) { # was previous table collected? |
92
|
0
|
0
|
|
|
|
0
|
if (relvnt_tab($chset[$ii], \@thistab) ) { |
93
|
0
|
|
|
|
|
0
|
$ch_tab[$ii] = [ @thistab ]; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
} |
96
|
0
|
|
|
|
|
0
|
last; |
97
|
|
|
|
|
|
|
} |
98
|
10
|
100
|
|
|
|
29
|
if ($line !~ /^ [^\s]/){ # distinguish table lines by 2 leading spaces |
99
|
2
|
|
|
|
|
6
|
next; |
100
|
|
|
|
|
|
|
} |
101
|
8
|
|
|
|
|
24
|
$line =~ s/^\s*//; # trim leading spaces |
102
|
|
|
|
|
|
|
# be sure of ending a table by encountering the next |
103
|
8
|
100
|
|
|
|
20
|
if ($line !~ /^&charset\s/) { |
104
|
6
|
|
|
|
|
11
|
push @thistab, $line; |
105
|
6
|
|
|
|
|
13
|
next; |
106
|
|
|
|
|
|
|
} |
107
|
2
|
100
|
|
|
|
6
|
if (@thistab) { # was previous table collected? |
108
|
1
|
0
|
|
|
|
8
|
if (relvnt_tab($chset[$ii], \@thistab)) { |
109
|
0
|
|
|
|
|
0
|
$ch_tab[$ii] = [ @thistab ]; |
110
|
0
|
|
|
|
|
0
|
last; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
} |
113
|
1
|
|
|
|
|
20
|
@thistab = ($line); # start another table collection |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
0
|
|
|
|
|
0
|
close RFC; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
#examine charsets |
119
|
0
|
|
|
|
|
0
|
$qq = 0; |
120
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
121
|
0
|
0
|
|
|
|
0
|
if(! $ch_tab[$ii][0]){ |
122
|
0
|
|
|
|
|
0
|
print STDERR "Couldn't find a character set for \"$chset[$ii]\".\n"; |
123
|
0
|
|
|
|
|
0
|
$qq = 1; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} |
126
|
0
|
0
|
|
|
|
0
|
if ($qq) {return undef;} |
|
0
|
|
|
|
|
0
|
|
127
|
|
|
|
|
|
|
|
128
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
129
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#{$ch_tab[$ii]} ) { |
|
0
|
|
|
|
|
0
|
|
130
|
0
|
0
|
|
|
|
0
|
if($ch_tab[$ii][$jj] =~ /^&bits\b\s*(\d*)/){ |
131
|
0
|
0
|
0
|
|
|
0
|
if ($1 != 0 || $1 != 8) { |
132
|
0
|
|
|
|
|
0
|
print STDERR "Can't handle $1\-bit charsets like $chset[$ii].\n"; |
133
|
0
|
|
|
|
|
0
|
$qq = 1; |
134
|
0
|
|
|
|
|
0
|
last; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
0
|
0
|
|
|
|
0
|
if ($qq) {return undef;} |
|
0
|
|
|
|
|
0
|
|
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
142
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#{$ch_tab[$ii]} ) { |
|
0
|
|
|
|
|
0
|
|
143
|
0
|
0
|
|
|
|
0
|
if($ch_tab[$ii][$jj] =~ /^&(code2|codex|comb2)\b/){ |
144
|
0
|
|
|
|
|
0
|
print STDERR "Can't handle $1 terms in charset $chset[$ii].\n"; |
145
|
0
|
|
|
|
|
0
|
$qq = 1; |
146
|
0
|
|
|
|
|
0
|
last; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
0
|
0
|
|
|
|
0
|
if ($qq) {return undef;} |
|
0
|
|
|
|
|
0
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
#create bit maps |
153
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
154
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nCharacter set $chset[$ii]:\n";} |
|
0
|
|
|
|
|
0
|
|
155
|
0
|
|
|
|
|
0
|
$dd = -1; # code offset unless negative |
156
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#{$ch_tab[$ii]} ) { |
|
0
|
|
|
|
|
0
|
|
157
|
0
|
|
|
|
|
0
|
$line = $ch_tab[$ii][$jj]; |
158
|
0
|
0
|
|
|
|
0
|
if($line =~ /^&code\b\s*(\d)/){ |
159
|
0
|
|
|
|
|
0
|
$dd = $1; |
160
|
0
|
|
|
|
|
0
|
next; |
161
|
|
|
|
|
|
|
} |
162
|
0
|
0
|
|
|
|
0
|
if($line =~ /^&duplicate\s+([\da-fA-F]+)\s+([^\s]+)/){ |
163
|
0
|
|
|
|
|
0
|
$pp = $1; $mm = $2; # position already in base 10 |
|
0
|
|
|
|
|
0
|
|
164
|
|
|
|
|
|
|
# examples: "duplicate 91 AE", "duplicate 92 O/" (position, dup mnemonic) |
165
|
|
|
|
|
|
|
# organize duplicate keeping differently for FROM and TO charsets |
166
|
0
|
0
|
|
|
|
0
|
if ($ii == 0) { |
167
|
0
|
0
|
|
|
|
0
|
if ($duplcatfrom{$pp} ) { |
168
|
|
|
|
|
|
|
# FROM keyed by position |
169
|
0
|
|
|
|
|
0
|
push @{$duplcatfrom{$pp}{Mnemonics}}, "$mm"; |
|
0
|
|
|
|
|
0
|
|
170
|
|
|
|
|
|
|
} else { |
171
|
0
|
|
|
|
|
0
|
$duplcatfrom{$pp} = {Position => $pp, Mnemonics => [ "$mm" ],}; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
#print "FROM Duplicate for position $pp: $duplcatfrom{$pp}{Mnemonics}[-1]\n"; |
174
|
|
|
|
|
|
|
} else { |
175
|
0
|
|
|
|
|
0
|
$duplcatto{$mm} = $pp; # TO keyed by mnemonic |
176
|
|
|
|
|
|
|
#print "TO Duplicate for position $duplcatto{$mm}: $mm\n"; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
# flush control info (just use all defined keywords) |
180
|
0
|
0
|
|
|
|
0
|
if($line =~ |
181
|
|
|
|
|
|
|
/^&(charset|alias|g\desc|bits|code|code2|codex|duplicate|rem|comb2)\b/) { |
182
|
0
|
|
|
|
|
0
|
$dd = -1; |
183
|
0
|
|
|
|
|
0
|
next; |
184
|
|
|
|
|
|
|
} |
185
|
0
|
0
|
|
|
|
0
|
if ($dd < 0) { |
186
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
0
|
|
|
|
|
0
|
|
187
|
|
|
|
|
|
|
{print "Strange! You should examine the charset table: \"$line\"\n";} |
188
|
0
|
|
|
|
|
0
|
next; |
189
|
|
|
|
|
|
|
} |
190
|
0
|
|
|
|
|
0
|
@pp = split(/\s+/, $line); |
191
|
0
|
|
|
|
|
0
|
foreach $uu (@pp) { |
192
|
|
|
|
|
|
|
# mnemonic "??" means position unused. |
193
|
|
|
|
|
|
|
# mnemonic "__" means character set not completely defined here |
194
|
0
|
0
|
0
|
|
|
0
|
if ( $uu eq "??" || $uu eq "__" ) { |
195
|
0
|
|
|
|
|
0
|
$uu = ""; |
196
|
|
|
|
|
|
|
} |
197
|
0
|
0
|
0
|
|
|
0
|
if (($uu) && (! $nam_mne{$uu})) { |
198
|
0
|
|
|
|
|
0
|
printf "Warning: position %x (hex) has invalid mnemonic \"%s\".\n", |
199
|
|
|
|
|
|
|
$dd, $uu; |
200
|
0
|
|
|
|
|
0
|
$uu = ""; |
201
|
|
|
|
|
|
|
} |
202
|
0
|
|
|
|
|
0
|
$bitmap[$ii][$dd++] = $uu; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
# print ($#pp, $dd, (1 + $#{$bitmap[$ii]}), "\n"); |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
# normalize char set length |
207
|
0
|
|
|
|
|
0
|
$#{$bitmap[$ii]} += ((128 - ( (1 + $#{$bitmap[$ii]}) % 128)) % 128); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
208
|
|
|
|
|
|
|
# print ("highest index is", $#{$bitmap[$ii]}, "\n" ); |
209
|
0
|
|
|
|
|
0
|
for $kk ( 0 .. $#{$bitmap[$ii]} ) { |
|
0
|
|
|
|
|
0
|
|
210
|
0
|
0
|
|
|
|
0
|
if ($verbose) { print (length($bitmap[$ii][$kk])?"x":"_");} |
|
0
|
0
|
|
|
|
0
|
|
211
|
0
|
0
|
|
|
|
0
|
if ( ! (($kk +1) % 16)) { if ($verbose) { print "\n";}} |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# explanation from RFC 1345: |
216
|
|
|
|
|
|
|
# "&duplicate" has a special meaning indicating that a position |
217
|
|
|
|
|
|
|
# is being used for more than one character. This is an ugly |
218
|
|
|
|
|
|
|
# convention but it is a sad fact of life that same code in one |
219
|
|
|
|
|
|
|
# coded character set can mean different characters. |
220
|
|
|
|
|
|
|
# "&duplicate" takes two parameters - the first is the code to |
221
|
|
|
|
|
|
|
# be duplicated, the other is the new mnemonic. |
222
|
|
|
|
|
|
|
# &duplicate 91 AE |
223
|
|
|
|
|
|
|
# &duplicate 92 O/ |
224
|
|
|
|
|
|
|
|
225
|
0
|
|
|
|
|
0
|
$#transform = $#{$bitmap[0]}; # create transliterator table |
|
0
|
|
|
|
|
0
|
|
226
|
0
|
|
|
|
|
0
|
PIERWSZY: for $jj ( 0 .. $#{$bitmap[0]} ) { |
|
0
|
|
|
|
|
0
|
|
227
|
0
|
|
|
|
|
0
|
$transform[$jj] = -1; # initialize since 0 is valid value |
228
|
0
|
|
|
|
|
0
|
@tt = ( $bitmap[0][$jj] ); # start list with one element |
229
|
0
|
0
|
|
|
|
0
|
if ($duplcatfrom{$jj}) { # add any duplicates in the FROM charset |
230
|
0
|
|
|
|
|
0
|
push @tt, @{$duplcatfrom{$jj}{Mnemonics}}; |
|
0
|
|
|
|
|
0
|
|
231
|
|
|
|
|
|
|
#print "BB $jj $#tt <<< @tt >>>\n"; |
232
|
|
|
|
|
|
|
} |
233
|
0
|
|
|
|
|
0
|
@oo = (); |
234
|
0
|
|
|
|
|
0
|
for $xx ( 0 .. $#tt ) { # refine list |
235
|
0
|
0
|
|
|
|
0
|
if (length($tt[$xx]) > 0) { |
236
|
0
|
|
|
|
|
0
|
push @oo, $tt[$xx]; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
} |
239
|
0
|
0
|
|
|
|
0
|
if (! @oo) { |
240
|
0
|
|
|
|
|
0
|
next; |
241
|
|
|
|
|
|
|
} |
242
|
0
|
|
|
|
|
0
|
for $bb ( 0 .. $#oo ) { # match any mnemonics for this position in FROM charset |
243
|
0
|
|
|
|
|
0
|
$uu = $oo[$bb]; |
244
|
0
|
|
|
|
|
0
|
for $kk ( 0 .. $#{$bitmap[1]} ) { #search TO charset |
|
0
|
|
|
|
|
0
|
|
245
|
0
|
0
|
|
|
|
0
|
if ($uu eq $bitmap[1][$kk]) { |
246
|
0
|
|
|
|
|
0
|
$transform[$jj] = $kk; |
247
|
0
|
|
|
|
|
0
|
next PIERWSZY; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
} |
250
|
0
|
0
|
|
|
|
0
|
if (exists $duplcatto{$uu}) { # search TO charset duplicates |
251
|
0
|
|
|
|
|
0
|
$transform[$jj] = $duplcatto{$uu}; |
252
|
0
|
|
|
|
|
0
|
next PIERWSZY; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
0
|
0
|
|
|
|
0
|
if ($verbose) { print "\nTransliterator:\n";} |
|
0
|
|
|
|
|
0
|
|
258
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { |
259
|
0
|
0
|
|
|
|
0
|
if ($verbose) { print (($transform[$jj]>=0)?"x":"_");} |
|
0
|
0
|
|
|
|
0
|
|
260
|
0
|
0
|
|
|
|
0
|
if ( ! (($jj+1) % 16)) { if ($verbose) { print "\n";}} |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
0
|
$vv = $ww = 0; |
264
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # try to approximate using substitute lists |
265
|
0
|
0
|
|
|
|
0
|
if ($transform[$jj] >= 0) { |
266
|
0
|
|
|
|
|
0
|
next; |
267
|
|
|
|
|
|
|
} |
268
|
0
|
0
|
|
|
|
0
|
if (! ($gg = $bitmap[0][$jj]) ) { # undefined |
269
|
0
|
|
|
|
|
0
|
next; |
270
|
|
|
|
|
|
|
} |
271
|
0
|
|
|
|
|
0
|
TRZECI: foreach $pp (@{$aprox_mne{$gg}}) { |
|
0
|
|
|
|
|
0
|
|
272
|
0
|
|
|
|
|
0
|
for $cc ( 0 .. $#{$bitmap[1]} ) { |
|
0
|
|
|
|
|
0
|
|
273
|
0
|
|
|
|
|
0
|
$zz = $bitmap[1][$cc]; |
274
|
0
|
0
|
|
|
|
0
|
if ($zz eq $pp) { # substitute must be in TO char set |
275
|
0
|
|
|
|
|
0
|
$transform[$jj] = $cc; # success |
276
|
0
|
0
|
|
|
|
0
|
if (!$ww) { |
277
|
0
|
|
|
|
|
0
|
$ww = 1; |
278
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nApproximate substitutes:\n";} |
|
0
|
|
|
|
|
0
|
|
279
|
|
|
|
|
|
|
} |
280
|
0
|
0
|
|
|
|
0
|
if ($verbose) { printf "%X==>%X\t%s==>%s\n", $jj, $cc, |
|
0
|
|
|
|
|
0
|
|
281
|
|
|
|
|
|
|
$nam_mne{ $gg}, $nam_mne{ $zz};} |
282
|
0
|
|
|
|
|
0
|
last TRZECI; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
} |
286
|
0
|
|
|
|
|
0
|
$vv = 1; # defined character but no substitute |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
0
|
0
|
|
|
|
0
|
if ($ww) { |
290
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nTransliterator with aproximate substitutions:\n";} |
|
0
|
|
|
|
|
0
|
|
291
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { |
292
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print (($transform[$jj]>=0)?"x":"_");} |
|
0
|
0
|
|
|
|
0
|
|
293
|
0
|
0
|
|
|
|
0
|
if ( ! (($jj+1) % 16)) { if ($verbose) {print "\n";}} |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# non-equivalent substitutions (like "RIGHT SQUARE BRACKET" for "CENT SIGN") |
298
|
0
|
0
|
|
|
|
0
|
if ($vv) { |
299
|
0
|
|
|
|
|
0
|
@ff = (); |
300
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # list those in need in FROM charset |
301
|
0
|
0
|
0
|
|
|
0
|
if (($transform[$jj] < 0) && $bitmap[0][$jj] ) { |
302
|
0
|
|
|
|
|
0
|
push @ff, $jj; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
} |
305
|
0
|
0
|
|
|
|
0
|
if (@ff) { |
306
|
0
|
|
|
|
|
0
|
@tt = (); |
307
|
0
|
|
|
|
|
0
|
PIATY: for $bb ( 0 .. $#{$bitmap[1]} ) { # list availables in TO charset |
|
0
|
|
|
|
|
0
|
|
308
|
|
|
|
|
|
|
# start at usual second ascii graphic (hoping to maximize graphics) |
309
|
0
|
|
|
|
|
0
|
$cc = (34 + $bb) % (1 + $#{$bitmap[1]} ); |
|
0
|
|
|
|
|
0
|
|
310
|
0
|
0
|
|
|
|
0
|
if ($zz = $bitmap[1][$cc]) { |
311
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. $#{$bitmap[0]} ) { |
|
0
|
|
|
|
|
0
|
|
312
|
0
|
0
|
|
|
|
0
|
if ($zz eq $bitmap[0][$ii]) { |
313
|
0
|
|
|
|
|
0
|
next PIATY; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
} |
316
|
0
|
|
|
|
|
0
|
push @tt, $cc; |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
} |
320
|
0
|
|
|
|
|
0
|
$gg = $ee = 0; |
321
|
0
|
|
0
|
|
|
0
|
SIODMY: while ( ($#ff >= 0) && ($#tt >= 0) ) { |
322
|
0
|
|
|
|
|
0
|
for $jj ($gg .. $#ff) { # first try to match equal hex values |
323
|
0
|
|
|
|
|
0
|
$ww = $ff[$jj]; |
324
|
0
|
|
|
|
|
0
|
for $kk (0 .. $#tt) { |
325
|
0
|
0
|
|
|
|
0
|
if ( $ww == $tt[$kk]) { |
326
|
0
|
|
|
|
|
0
|
$transform[ $ww] = $ww; |
327
|
0
|
0
|
|
|
|
0
|
if (!$ee) { |
328
|
0
|
|
|
|
|
0
|
$ee = 1; |
329
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nNon-equivalent substitutes:\n";} |
|
0
|
|
|
|
|
0
|
|
330
|
|
|
|
|
|
|
} |
331
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
0
|
|
|
|
|
0
|
|
332
|
|
|
|
|
|
|
{printf "%X==>%X\t%s==>%s\n", $ww, $ww, |
333
|
|
|
|
|
|
|
$nam_mne{$bitmap[0][ $ww]}, $nam_mne{$bitmap[1][ $ww]};} |
334
|
0
|
|
|
|
|
0
|
splice @ff, $jj, 1; |
335
|
0
|
|
|
|
|
0
|
splice @tt, $kk, 1; |
336
|
0
|
|
|
|
|
0
|
$gg = $jj; |
337
|
0
|
|
|
|
|
0
|
next SIODMY; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
0
|
|
|
|
|
0
|
last; |
342
|
|
|
|
|
|
|
} |
343
|
0
|
0
|
|
|
|
0
|
$gg = ($#ff < $#tt)? $#ff : $#tt; |
344
|
0
|
|
|
|
|
0
|
for $jj (0 .. $gg ) { |
345
|
0
|
|
|
|
|
0
|
$transform[$ff[$jj]] = $tt[$jj]; |
346
|
0
|
0
|
|
|
|
0
|
if (!$ee) { |
347
|
0
|
|
|
|
|
0
|
$ee = 1; |
348
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nNon-equivalent substitutes:\n";} |
|
0
|
|
|
|
|
0
|
|
349
|
|
|
|
|
|
|
} |
350
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
0
|
|
|
|
|
0
|
|
351
|
|
|
|
|
|
|
{printf "%X==>%X\t%s==>%s\n", $ff[$jj], $tt[$jj], |
352
|
|
|
|
|
|
|
$nam_mne{$bitmap[0][$ff[$jj]]}, $nam_mne{$bitmap[1][$tt[$jj]]};} |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
0
|
$ee = $yy = 0; # anything left untranslated? |
357
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { |
358
|
0
|
0
|
|
|
|
0
|
if ($transform[$jj] < 0) { |
359
|
0
|
|
|
|
|
0
|
$yy = 1; |
360
|
0
|
0
|
|
|
|
0
|
if ($mm = $bitmap[0][$jj]) { |
361
|
0
|
0
|
|
|
|
0
|
if (!$ee) { |
362
|
0
|
|
|
|
|
0
|
$ee = 1; |
363
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nNon-equivalent remnant:\n";} |
|
0
|
|
|
|
|
0
|
|
364
|
|
|
|
|
|
|
} |
365
|
0
|
0
|
|
|
|
0
|
if ($verbose) {printf "%X\t%s\n", $jj, $nam_mne{$mm};} |
|
0
|
|
|
|
|
0
|
|
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# if non-equivalent remnant, then select a hopefully unique and graphic indicator |
371
|
0
|
0
|
|
|
|
0
|
if ($ee) { |
372
|
0
|
|
|
|
|
0
|
DRUGI: for $bb ( 0 .. $#{$bitmap[1]} ) { |
|
0
|
|
|
|
|
0
|
|
373
|
0
|
|
|
|
|
0
|
$cc = (34 + $bb) % (1 + $#{$bitmap[1]} ); |
|
0
|
|
|
|
|
0
|
|
374
|
0
|
0
|
|
|
|
0
|
if ($xx = $bitmap[1][$cc]) { |
375
|
0
|
|
|
|
|
0
|
for $dd ( 0 .. $#{$bitmap[0]} ) { |
|
0
|
|
|
|
|
0
|
|
376
|
0
|
0
|
|
|
|
0
|
if ( $xx eq $bitmap[0][$dd]) { # reject if in FROM charset |
377
|
0
|
|
|
|
|
0
|
next DRUGI; |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
} |
380
|
0
|
|
|
|
|
0
|
last; # success |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
} |
383
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
0
|
|
|
|
|
0
|
|
384
|
|
|
|
|
|
|
{printf "\nNon-equivalence indicator: %X\t%s\n", $cc, $nam_mne{$xx}}; |
385
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # interpolate non-equiv character |
386
|
0
|
0
|
0
|
|
|
0
|
if (($transform[$jj] < 0) && $bitmap[0][$jj]) { |
387
|
0
|
|
|
|
|
0
|
$transform[$jj] = $cc; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
# if undefined remnant, then select a possibly undefined indicator |
393
|
0
|
0
|
|
|
|
0
|
if ($yy) { |
394
|
0
|
|
|
|
|
0
|
SZOSTY: for ($cc = $#{$bitmap[1]}; $cc >= 0; --$cc ) { |
|
0
|
|
|
|
|
0
|
|
395
|
0
|
0
|
|
|
|
0
|
if (! ($xx = $bitmap[1][$cc]) ) { |
396
|
0
|
|
|
|
|
0
|
last; |
397
|
|
|
|
|
|
|
} |
398
|
0
|
|
|
|
|
0
|
for $dd ( 0 .. $#{$bitmap[0]} ) { |
|
0
|
|
|
|
|
0
|
|
399
|
0
|
0
|
|
|
|
0
|
if ($xx eq $bitmap[0][$dd] ) { |
400
|
0
|
|
|
|
|
0
|
next SZOSTY; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
} |
403
|
0
|
|
|
|
|
0
|
last; # success |
404
|
|
|
|
|
|
|
} |
405
|
0
|
0
|
|
|
|
0
|
if ( ! ($yy = $nam_mne{$xx}) ) {$yy = "(undefined character)";} |
|
0
|
|
|
|
|
0
|
|
406
|
0
|
0
|
|
|
|
0
|
if ($verbose) {printf "\nUndefined indicator: %X\t%s\n", $cc, $yy}; |
|
0
|
|
|
|
|
0
|
|
407
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # interpolate non-equiv character |
408
|
0
|
0
|
|
|
|
0
|
if ($transform[$jj] < 0) { |
409
|
0
|
|
|
|
|
0
|
$transform[$jj] = $cc; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# if FROM charset length < 256, assume repeated for upper 128 chars |
415
|
0
|
0
|
|
|
|
0
|
if ($#transform < 128) {push @transform, @transform;} |
|
0
|
|
|
|
|
0
|
|
416
|
0
|
|
|
|
|
0
|
$yy = "$chset[0]".".to."."$chset[1]"; |
417
|
0
|
|
|
|
|
0
|
my $self = {TRN_NAM=>"$yy", TRN_ARY=>[@transform]}; |
418
|
0
|
|
|
|
|
0
|
bless $self, $class; |
419
|
0
|
|
|
|
|
0
|
return $self; |
420
|
|
|
|
|
|
|
} # end of sub new |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub relvnt_tab { # true if this is the sought char table |
424
|
1
|
|
|
1
|
0
|
2
|
my ($jj, $uu, $ww, $xx, $yy, $chch, $tabref); |
425
|
1
|
|
|
|
|
3
|
$chch = shift; # $chset[0 or 1] |
426
|
1
|
|
|
|
|
2
|
$tabref = shift; # reference to @thistab |
427
|
1
|
|
|
|
|
3
|
for $jj ( 0 .. $#{@$tabref} ){ |
|
1
|
|
|
|
|
185
|
|
428
|
0
|
|
|
|
|
0
|
$ww = $$tabref[$jj]; |
429
|
0
|
0
|
|
|
|
0
|
if ($ww !~ /^&(charset|alias)\s+([^\s]*)/){ |
430
|
0
|
|
|
|
|
0
|
next; |
431
|
|
|
|
|
|
|
} |
432
|
0
|
|
|
|
|
0
|
$xx = $1; $yy = $2; |
|
0
|
|
|
|
|
0
|
|
433
|
0
|
0
|
|
|
|
0
|
if ($chch =~ /^$yy$/i) { |
434
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "Found $xx $yy";} |
|
0
|
|
|
|
|
0
|
|
435
|
0
|
0
|
|
|
|
0
|
if ($xx eq "alias") { |
436
|
0
|
|
|
|
|
0
|
$uu = $$tabref[0]; |
437
|
0
|
|
|
|
|
0
|
$uu =~ s/^&//; |
438
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print " for $uu";} |
|
0
|
|
|
|
|
0
|
|
439
|
|
|
|
|
|
|
} |
440
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\n";} |
|
0
|
|
|
|
|
0
|
|
441
|
0
|
|
|
|
|
0
|
return 1; # true |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
} |
444
|
0
|
|
|
|
|
0
|
return undef; # false |
445
|
|
|
|
|
|
|
} # end of sub relvnt_tab |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub load_dsf { # load code definitions and approximate substitutes |
449
|
1
|
|
|
1
|
0
|
2
|
my ($mm, $nn, @ww); |
450
|
1
|
50
|
|
|
|
91
|
unless (open(DSF, "$_[0]")) {print "Can't open $_[0]: $!.\n"; return undef;} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
451
|
1
|
|
|
|
|
56
|
while () { |
452
|
1
|
|
|
|
|
4
|
chomp; |
453
|
1
|
50
|
|
|
|
8
|
if (/^hash mnemonic=name/) { # first group header |
454
|
1
|
|
|
|
|
139
|
last; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
} |
457
|
1
|
|
|
|
|
16
|
while () { |
458
|
1895
|
|
|
|
|
2358
|
chomp; |
459
|
1895
|
100
|
|
|
|
4165
|
if (/^hash mnemonic=substitute list/) { # next group header |
460
|
1
|
|
|
|
|
6
|
last; |
461
|
|
|
|
|
|
|
} |
462
|
1894
|
|
|
|
|
5980
|
($mm, $nn) = split(/\t/); |
463
|
1894
|
|
|
|
|
11970
|
$nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic |
464
|
1894
|
|
|
|
|
7604
|
$mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name |
465
|
|
|
|
|
|
|
} |
466
|
1
|
|
|
|
|
12
|
while () { |
467
|
985
|
|
|
|
|
1024
|
chomp; |
468
|
985
|
|
|
|
|
7556
|
($mm, @ww) = split(/\t/); |
469
|
985
|
|
|
|
|
5334
|
$aprox_mne{$mm} = [@ww]; |
470
|
|
|
|
|
|
|
} |
471
|
1
|
|
|
|
|
57
|
close DSF; |
472
|
1
|
|
|
|
|
12
|
return 1; |
473
|
|
|
|
|
|
|
} # end of sub load_dsf |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub build_substitutes { |
477
|
|
|
|
|
|
|
# creates lists of approximate substitutes |
478
|
|
|
|
|
|
|
# it takes about 90 minutes to recreate the file |
479
|
|
|
|
|
|
|
# there should be no need to run this since its result file never needs changing |
480
|
0
|
|
|
0
|
1
|
|
my ($xx, $aa, $yy, $bb, $ff, $hh, $pp, $jj, $kk, $gg, $rr, $mm, $nn, $start); |
481
|
0
|
|
|
|
|
|
my @ww; |
482
|
0
|
|
|
|
|
|
my $this = shift; |
483
|
0
|
|
0
|
|
|
|
my $class = ref($this) || $this; |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
#$start = time(); |
486
|
0
|
|
|
|
|
|
load_rfcdoc( $rfc_fnam); |
487
|
|
|
|
|
|
|
# find approximate substitutes |
488
|
|
|
|
|
|
|
# (mnemonic, hexcode, name) example: |
489
|
|
|
|
|
|
|
# mnemonic: j+- feef ARABIC LETTER ALEF MAKSURA ISOLATED FORM |
490
|
|
|
|
|
|
|
# substitute: j+ 0649 ARABIC LETTER ALEF MAKSURA |
491
|
|
|
|
|
|
|
# substitute: a+: e022 ARABIC LETTER ALEF FINAL FORM COMPATIBILITY (IBM868 144) |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
#print (time() - $start); print " delete words from right\n"; |
494
|
0
|
|
|
|
|
|
while (($xx, $aa) = each %mne_nam) { # (name, nmemonic) |
495
|
0
|
0
|
|
|
|
|
if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character |
496
|
0
|
|
|
|
|
|
next; |
497
|
|
|
|
|
|
|
} |
498
|
0
|
|
|
|
|
|
$yy = $xx; |
499
|
0
|
|
|
|
|
|
while ($yy =~ /[^\s]\s+[^\s]+\s*$/) { |
500
|
0
|
|
|
|
|
|
$yy =~ s/^(.*[^\s])\s+[^\s]+\s*$/$1/; # delete words from right |
501
|
0
|
0
|
|
|
|
|
if ($bb = $mne_nam{$yy}) { |
502
|
0
|
|
|
|
|
|
push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list |
|
0
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
#print "$aa\t@{$aprox_mne{$aa}}\n"; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
#print (time() - $start); print " delete words from left\n"; |
509
|
0
|
|
|
|
|
|
while (($xx, $aa) = each %mne_nam) { # (name, nmemonic) |
510
|
0
|
0
|
|
|
|
|
if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character |
511
|
0
|
|
|
|
|
|
next; |
512
|
|
|
|
|
|
|
} |
513
|
0
|
|
|
|
|
|
$yy = $xx; |
514
|
0
|
|
|
|
|
|
while ($yy =~ /[^\s]\s+[^\s]+\s*$/) { |
515
|
0
|
|
|
|
|
|
$yy =~ s/^[^\s]+\s+(.*)$/$1/; # delete words from left |
516
|
0
|
0
|
|
|
|
|
if ($yy =~ /^WITH\s/i) { # avoid matching overly broad phrases |
517
|
0
|
|
|
|
|
|
last; |
518
|
|
|
|
|
|
|
} |
519
|
0
|
0
|
|
|
|
|
if ($bb = $mne_nam{$yy}) { |
520
|
0
|
|
|
|
|
|
push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list |
|
0
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
#print "$aa\t@{$aprox_mne{$aa}}\n"; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# look for (string1 inside string2) or (string2 inside string1) |
527
|
|
|
|
|
|
|
# also look fcr "DIGIT ONE", "NUMBER TWO" strings |
528
|
|
|
|
|
|
|
# (disabled) also equate the 2 kinds of Japanese syllabic character sets |
529
|
|
|
|
|
|
|
# since subs are assigned reciprocally, algorithm avoids redundancy: for example, if |
530
|
|
|
|
|
|
|
# elements are (A, B, C, D, E), then comparisions will be E with (A, B, C, D), |
531
|
|
|
|
|
|
|
# D with (A, B, C), C with (A, B), B with (A) and A with nothing. |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
#print (time() - $start); print " look for string1 inside string2, number terms\n"; |
534
|
0
|
|
|
|
|
|
$ff = "DIGIT|NUMBER"; |
535
|
|
|
|
|
|
|
#$hh = "HIRAGANA|KATAKANA"; |
536
|
|
|
|
|
|
|
#$pp = "LETTER|LIGATURE|".$ff."|".$hh; |
537
|
0
|
|
|
|
|
|
$pp = "LETTER|LIGATURE|".$ff; |
538
|
0
|
|
|
|
|
|
$jj = scalar (keys %mne_nam); |
539
|
0
|
|
|
|
|
|
foreach $xx ( reverse keys %mne_nam ) { |
540
|
0
|
0
|
|
|
|
|
if ( ($kk = (--$jj)) <= 0) { |
541
|
0
|
|
|
|
|
|
last; |
542
|
|
|
|
|
|
|
} |
543
|
0
|
0
|
|
|
|
|
if ($xx !~ /($pp)/i) { # only certain types |
544
|
0
|
|
|
|
|
|
next; |
545
|
|
|
|
|
|
|
} |
546
|
0
|
0
|
|
|
|
|
if ($xx !~ /\b\w+\b\s+\b\w+/) { # at least 2 words |
547
|
0
|
|
|
|
|
|
next; |
548
|
|
|
|
|
|
|
} |
549
|
0
|
0
|
|
|
|
|
if ($xx =~ /\b($ff) \b(\w+)\b/i) { # digit, numeral, et al. match |
550
|
0
|
|
|
|
|
|
$gg = $2; |
551
|
|
|
|
|
|
|
} else { |
552
|
0
|
|
|
|
|
|
$gg = ""; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
# if ($xx =~ /(.*)\b($hh)\b(.*)/i) { # equivalent Japanese syllabic sets |
555
|
|
|
|
|
|
|
# $xx = "$1"."HIRAGANA"."$2"; |
556
|
|
|
|
|
|
|
# $rr = "$1"."KATAKANA"."$2"; |
557
|
|
|
|
|
|
|
# } else { |
558
|
|
|
|
|
|
|
# $rr = ""; |
559
|
|
|
|
|
|
|
# } |
560
|
0
|
|
|
|
|
|
$aa = $mne_nam{$xx}; |
561
|
0
|
|
|
|
|
|
foreach $yy (keys %mne_nam) { |
562
|
0
|
0
|
|
|
|
|
if ( ($kk--) <= 0) { |
563
|
0
|
|
|
|
|
|
last; |
564
|
|
|
|
|
|
|
} |
565
|
0
|
0
|
|
|
|
|
if ($yy !~ /($pp)/i) { |
566
|
0
|
|
|
|
|
|
next; |
567
|
|
|
|
|
|
|
} |
568
|
0
|
0
|
|
|
|
|
if ($yy !~ /\b\w+\b\s+\b\w+/) { |
569
|
0
|
|
|
|
|
|
next; |
570
|
|
|
|
|
|
|
} |
571
|
0
|
0
|
0
|
|
|
|
if ( (($xx =~ /\b($yy)\b/i) || ($yy =~ /\b($xx)\b/i)) || |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
572
|
|
|
|
|
|
|
# (($rr) && ($xx =~ /\b($yy)\b/i) || ($yy =~ /\b($xx)\b/i)) || |
573
|
|
|
|
|
|
|
(($gg) && ($yy !~ /FRACTION/i) && ($yy =~ /\b($ff)\s+\b$gg\b/i)) ) { |
574
|
0
|
|
|
|
|
|
$bb = $mne_nam{$yy}; |
575
|
0
|
|
|
|
|
|
push @{ $aprox_mne{ $aa} }, "$bb"; # add to sub lists repciprocally |
|
0
|
|
|
|
|
|
|
576
|
0
|
|
|
|
|
|
push @{ $aprox_mne{ $bb} }, "$aa"; |
|
0
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
#print "$aa\t@{$aprox_mne{ $aa}}\n"; |
578
|
|
|
|
|
|
|
#print "$bb\t@{$aprox_mne{ $bb}}\n"; |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
#print (time() - $start); print "\n"; |
584
|
|
|
|
|
|
|
#print "No substitutes found for these:\n"; |
585
|
0
|
|
|
|
|
|
foreach $aa ( keys %aprox_mne ) { |
586
|
0
|
0
|
|
|
|
|
if ( $#{ $aprox_mne{$aa}} < 0) { |
|
0
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# print "$aa\t$nam_mne{$aa}\n"; |
588
|
0
|
|
|
|
|
|
delete $aprox_mne{$aa}; |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
} |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
#print (time() - $start); print " eliminate duplicate substitutions\n"; |
593
|
0
|
|
|
|
|
|
foreach $aa ( keys %aprox_mne ) { # eliminate duplicate substitutions |
594
|
0
|
|
|
|
|
|
@ww = @{$aprox_mne{$aa}}; |
|
0
|
|
|
|
|
|
|
595
|
0
|
|
|
|
|
|
CZWARTY: for($gg = 0;;){ |
596
|
0
|
|
|
|
|
|
for ($jj = $#ww; $jj > 0; --$jj){ |
597
|
0
|
|
|
|
|
|
for ($kk = ($jj -1); $kk >= 0; --$kk){ |
598
|
0
|
0
|
|
|
|
|
if ("$ww[$kk]" eq "$ww[$jj]"){ |
599
|
0
|
|
|
|
|
|
splice @ww, $jj, 1; |
600
|
0
|
|
|
|
|
|
$gg = 1; |
601
|
0
|
|
|
|
|
|
next CZWARTY; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
} |
605
|
0
|
|
|
|
|
|
last; |
606
|
|
|
|
|
|
|
} |
607
|
0
|
0
|
|
|
|
|
if ($gg) { |
608
|
0
|
|
|
|
|
|
$aprox_mne{$aa} = [@ww]; |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
} |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
#print (time() - $start); print "\n"; |
613
|
|
|
|
|
|
|
# for user's protection, save old file if any |
614
|
0
|
|
|
|
|
|
$gg = "$substi_fnam".".bkp"; |
615
|
0
|
0
|
|
|
|
|
if (! -e $gg) {rename( $substi_fnam, "$gg")}; |
|
0
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
# contrived loop wherein second pass only when open or write failure |
617
|
0
|
|
|
|
|
|
OSMY: for ($yy = 0; $yy == 0; $yy = 1) { |
618
|
0
|
0
|
|
|
|
|
unless (open(DSF, ">$substi_fnam")) {next OSMY}; |
|
0
|
|
|
|
|
|
|
619
|
0
|
0
|
|
|
|
|
unless (print DSF "hash mnemonic=name\n") {next OSMY}; # header |
|
0
|
|
|
|
|
|
|
620
|
0
|
|
|
|
|
|
foreach $mm (sort keys %nam_mne ) { |
621
|
|
|
|
|
|
|
# mnemonic tab name newline |
622
|
0
|
0
|
|
|
|
|
unless (print DSF "$mm\t$nam_mne{$mm}\n") {next OSMY}; |
|
0
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
} |
624
|
0
|
0
|
|
|
|
|
unless (print DSF "hash mnemonic=substitute list\n") {next OSMY}; # header |
|
0
|
|
|
|
|
|
|
625
|
0
|
|
|
|
|
|
foreach $mm (sort keys %aprox_mne ) { |
626
|
0
|
0
|
|
|
|
|
unless (print DSF "$mm") {next OSMY}; # mnemonic |
|
0
|
|
|
|
|
|
|
627
|
0
|
|
|
|
|
|
foreach $aa (0 .. $#{$aprox_mne{$mm}}) { # each substitute |
|
0
|
|
|
|
|
|
|
628
|
0
|
0
|
|
|
|
|
unless (print DSF "\t$aprox_mne{$mm}[$aa]") {next OSMY}; |
|
0
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
} |
630
|
0
|
0
|
|
|
|
|
unless (print DSF "\n") {next OSMY}; # newline |
|
0
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
} |
632
|
0
|
|
|
|
|
|
last OSMY; |
633
|
|
|
|
|
|
|
} |
634
|
0
|
|
|
|
|
|
close DSF; |
635
|
0
|
0
|
|
|
|
|
if ($yy) { |
636
|
0
|
0
|
|
|
|
|
if (! -e $substi_fnam) {rename( "$gg", $substi_fnam)}; |
|
0
|
|
|
|
|
|
|
637
|
0
|
|
|
|
|
|
print "Failed to create $substi_fnam: $!\n"; |
638
|
0
|
|
|
|
|
|
return undef; |
639
|
|
|
|
|
|
|
} |
640
|
0
|
|
|
|
|
|
return 1; |
641
|
|
|
|
|
|
|
} # end of sub build_substitutes |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
sub load_rfcdoc { #load code definition list |
645
|
0
|
|
|
0
|
0
|
|
my ($mm, $nn, $jj, $catch, $hh, $xx, $kk, $yy); |
646
|
0
|
|
|
|
|
|
my $digits = \"ZERO|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE"; |
647
|
0
|
0
|
|
|
|
|
unless (open(RFC, $_[0])) {print "Can't open $_[0]: $!\n"; return undef;} |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
648
|
0
|
|
|
|
|
|
while() { |
649
|
0
|
|
|
|
|
|
chomp; |
650
|
0
|
0
|
|
|
|
|
if (/^3. CHARACTER MNEMONIC TABLE/) { |
651
|
0
|
|
|
|
|
|
last; |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
} |
654
|
0
|
|
|
|
|
|
$jj = 1; |
655
|
0
|
|
|
|
|
|
while() { |
656
|
0
|
|
|
|
|
|
chomp; |
657
|
0
|
0
|
|
|
|
|
if (/^4. CHARSETS/) { |
658
|
0
|
|
|
|
|
|
last; |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
# page head, foot, effectively empty lines |
661
|
0
|
0
|
0
|
|
|
|
if (/^(Simonsen|RFC 1345) / || /^.{0,3}$/) { |
662
|
0
|
|
|
|
|
|
next; |
663
|
|
|
|
|
|
|
} |
664
|
0
|
0
|
|
|
|
|
if (/SP\s+0020\s+SPACE/) { # first code def line |
665
|
0
|
|
|
|
|
|
$catch = "1"; |
666
|
|
|
|
|
|
|
} |
667
|
0
|
0
|
|
|
|
|
if (! $catch) { |
668
|
0
|
|
|
|
|
|
next; |
669
|
|
|
|
|
|
|
} |
670
|
0
|
|
|
|
|
|
++$jj; |
671
|
0
|
0
|
0
|
|
|
|
if(/^\s*([^\s]+)\s+([\da-fA-F]{4})\s{4}\b(.+)\s*$/ || |
672
|
|
|
|
|
|
|
/^(\s+)(e000)\s{4}\b(.+)\s*$/) { |
673
|
0
|
|
|
|
|
|
$mm = $1; $hh = $2; $xx = hex $2; $nn = $3; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# normalize unusual e000 format which indicates unfinished (Mnemonic) |
675
|
0
|
0
|
|
|
|
|
if ($hh eq "e000") { |
676
|
0
|
|
|
|
|
|
$mm = " "; # single space |
677
|
|
|
|
|
|
|
} |
678
|
|
|
|
|
|
|
# correct mistake in LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW |
679
|
0
|
0
|
|
|
|
|
if ($hh eq "1e4b") { |
680
|
0
|
|
|
|
|
|
$mm = "n->"; |
681
|
|
|
|
|
|
|
} |
682
|
|
|
|
|
|
|
# correct mistake in LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE |
683
|
0
|
0
|
|
|
|
|
if ($hh eq "1e69") { |
684
|
0
|
|
|
|
|
|
$mm = "s.-."; |
685
|
|
|
|
|
|
|
} |
686
|
0
|
0
|
0
|
|
|
|
if ($yy && ($jj != ($kk +1)) && ($xx != ($yy +1))) { |
|
|
|
0
|
|
|
|
|
687
|
0
|
|
|
|
|
|
print "Check sequence around this entry: $_\n"; |
688
|
|
|
|
|
|
|
} |
689
|
0
|
|
|
|
|
|
$yy = $xx; $kk = $jj; |
|
0
|
|
|
|
|
|
|
690
|
0
|
0
|
|
|
|
|
if ($nam_mne{$mm} ne "") { # already exists |
691
|
0
|
|
|
|
|
|
print "Same mnemonic $mm for $nam_mne{$mm} and hex code $hh \n"; |
692
|
|
|
|
|
|
|
} |
693
|
0
|
0
|
|
|
|
|
if ($mne_nam{$nn} ne "") { # already exists |
694
|
0
|
|
|
|
|
|
print "Same mnemonic $mne_nam{$nn} for $nn and hex code $hh \n"; |
695
|
|
|
|
|
|
|
} |
696
|
|
|
|
|
|
|
# normalize for more thorough substitution |
697
|
0
|
|
|
|
|
|
$nn =~ s/\b(SUBSCRIPT|SUPERSCRIPT)\s+($$digits)\b/$1 DIGIT $2/; |
698
|
0
|
|
|
|
|
|
$nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic |
699
|
0
|
|
|
|
|
|
$mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name |
700
|
0
|
|
|
|
|
|
$aprox_mne{$mm} = []; # list approx subs keyed by mnemonic |
701
|
0
|
|
|
|
|
|
next; |
702
|
|
|
|
|
|
|
} |
703
|
0
|
0
|
|
|
|
|
if (/^[ ]{16}([^\s].*)/) { # continuation line |
704
|
0
|
|
|
|
|
|
delete $mne_nam{$nn}; |
705
|
0
|
|
|
|
|
|
$nn = "$nn $1"; # append to name in prev line |
706
|
0
|
|
|
|
|
|
$nam_mne{$mm} = "$nn"; |
707
|
0
|
|
|
|
|
|
$mne_nam{$nn} = "$mm"; |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
} |
710
|
0
|
|
|
|
|
|
close RFC; |
711
|
0
|
|
|
|
|
|
return 1; |
712
|
|
|
|
|
|
|
} # end of sub load_rfcdoc |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
sub transliterate { |
716
|
0
|
0
|
|
0
|
1
|
|
my $self = ($#_ > 0) ? shift : 0; |
717
|
0
|
0
|
|
|
|
|
my $arref = $self ? \@{$self->{TRN_ARY}} : \@transform; |
|
0
|
|
|
|
|
|
|
718
|
0
|
|
|
|
|
|
my @xx = unpack "C*", $_[0]; |
719
|
0
|
|
|
|
|
|
my $yy = ""; |
720
|
0
|
|
|
|
|
|
foreach ( @xx ) { |
721
|
0
|
|
|
|
|
|
$yy .= pack "C", @$arref[$_]; |
722
|
|
|
|
|
|
|
} |
723
|
0
|
|
|
|
|
|
return $yy; |
724
|
|
|
|
|
|
|
} # end of sub transliterate |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
__END__ |