| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!perl |
|
2
|
|
|
|
|
|
|
# "Translit.pm" by Genji Schmeder 4 October 1996 |
|
3
|
|
|
|
|
|
|
# creates transliteration map between character sets defined in IETF RFC 1345 |
|
4
|
|
|
|
|
|
|
# with acknowledgements to Chris Leach, author of "EBCDIC.pm" |
|
5
|
|
|
|
|
|
|
# and to Keld Simonsen, author of RFC 1345 |
|
6
|
|
|
|
|
|
|
# Copyright (c) 1997 Genji Schmeder. All rights reserved. |
|
7
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
|
8
|
|
|
|
|
|
|
# it under the same terms as Perl itself. |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
package Convert::Translit; |
|
11
|
1
|
|
|
1
|
|
765
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
46
|
|
|
12
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT_OK); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
130
|
|
|
13
|
|
|
|
|
|
|
require Exporter; |
|
14
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
|
15
|
|
|
|
|
|
|
@EXPORT_OK = qw(transliterate build_substitutes); |
|
16
|
|
|
|
|
|
|
$VERSION = '1.03'; # dated 5 November 1997 |
|
17
|
1
|
|
|
1
|
|
925
|
use integer; |
|
|
1
|
|
|
|
|
14
|
|
|
|
1
|
|
|
|
|
6
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
if ($] < 5) {die "Perl version must be at least 5.\n";} |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my $here = where_module(); |
|
22
|
|
|
|
|
|
|
my $rfc_fnam = $here."rfc1345"; |
|
23
|
|
|
|
|
|
|
my $substi_fnam = $here."substitutes"; |
|
24
|
|
|
|
|
|
|
undef &where_module; |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my %nam_mne; # hash of name keyed by mnemonic |
|
27
|
|
|
|
|
|
|
my %mne_nam; # hash of mnemonic keyed by name |
|
28
|
|
|
|
|
|
|
my %aprox_mne; # hash of substitute list keyed by mnemonic |
|
29
|
|
|
|
|
|
|
# if new() never called, transliterate() returns unchanged arg |
|
30
|
|
|
|
|
|
|
my @transform = (0 .. 255); |
|
31
|
|
|
|
|
|
|
my $verbose = 0; |
|
32
|
|
|
|
|
|
|
1; |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub where_module { |
|
36
|
|
|
|
|
|
|
my @xx = caller; |
|
37
|
1
|
|
|
1
|
|
1458
|
use File::Basename; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
6696
|
|
|
38
|
|
|
|
|
|
|
my $yy = dirname($xx[1]); |
|
39
|
|
|
|
|
|
|
# adjust for Macintosh and Unix different return from dirname() |
|
40
|
|
|
|
|
|
|
if ( ($^O ne "MacOS") && ($yy =~ /[^\/]$/) ) { |
|
41
|
|
|
|
|
|
|
$yy .= "/"; |
|
42
|
|
|
|
|
|
|
} |
|
43
|
|
|
|
|
|
|
return $yy; |
|
44
|
|
|
|
|
|
|
} # end of sub where_module |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub new { |
|
48
|
1
|
|
|
1
|
1
|
59
|
my ($bb, $cc, $dd, $ee, $gg, $ii, $jj, $kk, $line); |
|
49
|
0
|
|
|
|
|
0
|
my ($mm, $pp, $qq, $uu, $vv, $ww, $xx, $yy, $zz); |
|
50
|
0
|
|
|
|
|
0
|
my (@ch_tab, @thistab, @bitmap, @ff, @tt, @pp, @oo); |
|
51
|
0
|
|
|
|
|
0
|
my (%duplcatfrom, %duplcatto); |
|
52
|
1
|
|
|
|
|
3
|
my $this = shift; |
|
53
|
1
|
|
33
|
|
|
9
|
my $class = ref($this) || $this; |
|
54
|
|
|
|
|
|
|
|
|
55
|
1
|
|
|
|
|
3
|
%nam_mne = (); |
|
56
|
1
|
|
|
|
|
2
|
%mne_nam = (); |
|
57
|
1
|
|
|
|
|
2
|
%aprox_mne = (); |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# syntax: new([FROM charset], TO charset, [verbose]) |
|
60
|
1
|
|
|
|
|
3
|
my @chset; |
|
61
|
1
|
50
|
|
|
|
5
|
$chset[0] = ($#_ > 0) ? shift : "ascii"; # ascii is default FROM charset |
|
62
|
1
|
|
|
|
|
3
|
$chset[1] = shift; |
|
63
|
1
|
|
|
|
|
2
|
$verbose = shift; |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# typical rfc1345 table header: |
|
66
|
|
|
|
|
|
|
# &charset CSA_Z243.4-1985-1 |
|
67
|
|
|
|
|
|
|
# &rem source: ECMA registry |
|
68
|
|
|
|
|
|
|
# &alias iso-ir-121 |
|
69
|
|
|
|
|
|
|
# &g0esc x2877 &g1esc x2977 &g2esc x2a77 &g3esc x2b77 |
|
70
|
|
|
|
|
|
|
# &alias ISO646-CA |
|
71
|
|
|
|
|
|
|
# &alias csa7-1 |
|
72
|
|
|
|
|
|
|
# &alias ca |
|
73
|
|
|
|
|
|
|
# &code 0 |
|
74
|
|
|
|
|
|
|
# NU SH SX EX ET EQ AK BL BS HT LF VT FF CR SO SI |
|
75
|
|
|
|
|
|
|
|
|
76
|
1
|
50
|
|
|
|
5
|
load_dsf( $substi_fnam) or return undef; |
|
77
|
1
|
50
|
|
|
|
5
|
if ($verbose) {print "Creating transliteration from \"$chset[0]\" to \"$chset[1]\"\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
78
|
1
|
|
|
|
|
7
|
for $ii ( 0 .. 1 ){ |
|
79
|
|
|
|
|
|
|
#traverse whole file twice for simplicity |
|
80
|
1
|
50
|
|
|
|
109
|
unless (open(RFC, $rfc_fnam)) {print "Can't open $rfc_fnam: $!.\n"; return undef;} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
81
|
1
|
|
|
|
|
41
|
while() { #skip to relevant section |
|
82
|
2709
|
100
|
|
|
|
8809
|
if (/5.\s+CHARSET TABLES/) { |
|
83
|
1
|
|
|
|
|
4
|
last; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
} |
|
86
|
1
|
|
|
|
|
4
|
@thistab = (); |
|
87
|
1
|
|
|
|
|
8
|
while() { # read table lines |
|
88
|
10
|
|
|
|
|
18
|
$line = $_; |
|
89
|
10
|
|
|
|
|
12
|
chomp $line; |
|
90
|
10
|
50
|
|
|
|
45
|
if ($line =~ /^ACKNOWLEDGEMENTS/) { # past relevant section |
|
91
|
0
|
0
|
|
|
|
0
|
if (@thistab) { # was previous table collected? |
|
92
|
0
|
0
|
|
|
|
0
|
if (relvnt_tab($chset[$ii], \@thistab) ) { |
|
93
|
0
|
|
|
|
|
0
|
$ch_tab[$ii] = [ @thistab ]; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
} |
|
96
|
0
|
|
|
|
|
0
|
last; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
10
|
100
|
|
|
|
29
|
if ($line !~ /^ [^\s]/){ # distinguish table lines by 2 leading spaces |
|
99
|
2
|
|
|
|
|
6
|
next; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
8
|
|
|
|
|
24
|
$line =~ s/^\s*//; # trim leading spaces |
|
102
|
|
|
|
|
|
|
# be sure of ending a table by encountering the next |
|
103
|
8
|
100
|
|
|
|
20
|
if ($line !~ /^&charset\s/) { |
|
104
|
6
|
|
|
|
|
11
|
push @thistab, $line; |
|
105
|
6
|
|
|
|
|
13
|
next; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
2
|
100
|
|
|
|
6
|
if (@thistab) { # was previous table collected? |
|
108
|
1
|
0
|
|
|
|
8
|
if (relvnt_tab($chset[$ii], \@thistab)) { |
|
109
|
0
|
|
|
|
|
0
|
$ch_tab[$ii] = [ @thistab ]; |
|
110
|
0
|
|
|
|
|
0
|
last; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
} |
|
113
|
1
|
|
|
|
|
20
|
@thistab = ($line); # start another table collection |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
} |
|
116
|
0
|
|
|
|
|
0
|
close RFC; |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
#examine charsets |
|
119
|
0
|
|
|
|
|
0
|
$qq = 0; |
|
120
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
|
121
|
0
|
0
|
|
|
|
0
|
if(! $ch_tab[$ii][0]){ |
|
122
|
0
|
|
|
|
|
0
|
print STDERR "Couldn't find a character set for \"$chset[$ii]\".\n"; |
|
123
|
0
|
|
|
|
|
0
|
$qq = 1; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
} |
|
126
|
0
|
0
|
|
|
|
0
|
if ($qq) {return undef;} |
|
|
0
|
|
|
|
|
0
|
|
|
127
|
|
|
|
|
|
|
|
|
128
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
|
129
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#{$ch_tab[$ii]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
130
|
0
|
0
|
|
|
|
0
|
if($ch_tab[$ii][$jj] =~ /^&bits\b\s*(\d*)/){ |
|
131
|
0
|
0
|
0
|
|
|
0
|
if ($1 != 0 || $1 != 8) { |
|
132
|
0
|
|
|
|
|
0
|
print STDERR "Can't handle $1\-bit charsets like $chset[$ii].\n"; |
|
133
|
0
|
|
|
|
|
0
|
$qq = 1; |
|
134
|
0
|
|
|
|
|
0
|
last; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
} |
|
139
|
0
|
0
|
|
|
|
0
|
if ($qq) {return undef;} |
|
|
0
|
|
|
|
|
0
|
|
|
140
|
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
|
142
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#{$ch_tab[$ii]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
143
|
0
|
0
|
|
|
|
0
|
if($ch_tab[$ii][$jj] =~ /^&(code2|codex|comb2)\b/){ |
|
144
|
0
|
|
|
|
|
0
|
print STDERR "Can't handle $1 terms in charset $chset[$ii].\n"; |
|
145
|
0
|
|
|
|
|
0
|
$qq = 1; |
|
146
|
0
|
|
|
|
|
0
|
last; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
} |
|
150
|
0
|
0
|
|
|
|
0
|
if ($qq) {return undef;} |
|
|
0
|
|
|
|
|
0
|
|
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
#create bit maps |
|
153
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. 1 ){ |
|
154
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nCharacter set $chset[$ii]:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
155
|
0
|
|
|
|
|
0
|
$dd = -1; # code offset unless negative |
|
156
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#{$ch_tab[$ii]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
157
|
0
|
|
|
|
|
0
|
$line = $ch_tab[$ii][$jj]; |
|
158
|
0
|
0
|
|
|
|
0
|
if($line =~ /^&code\b\s*(\d)/){ |
|
159
|
0
|
|
|
|
|
0
|
$dd = $1; |
|
160
|
0
|
|
|
|
|
0
|
next; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
0
|
0
|
|
|
|
0
|
if($line =~ /^&duplicate\s+([\da-fA-F]+)\s+([^\s]+)/){ |
|
163
|
0
|
|
|
|
|
0
|
$pp = $1; $mm = $2; # position already in base 10 |
|
|
0
|
|
|
|
|
0
|
|
|
164
|
|
|
|
|
|
|
# examples: "duplicate 91 AE", "duplicate 92 O/" (position, dup mnemonic) |
|
165
|
|
|
|
|
|
|
# organize duplicate keeping differently for FROM and TO charsets |
|
166
|
0
|
0
|
|
|
|
0
|
if ($ii == 0) { |
|
167
|
0
|
0
|
|
|
|
0
|
if ($duplcatfrom{$pp} ) { |
|
168
|
|
|
|
|
|
|
# FROM keyed by position |
|
169
|
0
|
|
|
|
|
0
|
push @{$duplcatfrom{$pp}{Mnemonics}}, "$mm"; |
|
|
0
|
|
|
|
|
0
|
|
|
170
|
|
|
|
|
|
|
} else { |
|
171
|
0
|
|
|
|
|
0
|
$duplcatfrom{$pp} = {Position => $pp, Mnemonics => [ "$mm" ],}; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
#print "FROM Duplicate for position $pp: $duplcatfrom{$pp}{Mnemonics}[-1]\n"; |
|
174
|
|
|
|
|
|
|
} else { |
|
175
|
0
|
|
|
|
|
0
|
$duplcatto{$mm} = $pp; # TO keyed by mnemonic |
|
176
|
|
|
|
|
|
|
#print "TO Duplicate for position $duplcatto{$mm}: $mm\n"; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
# flush control info (just use all defined keywords) |
|
180
|
0
|
0
|
|
|
|
0
|
if($line =~ |
|
181
|
|
|
|
|
|
|
/^&(charset|alias|g\desc|bits|code|code2|codex|duplicate|rem|comb2)\b/) { |
|
182
|
0
|
|
|
|
|
0
|
$dd = -1; |
|
183
|
0
|
|
|
|
|
0
|
next; |
|
184
|
|
|
|
|
|
|
} |
|
185
|
0
|
0
|
|
|
|
0
|
if ($dd < 0) { |
|
186
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
|
0
|
|
|
|
|
0
|
|
|
187
|
|
|
|
|
|
|
{print "Strange! You should examine the charset table: \"$line\"\n";} |
|
188
|
0
|
|
|
|
|
0
|
next; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
0
|
|
|
|
|
0
|
@pp = split(/\s+/, $line); |
|
191
|
0
|
|
|
|
|
0
|
foreach $uu (@pp) { |
|
192
|
|
|
|
|
|
|
# mnemonic "??" means position unused. |
|
193
|
|
|
|
|
|
|
# mnemonic "__" means character set not completely defined here |
|
194
|
0
|
0
|
0
|
|
|
0
|
if ( $uu eq "??" || $uu eq "__" ) { |
|
195
|
0
|
|
|
|
|
0
|
$uu = ""; |
|
196
|
|
|
|
|
|
|
} |
|
197
|
0
|
0
|
0
|
|
|
0
|
if (($uu) && (! $nam_mne{$uu})) { |
|
198
|
0
|
|
|
|
|
0
|
printf "Warning: position %x (hex) has invalid mnemonic \"%s\".\n", |
|
199
|
|
|
|
|
|
|
$dd, $uu; |
|
200
|
0
|
|
|
|
|
0
|
$uu = ""; |
|
201
|
|
|
|
|
|
|
} |
|
202
|
0
|
|
|
|
|
0
|
$bitmap[$ii][$dd++] = $uu; |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
# print ($#pp, $dd, (1 + $#{$bitmap[$ii]}), "\n"); |
|
205
|
|
|
|
|
|
|
} |
|
206
|
|
|
|
|
|
|
# normalize char set length |
|
207
|
0
|
|
|
|
|
0
|
$#{$bitmap[$ii]} += ((128 - ( (1 + $#{$bitmap[$ii]}) % 128)) % 128); |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
208
|
|
|
|
|
|
|
# print ("highest index is", $#{$bitmap[$ii]}, "\n" ); |
|
209
|
0
|
|
|
|
|
0
|
for $kk ( 0 .. $#{$bitmap[$ii]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
210
|
0
|
0
|
|
|
|
0
|
if ($verbose) { print (length($bitmap[$ii][$kk])?"x":"_");} |
|
|
0
|
0
|
|
|
|
0
|
|
|
211
|
0
|
0
|
|
|
|
0
|
if ( ! (($kk +1) % 16)) { if ($verbose) { print "\n";}} |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# explanation from RFC 1345: |
|
216
|
|
|
|
|
|
|
# "&duplicate" has a special meaning indicating that a position |
|
217
|
|
|
|
|
|
|
# is being used for more than one character. This is an ugly |
|
218
|
|
|
|
|
|
|
# convention but it is a sad fact of life that same code in one |
|
219
|
|
|
|
|
|
|
# coded character set can mean different characters. |
|
220
|
|
|
|
|
|
|
# "&duplicate" takes two parameters - the first is the code to |
|
221
|
|
|
|
|
|
|
# be duplicated, the other is the new mnemonic. |
|
222
|
|
|
|
|
|
|
# &duplicate 91 AE |
|
223
|
|
|
|
|
|
|
# &duplicate 92 O/ |
|
224
|
|
|
|
|
|
|
|
|
225
|
0
|
|
|
|
|
0
|
$#transform = $#{$bitmap[0]}; # create transliterator table |
|
|
0
|
|
|
|
|
0
|
|
|
226
|
0
|
|
|
|
|
0
|
PIERWSZY: for $jj ( 0 .. $#{$bitmap[0]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
227
|
0
|
|
|
|
|
0
|
$transform[$jj] = -1; # initialize since 0 is valid value |
|
228
|
0
|
|
|
|
|
0
|
@tt = ( $bitmap[0][$jj] ); # start list with one element |
|
229
|
0
|
0
|
|
|
|
0
|
if ($duplcatfrom{$jj}) { # add any duplicates in the FROM charset |
|
230
|
0
|
|
|
|
|
0
|
push @tt, @{$duplcatfrom{$jj}{Mnemonics}}; |
|
|
0
|
|
|
|
|
0
|
|
|
231
|
|
|
|
|
|
|
#print "BB $jj $#tt <<< @tt >>>\n"; |
|
232
|
|
|
|
|
|
|
} |
|
233
|
0
|
|
|
|
|
0
|
@oo = (); |
|
234
|
0
|
|
|
|
|
0
|
for $xx ( 0 .. $#tt ) { # refine list |
|
235
|
0
|
0
|
|
|
|
0
|
if (length($tt[$xx]) > 0) { |
|
236
|
0
|
|
|
|
|
0
|
push @oo, $tt[$xx]; |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
} |
|
239
|
0
|
0
|
|
|
|
0
|
if (! @oo) { |
|
240
|
0
|
|
|
|
|
0
|
next; |
|
241
|
|
|
|
|
|
|
} |
|
242
|
0
|
|
|
|
|
0
|
for $bb ( 0 .. $#oo ) { # match any mnemonics for this position in FROM charset |
|
243
|
0
|
|
|
|
|
0
|
$uu = $oo[$bb]; |
|
244
|
0
|
|
|
|
|
0
|
for $kk ( 0 .. $#{$bitmap[1]} ) { #search TO charset |
|
|
0
|
|
|
|
|
0
|
|
|
245
|
0
|
0
|
|
|
|
0
|
if ($uu eq $bitmap[1][$kk]) { |
|
246
|
0
|
|
|
|
|
0
|
$transform[$jj] = $kk; |
|
247
|
0
|
|
|
|
|
0
|
next PIERWSZY; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
|
|
|
|
|
|
} |
|
250
|
0
|
0
|
|
|
|
0
|
if (exists $duplcatto{$uu}) { # search TO charset duplicates |
|
251
|
0
|
|
|
|
|
0
|
$transform[$jj] = $duplcatto{$uu}; |
|
252
|
0
|
|
|
|
|
0
|
next PIERWSZY; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
} |
|
256
|
|
|
|
|
|
|
|
|
257
|
0
|
0
|
|
|
|
0
|
if ($verbose) { print "\nTransliterator:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
258
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { |
|
259
|
0
|
0
|
|
|
|
0
|
if ($verbose) { print (($transform[$jj]>=0)?"x":"_");} |
|
|
0
|
0
|
|
|
|
0
|
|
|
260
|
0
|
0
|
|
|
|
0
|
if ( ! (($jj+1) % 16)) { if ($verbose) { print "\n";}} |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
0
|
$vv = $ww = 0; |
|
264
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # try to approximate using substitute lists |
|
265
|
0
|
0
|
|
|
|
0
|
if ($transform[$jj] >= 0) { |
|
266
|
0
|
|
|
|
|
0
|
next; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
0
|
0
|
|
|
|
0
|
if (! ($gg = $bitmap[0][$jj]) ) { # undefined |
|
269
|
0
|
|
|
|
|
0
|
next; |
|
270
|
|
|
|
|
|
|
} |
|
271
|
0
|
|
|
|
|
0
|
TRZECI: foreach $pp (@{$aprox_mne{$gg}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
272
|
0
|
|
|
|
|
0
|
for $cc ( 0 .. $#{$bitmap[1]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
273
|
0
|
|
|
|
|
0
|
$zz = $bitmap[1][$cc]; |
|
274
|
0
|
0
|
|
|
|
0
|
if ($zz eq $pp) { # substitute must be in TO char set |
|
275
|
0
|
|
|
|
|
0
|
$transform[$jj] = $cc; # success |
|
276
|
0
|
0
|
|
|
|
0
|
if (!$ww) { |
|
277
|
0
|
|
|
|
|
0
|
$ww = 1; |
|
278
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nApproximate substitutes:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
279
|
|
|
|
|
|
|
} |
|
280
|
0
|
0
|
|
|
|
0
|
if ($verbose) { printf "%X==>%X\t%s==>%s\n", $jj, $cc, |
|
|
0
|
|
|
|
|
0
|
|
|
281
|
|
|
|
|
|
|
$nam_mne{ $gg}, $nam_mne{ $zz};} |
|
282
|
0
|
|
|
|
|
0
|
last TRZECI; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
} |
|
286
|
0
|
|
|
|
|
0
|
$vv = 1; # defined character but no substitute |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
0
|
0
|
|
|
|
0
|
if ($ww) { |
|
290
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nTransliterator with aproximate substitutions:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
291
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { |
|
292
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print (($transform[$jj]>=0)?"x":"_");} |
|
|
0
|
0
|
|
|
|
0
|
|
|
293
|
0
|
0
|
|
|
|
0
|
if ( ! (($jj+1) % 16)) { if ($verbose) {print "\n";}} |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# non-equivalent substitutions (like "RIGHT SQUARE BRACKET" for "CENT SIGN") |
|
298
|
0
|
0
|
|
|
|
0
|
if ($vv) { |
|
299
|
0
|
|
|
|
|
0
|
@ff = (); |
|
300
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # list those in need in FROM charset |
|
301
|
0
|
0
|
0
|
|
|
0
|
if (($transform[$jj] < 0) && $bitmap[0][$jj] ) { |
|
302
|
0
|
|
|
|
|
0
|
push @ff, $jj; |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
} |
|
305
|
0
|
0
|
|
|
|
0
|
if (@ff) { |
|
306
|
0
|
|
|
|
|
0
|
@tt = (); |
|
307
|
0
|
|
|
|
|
0
|
PIATY: for $bb ( 0 .. $#{$bitmap[1]} ) { # list availables in TO charset |
|
|
0
|
|
|
|
|
0
|
|
|
308
|
|
|
|
|
|
|
# start at usual second ascii graphic (hoping to maximize graphics) |
|
309
|
0
|
|
|
|
|
0
|
$cc = (34 + $bb) % (1 + $#{$bitmap[1]} ); |
|
|
0
|
|
|
|
|
0
|
|
|
310
|
0
|
0
|
|
|
|
0
|
if ($zz = $bitmap[1][$cc]) { |
|
311
|
0
|
|
|
|
|
0
|
for $ii ( 0 .. $#{$bitmap[0]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
312
|
0
|
0
|
|
|
|
0
|
if ($zz eq $bitmap[0][$ii]) { |
|
313
|
0
|
|
|
|
|
0
|
next PIATY; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
} |
|
316
|
0
|
|
|
|
|
0
|
push @tt, $cc; |
|
317
|
|
|
|
|
|
|
} |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
} |
|
320
|
0
|
|
|
|
|
0
|
$gg = $ee = 0; |
|
321
|
0
|
|
0
|
|
|
0
|
SIODMY: while ( ($#ff >= 0) && ($#tt >= 0) ) { |
|
322
|
0
|
|
|
|
|
0
|
for $jj ($gg .. $#ff) { # first try to match equal hex values |
|
323
|
0
|
|
|
|
|
0
|
$ww = $ff[$jj]; |
|
324
|
0
|
|
|
|
|
0
|
for $kk (0 .. $#tt) { |
|
325
|
0
|
0
|
|
|
|
0
|
if ( $ww == $tt[$kk]) { |
|
326
|
0
|
|
|
|
|
0
|
$transform[ $ww] = $ww; |
|
327
|
0
|
0
|
|
|
|
0
|
if (!$ee) { |
|
328
|
0
|
|
|
|
|
0
|
$ee = 1; |
|
329
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nNon-equivalent substitutes:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
330
|
|
|
|
|
|
|
} |
|
331
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
|
0
|
|
|
|
|
0
|
|
|
332
|
|
|
|
|
|
|
{printf "%X==>%X\t%s==>%s\n", $ww, $ww, |
|
333
|
|
|
|
|
|
|
$nam_mne{$bitmap[0][ $ww]}, $nam_mne{$bitmap[1][ $ww]};} |
|
334
|
0
|
|
|
|
|
0
|
splice @ff, $jj, 1; |
|
335
|
0
|
|
|
|
|
0
|
splice @tt, $kk, 1; |
|
336
|
0
|
|
|
|
|
0
|
$gg = $jj; |
|
337
|
0
|
|
|
|
|
0
|
next SIODMY; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
} |
|
341
|
0
|
|
|
|
|
0
|
last; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
0
|
0
|
|
|
|
0
|
$gg = ($#ff < $#tt)? $#ff : $#tt; |
|
344
|
0
|
|
|
|
|
0
|
for $jj (0 .. $gg ) { |
|
345
|
0
|
|
|
|
|
0
|
$transform[$ff[$jj]] = $tt[$jj]; |
|
346
|
0
|
0
|
|
|
|
0
|
if (!$ee) { |
|
347
|
0
|
|
|
|
|
0
|
$ee = 1; |
|
348
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nNon-equivalent substitutes:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
349
|
|
|
|
|
|
|
} |
|
350
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
|
0
|
|
|
|
|
0
|
|
|
351
|
|
|
|
|
|
|
{printf "%X==>%X\t%s==>%s\n", $ff[$jj], $tt[$jj], |
|
352
|
|
|
|
|
|
|
$nam_mne{$bitmap[0][$ff[$jj]]}, $nam_mne{$bitmap[1][$tt[$jj]]};} |
|
353
|
|
|
|
|
|
|
} |
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
0
|
$ee = $yy = 0; # anything left untranslated? |
|
357
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { |
|
358
|
0
|
0
|
|
|
|
0
|
if ($transform[$jj] < 0) { |
|
359
|
0
|
|
|
|
|
0
|
$yy = 1; |
|
360
|
0
|
0
|
|
|
|
0
|
if ($mm = $bitmap[0][$jj]) { |
|
361
|
0
|
0
|
|
|
|
0
|
if (!$ee) { |
|
362
|
0
|
|
|
|
|
0
|
$ee = 1; |
|
363
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\nNon-equivalent remnant:\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
364
|
|
|
|
|
|
|
} |
|
365
|
0
|
0
|
|
|
|
0
|
if ($verbose) {printf "%X\t%s\n", $jj, $nam_mne{$mm};} |
|
|
0
|
|
|
|
|
0
|
|
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
} |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# if non-equivalent remnant, then select a hopefully unique and graphic indicator |
|
371
|
0
|
0
|
|
|
|
0
|
if ($ee) { |
|
372
|
0
|
|
|
|
|
0
|
DRUGI: for $bb ( 0 .. $#{$bitmap[1]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
373
|
0
|
|
|
|
|
0
|
$cc = (34 + $bb) % (1 + $#{$bitmap[1]} ); |
|
|
0
|
|
|
|
|
0
|
|
|
374
|
0
|
0
|
|
|
|
0
|
if ($xx = $bitmap[1][$cc]) { |
|
375
|
0
|
|
|
|
|
0
|
for $dd ( 0 .. $#{$bitmap[0]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
376
|
0
|
0
|
|
|
|
0
|
if ( $xx eq $bitmap[0][$dd]) { # reject if in FROM charset |
|
377
|
0
|
|
|
|
|
0
|
next DRUGI; |
|
378
|
|
|
|
|
|
|
} |
|
379
|
|
|
|
|
|
|
} |
|
380
|
0
|
|
|
|
|
0
|
last; # success |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
} |
|
383
|
0
|
0
|
|
|
|
0
|
if ($verbose) |
|
|
0
|
|
|
|
|
0
|
|
|
384
|
|
|
|
|
|
|
{printf "\nNon-equivalence indicator: %X\t%s\n", $cc, $nam_mne{$xx}}; |
|
385
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # interpolate non-equiv character |
|
386
|
0
|
0
|
0
|
|
|
0
|
if (($transform[$jj] < 0) && $bitmap[0][$jj]) { |
|
387
|
0
|
|
|
|
|
0
|
$transform[$jj] = $cc; |
|
388
|
|
|
|
|
|
|
} |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
# if undefined remnant, then select a possibly undefined indicator |
|
393
|
0
|
0
|
|
|
|
0
|
if ($yy) { |
|
394
|
0
|
|
|
|
|
0
|
SZOSTY: for ($cc = $#{$bitmap[1]}; $cc >= 0; --$cc ) { |
|
|
0
|
|
|
|
|
0
|
|
|
395
|
0
|
0
|
|
|
|
0
|
if (! ($xx = $bitmap[1][$cc]) ) { |
|
396
|
0
|
|
|
|
|
0
|
last; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
0
|
|
|
|
|
0
|
for $dd ( 0 .. $#{$bitmap[0]} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
399
|
0
|
0
|
|
|
|
0
|
if ($xx eq $bitmap[0][$dd] ) { |
|
400
|
0
|
|
|
|
|
0
|
next SZOSTY; |
|
401
|
|
|
|
|
|
|
} |
|
402
|
|
|
|
|
|
|
} |
|
403
|
0
|
|
|
|
|
0
|
last; # success |
|
404
|
|
|
|
|
|
|
} |
|
405
|
0
|
0
|
|
|
|
0
|
if ( ! ($yy = $nam_mne{$xx}) ) {$yy = "(undefined character)";} |
|
|
0
|
|
|
|
|
0
|
|
|
406
|
0
|
0
|
|
|
|
0
|
if ($verbose) {printf "\nUndefined indicator: %X\t%s\n", $cc, $yy}; |
|
|
0
|
|
|
|
|
0
|
|
|
407
|
0
|
|
|
|
|
0
|
for $jj ( 0 .. $#transform ) { # interpolate non-equiv character |
|
408
|
0
|
0
|
|
|
|
0
|
if ($transform[$jj] < 0) { |
|
409
|
0
|
|
|
|
|
0
|
$transform[$jj] = $cc; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
} |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# if FROM charset length < 256, assume repeated for upper 128 chars |
|
415
|
0
|
0
|
|
|
|
0
|
if ($#transform < 128) {push @transform, @transform;} |
|
|
0
|
|
|
|
|
0
|
|
|
416
|
0
|
|
|
|
|
0
|
$yy = "$chset[0]".".to."."$chset[1]"; |
|
417
|
0
|
|
|
|
|
0
|
my $self = {TRN_NAM=>"$yy", TRN_ARY=>[@transform]}; |
|
418
|
0
|
|
|
|
|
0
|
bless $self, $class; |
|
419
|
0
|
|
|
|
|
0
|
return $self; |
|
420
|
|
|
|
|
|
|
} # end of sub new |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub relvnt_tab { # true if this is the sought char table |
|
424
|
1
|
|
|
1
|
0
|
2
|
my ($jj, $uu, $ww, $xx, $yy, $chch, $tabref); |
|
425
|
1
|
|
|
|
|
3
|
$chch = shift; # $chset[0 or 1] |
|
426
|
1
|
|
|
|
|
2
|
$tabref = shift; # reference to @thistab |
|
427
|
1
|
|
|
|
|
3
|
for $jj ( 0 .. $#{@$tabref} ){ |
|
|
1
|
|
|
|
|
185
|
|
|
428
|
0
|
|
|
|
|
0
|
$ww = $$tabref[$jj]; |
|
429
|
0
|
0
|
|
|
|
0
|
if ($ww !~ /^&(charset|alias)\s+([^\s]*)/){ |
|
430
|
0
|
|
|
|
|
0
|
next; |
|
431
|
|
|
|
|
|
|
} |
|
432
|
0
|
|
|
|
|
0
|
$xx = $1; $yy = $2; |
|
|
0
|
|
|
|
|
0
|
|
|
433
|
0
|
0
|
|
|
|
0
|
if ($chch =~ /^$yy$/i) { |
|
434
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "Found $xx $yy";} |
|
|
0
|
|
|
|
|
0
|
|
|
435
|
0
|
0
|
|
|
|
0
|
if ($xx eq "alias") { |
|
436
|
0
|
|
|
|
|
0
|
$uu = $$tabref[0]; |
|
437
|
0
|
|
|
|
|
0
|
$uu =~ s/^&//; |
|
438
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print " for $uu";} |
|
|
0
|
|
|
|
|
0
|
|
|
439
|
|
|
|
|
|
|
} |
|
440
|
0
|
0
|
|
|
|
0
|
if ($verbose) {print "\n";} |
|
|
0
|
|
|
|
|
0
|
|
|
441
|
0
|
|
|
|
|
0
|
return 1; # true |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
} |
|
444
|
0
|
|
|
|
|
0
|
return undef; # false |
|
445
|
|
|
|
|
|
|
} # end of sub relvnt_tab |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub load_dsf { # load code definitions and approximate substitutes |
|
449
|
1
|
|
|
1
|
0
|
2
|
my ($mm, $nn, @ww); |
|
450
|
1
|
50
|
|
|
|
91
|
unless (open(DSF, "$_[0]")) {print "Can't open $_[0]: $!.\n"; return undef;} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
451
|
1
|
|
|
|
|
56
|
while () { |
|
452
|
1
|
|
|
|
|
4
|
chomp; |
|
453
|
1
|
50
|
|
|
|
8
|
if (/^hash mnemonic=name/) { # first group header |
|
454
|
1
|
|
|
|
|
139
|
last; |
|
455
|
|
|
|
|
|
|
} |
|
456
|
|
|
|
|
|
|
} |
|
457
|
1
|
|
|
|
|
16
|
while () { |
|
458
|
1895
|
|
|
|
|
2358
|
chomp; |
|
459
|
1895
|
100
|
|
|
|
4165
|
if (/^hash mnemonic=substitute list/) { # next group header |
|
460
|
1
|
|
|
|
|
6
|
last; |
|
461
|
|
|
|
|
|
|
} |
|
462
|
1894
|
|
|
|
|
5980
|
($mm, $nn) = split(/\t/); |
|
463
|
1894
|
|
|
|
|
11970
|
$nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic |
|
464
|
1894
|
|
|
|
|
7604
|
$mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name |
|
465
|
|
|
|
|
|
|
} |
|
466
|
1
|
|
|
|
|
12
|
while () { |
|
467
|
985
|
|
|
|
|
1024
|
chomp; |
|
468
|
985
|
|
|
|
|
7556
|
($mm, @ww) = split(/\t/); |
|
469
|
985
|
|
|
|
|
5334
|
$aprox_mne{$mm} = [@ww]; |
|
470
|
|
|
|
|
|
|
} |
|
471
|
1
|
|
|
|
|
57
|
close DSF; |
|
472
|
1
|
|
|
|
|
12
|
return 1; |
|
473
|
|
|
|
|
|
|
} # end of sub load_dsf |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub build_substitutes { |
|
477
|
|
|
|
|
|
|
# creates lists of approximate substitutes |
|
478
|
|
|
|
|
|
|
# it takes about 90 minutes to recreate the file |
|
479
|
|
|
|
|
|
|
# there should be no need to run this since its result file never needs changing |
|
480
|
0
|
|
|
0
|
1
|
|
my ($xx, $aa, $yy, $bb, $ff, $hh, $pp, $jj, $kk, $gg, $rr, $mm, $nn, $start); |
|
481
|
0
|
|
|
|
|
|
my @ww; |
|
482
|
0
|
|
|
|
|
|
my $this = shift; |
|
483
|
0
|
|
0
|
|
|
|
my $class = ref($this) || $this; |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
#$start = time(); |
|
486
|
0
|
|
|
|
|
|
load_rfcdoc( $rfc_fnam); |
|
487
|
|
|
|
|
|
|
# find approximate substitutes |
|
488
|
|
|
|
|
|
|
# (mnemonic, hexcode, name) example: |
|
489
|
|
|
|
|
|
|
# mnemonic: j+- feef ARABIC LETTER ALEF MAKSURA ISOLATED FORM |
|
490
|
|
|
|
|
|
|
# substitute: j+ 0649 ARABIC LETTER ALEF MAKSURA |
|
491
|
|
|
|
|
|
|
# substitute: a+: e022 ARABIC LETTER ALEF FINAL FORM COMPATIBILITY (IBM868 144) |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
#print (time() - $start); print " delete words from right\n"; |
|
494
|
0
|
|
|
|
|
|
while (($xx, $aa) = each %mne_nam) { # (name, nmemonic) |
|
495
|
0
|
0
|
|
|
|
|
if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character |
|
496
|
0
|
|
|
|
|
|
next; |
|
497
|
|
|
|
|
|
|
} |
|
498
|
0
|
|
|
|
|
|
$yy = $xx; |
|
499
|
0
|
|
|
|
|
|
while ($yy =~ /[^\s]\s+[^\s]+\s*$/) { |
|
500
|
0
|
|
|
|
|
|
$yy =~ s/^(.*[^\s])\s+[^\s]+\s*$/$1/; # delete words from right |
|
501
|
0
|
0
|
|
|
|
|
if ($bb = $mne_nam{$yy}) { |
|
502
|
0
|
|
|
|
|
|
push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list |
|
|
0
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
#print "$aa\t@{$aprox_mne{$aa}}\n"; |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
} |
|
506
|
|
|
|
|
|
|
} |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
#print (time() - $start); print " delete words from left\n"; |
|
509
|
0
|
|
|
|
|
|
while (($xx, $aa) = each %mne_nam) { # (name, nmemonic) |
|
510
|
0
|
0
|
|
|
|
|
if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character |
|
511
|
0
|
|
|
|
|
|
next; |
|
512
|
|
|
|
|
|
|
} |
|
513
|
0
|
|
|
|
|
|
$yy = $xx; |
|
514
|
0
|
|
|
|
|
|
while ($yy =~ /[^\s]\s+[^\s]+\s*$/) { |
|
515
|
0
|
|
|
|
|
|
$yy =~ s/^[^\s]+\s+(.*)$/$1/; # delete words from left |
|
516
|
0
|
0
|
|
|
|
|
if ($yy =~ /^WITH\s/i) { # avoid matching overly broad phrases |
|
517
|
0
|
|
|
|
|
|
last; |
|
518
|
|
|
|
|
|
|
} |
|
519
|
0
|
0
|
|
|
|
|
if ($bb = $mne_nam{$yy}) { |
|
520
|
0
|
|
|
|
|
|
push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list |
|
|
0
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
#print "$aa\t@{$aprox_mne{$aa}}\n"; |
|
522
|
|
|
|
|
|
|
} |
|
523
|
|
|
|
|
|
|
} |
|
524
|
|
|
|
|
|
|
} |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# look for (string1 inside string2) or (string2 inside string1) |
|
527
|
|
|
|
|
|
|
# also look fcr "DIGIT ONE", "NUMBER TWO" strings |
|
528
|
|
|
|
|
|
|
# (disabled) also equate the 2 kinds of Japanese syllabic character sets |
|
529
|
|
|
|
|
|
|
# since subs are assigned reciprocally, algorithm avoids redundancy: for example, if |
|
530
|
|
|
|
|
|
|
# elements are (A, B, C, D, E), then comparisions will be E with (A, B, C, D), |
|
531
|
|
|
|
|
|
|
# D with (A, B, C), C with (A, B), B with (A) and A with nothing. |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
#print (time() - $start); print " look for string1 inside string2, number terms\n"; |
|
534
|
0
|
|
|
|
|
|
$ff = "DIGIT|NUMBER"; |
|
535
|
|
|
|
|
|
|
#$hh = "HIRAGANA|KATAKANA"; |
|
536
|
|
|
|
|
|
|
#$pp = "LETTER|LIGATURE|".$ff."|".$hh; |
|
537
|
0
|
|
|
|
|
|
$pp = "LETTER|LIGATURE|".$ff; |
|
538
|
0
|
|
|
|
|
|
$jj = scalar (keys %mne_nam); |
|
539
|
0
|
|
|
|
|
|
foreach $xx ( reverse keys %mne_nam ) { |
|
540
|
0
|
0
|
|
|
|
|
if ( ($kk = (--$jj)) <= 0) { |
|
541
|
0
|
|
|
|
|
|
last; |
|
542
|
|
|
|
|
|
|
} |
|
543
|
0
|
0
|
|
|
|
|
if ($xx !~ /($pp)/i) { # only certain types |
|
544
|
0
|
|
|
|
|
|
next; |
|
545
|
|
|
|
|
|
|
} |
|
546
|
0
|
0
|
|
|
|
|
if ($xx !~ /\b\w+\b\s+\b\w+/) { # at least 2 words |
|
547
|
0
|
|
|
|
|
|
next; |
|
548
|
|
|
|
|
|
|
} |
|
549
|
0
|
0
|
|
|
|
|
if ($xx =~ /\b($ff) \b(\w+)\b/i) { # digit, numeral, et al. match |
|
550
|
0
|
|
|
|
|
|
$gg = $2; |
|
551
|
|
|
|
|
|
|
} else { |
|
552
|
0
|
|
|
|
|
|
$gg = ""; |
|
553
|
|
|
|
|
|
|
} |
|
554
|
|
|
|
|
|
|
# if ($xx =~ /(.*)\b($hh)\b(.*)/i) { # equivalent Japanese syllabic sets |
|
555
|
|
|
|
|
|
|
# $xx = "$1"."HIRAGANA"."$2"; |
|
556
|
|
|
|
|
|
|
# $rr = "$1"."KATAKANA"."$2"; |
|
557
|
|
|
|
|
|
|
# } else { |
|
558
|
|
|
|
|
|
|
# $rr = ""; |
|
559
|
|
|
|
|
|
|
# } |
|
560
|
0
|
|
|
|
|
|
$aa = $mne_nam{$xx}; |
|
561
|
0
|
|
|
|
|
|
foreach $yy (keys %mne_nam) { |
|
562
|
0
|
0
|
|
|
|
|
if ( ($kk--) <= 0) { |
|
563
|
0
|
|
|
|
|
|
last; |
|
564
|
|
|
|
|
|
|
} |
|
565
|
0
|
0
|
|
|
|
|
if ($yy !~ /($pp)/i) { |
|
566
|
0
|
|
|
|
|
|
next; |
|
567
|
|
|
|
|
|
|
} |
|
568
|
0
|
0
|
|
|
|
|
if ($yy !~ /\b\w+\b\s+\b\w+/) { |
|
569
|
0
|
|
|
|
|
|
next; |
|
570
|
|
|
|
|
|
|
} |
|
571
|
0
|
0
|
0
|
|
|
|
if ( (($xx =~ /\b($yy)\b/i) || ($yy =~ /\b($xx)\b/i)) || |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
572
|
|
|
|
|
|
|
# (($rr) && ($xx =~ /\b($yy)\b/i) || ($yy =~ /\b($xx)\b/i)) || |
|
573
|
|
|
|
|
|
|
(($gg) && ($yy !~ /FRACTION/i) && ($yy =~ /\b($ff)\s+\b$gg\b/i)) ) { |
|
574
|
0
|
|
|
|
|
|
$bb = $mne_nam{$yy}; |
|
575
|
0
|
|
|
|
|
|
push @{ $aprox_mne{ $aa} }, "$bb"; # add to sub lists repciprocally |
|
|
0
|
|
|
|
|
|
|
|
576
|
0
|
|
|
|
|
|
push @{ $aprox_mne{ $bb} }, "$aa"; |
|
|
0
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
#print "$aa\t@{$aprox_mne{ $aa}}\n"; |
|
578
|
|
|
|
|
|
|
#print "$bb\t@{$aprox_mne{ $bb}}\n"; |
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
} |
|
581
|
|
|
|
|
|
|
} |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
#print (time() - $start); print "\n"; |
|
584
|
|
|
|
|
|
|
#print "No substitutes found for these:\n"; |
|
585
|
0
|
|
|
|
|
|
foreach $aa ( keys %aprox_mne ) { |
|
586
|
0
|
0
|
|
|
|
|
if ( $#{ $aprox_mne{$aa}} < 0) { |
|
|
0
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# print "$aa\t$nam_mne{$aa}\n"; |
|
588
|
0
|
|
|
|
|
|
delete $aprox_mne{$aa}; |
|
589
|
|
|
|
|
|
|
} |
|
590
|
|
|
|
|
|
|
} |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
#print (time() - $start); print " eliminate duplicate substitutions\n"; |
|
593
|
0
|
|
|
|
|
|
foreach $aa ( keys %aprox_mne ) { # eliminate duplicate substitutions |
|
594
|
0
|
|
|
|
|
|
@ww = @{$aprox_mne{$aa}}; |
|
|
0
|
|
|
|
|
|
|
|
595
|
0
|
|
|
|
|
|
CZWARTY: for($gg = 0;;){ |
|
596
|
0
|
|
|
|
|
|
for ($jj = $#ww; $jj > 0; --$jj){ |
|
597
|
0
|
|
|
|
|
|
for ($kk = ($jj -1); $kk >= 0; --$kk){ |
|
598
|
0
|
0
|
|
|
|
|
if ("$ww[$kk]" eq "$ww[$jj]"){ |
|
599
|
0
|
|
|
|
|
|
splice @ww, $jj, 1; |
|
600
|
0
|
|
|
|
|
|
$gg = 1; |
|
601
|
0
|
|
|
|
|
|
next CZWARTY; |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
} |
|
605
|
0
|
|
|
|
|
|
last; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
0
|
0
|
|
|
|
|
if ($gg) { |
|
608
|
0
|
|
|
|
|
|
$aprox_mne{$aa} = [@ww]; |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
} |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
#print (time() - $start); print "\n"; |
|
613
|
|
|
|
|
|
|
# for user's protection, save old file if any |
|
614
|
0
|
|
|
|
|
|
$gg = "$substi_fnam".".bkp"; |
|
615
|
0
|
0
|
|
|
|
|
if (! -e $gg) {rename( $substi_fnam, "$gg")}; |
|
|
0
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
# contrived loop wherein second pass only when open or write failure |
|
617
|
0
|
|
|
|
|
|
OSMY: for ($yy = 0; $yy == 0; $yy = 1) { |
|
618
|
0
|
0
|
|
|
|
|
unless (open(DSF, ">$substi_fnam")) {next OSMY}; |
|
|
0
|
|
|
|
|
|
|
|
619
|
0
|
0
|
|
|
|
|
unless (print DSF "hash mnemonic=name\n") {next OSMY}; # header |
|
|
0
|
|
|
|
|
|
|
|
620
|
0
|
|
|
|
|
|
foreach $mm (sort keys %nam_mne ) { |
|
621
|
|
|
|
|
|
|
# mnemonic tab name newline |
|
622
|
0
|
0
|
|
|
|
|
unless (print DSF "$mm\t$nam_mne{$mm}\n") {next OSMY}; |
|
|
0
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
} |
|
624
|
0
|
0
|
|
|
|
|
unless (print DSF "hash mnemonic=substitute list\n") {next OSMY}; # header |
|
|
0
|
|
|
|
|
|
|
|
625
|
0
|
|
|
|
|
|
foreach $mm (sort keys %aprox_mne ) { |
|
626
|
0
|
0
|
|
|
|
|
unless (print DSF "$mm") {next OSMY}; # mnemonic |
|
|
0
|
|
|
|
|
|
|
|
627
|
0
|
|
|
|
|
|
foreach $aa (0 .. $#{$aprox_mne{$mm}}) { # each substitute |
|
|
0
|
|
|
|
|
|
|
|
628
|
0
|
0
|
|
|
|
|
unless (print DSF "\t$aprox_mne{$mm}[$aa]") {next OSMY}; |
|
|
0
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
} |
|
630
|
0
|
0
|
|
|
|
|
unless (print DSF "\n") {next OSMY}; # newline |
|
|
0
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
} |
|
632
|
0
|
|
|
|
|
|
last OSMY; |
|
633
|
|
|
|
|
|
|
} |
|
634
|
0
|
|
|
|
|
|
close DSF; |
|
635
|
0
|
0
|
|
|
|
|
if ($yy) { |
|
636
|
0
|
0
|
|
|
|
|
if (! -e $substi_fnam) {rename( "$gg", $substi_fnam)}; |
|
|
0
|
|
|
|
|
|
|
|
637
|
0
|
|
|
|
|
|
print "Failed to create $substi_fnam: $!\n"; |
|
638
|
0
|
|
|
|
|
|
return undef; |
|
639
|
|
|
|
|
|
|
} |
|
640
|
0
|
|
|
|
|
|
return 1; |
|
641
|
|
|
|
|
|
|
} # end of sub build_substitutes |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
sub load_rfcdoc { #load code definition list |
|
645
|
0
|
|
|
0
|
0
|
|
my ($mm, $nn, $jj, $catch, $hh, $xx, $kk, $yy); |
|
646
|
0
|
|
|
|
|
|
my $digits = \"ZERO|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE"; |
|
647
|
0
|
0
|
|
|
|
|
unless (open(RFC, $_[0])) {print "Can't open $_[0]: $!\n"; return undef;} |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
648
|
0
|
|
|
|
|
|
while() { |
|
649
|
0
|
|
|
|
|
|
chomp; |
|
650
|
0
|
0
|
|
|
|
|
if (/^3. CHARACTER MNEMONIC TABLE/) { |
|
651
|
0
|
|
|
|
|
|
last; |
|
652
|
|
|
|
|
|
|
} |
|
653
|
|
|
|
|
|
|
} |
|
654
|
0
|
|
|
|
|
|
$jj = 1; |
|
655
|
0
|
|
|
|
|
|
while() { |
|
656
|
0
|
|
|
|
|
|
chomp; |
|
657
|
0
|
0
|
|
|
|
|
if (/^4. CHARSETS/) { |
|
658
|
0
|
|
|
|
|
|
last; |
|
659
|
|
|
|
|
|
|
} |
|
660
|
|
|
|
|
|
|
# page head, foot, effectively empty lines |
|
661
|
0
|
0
|
0
|
|
|
|
if (/^(Simonsen|RFC 1345) / || /^.{0,3}$/) { |
|
662
|
0
|
|
|
|
|
|
next; |
|
663
|
|
|
|
|
|
|
} |
|
664
|
0
|
0
|
|
|
|
|
if (/SP\s+0020\s+SPACE/) { # first code def line |
|
665
|
0
|
|
|
|
|
|
$catch = "1"; |
|
666
|
|
|
|
|
|
|
} |
|
667
|
0
|
0
|
|
|
|
|
if (! $catch) { |
|
668
|
0
|
|
|
|
|
|
next; |
|
669
|
|
|
|
|
|
|
} |
|
670
|
0
|
|
|
|
|
|
++$jj; |
|
671
|
0
|
0
|
0
|
|
|
|
if(/^\s*([^\s]+)\s+([\da-fA-F]{4})\s{4}\b(.+)\s*$/ || |
|
672
|
|
|
|
|
|
|
/^(\s+)(e000)\s{4}\b(.+)\s*$/) { |
|
673
|
0
|
|
|
|
|
|
$mm = $1; $hh = $2; $xx = hex $2; $nn = $3; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# normalize unusual e000 format which indicates unfinished (Mnemonic) |
|
675
|
0
|
0
|
|
|
|
|
if ($hh eq "e000") { |
|
676
|
0
|
|
|
|
|
|
$mm = " "; # single space |
|
677
|
|
|
|
|
|
|
} |
|
678
|
|
|
|
|
|
|
# correct mistake in LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW |
|
679
|
0
|
0
|
|
|
|
|
if ($hh eq "1e4b") { |
|
680
|
0
|
|
|
|
|
|
$mm = "n->"; |
|
681
|
|
|
|
|
|
|
} |
|
682
|
|
|
|
|
|
|
# correct mistake in LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE |
|
683
|
0
|
0
|
|
|
|
|
if ($hh eq "1e69") { |
|
684
|
0
|
|
|
|
|
|
$mm = "s.-."; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
0
|
0
|
0
|
|
|
|
if ($yy && ($jj != ($kk +1)) && ($xx != ($yy +1))) { |
|
|
|
|
0
|
|
|
|
|
|
687
|
0
|
|
|
|
|
|
print "Check sequence around this entry: $_\n"; |
|
688
|
|
|
|
|
|
|
} |
|
689
|
0
|
|
|
|
|
|
$yy = $xx; $kk = $jj; |
|
|
0
|
|
|
|
|
|
|
|
690
|
0
|
0
|
|
|
|
|
if ($nam_mne{$mm} ne "") { # already exists |
|
691
|
0
|
|
|
|
|
|
print "Same mnemonic $mm for $nam_mne{$mm} and hex code $hh \n"; |
|
692
|
|
|
|
|
|
|
} |
|
693
|
0
|
0
|
|
|
|
|
if ($mne_nam{$nn} ne "") { # already exists |
|
694
|
0
|
|
|
|
|
|
print "Same mnemonic $mne_nam{$nn} for $nn and hex code $hh \n"; |
|
695
|
|
|
|
|
|
|
} |
|
696
|
|
|
|
|
|
|
# normalize for more thorough substitution |
|
697
|
0
|
|
|
|
|
|
$nn =~ s/\b(SUBSCRIPT|SUPERSCRIPT)\s+($$digits)\b/$1 DIGIT $2/; |
|
698
|
0
|
|
|
|
|
|
$nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic |
|
699
|
0
|
|
|
|
|
|
$mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name |
|
700
|
0
|
|
|
|
|
|
$aprox_mne{$mm} = []; # list approx subs keyed by mnemonic |
|
701
|
0
|
|
|
|
|
|
next; |
|
702
|
|
|
|
|
|
|
} |
|
703
|
0
|
0
|
|
|
|
|
if (/^[ ]{16}([^\s].*)/) { # continuation line |
|
704
|
0
|
|
|
|
|
|
delete $mne_nam{$nn}; |
|
705
|
0
|
|
|
|
|
|
$nn = "$nn $1"; # append to name in prev line |
|
706
|
0
|
|
|
|
|
|
$nam_mne{$mm} = "$nn"; |
|
707
|
0
|
|
|
|
|
|
$mne_nam{$nn} = "$mm"; |
|
708
|
|
|
|
|
|
|
} |
|
709
|
|
|
|
|
|
|
} |
|
710
|
0
|
|
|
|
|
|
close RFC; |
|
711
|
0
|
|
|
|
|
|
return 1; |
|
712
|
|
|
|
|
|
|
} # end of sub load_rfcdoc |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
sub transliterate { |
|
716
|
0
|
0
|
|
0
|
1
|
|
my $self = ($#_ > 0) ? shift : 0; |
|
717
|
0
|
0
|
|
|
|
|
my $arref = $self ? \@{$self->{TRN_ARY}} : \@transform; |
|
|
0
|
|
|
|
|
|
|
|
718
|
0
|
|
|
|
|
|
my @xx = unpack "C*", $_[0]; |
|
719
|
0
|
|
|
|
|
|
my $yy = ""; |
|
720
|
0
|
|
|
|
|
|
foreach ( @xx ) { |
|
721
|
0
|
|
|
|
|
|
$yy .= pack "C", @$arref[$_]; |
|
722
|
|
|
|
|
|
|
} |
|
723
|
0
|
|
|
|
|
|
return $yy; |
|
724
|
|
|
|
|
|
|
} # end of sub transliterate |
|
725
|
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
__END__ |