File Coverage

blib/lib/Convert/Translit.pm
Criterion Covered Total %
statement 67 485 13.8
branch 17 272 6.2
condition 1 48 2.0
subroutine 7 10 70.0
pod 3 6 50.0
total 95 821 11.5


line stmt bran cond sub pod time code
1             #!perl
2             # "Translit.pm" by Genji Schmeder 4 October 1996
3             # creates transliteration map between character sets defined in IETF RFC 1345
4             # with acknowledgements to Chris Leach, author of "EBCDIC.pm"
5             # and to Keld Simonsen, author of RFC 1345
6             # Copyright (c) 1997 Genji Schmeder. All rights reserved.
7             # This program is free software; you can redistribute it and/or modify
8             # it under the same terms as Perl itself.
9              
10             package Convert::Translit;
11 1     1   765 use strict;
  1         2  
  1         46  
12 1     1   5 use vars qw($VERSION @ISA @EXPORT_OK);
  1         3  
  1         130  
13             require Exporter;
14             @ISA = qw(Exporter);
15             @EXPORT_OK = qw(transliterate build_substitutes);
16             $VERSION = '1.03'; # dated 5 November 1997
17 1     1   925 use integer;
  1         14  
  1         6  
18              
19             if ($] < 5) {die "Perl version must be at least 5.\n";}
20              
21             my $here = where_module();
22             my $rfc_fnam = $here."rfc1345";
23             my $substi_fnam = $here."substitutes";
24             undef &where_module;
25              
26             my %nam_mne; # hash of name keyed by mnemonic
27             my %mne_nam; # hash of mnemonic keyed by name
28             my %aprox_mne; # hash of substitute list keyed by mnemonic
29             # if new() never called, transliterate() returns unchanged arg
30             my @transform = (0 .. 255);
31             my $verbose = 0;
32             1;
33              
34              
35             sub where_module {
36             my @xx = caller;
37 1     1   1458 use File::Basename;
  1         2  
  1         6696  
38             my $yy = dirname($xx[1]);
39             # adjust for Macintosh and Unix different return from dirname()
40             if ( ($^O ne "MacOS") && ($yy =~ /[^\/]$/) ) {
41             $yy .= "/";
42             }
43             return $yy;
44             } # end of sub where_module
45              
46              
47             sub new {
48 1     1 1 59 my ($bb, $cc, $dd, $ee, $gg, $ii, $jj, $kk, $line);
49 0         0 my ($mm, $pp, $qq, $uu, $vv, $ww, $xx, $yy, $zz);
50 0         0 my (@ch_tab, @thistab, @bitmap, @ff, @tt, @pp, @oo);
51 0         0 my (%duplcatfrom, %duplcatto);
52 1         3 my $this = shift;
53 1   33     9 my $class = ref($this) || $this;
54              
55 1         3 %nam_mne = ();
56 1         2 %mne_nam = ();
57 1         2 %aprox_mne = ();
58              
59             # syntax: new([FROM charset], TO charset, [verbose])
60 1         3 my @chset;
61 1 50       5 $chset[0] = ($#_ > 0) ? shift : "ascii"; # ascii is default FROM charset
62 1         3 $chset[1] = shift;
63 1         2 $verbose = shift;
64              
65             # typical rfc1345 table header:
66             # &charset CSA_Z243.4-1985-1
67             # &rem source: ECMA registry
68             # &alias iso-ir-121
69             # &g0esc x2877 &g1esc x2977 &g2esc x2a77 &g3esc x2b77
70             # &alias ISO646-CA
71             # &alias csa7-1
72             # &alias ca
73             # &code 0
74             # NU SH SX EX ET EQ AK BL BS HT LF VT FF CR SO SI
75              
76 1 50       5 load_dsf( $substi_fnam) or return undef;
77 1 50       5 if ($verbose) {print "Creating transliteration from \"$chset[0]\" to \"$chset[1]\"\n";}
  0         0  
78 1         7 for $ii ( 0 .. 1 ){
79             #traverse whole file twice for simplicity
80 1 50       109 unless (open(RFC, $rfc_fnam)) {print "Can't open $rfc_fnam: $!.\n"; return undef;}
  0         0  
  0         0  
81 1         41 while() { #skip to relevant section
82 2709 100       8809 if (/5.\s+CHARSET TABLES/) {
83 1         4 last;
84             }
85             }
86 1         4 @thistab = ();
87 1         8 while() { # read table lines
88 10         18 $line = $_;
89 10         12 chomp $line;
90 10 50       45 if ($line =~ /^ACKNOWLEDGEMENTS/) { # past relevant section
91 0 0       0 if (@thistab) { # was previous table collected?
92 0 0       0 if (relvnt_tab($chset[$ii], \@thistab) ) {
93 0         0 $ch_tab[$ii] = [ @thistab ];
94             }
95             }
96 0         0 last;
97             }
98 10 100       29 if ($line !~ /^ [^\s]/){ # distinguish table lines by 2 leading spaces
99 2         6 next;
100             }
101 8         24 $line =~ s/^\s*//; # trim leading spaces
102             # be sure of ending a table by encountering the next
103 8 100       20 if ($line !~ /^&charset\s/) {
104 6         11 push @thistab, $line;
105 6         13 next;
106             }
107 2 100       6 if (@thistab) { # was previous table collected?
108 1 0       8 if (relvnt_tab($chset[$ii], \@thistab)) {
109 0         0 $ch_tab[$ii] = [ @thistab ];
110 0         0 last;
111             }
112             }
113 1         20 @thistab = ($line); # start another table collection
114             }
115             }
116 0         0 close RFC;
117              
118             #examine charsets
119 0         0 $qq = 0;
120 0         0 for $ii ( 0 .. 1 ){
121 0 0       0 if(! $ch_tab[$ii][0]){
122 0         0 print STDERR "Couldn't find a character set for \"$chset[$ii]\".\n";
123 0         0 $qq = 1;
124             }
125             }
126 0 0       0 if ($qq) {return undef;}
  0         0  
127              
128 0         0 for $ii ( 0 .. 1 ){
129 0         0 for $jj ( 0 .. $#{$ch_tab[$ii]} ) {
  0         0  
130 0 0       0 if($ch_tab[$ii][$jj] =~ /^&bits\b\s*(\d*)/){
131 0 0 0     0 if ($1 != 0 || $1 != 8) {
132 0         0 print STDERR "Can't handle $1\-bit charsets like $chset[$ii].\n";
133 0         0 $qq = 1;
134 0         0 last;
135             }
136             }
137             }
138             }
139 0 0       0 if ($qq) {return undef;}
  0         0  
140              
141 0         0 for $ii ( 0 .. 1 ){
142 0         0 for $jj ( 0 .. $#{$ch_tab[$ii]} ) {
  0         0  
143 0 0       0 if($ch_tab[$ii][$jj] =~ /^&(code2|codex|comb2)\b/){
144 0         0 print STDERR "Can't handle $1 terms in charset $chset[$ii].\n";
145 0         0 $qq = 1;
146 0         0 last;
147             }
148             }
149             }
150 0 0       0 if ($qq) {return undef;}
  0         0  
151              
152             #create bit maps
153 0         0 for $ii ( 0 .. 1 ){
154 0 0       0 if ($verbose) {print "\nCharacter set $chset[$ii]:\n";}
  0         0  
155 0         0 $dd = -1; # code offset unless negative
156 0         0 for $jj ( 0 .. $#{$ch_tab[$ii]} ) {
  0         0  
157 0         0 $line = $ch_tab[$ii][$jj];
158 0 0       0 if($line =~ /^&code\b\s*(\d)/){
159 0         0 $dd = $1;
160 0         0 next;
161             }
162 0 0       0 if($line =~ /^&duplicate\s+([\da-fA-F]+)\s+([^\s]+)/){
163 0         0 $pp = $1; $mm = $2; # position already in base 10
  0         0  
164             # examples: "duplicate 91 AE", "duplicate 92 O/" (position, dup mnemonic)
165             # organize duplicate keeping differently for FROM and TO charsets
166 0 0       0 if ($ii == 0) {
167 0 0       0 if ($duplcatfrom{$pp} ) {
168             # FROM keyed by position
169 0         0 push @{$duplcatfrom{$pp}{Mnemonics}}, "$mm";
  0         0  
170             } else {
171 0         0 $duplcatfrom{$pp} = {Position => $pp, Mnemonics => [ "$mm" ],};
172             }
173             #print "FROM Duplicate for position $pp: $duplcatfrom{$pp}{Mnemonics}[-1]\n";
174             } else {
175 0         0 $duplcatto{$mm} = $pp; # TO keyed by mnemonic
176             #print "TO Duplicate for position $duplcatto{$mm}: $mm\n";
177             }
178             }
179             # flush control info (just use all defined keywords)
180 0 0       0 if($line =~
181             /^&(charset|alias|g\desc|bits|code|code2|codex|duplicate|rem|comb2)\b/) {
182 0         0 $dd = -1;
183 0         0 next;
184             }
185 0 0       0 if ($dd < 0) {
186 0 0       0 if ($verbose)
  0         0  
187             {print "Strange! You should examine the charset table: \"$line\"\n";}
188 0         0 next;
189             }
190 0         0 @pp = split(/\s+/, $line);
191 0         0 foreach $uu (@pp) {
192             # mnemonic "??" means position unused.
193             # mnemonic "__" means character set not completely defined here
194 0 0 0     0 if ( $uu eq "??" || $uu eq "__" ) {
195 0         0 $uu = "";
196             }
197 0 0 0     0 if (($uu) && (! $nam_mne{$uu})) {
198 0         0 printf "Warning: position %x (hex) has invalid mnemonic \"%s\".\n",
199             $dd, $uu;
200 0         0 $uu = "";
201             }
202 0         0 $bitmap[$ii][$dd++] = $uu;
203             }
204             # print ($#pp, $dd, (1 + $#{$bitmap[$ii]}), "\n");
205             }
206             # normalize char set length
207 0         0 $#{$bitmap[$ii]} += ((128 - ( (1 + $#{$bitmap[$ii]}) % 128)) % 128);
  0         0  
  0         0  
208             # print ("highest index is", $#{$bitmap[$ii]}, "\n" );
209 0         0 for $kk ( 0 .. $#{$bitmap[$ii]} ) {
  0         0  
210 0 0       0 if ($verbose) { print (length($bitmap[$ii][$kk])?"x":"_");}
  0 0       0  
211 0 0       0 if ( ! (($kk +1) % 16)) { if ($verbose) { print "\n";}}
  0 0       0  
  0         0  
212             }
213             }
214              
215             # explanation from RFC 1345:
216             # "&duplicate" has a special meaning indicating that a position
217             # is being used for more than one character. This is an ugly
218             # convention but it is a sad fact of life that same code in one
219             # coded character set can mean different characters.
220             # "&duplicate" takes two parameters - the first is the code to
221             # be duplicated, the other is the new mnemonic.
222             # &duplicate 91 AE
223             # &duplicate 92 O/
224              
225 0         0 $#transform = $#{$bitmap[0]}; # create transliterator table
  0         0  
226 0         0 PIERWSZY: for $jj ( 0 .. $#{$bitmap[0]} ) {
  0         0  
227 0         0 $transform[$jj] = -1; # initialize since 0 is valid value
228 0         0 @tt = ( $bitmap[0][$jj] ); # start list with one element
229 0 0       0 if ($duplcatfrom{$jj}) { # add any duplicates in the FROM charset
230 0         0 push @tt, @{$duplcatfrom{$jj}{Mnemonics}};
  0         0  
231             #print "BB $jj $#tt <<< @tt >>>\n";
232             }
233 0         0 @oo = ();
234 0         0 for $xx ( 0 .. $#tt ) { # refine list
235 0 0       0 if (length($tt[$xx]) > 0) {
236 0         0 push @oo, $tt[$xx];
237             }
238             }
239 0 0       0 if (! @oo) {
240 0         0 next;
241             }
242 0         0 for $bb ( 0 .. $#oo ) { # match any mnemonics for this position in FROM charset
243 0         0 $uu = $oo[$bb];
244 0         0 for $kk ( 0 .. $#{$bitmap[1]} ) { #search TO charset
  0         0  
245 0 0       0 if ($uu eq $bitmap[1][$kk]) {
246 0         0 $transform[$jj] = $kk;
247 0         0 next PIERWSZY;
248             }
249             }
250 0 0       0 if (exists $duplcatto{$uu}) { # search TO charset duplicates
251 0         0 $transform[$jj] = $duplcatto{$uu};
252 0         0 next PIERWSZY;
253             }
254             }
255             }
256              
257 0 0       0 if ($verbose) { print "\nTransliterator:\n";}
  0         0  
258 0         0 for $jj ( 0 .. $#transform ) {
259 0 0       0 if ($verbose) { print (($transform[$jj]>=0)?"x":"_");}
  0 0       0  
260 0 0       0 if ( ! (($jj+1) % 16)) { if ($verbose) { print "\n";}}
  0 0       0  
  0         0  
261             }
262              
263 0         0 $vv = $ww = 0;
264 0         0 for $jj ( 0 .. $#transform ) { # try to approximate using substitute lists
265 0 0       0 if ($transform[$jj] >= 0) {
266 0         0 next;
267             }
268 0 0       0 if (! ($gg = $bitmap[0][$jj]) ) { # undefined
269 0         0 next;
270             }
271 0         0 TRZECI: foreach $pp (@{$aprox_mne{$gg}}) {
  0         0  
272 0         0 for $cc ( 0 .. $#{$bitmap[1]} ) {
  0         0  
273 0         0 $zz = $bitmap[1][$cc];
274 0 0       0 if ($zz eq $pp) { # substitute must be in TO char set
275 0         0 $transform[$jj] = $cc; # success
276 0 0       0 if (!$ww) {
277 0         0 $ww = 1;
278 0 0       0 if ($verbose) {print "\nApproximate substitutes:\n";}
  0         0  
279             }
280 0 0       0 if ($verbose) { printf "%X==>%X\t%s==>%s\n", $jj, $cc,
  0         0  
281             $nam_mne{ $gg}, $nam_mne{ $zz};}
282 0         0 last TRZECI;
283             }
284             }
285             }
286 0         0 $vv = 1; # defined character but no substitute
287             }
288              
289 0 0       0 if ($ww) {
290 0 0       0 if ($verbose) {print "\nTransliterator with aproximate substitutions:\n";}
  0         0  
291 0         0 for $jj ( 0 .. $#transform ) {
292 0 0       0 if ($verbose) {print (($transform[$jj]>=0)?"x":"_");}
  0 0       0  
293 0 0       0 if ( ! (($jj+1) % 16)) { if ($verbose) {print "\n";}}
  0 0       0  
  0         0  
294             }
295             }
296              
297             # non-equivalent substitutions (like "RIGHT SQUARE BRACKET" for "CENT SIGN")
298 0 0       0 if ($vv) {
299 0         0 @ff = ();
300 0         0 for $jj ( 0 .. $#transform ) { # list those in need in FROM charset
301 0 0 0     0 if (($transform[$jj] < 0) && $bitmap[0][$jj] ) {
302 0         0 push @ff, $jj;
303             }
304             }
305 0 0       0 if (@ff) {
306 0         0 @tt = ();
307 0         0 PIATY: for $bb ( 0 .. $#{$bitmap[1]} ) { # list availables in TO charset
  0         0  
308             # start at usual second ascii graphic (hoping to maximize graphics)
309 0         0 $cc = (34 + $bb) % (1 + $#{$bitmap[1]} );
  0         0  
310 0 0       0 if ($zz = $bitmap[1][$cc]) {
311 0         0 for $ii ( 0 .. $#{$bitmap[0]} ) {
  0         0  
312 0 0       0 if ($zz eq $bitmap[0][$ii]) {
313 0         0 next PIATY;
314             }
315             }
316 0         0 push @tt, $cc;
317             }
318             }
319             }
320 0         0 $gg = $ee = 0;
321 0   0     0 SIODMY: while ( ($#ff >= 0) && ($#tt >= 0) ) {
322 0         0 for $jj ($gg .. $#ff) { # first try to match equal hex values
323 0         0 $ww = $ff[$jj];
324 0         0 for $kk (0 .. $#tt) {
325 0 0       0 if ( $ww == $tt[$kk]) {
326 0         0 $transform[ $ww] = $ww;
327 0 0       0 if (!$ee) {
328 0         0 $ee = 1;
329 0 0       0 if ($verbose) {print "\nNon-equivalent substitutes:\n";}
  0         0  
330             }
331 0 0       0 if ($verbose)
  0         0  
332             {printf "%X==>%X\t%s==>%s\n", $ww, $ww,
333             $nam_mne{$bitmap[0][ $ww]}, $nam_mne{$bitmap[1][ $ww]};}
334 0         0 splice @ff, $jj, 1;
335 0         0 splice @tt, $kk, 1;
336 0         0 $gg = $jj;
337 0         0 next SIODMY;
338             }
339             }
340             }
341 0         0 last;
342             }
343 0 0       0 $gg = ($#ff < $#tt)? $#ff : $#tt;
344 0         0 for $jj (0 .. $gg ) {
345 0         0 $transform[$ff[$jj]] = $tt[$jj];
346 0 0       0 if (!$ee) {
347 0         0 $ee = 1;
348 0 0       0 if ($verbose) {print "\nNon-equivalent substitutes:\n";}
  0         0  
349             }
350 0 0       0 if ($verbose)
  0         0  
351             {printf "%X==>%X\t%s==>%s\n", $ff[$jj], $tt[$jj],
352             $nam_mne{$bitmap[0][$ff[$jj]]}, $nam_mne{$bitmap[1][$tt[$jj]]};}
353             }
354             }
355              
356 0         0 $ee = $yy = 0; # anything left untranslated?
357 0         0 for $jj ( 0 .. $#transform ) {
358 0 0       0 if ($transform[$jj] < 0) {
359 0         0 $yy = 1;
360 0 0       0 if ($mm = $bitmap[0][$jj]) {
361 0 0       0 if (!$ee) {
362 0         0 $ee = 1;
363 0 0       0 if ($verbose) {print "\nNon-equivalent remnant:\n";}
  0         0  
364             }
365 0 0       0 if ($verbose) {printf "%X\t%s\n", $jj, $nam_mne{$mm};}
  0         0  
366             }
367             }
368             }
369              
370             # if non-equivalent remnant, then select a hopefully unique and graphic indicator
371 0 0       0 if ($ee) {
372 0         0 DRUGI: for $bb ( 0 .. $#{$bitmap[1]} ) {
  0         0  
373 0         0 $cc = (34 + $bb) % (1 + $#{$bitmap[1]} );
  0         0  
374 0 0       0 if ($xx = $bitmap[1][$cc]) {
375 0         0 for $dd ( 0 .. $#{$bitmap[0]} ) {
  0         0  
376 0 0       0 if ( $xx eq $bitmap[0][$dd]) { # reject if in FROM charset
377 0         0 next DRUGI;
378             }
379             }
380 0         0 last; # success
381             }
382             }
383 0 0       0 if ($verbose)
  0         0  
384             {printf "\nNon-equivalence indicator: %X\t%s\n", $cc, $nam_mne{$xx}};
385 0         0 for $jj ( 0 .. $#transform ) { # interpolate non-equiv character
386 0 0 0     0 if (($transform[$jj] < 0) && $bitmap[0][$jj]) {
387 0         0 $transform[$jj] = $cc;
388             }
389             }
390             }
391              
392             # if undefined remnant, then select a possibly undefined indicator
393 0 0       0 if ($yy) {
394 0         0 SZOSTY: for ($cc = $#{$bitmap[1]}; $cc >= 0; --$cc ) {
  0         0  
395 0 0       0 if (! ($xx = $bitmap[1][$cc]) ) {
396 0         0 last;
397             }
398 0         0 for $dd ( 0 .. $#{$bitmap[0]} ) {
  0         0  
399 0 0       0 if ($xx eq $bitmap[0][$dd] ) {
400 0         0 next SZOSTY;
401             }
402             }
403 0         0 last; # success
404             }
405 0 0       0 if ( ! ($yy = $nam_mne{$xx}) ) {$yy = "(undefined character)";}
  0         0  
406 0 0       0 if ($verbose) {printf "\nUndefined indicator: %X\t%s\n", $cc, $yy};
  0         0  
407 0         0 for $jj ( 0 .. $#transform ) { # interpolate non-equiv character
408 0 0       0 if ($transform[$jj] < 0) {
409 0         0 $transform[$jj] = $cc;
410             }
411             }
412             }
413              
414             # if FROM charset length < 256, assume repeated for upper 128 chars
415 0 0       0 if ($#transform < 128) {push @transform, @transform;}
  0         0  
416 0         0 $yy = "$chset[0]".".to."."$chset[1]";
417 0         0 my $self = {TRN_NAM=>"$yy", TRN_ARY=>[@transform]};
418 0         0 bless $self, $class;
419 0         0 return $self;
420             } # end of sub new
421              
422              
423             sub relvnt_tab { # true if this is the sought char table
424 1     1 0 2 my ($jj, $uu, $ww, $xx, $yy, $chch, $tabref);
425 1         3 $chch = shift; # $chset[0 or 1]
426 1         2 $tabref = shift; # reference to @thistab
427 1         3 for $jj ( 0 .. $#{@$tabref} ){
  1         185  
428 0         0 $ww = $$tabref[$jj];
429 0 0       0 if ($ww !~ /^&(charset|alias)\s+([^\s]*)/){
430 0         0 next;
431             }
432 0         0 $xx = $1; $yy = $2;
  0         0  
433 0 0       0 if ($chch =~ /^$yy$/i) {
434 0 0       0 if ($verbose) {print "Found $xx $yy";}
  0         0  
435 0 0       0 if ($xx eq "alias") {
436 0         0 $uu = $$tabref[0];
437 0         0 $uu =~ s/^&//;
438 0 0       0 if ($verbose) {print " for $uu";}
  0         0  
439             }
440 0 0       0 if ($verbose) {print "\n";}
  0         0  
441 0         0 return 1; # true
442             }
443             }
444 0         0 return undef; # false
445             } # end of sub relvnt_tab
446              
447              
448             sub load_dsf { # load code definitions and approximate substitutes
449 1     1 0 2 my ($mm, $nn, @ww);
450 1 50       91 unless (open(DSF, "$_[0]")) {print "Can't open $_[0]: $!.\n"; return undef;}
  0         0  
  0         0  
451 1         56 while () {
452 1         4 chomp;
453 1 50       8 if (/^hash mnemonic=name/) { # first group header
454 1         139 last;
455             }
456             }
457 1         16 while () {
458 1895         2358 chomp;
459 1895 100       4165 if (/^hash mnemonic=substitute list/) { # next group header
460 1         6 last;
461             }
462 1894         5980 ($mm, $nn) = split(/\t/);
463 1894         11970 $nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic
464 1894         7604 $mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name
465             }
466 1         12 while () {
467 985         1024 chomp;
468 985         7556 ($mm, @ww) = split(/\t/);
469 985         5334 $aprox_mne{$mm} = [@ww];
470             }
471 1         57 close DSF;
472 1         12 return 1;
473             } # end of sub load_dsf
474              
475              
476             sub build_substitutes {
477             # creates lists of approximate substitutes
478             # it takes about 90 minutes to recreate the file
479             # there should be no need to run this since its result file never needs changing
480 0     0 1   my ($xx, $aa, $yy, $bb, $ff, $hh, $pp, $jj, $kk, $gg, $rr, $mm, $nn, $start);
481 0           my @ww;
482 0           my $this = shift;
483 0   0       my $class = ref($this) || $this;
484              
485             #$start = time();
486 0           load_rfcdoc( $rfc_fnam);
487             # find approximate substitutes
488             # (mnemonic, hexcode, name) example:
489             # mnemonic: j+- feef ARABIC LETTER ALEF MAKSURA ISOLATED FORM
490             # substitute: j+ 0649 ARABIC LETTER ALEF MAKSURA
491             # substitute: a+: e022 ARABIC LETTER ALEF FINAL FORM COMPATIBILITY (IBM868 144)
492              
493             #print (time() - $start); print " delete words from right\n";
494 0           while (($xx, $aa) = each %mne_nam) { # (name, nmemonic)
495 0 0         if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character
496 0           next;
497             }
498 0           $yy = $xx;
499 0           while ($yy =~ /[^\s]\s+[^\s]+\s*$/) {
500 0           $yy =~ s/^(.*[^\s])\s+[^\s]+\s*$/$1/; # delete words from right
501 0 0         if ($bb = $mne_nam{$yy}) {
502 0           push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list
  0            
503             #print "$aa\t@{$aprox_mne{$aa}}\n";
504             }
505             }
506             }
507              
508             #print (time() - $start); print " delete words from left\n";
509 0           while (($xx, $aa) = each %mne_nam) { # (name, nmemonic)
510 0 0         if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character
511 0           next;
512             }
513 0           $yy = $xx;
514 0           while ($yy =~ /[^\s]\s+[^\s]+\s*$/) {
515 0           $yy =~ s/^[^\s]+\s+(.*)$/$1/; # delete words from left
516 0 0         if ($yy =~ /^WITH\s/i) { # avoid matching overly broad phrases
517 0           last;
518             }
519 0 0         if ($bb = $mne_nam{$yy}) {
520 0           push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list
  0            
521             #print "$aa\t@{$aprox_mne{$aa}}\n";
522             }
523             }
524             }
525              
526             # look for (string1 inside string2) or (string2 inside string1)
527             # also look fcr "DIGIT ONE", "NUMBER TWO" strings
528             # (disabled) also equate the 2 kinds of Japanese syllabic character sets
529             # since subs are assigned reciprocally, algorithm avoids redundancy: for example, if
530             # elements are (A, B, C, D, E), then comparisions will be E with (A, B, C, D),
531             # D with (A, B, C), C with (A, B), B with (A) and A with nothing.
532              
533             #print (time() - $start); print " look for string1 inside string2, number terms\n";
534 0           $ff = "DIGIT|NUMBER";
535             #$hh = "HIRAGANA|KATAKANA";
536             #$pp = "LETTER|LIGATURE|".$ff."|".$hh;
537 0           $pp = "LETTER|LIGATURE|".$ff;
538 0           $jj = scalar (keys %mne_nam);
539 0           foreach $xx ( reverse keys %mne_nam ) {
540 0 0         if ( ($kk = (--$jj)) <= 0) {
541 0           last;
542             }
543 0 0         if ($xx !~ /($pp)/i) { # only certain types
544 0           next;
545             }
546 0 0         if ($xx !~ /\b\w+\b\s+\b\w+/) { # at least 2 words
547 0           next;
548             }
549 0 0         if ($xx =~ /\b($ff) \b(\w+)\b/i) { # digit, numeral, et al. match
550 0           $gg = $2;
551             } else {
552 0           $gg = "";
553             }
554             # if ($xx =~ /(.*)\b($hh)\b(.*)/i) { # equivalent Japanese syllabic sets
555             # $xx = "$1"."HIRAGANA"."$2";
556             # $rr = "$1"."KATAKANA"."$2";
557             # } else {
558             # $rr = "";
559             # }
560 0           $aa = $mne_nam{$xx};
561 0           foreach $yy (keys %mne_nam) {
562 0 0         if ( ($kk--) <= 0) {
563 0           last;
564             }
565 0 0         if ($yy !~ /($pp)/i) {
566 0           next;
567             }
568 0 0         if ($yy !~ /\b\w+\b\s+\b\w+/) {
569 0           next;
570             }
571 0 0 0       if ( (($xx =~ /\b($yy)\b/i) || ($yy =~ /\b($xx)\b/i)) ||
      0        
      0        
      0        
572             # (($rr) && ($xx =~ /\b($yy)\b/i) || ($yy =~ /\b($xx)\b/i)) ||
573             (($gg) && ($yy !~ /FRACTION/i) && ($yy =~ /\b($ff)\s+\b$gg\b/i)) ) {
574 0           $bb = $mne_nam{$yy};
575 0           push @{ $aprox_mne{ $aa} }, "$bb"; # add to sub lists repciprocally
  0            
576 0           push @{ $aprox_mne{ $bb} }, "$aa";
  0            
577             #print "$aa\t@{$aprox_mne{ $aa}}\n";
578             #print "$bb\t@{$aprox_mne{ $bb}}\n";
579             }
580             }
581             }
582              
583             #print (time() - $start); print "\n";
584             #print "No substitutes found for these:\n";
585 0           foreach $aa ( keys %aprox_mne ) {
586 0 0         if ( $#{ $aprox_mne{$aa}} < 0) {
  0            
587             # print "$aa\t$nam_mne{$aa}\n";
588 0           delete $aprox_mne{$aa};
589             }
590             }
591              
592             #print (time() - $start); print " eliminate duplicate substitutions\n";
593 0           foreach $aa ( keys %aprox_mne ) { # eliminate duplicate substitutions
594 0           @ww = @{$aprox_mne{$aa}};
  0            
595 0           CZWARTY: for($gg = 0;;){
596 0           for ($jj = $#ww; $jj > 0; --$jj){
597 0           for ($kk = ($jj -1); $kk >= 0; --$kk){
598 0 0         if ("$ww[$kk]" eq "$ww[$jj]"){
599 0           splice @ww, $jj, 1;
600 0           $gg = 1;
601 0           next CZWARTY;
602             }
603             }
604             }
605 0           last;
606             }
607 0 0         if ($gg) {
608 0           $aprox_mne{$aa} = [@ww];
609             }
610             }
611              
612             #print (time() - $start); print "\n";
613             # for user's protection, save old file if any
614 0           $gg = "$substi_fnam".".bkp";
615 0 0         if (! -e $gg) {rename( $substi_fnam, "$gg")};
  0            
616             # contrived loop wherein second pass only when open or write failure
617 0           OSMY: for ($yy = 0; $yy == 0; $yy = 1) {
618 0 0         unless (open(DSF, ">$substi_fnam")) {next OSMY};
  0            
619 0 0         unless (print DSF "hash mnemonic=name\n") {next OSMY}; # header
  0            
620 0           foreach $mm (sort keys %nam_mne ) {
621             # mnemonic tab name newline
622 0 0         unless (print DSF "$mm\t$nam_mne{$mm}\n") {next OSMY};
  0            
623             }
624 0 0         unless (print DSF "hash mnemonic=substitute list\n") {next OSMY}; # header
  0            
625 0           foreach $mm (sort keys %aprox_mne ) {
626 0 0         unless (print DSF "$mm") {next OSMY}; # mnemonic
  0            
627 0           foreach $aa (0 .. $#{$aprox_mne{$mm}}) { # each substitute
  0            
628 0 0         unless (print DSF "\t$aprox_mne{$mm}[$aa]") {next OSMY};
  0            
629             }
630 0 0         unless (print DSF "\n") {next OSMY}; # newline
  0            
631             }
632 0           last OSMY;
633             }
634 0           close DSF;
635 0 0         if ($yy) {
636 0 0         if (! -e $substi_fnam) {rename( "$gg", $substi_fnam)};
  0            
637 0           print "Failed to create $substi_fnam: $!\n";
638 0           return undef;
639             }
640 0           return 1;
641             } # end of sub build_substitutes
642              
643              
644             sub load_rfcdoc { #load code definition list
645 0     0 0   my ($mm, $nn, $jj, $catch, $hh, $xx, $kk, $yy);
646 0           my $digits = \"ZERO|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE";
647 0 0         unless (open(RFC, $_[0])) {print "Can't open $_[0]: $!\n"; return undef;}
  0            
  0            
648 0           while() {
649 0           chomp;
650 0 0         if (/^3. CHARACTER MNEMONIC TABLE/) {
651 0           last;
652             }
653             }
654 0           $jj = 1;
655 0           while() {
656 0           chomp;
657 0 0         if (/^4. CHARSETS/) {
658 0           last;
659             }
660             # page head, foot, effectively empty lines
661 0 0 0       if (/^(Simonsen|RFC 1345) / || /^.{0,3}$/) {
662 0           next;
663             }
664 0 0         if (/SP\s+0020\s+SPACE/) { # first code def line
665 0           $catch = "1";
666             }
667 0 0         if (! $catch) {
668 0           next;
669             }
670 0           ++$jj;
671 0 0 0       if(/^\s*([^\s]+)\s+([\da-fA-F]{4})\s{4}\b(.+)\s*$/ ||
672             /^(\s+)(e000)\s{4}\b(.+)\s*$/) {
673 0           $mm = $1; $hh = $2; $xx = hex $2; $nn = $3;
  0            
  0            
  0            
674             # normalize unusual e000 format which indicates unfinished (Mnemonic)
675 0 0         if ($hh eq "e000") {
676 0           $mm = " "; # single space
677             }
678             # correct mistake in LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW
679 0 0         if ($hh eq "1e4b") {
680 0           $mm = "n->";
681             }
682             # correct mistake in LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE
683 0 0         if ($hh eq "1e69") {
684 0           $mm = "s.-.";
685             }
686 0 0 0       if ($yy && ($jj != ($kk +1)) && ($xx != ($yy +1))) {
      0        
687 0           print "Check sequence around this entry: $_\n";
688             }
689 0           $yy = $xx; $kk = $jj;
  0            
690 0 0         if ($nam_mne{$mm} ne "") { # already exists
691 0           print "Same mnemonic $mm for $nam_mne{$mm} and hex code $hh \n";
692             }
693 0 0         if ($mne_nam{$nn} ne "") { # already exists
694 0           print "Same mnemonic $mne_nam{$nn} for $nn and hex code $hh \n";
695             }
696             # normalize for more thorough substitution
697 0           $nn =~ s/\b(SUBSCRIPT|SUPERSCRIPT)\s+($$digits)\b/$1 DIGIT $2/;
698 0           $nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic
699 0           $mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name
700 0           $aprox_mne{$mm} = []; # list approx subs keyed by mnemonic
701 0           next;
702             }
703 0 0         if (/^[ ]{16}([^\s].*)/) { # continuation line
704 0           delete $mne_nam{$nn};
705 0           $nn = "$nn $1"; # append to name in prev line
706 0           $nam_mne{$mm} = "$nn";
707 0           $mne_nam{$nn} = "$mm";
708             }
709             }
710 0           close RFC;
711 0           return 1;
712             } # end of sub load_rfcdoc
713              
714              
715             sub transliterate {
716 0 0   0 1   my $self = ($#_ > 0) ? shift : 0;
717 0 0         my $arref = $self ? \@{$self->{TRN_ARY}} : \@transform;
  0            
718 0           my @xx = unpack "C*", $_[0];
719 0           my $yy = "";
720 0           foreach ( @xx ) {
721 0           $yy .= pack "C", @$arref[$_];
722             }
723 0           return $yy;
724             } # end of sub transliterate
725              
726             __END__