File Coverage

blib/lib/Convert/Translit.pm

Criterion	Covered	Total	%
statement	67	485	13.8
branch	17	272	6.2
condition	1	48	2.0
subroutine	7	10	70.0
pod	3	6	50.0
total	95	821	11.5

line	stmt	bran	cond	sub	pod	time	code
1							#!perl
2							# "Translit.pm" by Genji Schmeder 4 October 1996
3							# creates transliteration map between character sets defined in IETF RFC 1345
4							# with acknowledgements to Chris Leach, author of "EBCDIC.pm"
5							# and to Keld Simonsen, author of RFC 1345
6							# Copyright (c) 1997 Genji Schmeder. All rights reserved.
7							# This program is free software; you can redistribute it and/or modify
8							# it under the same terms as Perl itself.
9
10							package Convert::Translit;
11	1			1		765	use strict;
	1					2
	1					46
12	1			1		5	use vars qw($VERSION @ISA @EXPORT_OK);
	1					3
	1					130
13							require Exporter;
14							@ISA = qw(Exporter);
15							@EXPORT_OK = qw(transliterate build_substitutes);
16							$VERSION = '1.03'; # dated 5 November 1997
17	1			1		925	use integer;
	1					14
	1					6
18
19							if ($] < 5) {die "Perl version must be at least 5.\n";}
20
21							my $here = where_module();
22							my $rfc_fnam = $here."rfc1345";
23							my $substi_fnam = $here."substitutes";
24							undef &where_module;
25
26							my %nam_mne; # hash of name keyed by mnemonic
27							my %mne_nam; # hash of mnemonic keyed by name
28							my %aprox_mne; # hash of substitute list keyed by mnemonic
29							# if new() never called, transliterate() returns unchanged arg
30							my @transform = (0 .. 255);
31							my $verbose = 0;
32							1;
33
34
35							sub where_module {
36							my @xx = caller;
37	1			1		1458	use File::Basename;
	1					2
	1					6696
38							my $yy = dirname($xx[1]);
39							# adjust for Macintosh and Unix different return from dirname()
40							if ( ($^O ne "MacOS") && ($yy =~ /[^\/]$/) ) {
41							$yy .= "/";
42							}
43							return $yy;
44							} # end of sub where_module
45
46
47							sub new {
48	1			1	1	59	my ($bb, $cc, $dd, $ee, $gg, $ii, $jj, $kk, $line);
49	0					0	my ($mm, $pp, $qq, $uu, $vv, $ww, $xx, $yy, $zz);
50	0					0	my (@ch_tab, @thistab, @bitmap, @ff, @tt, @pp, @oo);
51	0					0	my (%duplcatfrom, %duplcatto);
52	1					3	my $this = shift;
53	1		33			9	my $class = ref($this) \|\| $this;
54
55	1					3	%nam_mne = ();
56	1					2	%mne_nam = ();
57	1					2	%aprox_mne = ();
58
59							# syntax: new([FROM charset], TO charset, [verbose])
60	1					3	my @chset;
61	1	50				5	$chset[0] = ($#_ > 0) ? shift : "ascii"; # ascii is default FROM charset
62	1					3	$chset[1] = shift;
63	1					2	$verbose = shift;
64
65							# typical rfc1345 table header:
66							# &charset CSA_Z243.4-1985-1
67							# &rem source: ECMA registry
68							# &alias iso-ir-121
69							# &g0esc x2877 &g1esc x2977 &g2esc x2a77 &g3esc x2b77
70							# &alias ISO646-CA
71							# &alias csa7-1
72							# &alias ca
73							# &code 0
74							# NU SH SX EX ET EQ AK BL BS HT LF VT FF CR SO SI
75
76	1	50				5	load_dsf( $substi_fnam) or return undef;
77	1	50				5	if ($verbose) {print "Creating transliteration from \"$chset[0]\" to \"$chset[1]\"\n";}
	0					0
78	1					7	for $ii ( 0 .. 1 ){
79							#traverse whole file twice for simplicity
80	1	50				109	unless (open(RFC, $rfc_fnam)) {print "Can't open $rfc_fnam: $!.\n"; return undef;}
	0					0
	0					0
81	1					41	while() { #skip to relevant section
82	2709	100				8809	if (/5.\s+CHARSET TABLES/) {
83	1					4	last;
84							}
85							}
86	1					4	@thistab = ();
87	1					8	while() { # read table lines
88	10					18	$line = $_;
89	10					12	chomp $line;
90	10	50				45	if ($line =~ /^ACKNOWLEDGEMENTS/) { # past relevant section
91	0	0				0	if (@thistab) { # was previous table collected?
92	0	0				0	if (relvnt_tab($chset[$ii], \@thistab) ) {
93	0					0	$ch_tab[$ii] = [ @thistab ];
94							}
95							}
96	0					0	last;
97							}
98	10	100				29	if ($line !~ /^ [^\s]/){ # distinguish table lines by 2 leading spaces
99	2					6	next;
100							}
101	8					24	$line =~ s/^\s*//; # trim leading spaces
102							# be sure of ending a table by encountering the next
103	8	100				20	if ($line !~ /^&charset\s/) {
104	6					11	push @thistab, $line;
105	6					13	next;
106							}
107	2	100				6	if (@thistab) { # was previous table collected?
108	1	0				8	if (relvnt_tab($chset[$ii], \@thistab)) {
109	0					0	$ch_tab[$ii] = [ @thistab ];
110	0					0	last;
111							}
112							}
113	1					20	@thistab = ($line); # start another table collection
114							}
115							}
116	0					0	close RFC;
117
118							#examine charsets
119	0					0	$qq = 0;
120	0					0	for $ii ( 0 .. 1 ){
121	0	0				0	if(! $ch_tab[$ii][0]){
122	0					0	print STDERR "Couldn't find a character set for \"$chset[$ii]\".\n";
123	0					0	$qq = 1;
124							}
125							}
126	0	0				0	if ($qq) {return undef;}
	0					0
127
128	0					0	for $ii ( 0 .. 1 ){
129	0					0	for $jj ( 0 .. $#{$ch_tab[$ii]} ) {
	0					0
130	0	0				0	if($ch_tab[$ii][$jj] =~ /^&bits\b\s(\d)/){
131	0	0	0			0	if ($1 != 0 \|\| $1 != 8) {
132	0					0	print STDERR "Can't handle $1\-bit charsets like $chset[$ii].\n";
133	0					0	$qq = 1;
134	0					0	last;
135							}
136							}
137							}
138							}
139	0	0				0	if ($qq) {return undef;}
	0					0
140
141	0					0	for $ii ( 0 .. 1 ){
142	0					0	for $jj ( 0 .. $#{$ch_tab[$ii]} ) {
	0					0
143	0	0				0	if($ch_tab[$ii][$jj] =~ /^&(code2\|codex\|comb2)\b/){
144	0					0	print STDERR "Can't handle $1 terms in charset $chset[$ii].\n";
145	0					0	$qq = 1;
146	0					0	last;
147							}
148							}
149							}
150	0	0				0	if ($qq) {return undef;}
	0					0
151
152							#create bit maps
153	0					0	for $ii ( 0 .. 1 ){
154	0	0				0	if ($verbose) {print "\nCharacter set $chset[$ii]:\n";}
	0					0
155	0					0	$dd = -1; # code offset unless negative
156	0					0	for $jj ( 0 .. $#{$ch_tab[$ii]} ) {
	0					0
157	0					0	$line = $ch_tab[$ii][$jj];
158	0	0				0	if($line =~ /^&code\b\s*(\d)/){
159	0					0	$dd = $1;
160	0					0	next;
161							}
162	0	0				0	if($line =~ /^&duplicate\s+([\da-fA-F]+)\s+([^\s]+)/){
163	0					0	$pp = $1; $mm = $2; # position already in base 10
	0					0
164							# examples: "duplicate 91 AE", "duplicate 92 O/" (position, dup mnemonic)
165							# organize duplicate keeping differently for FROM and TO charsets
166	0	0				0	if ($ii == 0) {
167	0	0				0	if ($duplcatfrom{$pp} ) {
168							# FROM keyed by position
169	0					0	push @{$duplcatfrom{$pp}{Mnemonics}}, "$mm";
	0					0
170							} else {
171	0					0	$duplcatfrom{$pp} = {Position => $pp, Mnemonics => [ "$mm" ],};
172							}
173							#print "FROM Duplicate for position $pp: $duplcatfrom{$pp}{Mnemonics}[-1]\n";
174							} else {
175	0					0	$duplcatto{$mm} = $pp; # TO keyed by mnemonic
176							#print "TO Duplicate for position $duplcatto{$mm}: $mm\n";
177							}
178							}
179							# flush control info (just use all defined keywords)
180	0	0				0	if($line =~
181							/^&(charset\|alias\|g\desc\|bits\|code\|code2\|codex\|duplicate\|rem\|comb2)\b/) {
182	0					0	$dd = -1;
183	0					0	next;
184							}
185	0	0				0	if ($dd < 0) {
186	0	0				0	if ($verbose)
	0					0
187							{print "Strange! You should examine the charset table: \"$line\"\n";}
188	0					0	next;
189							}
190	0					0	@pp = split(/\s+/, $line);
191	0					0	foreach $uu (@pp) {
192							# mnemonic "??" means position unused.
193							# mnemonic "__" means character set not completely defined here
194	0	0	0			0	if ( $uu eq "??" \|\| $uu eq "__" ) {
195	0					0	$uu = "";
196							}
197	0	0	0			0	if (($uu) && (! $nam_mne{$uu})) {
198	0					0	printf "Warning: position %x (hex) has invalid mnemonic \"%s\".\n",
199							$dd, $uu;
200	0					0	$uu = "";
201							}
202	0					0	$bitmap[$ii][$dd++] = $uu;
203							}
204							# print ($#pp, $dd, (1 + $#{$bitmap[$ii]}), "\n");
205							}
206							# normalize char set length
207	0					0	$#{$bitmap[$ii]} += ((128 - ( (1 + $#{$bitmap[$ii]}) % 128)) % 128);
	0					0
	0					0
208							# print ("highest index is", $#{$bitmap[$ii]}, "\n" );
209	0					0	for $kk ( 0 .. $#{$bitmap[$ii]} ) {
	0					0
210	0	0				0	if ($verbose) { print (length($bitmap[$ii][$kk])?"x":"_");}
	0	0				0
211	0	0				0	if ( ! (($kk +1) % 16)) { if ($verbose) { print "\n";}}
	0	0				0
	0					0
212							}
213							}
214
215							# explanation from RFC 1345:
216							# "&duplicate" has a special meaning indicating that a position
217							# is being used for more than one character. This is an ugly
218							# convention but it is a sad fact of life that same code in one
219							# coded character set can mean different characters.
220							# "&duplicate" takes two parameters - the first is the code to
221							# be duplicated, the other is the new mnemonic.
222							# &duplicate 91 AE
223							# &duplicate 92 O/
224
225	0					0	$#transform = $#{$bitmap[0]}; # create transliterator table
	0					0
226	0					0	PIERWSZY: for $jj ( 0 .. $#{$bitmap[0]} ) {
	0					0
227	0					0	$transform[$jj] = -1; # initialize since 0 is valid value
228	0					0	@tt = ( $bitmap[0][$jj] ); # start list with one element
229	0	0				0	if ($duplcatfrom{$jj}) { # add any duplicates in the FROM charset
230	0					0	push @tt, @{$duplcatfrom{$jj}{Mnemonics}};
	0					0
231							#print "BB $jj $#tt <<< @tt >>>\n";
232							}
233	0					0	@oo = ();
234	0					0	for $xx ( 0 .. $#tt ) { # refine list
235	0	0				0	if (length($tt[$xx]) > 0) {
236	0					0	push @oo, $tt[$xx];
237							}
238							}
239	0	0				0	if (! @oo) {
240	0					0	next;
241							}
242	0					0	for $bb ( 0 .. $#oo ) { # match any mnemonics for this position in FROM charset
243	0					0	$uu = $oo[$bb];
244	0					0	for $kk ( 0 .. $#{$bitmap[1]} ) { #search TO charset
	0					0
245	0	0				0	if ($uu eq $bitmap[1][$kk]) {
246	0					0	$transform[$jj] = $kk;
247	0					0	next PIERWSZY;
248							}
249							}
250	0	0				0	if (exists $duplcatto{$uu}) { # search TO charset duplicates
251	0					0	$transform[$jj] = $duplcatto{$uu};
252	0					0	next PIERWSZY;
253							}
254							}
255							}
256
257	0	0				0	if ($verbose) { print "\nTransliterator:\n";}
	0					0
258	0					0	for $jj ( 0 .. $#transform ) {
259	0	0				0	if ($verbose) { print (($transform[$jj]>=0)?"x":"_");}
	0	0				0
260	0	0				0	if ( ! (($jj+1) % 16)) { if ($verbose) { print "\n";}}
	0	0				0
	0					0
261							}
262
263	0					0	$vv = $ww = 0;
264	0					0	for $jj ( 0 .. $#transform ) { # try to approximate using substitute lists
265	0	0				0	if ($transform[$jj] >= 0) {
266	0					0	next;
267							}
268	0	0				0	if (! ($gg = $bitmap[0][$jj]) ) { # undefined
269	0					0	next;
270							}
271	0					0	TRZECI: foreach $pp (@{$aprox_mne{$gg}}) {
	0					0
272	0					0	for $cc ( 0 .. $#{$bitmap[1]} ) {
	0					0
273	0					0	$zz = $bitmap[1][$cc];
274	0	0				0	if ($zz eq $pp) { # substitute must be in TO char set
275	0					0	$transform[$jj] = $cc; # success
276	0	0				0	if (!$ww) {
277	0					0	$ww = 1;
278	0	0				0	if ($verbose) {print "\nApproximate substitutes:\n";}
	0					0
279							}
280	0	0				0	if ($verbose) { printf "%X==>%X\t%s==>%s\n", $jj, $cc,
	0					0
281							$nam_mne{ $gg}, $nam_mne{ $zz};}
282	0					0	last TRZECI;
283							}
284							}
285							}
286	0					0	$vv = 1; # defined character but no substitute
287							}
288
289	0	0				0	if ($ww) {
290	0	0				0	if ($verbose) {print "\nTransliterator with aproximate substitutions:\n";}
	0					0
291	0					0	for $jj ( 0 .. $#transform ) {
292	0	0				0	if ($verbose) {print (($transform[$jj]>=0)?"x":"_");}
	0	0				0
293	0	0				0	if ( ! (($jj+1) % 16)) { if ($verbose) {print "\n";}}
	0	0				0
	0					0
294							}
295							}
296
297							# non-equivalent substitutions (like "RIGHT SQUARE BRACKET" for "CENT SIGN")
298	0	0				0	if ($vv) {
299	0					0	@ff = ();
300	0					0	for $jj ( 0 .. $#transform ) { # list those in need in FROM charset
301	0	0	0			0	if (($transform[$jj] < 0) && $bitmap[0][$jj] ) {
302	0					0	push @ff, $jj;
303							}
304							}
305	0	0				0	if (@ff) {
306	0					0	@tt = ();
307	0					0	PIATY: for $bb ( 0 .. $#{$bitmap[1]} ) { # list availables in TO charset
	0					0
308							# start at usual second ascii graphic (hoping to maximize graphics)
309	0					0	$cc = (34 + $bb) % (1 + $#{$bitmap[1]} );
	0					0
310	0	0				0	if ($zz = $bitmap[1][$cc]) {
311	0					0	for $ii ( 0 .. $#{$bitmap[0]} ) {
	0					0
312	0	0				0	if ($zz eq $bitmap[0][$ii]) {
313	0					0	next PIATY;
314							}
315							}
316	0					0	push @tt, $cc;
317							}
318							}
319							}
320	0					0	$gg = $ee = 0;
321	0		0			0	SIODMY: while ( ($#ff >= 0) && ($#tt >= 0) ) {
322	0					0	for $jj ($gg .. $#ff) { # first try to match equal hex values
323	0					0	$ww = $ff[$jj];
324	0					0	for $kk (0 .. $#tt) {
325	0	0				0	if ( $ww == $tt[$kk]) {
326	0					0	$transform[ $ww] = $ww;
327	0	0				0	if (!$ee) {
328	0					0	$ee = 1;
329	0	0				0	if ($verbose) {print "\nNon-equivalent substitutes:\n";}
	0					0
330							}
331	0	0				0	if ($verbose)
	0					0
332							{printf "%X==>%X\t%s==>%s\n", $ww, $ww,
333							$nam_mne{$bitmap[0][ $ww]}, $nam_mne{$bitmap[1][ $ww]};}
334	0					0	splice @ff, $jj, 1;
335	0					0	splice @tt, $kk, 1;
336	0					0	$gg = $jj;
337	0					0	next SIODMY;
338							}
339							}
340							}
341	0					0	last;
342							}
343	0	0				0	$gg = ($#ff < $#tt)? $#ff : $#tt;
344	0					0	for $jj (0 .. $gg ) {
345	0					0	$transform[$ff[$jj]] = $tt[$jj];
346	0	0				0	if (!$ee) {
347	0					0	$ee = 1;
348	0	0				0	if ($verbose) {print "\nNon-equivalent substitutes:\n";}
	0					0
349							}
350	0	0				0	if ($verbose)
	0					0
351							{printf "%X==>%X\t%s==>%s\n", $ff[$jj], $tt[$jj],
352							$nam_mne{$bitmap[0][$ff[$jj]]}, $nam_mne{$bitmap[1][$tt[$jj]]};}
353							}
354							}
355
356	0					0	$ee = $yy = 0; # anything left untranslated?
357	0					0	for $jj ( 0 .. $#transform ) {
358	0	0				0	if ($transform[$jj] < 0) {
359	0					0	$yy = 1;
360	0	0				0	if ($mm = $bitmap[0][$jj]) {
361	0	0				0	if (!$ee) {
362	0					0	$ee = 1;
363	0	0				0	if ($verbose) {print "\nNon-equivalent remnant:\n";}
	0					0
364							}
365	0	0				0	if ($verbose) {printf "%X\t%s\n", $jj, $nam_mne{$mm};}
	0					0
366							}
367							}
368							}
369
370							# if non-equivalent remnant, then select a hopefully unique and graphic indicator
371	0	0				0	if ($ee) {
372	0					0	DRUGI: for $bb ( 0 .. $#{$bitmap[1]} ) {
	0					0
373	0					0	$cc = (34 + $bb) % (1 + $#{$bitmap[1]} );
	0					0
374	0	0				0	if ($xx = $bitmap[1][$cc]) {
375	0					0	for $dd ( 0 .. $#{$bitmap[0]} ) {
	0					0
376	0	0				0	if ( $xx eq $bitmap[0][$dd]) { # reject if in FROM charset
377	0					0	next DRUGI;
378							}
379							}
380	0					0	last; # success
381							}
382							}
383	0	0				0	if ($verbose)
	0					0
384							{printf "\nNon-equivalence indicator: %X\t%s\n", $cc, $nam_mne{$xx}};
385	0					0	for $jj ( 0 .. $#transform ) { # interpolate non-equiv character
386	0	0	0			0	if (($transform[$jj] < 0) && $bitmap[0][$jj]) {
387	0					0	$transform[$jj] = $cc;
388							}
389							}
390							}
391
392							# if undefined remnant, then select a possibly undefined indicator
393	0	0				0	if ($yy) {
394	0					0	SZOSTY: for ($cc = $#{$bitmap[1]}; $cc >= 0; --$cc ) {
	0					0
395	0	0				0	if (! ($xx = $bitmap[1][$cc]) ) {
396	0					0	last;
397							}
398	0					0	for $dd ( 0 .. $#{$bitmap[0]} ) {
	0					0
399	0	0				0	if ($xx eq $bitmap[0][$dd] ) {
400	0					0	next SZOSTY;
401							}
402							}
403	0					0	last; # success
404							}
405	0	0				0	if ( ! ($yy = $nam_mne{$xx}) ) {$yy = "(undefined character)";}
	0					0
406	0	0				0	if ($verbose) {printf "\nUndefined indicator: %X\t%s\n", $cc, $yy};
	0					0
407	0					0	for $jj ( 0 .. $#transform ) { # interpolate non-equiv character
408	0	0				0	if ($transform[$jj] < 0) {
409	0					0	$transform[$jj] = $cc;
410							}
411							}
412							}
413
414							# if FROM charset length < 256, assume repeated for upper 128 chars
415	0	0				0	if ($#transform < 128) {push @transform, @transform;}
	0					0
416	0					0	$yy = "$chset[0]".".to."."$chset[1]";
417	0					0	my $self = {TRN_NAM=>"$yy", TRN_ARY=>[@transform]};
418	0					0	bless $self, $class;
419	0					0	return $self;
420							} # end of sub new
421
422
423							sub relvnt_tab { # true if this is the sought char table
424	1			1	0	2	my ($jj, $uu, $ww, $xx, $yy, $chch, $tabref);
425	1					3	$chch = shift; # $chset[0 or 1]
426	1					2	$tabref = shift; # reference to @thistab
427	1					3	for $jj ( 0 .. $#{@$tabref} ){
	1					185
428	0					0	$ww = $$tabref[$jj];
429	0	0				0	if ($ww !~ /^&(charset\|alias)\s+([^\s]*)/){
430	0					0	next;
431							}
432	0					0	$xx = $1; $yy = $2;
	0					0
433	0	0				0	if ($chch =~ /^$yy$/i) {
434	0	0				0	if ($verbose) {print "Found $xx $yy";}
	0					0
435	0	0				0	if ($xx eq "alias") {
436	0					0	$uu = $$tabref[0];
437	0					0	$uu =~ s/^&//;
438	0	0				0	if ($verbose) {print " for $uu";}
	0					0
439							}
440	0	0				0	if ($verbose) {print "\n";}
	0					0
441	0					0	return 1; # true
442							}
443							}
444	0					0	return undef; # false
445							} # end of sub relvnt_tab
446
447
448							sub load_dsf { # load code definitions and approximate substitutes
449	1			1	0	2	my ($mm, $nn, @ww);
450	1	50				91	unless (open(DSF, "$_[0]")) {print "Can't open $_[0]: $!.\n"; return undef;}
	0					0
	0					0
451	1					56	while () {
452	1					4	chomp;
453	1	50				8	if (/^hash mnemonic=name/) { # first group header
454	1					139	last;
455							}
456							}
457	1					16	while () {
458	1895					2358	chomp;
459	1895	100				4165	if (/^hash mnemonic=substitute list/) { # next group header
460	1					6	last;
461							}
462	1894					5980	($mm, $nn) = split(/\t/);
463	1894					11970	$nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic
464	1894					7604	$mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name
465							}
466	1					12	while () {
467	985					1024	chomp;
468	985					7556	($mm, @ww) = split(/\t/);
469	985					5334	$aprox_mne{$mm} = [@ww];
470							}
471	1					57	close DSF;
472	1					12	return 1;
473							} # end of sub load_dsf
474
475
476							sub build_substitutes {
477							# creates lists of approximate substitutes
478							# it takes about 90 minutes to recreate the file
479							# there should be no need to run this since its result file never needs changing
480	0			0	1		my ($xx, $aa, $yy, $bb, $ff, $hh, $pp, $jj, $kk, $gg, $rr, $mm, $nn, $start);
481	0						my @ww;
482	0						my $this = shift;
483	0		0				my $class = ref($this) \|\| $this;
484
485							#$start = time();
486	0						load_rfcdoc( $rfc_fnam);
487							# find approximate substitutes
488							# (mnemonic, hexcode, name) example:
489							# mnemonic: j+- feef ARABIC LETTER ALEF MAKSURA ISOLATED FORM
490							# substitute: j+ 0649 ARABIC LETTER ALEF MAKSURA
491							# substitute: a+: e022 ARABIC LETTER ALEF FINAL FORM COMPATIBILITY (IBM868 144)
492
493							#print (time() - $start); print " delete words from right\n";
494	0						while (($xx, $aa) = each %mne_nam) { # (name, nmemonic)
495	0	0					if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character
496	0						next;
497							}
498	0						$yy = $xx;
499	0						while ($yy =~ /[^\s]\s+[^\s]+\s*$/) {
500	0						$yy =~ s/^(.[^\s])\s+[^\s]+\s$/$1/; # delete words from right
501	0	0					if ($bb = $mne_nam{$yy}) {
502	0						push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list
	0
503							#print "$aa\t@{$aprox_mne{$aa}}\n";
504							}
505							}
506							}
507
508							#print (time() - $start); print " delete words from left\n";
509	0						while (($xx, $aa) = each %mne_nam) { # (name, nmemonic)
510	0	0					if ($xx =~ /^DOT ABOVE\s/i) { # avoid this overly matchable character
511	0						next;
512							}
513	0						$yy = $xx;
514	0						while ($yy =~ /[^\s]\s+[^\s]+\s*$/) {
515	0						$yy =~ s/^[^\s]+\s+(.*)$/$1/; # delete words from left
516	0	0					if ($yy =~ /^WITH\s/i) { # avoid matching overly broad phrases
517	0						last;
518							}
519	0	0					if ($bb = $mne_nam{$yy}) {
520	0						push @{ $aprox_mne{$aa} }, "$bb"; # add to sub list
	0
521							#print "$aa\t@{$aprox_mne{$aa}}\n";
522							}
523							}
524							}
525
526							# look for (string1 inside string2) or (string2 inside string1)
527							# also look fcr "DIGIT ONE", "NUMBER TWO" strings
528							# (disabled) also equate the 2 kinds of Japanese syllabic character sets
529							# since subs are assigned reciprocally, algorithm avoids redundancy: for example, if
530							# elements are (A, B, C, D, E), then comparisions will be E with (A, B, C, D),
531							# D with (A, B, C), C with (A, B), B with (A) and A with nothing.
532
533							#print (time() - $start); print " look for string1 inside string2, number terms\n";
534	0						$ff = "DIGIT\|NUMBER";
535							#$hh = "HIRAGANA\|KATAKANA";
536							#$pp = "LETTER\|LIGATURE\|".$ff."\|".$hh;
537	0						$pp = "LETTER\|LIGATURE\|".$ff;
538	0						$jj = scalar (keys %mne_nam);
539	0						foreach $xx ( reverse keys %mne_nam ) {
540	0	0					if ( ($kk = (--$jj)) <= 0) {
541	0						last;
542							}
543	0	0					if ($xx !~ /($pp)/i) { # only certain types
544	0						next;
545							}
546	0	0					if ($xx !~ /\b\w+\b\s+\b\w+/) { # at least 2 words
547	0						next;
548							}
549	0	0					if ($xx =~ /\b($ff) \b(\w+)\b/i) { # digit, numeral, et al. match
550	0						$gg = $2;
551							} else {
552	0						$gg = "";
553							}
554							# if ($xx =~ /(.)\b($hh)\b(.)/i) { # equivalent Japanese syllabic sets
555							# $xx = "$1"."HIRAGANA"."$2";
556							# $rr = "$1"."KATAKANA"."$2";
557							# } else {
558							# $rr = "";
559							# }
560	0						$aa = $mne_nam{$xx};
561	0						foreach $yy (keys %mne_nam) {
562	0	0					if ( ($kk--) <= 0) {
563	0						last;
564							}
565	0	0					if ($yy !~ /($pp)/i) {
566	0						next;
567							}
568	0	0					if ($yy !~ /\b\w+\b\s+\b\w+/) {
569	0						next;
570							}
571	0	0	0				if ( (($xx =~ /\b($yy)\b/i) \|\| ($yy =~ /\b($xx)\b/i)) \|\|
			0
			0
			0
572							# (($rr) && ($xx =~ /\b($yy)\b/i) \|\| ($yy =~ /\b($xx)\b/i)) \|\|
573							(($gg) && ($yy !~ /FRACTION/i) && ($yy =~ /\b($ff)\s+\b$gg\b/i)) ) {
574	0						$bb = $mne_nam{$yy};
575	0						push @{ $aprox_mne{ $aa} }, "$bb"; # add to sub lists repciprocally
	0
576	0						push @{ $aprox_mne{ $bb} }, "$aa";
	0
577							#print "$aa\t@{$aprox_mne{ $aa}}\n";
578							#print "$bb\t@{$aprox_mne{ $bb}}\n";
579							}
580							}
581							}
582
583							#print (time() - $start); print "\n";
584							#print "No substitutes found for these:\n";
585	0						foreach $aa ( keys %aprox_mne ) {
586	0	0					if ( $#{ $aprox_mne{$aa}} < 0) {
	0
587							# print "$aa\t$nam_mne{$aa}\n";
588	0						delete $aprox_mne{$aa};
589							}
590							}
591
592							#print (time() - $start); print " eliminate duplicate substitutions\n";
593	0						foreach $aa ( keys %aprox_mne ) { # eliminate duplicate substitutions
594	0						@ww = @{$aprox_mne{$aa}};
	0
595	0						CZWARTY: for($gg = 0;;){
596	0						for ($jj = $#ww; $jj > 0; --$jj){
597	0						for ($kk = ($jj -1); $kk >= 0; --$kk){
598	0	0					if ("$ww[$kk]" eq "$ww[$jj]"){
599	0						splice @ww, $jj, 1;
600	0						$gg = 1;
601	0						next CZWARTY;
602							}
603							}
604							}
605	0						last;
606							}
607	0	0					if ($gg) {
608	0						$aprox_mne{$aa} = [@ww];
609							}
610							}
611
612							#print (time() - $start); print "\n";
613							# for user's protection, save old file if any
614	0						$gg = "$substi_fnam".".bkp";
615	0	0					if (! -e $gg) {rename( $substi_fnam, "$gg")};
	0
616							# contrived loop wherein second pass only when open or write failure
617	0						OSMY: for ($yy = 0; $yy == 0; $yy = 1) {
618	0	0					unless (open(DSF, ">$substi_fnam")) {next OSMY};
	0
619	0	0					unless (print DSF "hash mnemonic=name\n") {next OSMY}; # header
	0
620	0						foreach $mm (sort keys %nam_mne ) {
621							# mnemonic tab name newline
622	0	0					unless (print DSF "$mm\t$nam_mne{$mm}\n") {next OSMY};
	0
623							}
624	0	0					unless (print DSF "hash mnemonic=substitute list\n") {next OSMY}; # header
	0
625	0						foreach $mm (sort keys %aprox_mne ) {
626	0	0					unless (print DSF "$mm") {next OSMY}; # mnemonic
	0
627	0						foreach $aa (0 .. $#{$aprox_mne{$mm}}) { # each substitute
	0
628	0	0					unless (print DSF "\t$aprox_mne{$mm}[$aa]") {next OSMY};
	0
629							}
630	0	0					unless (print DSF "\n") {next OSMY}; # newline
	0
631							}
632	0						last OSMY;
633							}
634	0						close DSF;
635	0	0					if ($yy) {
636	0	0					if (! -e $substi_fnam) {rename( "$gg", $substi_fnam)};
	0
637	0						print "Failed to create $substi_fnam: $!\n";
638	0						return undef;
639							}
640	0						return 1;
641							} # end of sub build_substitutes
642
643
644							sub load_rfcdoc { #load code definition list
645	0			0	0		my ($mm, $nn, $jj, $catch, $hh, $xx, $kk, $yy);
646	0						my $digits = \"ZERO\|ONE\|TWO\|THREE\|FOUR\|FIVE\|SIX\|SEVEN\|EIGHT\|NINE";
647	0	0					unless (open(RFC, $_[0])) {print "Can't open $_[0]: $!\n"; return undef;}
	0
	0
648	0						while() {
649	0						chomp;
650	0	0					if (/^3. CHARACTER MNEMONIC TABLE/) {
651	0						last;
652							}
653							}
654	0						$jj = 1;
655	0						while() {
656	0						chomp;
657	0	0					if (/^4. CHARSETS/) {
658	0						last;
659							}
660							# page head, foot, effectively empty lines
661	0	0	0				if (/^(Simonsen\|RFC 1345) / \|\| /^.{0,3}$/) {
662	0						next;
663							}
664	0	0					if (/SP\s+0020\s+SPACE/) { # first code def line
665	0						$catch = "1";
666							}
667	0	0					if (! $catch) {
668	0						next;
669							}
670	0						++$jj;
671	0	0	0				if(/^\s([^\s]+)\s+([\da-fA-F]{4})\s{4}\b(.+)\s$/ \|\|
672							/^(\s+)(e000)\s{4}\b(.+)\s*$/) {
673	0						$mm = $1; $hh = $2; $xx = hex $2; $nn = $3;
	0
	0
	0
674							# normalize unusual e000 format which indicates unfinished (Mnemonic)
675	0	0					if ($hh eq "e000") {
676	0						$mm = " "; # single space
677							}
678							# correct mistake in LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW
679	0	0					if ($hh eq "1e4b") {
680	0						$mm = "n->";
681							}
682							# correct mistake in LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE
683	0	0					if ($hh eq "1e69") {
684	0						$mm = "s.-.";
685							}
686	0	0	0				if ($yy && ($jj != ($kk +1)) && ($xx != ($yy +1))) {
			0
687	0						print "Check sequence around this entry: $_\n";
688							}
689	0						$yy = $xx; $kk = $jj;
	0
690	0	0					if ($nam_mne{$mm} ne "") { # already exists
691	0						print "Same mnemonic $mm for $nam_mne{$mm} and hex code $hh \n";
692							}
693	0	0					if ($mne_nam{$nn} ne "") { # already exists
694	0						print "Same mnemonic $mne_nam{$nn} for $nn and hex code $hh \n";
695							}
696							# normalize for more thorough substitution
697	0						$nn =~ s/\b(SUBSCRIPT\|SUPERSCRIPT)\s+($$digits)\b/$1 DIGIT $2/;
698	0						$nam_mne{$mm} = "$nn"; # hash of name keyed by mnemonic
699	0						$mne_nam{$nn} = "$mm"; # hash of mnemonic keyed by name
700	0						$aprox_mne{$mm} = []; # list approx subs keyed by mnemonic
701	0						next;
702							}
703	0	0					if (/^[ ]{16}([^\s].*)/) { # continuation line
704	0						delete $mne_nam{$nn};
705	0						$nn = "$nn $1"; # append to name in prev line
706	0						$nam_mne{$mm} = "$nn";
707	0						$mne_nam{$nn} = "$mm";
708							}
709							}
710	0						close RFC;
711	0						return 1;
712							} # end of sub load_rfcdoc
713
714
715							sub transliterate {
716	0	0		0	1		my $self = ($#_ > 0) ? shift : 0;
717	0	0					my $arref = $self ? \@{$self->{TRN_ARY}} : \@transform;
	0
718	0						my @xx = unpack "C*", $_[0];
719	0						my $yy = "";
720	0						foreach ( @xx ) {
721	0						$yy .= pack "C", @$arref[$_];
722							}
723	0						return $yy;
724							} # end of sub transliterate
725
726							__END__