File Coverage

blib/lib/Biblio/bp/lib/bp-cs-tex.pl

Criterion	Covered	Total	%
statement	72	154	46.7
branch	31	64	48.4
condition	0	15	0.0
subroutine	2	3	66.6
pod			n/a
total	105	236	44.4

line	stmt	bran	cond	sub	time	code
1						#
2						# bibliography package for Perl
3						#
4						# TeX character set.
5						#
6						# Dana Jacobsen (dana@acm.org)
7						# 22 January 1995 (last modified on 14 March 1996)
8						#
9						# These routines have gone through a major update in November 1995.
10						#
11						# This is still in beta.
12						# There are many characters not implemented, and the underlying charset
13						# code is not solid yet.
14						#
15						# Some ugly convolutions are gone through to make it run at a decent
16						# speed. This code is _very_ timing sensitive. On a typical 1043 record
17						# run, the first implementation ran at 83 seconds for tocanon, 28 seconds
18						# for fromcanon. Two days of work brought this down to 1 second and 2
19						# seconds.
20						# Lesson:
21						# If you're not careful, you may find the charset code dominating
22						# your entire conversion time since it is run for every _field_, but
23						# with some careful profiling, it can be very fast.
24						#
25
26
27						####
28						#
29						# ToDo's identified by ptandler, 02-07-18
30						#
31						# - Unknown TeX characters in 'ACM SIG{\-}PLAN Notices' --> \- is an optional hyphen
32						# - Unknown TeX characters in '\{lopez,borning\}@cs' --> \{ and \} protect braces
33						# - braces are not removed ... in bibtex they are often needed to protect the case of words
34						# that should not be converted to lowercase in titles.
35						# - tex commands like \cite{....} in bibtex entries are treated as unknown tex characters
36						#
37						####
38
39						package bp_cs_tex;
40
41						######
42
43						$bib'charsets{'tex', 'i_name'} = 'tex';
44
45						$bib'charsets{'tex', 'tocanon'} = "bp_cs_tex'tocanon";
46						$bib'charsets{'tex', 'fromcanon'} = "bp_cs_tex'fromcanon";
47
48						$bib'charsets{'tex', 'toesc'} = "[\$\\\\]";
49						# XXXXX We have so many characters to protect, should we even bother?
50						$bib'charsets{'tex', 'fromesc'} = "[\\#\$\%\&{}_\\|><\^~\200-\377]\|${bib'cs_ext}\|${bib'cs_meta}";
51
52						######
53
54						$cs_init = 0;
55
56						# package variables for anyone to use
57						$mine = '';
58						$unicode = '';
59						$can = '';
60
61						######
62
63						sub init_cs {
64
65						# Thorn and eth are really nasty since they don't exist in the standard TeX
66						# fonts. This is what I came up with in r2b to fake it. Fortunately they
67						# aren't used often. Get the cmoer fonts if you want to do them right.
68						# My eth is pretty nice, but the thorn leaves a little to be desired.
69
70	2			2	245	%charmap = (
71						'00A1', "!'",
72						'00A2', '\leavevmode\hbox{\rm\rlap/c}',
73						'00A3', '{\pounds}',
74						'00A4', '$\spadesuit$',
75						'00A5', '\leavevmode\hbox{\rm\rlap=Y}',
76						'00A6', '\leavevmode
77						\hbox{\hskip.4ex\hbox{\ooalign{\vrule width.2ex height.5ex depth.4ex\crcr
78						\hfil\raise.8ex\hbox{\vrule width.2ex height.9ex depth0ex}\hfil}}}',
79						'00A7', '\S ',
80						'00A8', '{\"{ }}',
81						'00A9', '\leavevmode\hbox{\raise.6em\hbox{\copyright}}',
82						'00AA', '${}^{\b{\scriptsize a}}$',
83						'00AB', '$\scriptscriptstyle\ll$',
84						'00AC', '$\neg$',
85						'00AE', '\leavevmode\hbox{\raise.6em\hbox{\ooalign{{\mathhexbox20D}\crcr
86						\hfil\raise.07ex\hbox{r}\hfil}}}',
87						'00AF', '{\={ }}',
88						'00B0', '${}^\circ$',
89						'00B1', '$\pm$',
90						'00B2', '${}^2$',
91						'00B3', '${}^3$',
92						'00B4', '{\'{ }}',
93						'00B5', '$\mu$',
94						'00B6', '\P ',
95						'00B7', '$\cdot$',
96						'00B8', '{\c{ }}',
97						'00B9', '${}^1$',
98						'00BA', '${}^{\b{\scriptsize o}}$',
99						'00BB', '$\scriptscriptstyle\gg$',
100						'00BC', '$1\over4$',
101						'00BD', '$1\over2$',
102						'00BE', '$3\over4$',
103						'00BF', '?`',
104						'00C0', '{\`A}',
105						'00C1', q-{\'A}-,
106						'00C2', '{\^A}',
107						'00C3', '{\~A}',
108						'00C4', '{\"A}',
109						'00C5', '{\AA}',
110						'00C6', '{\AE}',
111						'00C7', '{\c{C}}',
112						'00C8', '{\`E}',
113						'00C9', q-{\'E}-,
114						'00CA', '{\^E}',
115						'00CB', '{\"E}',
116						'00CC', '{\`I}',
117						'00CD', q-{\'I}-,
118						'00CE', '{\^I}',
119						'00CF', '{\"I}',
120						'00D0', '\leavevmode\hbox{\ooalign{{D}\crcr
121						\hskip.2ex\raise.25ex\hbox{-}\hfil}}',
122						'00D1', '{\~N}',
123						'00D2', '{\`O}',
124						'00D3', q-{\'O}-,
125						'00D4', '{\^O}',
126						'00D5', '{\~O}',
127						'00D6', '{\"O}',
128						'00D7', '$\times$',
129						'00D8', '{\O}',
130						'00D9', '{\`U}',
131						'00DA', q-{\'U}-,
132						'00DB', '{\^U}',
133						'00DC', '{\"U}',
134						'00DD', q-{\'Y}-,
135						'00DE', '\leavevmode\hbox{I\hskip-.6ex\raise.5ex\hbox{$\scriptscriptstyle\supset$}}',
136						'00DF', '{\ss}',
137						'00E0', '{\`a}',
138						'00E1', q-{\'a}-,
139						'00E2', '{\^a}',
140						'00E3', '{\~a}',
141						'00E4', '{\"a}',
142						'00E5', '{\aa}',
143						'00E6', '{\ae}',
144						'00E7', '{\c{c}}',
145						'00E8', '{\`e}',
146						'00E9', q-{\'e}-,
147						'00EA', '{\^e}',
148						'00EB', '{\"e}',
149						'00EC', '{\`i}',
150						'00ED', q-{\'i}-,
151						'00EE', '{\^i}',
152						'00EF', '{\"i}',
153						'00F0', '\leavevmode\hbox{\ooalign{$\partial$\crcr\hskip.8ex\raise.7ex\hbox{-}\hfil}}',
154						'00F1', '{\~n}',
155						'00F2', '{\`o}',
156						'00F3', q-{\'o}-,
157						'00F4', '{\^o}',
158						'00F5', '{\~o}',
159						'00F6', '{\"o}',
160						'00F7', '$\div$',
161						'00F8', '{\o}',
162						'00F9', '{\`u}',
163						'00FA', q-{\'u}-,
164						'00FB', '{\^u}',
165						'00FC', '{\"u}',
166						'00FD', q-{\'y}-,
167						'00FE', '\leavevmode\hbox{{\lower.3ex\hbox{\large l}}\hskip-.52ex o}',
168						'00FF', '{\"y}',
169						'0107', q-{\'c}-,
170						'010C', '{\vC}',
171						'010D', '{\vc}',
172						'0159', '{\vr}',
173						'015F', '{\c{s}}',
174						'0160', '{\vS}',
175						'0161', '{\vs}',
176						'017A', q-{\'z}-,
177						'017E', '{\vz}',
178						# XXXXX
179						# Should these be surrounded by $ (math mode)?
180						# Also, what to do with \mu, which is listed twice?
181						'03B1', '\alpha',
182						'03B2', '\beta',
183						'03B3', '\gamma',
184						'03B4', '\delta',
185						'03B5', '\epsilon',
186						'03B6', '\zeta',
187						'03B7', '\eta',
188						'03B8', '\theta',
189						'03B9', '\iota',
190						'03BA', '\kappa',
191						'03BB', '\lambda',
192						'03BC', '\mu',
193						'03BD', '\nu',
194						'03BE', '\xi',
195						'03C0', '\pi',
196						'03C1', '\rho',
197						'03C2', '\varsigma',
198						'03C3', '\sigma',
199						'03C4', '\tau',
200						'03C5', '\upsilon',
201						'03C6', '\phi',
202						'03C7', '\chi',
203						'03C8', '\psi',
204						'03C9', '\omega',
205						'2007', '$\:$',
206						'2009', '$\,$',
207						'201C', '``',
208						'201D', '\'\'',
209						);
210
211						# This mapping is only used in the from section. We'll do these by hand
212						# in the to mapping.
213	2				14	%charmap2 = (
214						'00A0', '~',
215						'00AD', '-',
216						'2002', '\ ',
217						'2003', '\ \ ',
218						'2014', '---',
219						'03BF', 'o',
220						);
221
222						# Blah. TeX has such a non-uniform way of handling characters that this is
223						# really slow. I'm going to try some optimizations for the tocanon code
224						# since that will be heavily used. It makes this stuff less uniform though.
225						# Remember that we don't have a full TeX parser, or even a partial one.
226
227						# Build up a search string to do the reverse map.
228	2				5	$cmap_to_eval = '';
229	2				4	$cmap_from8_eval = '';
230	2				5	$cmap_to_eval_1 = '';
231	2				5	$cmap_to_eval_2 = '';
232	2				5	%rmap = ();
233	2				6	%accent = ();
234
235						# Step 1: Build a reverse map
236	2				26	while (($unicode, $mine) = each %charmap) {
237	262				899	$rmap{$mine} = $unicode;
238						}
239						# Step 2: walk through the keys in sorted order
240	2				5	local($mineE);
241	2				150	foreach $mine (sort keys %rmap) {
242	262				706	$can = &bib'unicode_to_canon( $rmap{$mine} );
243	262				429	$mineE = $mine;
244	262				1781	$mineE =~ s/(\W)/\\$1/g;
245						# The various maps for tocanon
246	262	100			1088	if ($mine =~ /^{\\([`'^"~])([\w])}$/) {
		100
		100
		100
		100
247	102				354	$accent{$1 . $2} = $can;
248						} elsif ($mine =~ /^{\\([vc])(\w)}$/) {
249	12				39	$accent{$1 . $2} = $can;
250						} elsif ($mine =~ /^{\\([vc]){(\w)}}$/) {
251	6				20	$accent{$1 . $2} = $can;
252						} elsif ($mine =~ /leavevmode/) {
253	18				56	$cmap_to_eval_1 .= "s/$mineE/$can/g;\n";
254						} elsif ($mine =~ /\$/) {
255	40				97	$cmap_to_eval_2 .= "s/$mineE/$can/g;\n";
256						} else {
257	84				176	$cmap_to_eval .= "s/$mineE/$can/g;\n";
258						}
259	262	100			567	if ( length($can) == 1 ) {
260	188				434	$cmap_from8_eval .= "s/$can/$mineE/g;\n";
261						}
262						}
263	2				19	$cmap_from8_eval .= "s/\\240/\\~/g;\ns/\\255/-/g;";
264						# leave rmap
265
266						#%map_diac = (
267						#'tilde', '\~{}',
268						#'circ', '\^{}',
269						#'lcub', '$\lbrace$',
270						#'rcub', '$\rbrace$',
271						#'bsol', '$\backslash$',
272						#);
273
274						# Careful. This is from only.
275	2				47	%metamap = (
276						'3100', '{', # Begin protection
277						'3110', '}', # End protection
278						# fonts
279						'0101', '{\rm ',
280						'0102', '{\it ',
281						'0103', '{\bf ',
282						'0111', '}',
283						'0112', '}',
284						'0113', '}',
285						'0110', '}', # previous font. We don't need a font stack to handle it.
286						'2102', '{\em ',
287						'2112', '}',
288						);
289
290	2				10	$cs_init = 1;
291						}
292
293						######
294
295						sub tocanon {
296	84			84	191	local($_, $protect) = @_;
297
298						# unprotect the TeX characters
299	84	50			188	if ($protect) {
300						# input is assumed to be in TeX format, before _any_ canon processing.
301						# output is TeX format, but with raw magic characters.
302	84				187	s/\$>\$/>/g;
303	84				139	s/\$<\$/
304	84				145	s/\$\\|\$/\\|/g;
305	84				114	s/\\_/_/g;
306	84				129	s/\$\\rbrace\$/}/g;
307	84				107	s/\$\\lbrace\$/{/g;
308	84				119	s/\\\&/\&/g;
309	84				113	s/\\\%/\%/g;
310	84				105	s/\\\$/\$/g;
311	84				176	s/\\#/#/g;
312						}
313
314	84	100			218	if (/-/) {
315	18				41	s/\$-\$/${bib'cs_ext}2212/go;
316	18				34	s/\b---\b/${bib'cs_ext}2014/go;
317	18				36	s/\b--\b/${bib'cs_ext}2013/go;
318						# leave -
319						}
320	84	100			269	if (/~/) {
321	8				35	1 while s/([^\\])~/$1\240/g;
322						}
323	84				122	s/\\ \\ /${bib'cs_ext}2003/go;
324	84				123	s/\\ /${bib'cs_ext}2002/go;
325
326						# Can we go now?
327	84	100			395	return $_ unless /\\/;
328
329	38	100			97	&init_cs unless $cs_init;
330
331	38	100			172	if (/\\[`'^"~vc][{ ]?[\w]/) {
332						# ISO -- we try {\"{c}}, {\"c}, \"{c}, \"c
333						# ^^^^^
334						# preferred
335						#
336						# XXXXX What do we do about all the other ways they can try?
337						# mgnet.bib uses {\" u} a lot. (got this way now)
338
339	30				128	while (/{\\([`'^"~vc])( ?)([\w])}/) {
340	32				142	$can = $accent{$1 . $3};
341	32				112	$mine = "{\\$1$2$3}";
342	32	50			72	if (!defined $can) {
343	0				0	&bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
344	0				0	$can = '';
345						}
346	32				337	$mine =~ s/(\W)/\\$1/g;
347	32				573	s/$mine/$can/g;
348						}
349	30				96	while (/{\\([`'^"~vc]){([\w])}}/) {
350	2				11	$can = $accent{$1 . $2};
351	2				8	$mine = "{\\$1\{$2\}}";
352	2	50			11	if (!defined $can) {
353	0				0	&bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
354	0				0	$can = '';
355						}
356	2				18	$mine =~ s/(\W)/\\$1/g;
357	2				25	s/$mine/$can/g;
358						}
359	30				93	while (/\\([`'^"~vc]){([\w])}/) {
360	0				0	$can = $accent{$1 . $2};
361	0				0	$mine = "\\$1\{$2\}";
362	0	0			0	if (!defined $can) {
363	0				0	&bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
364	0				0	$can = '';
365						}
366	0				0	$mine =~ s/(\W)/\\$1/g;
367	0				0	s/$mine/$can/g;
368						}
369	30				91	while (/\\([`'^"~])( ?)([\w])/) {
370	0				0	$can = $accent{$1 . $3};
371	0				0	$mine = "\\$1$2$3";
372	0	0			0	if (!defined $can) {
373	0				0	&bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
374	0				0	$can = '';
375						}
376	0				0	$mine =~ s/(\W)/\\$1/g;
377	0				0	s/$mine/$can/g;
378						}
379
380						# This unfortunately matches \cr and \circ. We aren't doing a loop
381						# any more, so it's not even necessary anymore. Let the standard
382						# routine try to match and give the normal error message on failure.
383						#while (s/(\\[`'^"~vc][{ ]?[\w])//) {
384						# &bib'gotwarn("Couldn't parse TeX accented character: $1!");
385						#}
386
387	30	100			231	return $_ unless /\\/;
388						} # end of standard accented characters
389
390						# XXXXX What about the v, c, and other accents? Do we need another
391						# section for those, or can we fit them in above?
392
393	12	100			43	if (/leavevmode/) {
394	2				417	eval $cmap_to_eval_1;
395						}
396	12	50			46	if (/\$/) {
397	0				0	eval $cmap_to_eval_2;
398						}
399	12				6691	eval $cmap_to_eval;
400
401	12				44	s/\\\^{}/\^/g;
402	12				55	s/\\~{\s?}/~/g;
403
404						# hopefully we're done by now
405	12	50			108	return $_ unless /\\/;
406
407						# font changes
408						# This doesn't work all that well, but most bibliographies are simple
409	0					s/\{\\rm ([^{}]*)\}/${bib'cs_meta}0101$1${bib'cs_meta}0110/g;
410	0					s/\{\\it ([^{}]*)\}/${bib'cs_meta}0102$1${bib'cs_meta}0110/g;
411	0					s/\{\\bf ([^{}]*)\}/${bib'cs_meta}0103$1${bib'cs_meta}0110/g;
412	0					s/\{\\em ([^{}]*)\}/${bib'cs_meta}2102$1${bib'cs_meta}2112/g;
413	0	0				$_ = &bib'font_check($_) if /${bib'cs_meta}01/o;
414						# done with font changing
415
416	0	0				return $_ unless /\\/;
417
418	0					s/\$\\backslash\$/$bib'cs_temp/g;
419	0	0				if (!/\\/) {
420	0					s/$bib'cs_temp/\\/go;
421	0					return $_;
422						}
423	0					s/$bib'cs_temp/\\/go;
424
425						# I give up.
426						# XXXXX We really ought to remove the escape and meta characters we have
427						# converted when we give them this warning.
428	0					&bib'gotwarn("Unknown TeX characters in '$_'");
429	0					$_;
430						}
431
432						######
433
434						sub fromcanon {
435	0			0		local($_, $protect) = @_;
436	0					local($repl);
437						# We no longer check for font matching here, as that should be done by a
438						# call to bib'font_check in the tocanon code.
439
440	0	0				if ($protect) {
441	0					s/\\/$bib'cs_temp/go;
442	0					s/#/\\#/g;
443	0					s/\$/\\\$/g;
444	0					s/\%/\\\%/g;
445	0					s/\&/\\\&/g;
446	0					s/{/\$\\lbrace\$/g;
447	0					s/}/\$\\rbrace\$/g;
448	0					s/_/\\_/g;
449	0					s/\\|/\$\\|\$/g;
450	0					s/>/\$>\$/g;
451	0					s/
452	0					s/\^/\\^{}/g;
453	0					s/~/\\~{}/g;
454	0					s/$bib'cs_temp/\$\\backslash\$/go;
455						}
456
457	0					while (/([\200-\237])/) {
458	0					$repl = $1;
459	0					$unicode = &bib'canon_to_unicode($repl);
460	0					&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
461	0					s/$repl//g;
462						}
463
464	0	0				&init_cs unless $cs_init;
465
466						#if (/[\240-\377]/) {
467						# eval $cmap_from8_eval;
468						#}
469	0					s/\240/~/g;
470	0					s/\255/-/g;
471	0					while (/([\240-\377])/) {
472	0					$repl = $1;
473	0					$unicode = &bib'canon_to_unicode($repl);
474	0					s/$repl/$charmap{$unicode}/g;
475						}
476
477						# Maybe we can go now?
478	0	0				return $_ unless /$bib'cs_escape/o;
479
480	0					while (/${bib'cs_ext}(....)/) {
481	0					$unicode = $1;
482	0	0				if ($unicode =~ /^00[0-7]/) {
483	0					1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
	0
484	0					next;
485						}
486	0	0	0			defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
487						&& next;
488	0	0	0			defined $charmap2{$unicode} && s/${bib'cs_ext}$unicode/$charmap2{$unicode}/g
489						&& next;
490
491	0					$can = &bib'unicode_approx($unicode);
492	0	0	0			defined $can && s/$bib'cs_ext$unicode/$can/g && next;
493
494	0					&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
495	0					s/${bib'cs_ext}$unicode//g;
496						}
497
498	0					while (/${bib'cs_meta}(....)/) {
499	0					$repl = $1;
500	0	0	0			defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
501						&& next;
502
503	0					$can = &bib'meta_approx($repl);
504	0	0	0			defined $can && s/$bib'cs_meta$repl/$can/g && next;
505
506	0					&bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to TeX");
507	0					s/${bib'cs_meta}$repl//g;
508						}
509
510	0					$_;
511						}
512
513						######
514
515
516						#######################
517						# end of package
518						#######################
519
520						1;