File Coverage

blib/lib/LaTeX/ToUnicode.pm

Criterion	Covered	Total	%
statement	132	187	70.5
branch	19	50	38.0
condition	0	3	0.0
subroutine	18	21	85.7
pod	2	3	66.6
total	171	264	64.7

line	stmt	bran	cond	sub	pod	time	code
1	1			1		99056	use strict;
	1					1
	1					28
2	1			1		4	use warnings;
	1					1
	1					45
3							package LaTeX::ToUnicode;
4							BEGIN {
5	1			1		61	$LaTeX::ToUnicode::VERSION = '1.95';
6							}
7							#ABSTRACT: Convert LaTeX commands to Unicode (simplistically)
8
9							require Exporter;
10							our @ISA = qw(Exporter);
11							our @EXPORT_OK = qw( convert debuglevel $endcw );
12
13	1			1		4	use utf8;
	1					0
	1					5
14	1			1		444	use Encode;
	1					13129
	1					73
15	1			1		477	use LaTeX::ToUnicode::Tables;
	1					3
	1					2387
16
17							# Terminating a control word (not symbol) the way TeX does: at the
18							# boundary between a letter (lookbehind) and a nonletter (lookahead),
19							# and then ignore any following whitespace.
20							our $endcw = qr/(?<=[a-zA-Z])(?=[^a-zA-Z]\|$)\s*/;
21
22							# Debugging output on or off; it's pretty random what gets output.
23							# Add more as needed. There is also more debugging output as warn
24							# statements, commented out, too voluminous to enable here.
25							my $debug = 0;
26
27	0			0	1	0	sub debuglevel { $debug = shift; }
28							sub _debug {
29	612	50		612		1307	return unless $debug;
30							# The backtrace info is split between caller(0) and caller(1), sigh.
31							# We don't need the package name, it's included in $subr in practice.
32	0					0	my (undef,$filename,$line,undef) = caller(0);
33	0					0	my (undef,undef,undef,$subr) = caller(1);
34	0					0	warn @_, " at $filename:$line ($subr)\n";
35							}
36
37							# The main conversion function.
38							#
39							sub convert {
40	102			102	1	233568	my ($string, %options) = @_;
41							#warn debug_hash_as_string("starting with: $string", %options);
42
43							# First, remove leading and trailing horizontal whitespace
44							# on each line of the possibly-multiline string we're given.
45	102					565	$string =~ s/^[ \t]*//m;
46	102					564	$string =~ s/[ \t]*$//m;
47
48							# For HTML output, must convert special characters that were in the
49							# TeX text (&<>) to their entities to avoid misparsing. We want to
50							# do this first, because conversion of the markup commands might
51							# output HTML tags like `, and we don't want to convert those <>.`
52							# Although <tt> works, better to keep the output HTML as
53							# human-readable as we can.
54							#
55	102	50				364	if ($options{html}) {
56	0					0	$string =~ s/([^\\]\|^)&/$1&/g;
57	0					0	$string =~ s/
58	0					0	$string =~ s/>/>/g;
59							}
60
61	102					209	my $user_hook = $options{hook};
62	102	50				270	if ($user_hook) {
63	0					0	_debug("before user hook: $string");
64	0					0	$string = &$user_hook($string, \%options);
65	0					0	_debug("after user hook: $string");
66							}
67
68							# Convert general commands that take arguments, since (1) they might
69							# insert TeX commands that need to be converted, and (2) because
70							# their arguments could well contain constructs that will map to a
71							# Perl string \x{nnnn} for Unicode character nnnn; those Perl braces
72							# for the \x will confuse further parsing of the TeX.
73							#
74	102					260	$string = _convert_commands_with_arg($string);
75	102					411	_debug("after commands with arg: $string");
76
77							# Convert markups (\texttt, etc.); they have the same brace-parsing issue.
78	102					332	$string = _convert_markups($string, \%options);
79	102					311	_debug("after markups: $string");
80
81							# And urls, a special case of commands with arguments.
82	102					263	$string = _convert_urls($string, \%options);
83	102					263	_debug("after urls: $string");
84
85	102					198	$string = _convert_control_words($string);
86	102					412	_debug("after control words: $string");
87
88	102					262	$string = _convert_control_symbols($string);
89	102					310	_debug("after control symbols: $string");
90
91	102					236	$string = _convert_accents($string);
92	102	100				303	$string = _convert_german($string) if $options{german};
93	102					219	$string = _convert_symbols($string);
94	102					302	$string = _convert_ligatures($string);
95	102					230	$string = _convert_glue_etc_primitives($string);
96	102					398	_debug("after the rest: $string");
97
98							# Let's handle ties here, after all the other conversions, since
99							# they don't fit well with any of the tables. We don't handle TeX's
100							# other special characters: $ & # ^ _.
101							#
102							# /~, or ~ at the beginning of a line, is probably part of a url or
103							# path, not a tie. Otherwise, consider it a space, since we can't
104							# distinguish true no-break spots (Donald~E. Knuth) from ties that
105							# are only relevant to a particular line width.
106							#
107	102					249	$string =~ s,([^/])~,$1 ,g;
108
109							# After all the conversions, $string contains \x{....} constructs
110							# (Perl Unicode characters) where translations have happened. Change
111							# those to the desired output format. Thus we assume that the
112							# Unicode \x{....}'s are not themselves involved in further
113							# translations, which is, so far, true.
114							#
115	102	50				302	if (! $options{entities}) {
		0
116							# Convert our \x strings from Tables.pm to the binary characters.
117
118							# As an extra-special case, we want to preserve the translation of
119							# \{ and \} as 007[bd] entities even if the --entities option is
120							# not give; otherwise they'd get eliminated like all other braces.
121							# Use a temporary cs \xx to keep them marked, and don't use braces
122							# to delimit the argument since they'll get deleted.
123	102					205	$string =~ s/\\x\{(007[bd])\}/\\xx($1)/g;
124
125							# Convert all other characters to characters.
126							# Assume exactly four hex digits, since we wrote Tables.pm that way.
127	102					555	$string =~ s/\\x\{(....)\}/ pack('U*', hex($1))/eg;
	86					749
128
129							} elsif ($options{entities}) {
130							# Convert the XML special characters that appeared in the input,
131							# e.g., from a TeX \&. Unless we're generating HTML output, in
132							# which case they have already been converted.
133	0	0				0	if (! $options{html}) {
134	0					0	$string =~ s/&/&/g;
135	0					0	$string =~ s/
136	0					0	$string =~ s/>/>/g;
137							}
138
139							# Our values in Tables.pm are simple ASCII strings \x{....},
140							# so we can replace them with hex entities with no trouble.
141							# Fortunately TeX does not have a standard \x control sequence.
142	0					0	$string =~ s/\\x\{(....)\}/&#x$1;/g;
143
144							# The rest of the job is about binary Unicode characters in the
145							# input. We want to transform them into entities also. As always
146							# in Perl, there's more than one way to do it, and several are
147							# described here, just for the fun of it.
148	0					0	my $ret = "";
149							#
150							# decode_utf8 is described in https://perldoc.perl.org/Encode.
151							# Without the decode_utf8, all of these methods output each byte
152							# separately; apparently $string is a byte string at this point,
153							# not a Unicode string. I don't know why that is.
154	0					0	$ret = decode_utf8($string);
155							#
156							# Transform everything that's not printable ASCII or newline into
157							# entities.
158	0					0	$ret =~ s/([^ -~\n])/ sprintf("&#x%04x;", ord($1)) /eg;
	0					0
159							#
160							# This method leaves control characters as literal; doesn't matter
161							# for XML output, since control characters aren't allowed, but
162							# let's use the regexp method anyway.
163							#$ret = encode("ascii", decode_utf8($string), Encode::FB_XMLCREF);
164							#
165							# The nice_string function from perluniintro also works.
166							#
167							# This fails, just outputs numbers (that is, ord values):
168							# foreach my $c (unpack("U*", $ret)) {
169							#
170							# Without the decode_utf8, outputs each byte separately.
171							# With the decode_utf8, works, but the above seems cleaner.
172							#foreach my $c (split(//, $ret)) {
173							# if (ord($c) <= 31 \|\| ord($c) >= 128) {
174							# $ret .= sprintf("&#x%04x;", ord($c));
175							# } else {
176							# $ret .= $c;
177							# }
178							#}
179							#
180	0					0	$string = $ret; # assigned from above.
181							}
182
183	102	50				273	if ($string =~ /\\x\{/) {
184	0					0	warn "LaTeX::ToUnicode::convert: untranslated \\x remains: $string\n";
185	0					0	warn "LaTeX::ToUnicode::convert: please report as bug.\n";
186							}
187
188							# Drop all remaining braces.
189	102					314	$string =~ s/[{}]//g;
190
191	102	50				279	if (! $options{entities}) {
192							# With all the other braces gone, now we can convert the preserved
193							# brace entities from \{ and \} to actual braces.
194	102					196	$string =~ s/\\xx$(007[bd])$/ pack('U*', hex($1))/eg;
	2					6
195							}
196
197							# Backslashes might remain. Don't remove them, as it makes for a
198							# useful way to find unhandled commands.
199
200							# leave newlines alone, but trim spaces and tabs.
201	102					267	$string =~ s/^[ \t]+//s; # remove leading whitespace
202	102					250	$string =~ s/[ \t]+$//s; # remove trailing whitespace
203	102					185	$string =~ s/[ \t]+/ /gs; # collapse all remaining whitespace to one space
204
205	102					689	$string;
206							}
207
208							# Convert commands that take a single braced argument. The table
209							# defines text we're supposed to insert before and after the argument.
210							# We let future processing handle conversion of both the inserted text
211							# and the argument.
212							#
213							sub _convert_commands_with_arg {
214	102			102		209	my $string = shift;
215
216	102					529	foreach my $cmd ( keys %LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS ) {
217	612					1134	my $repl = $LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS{$cmd};
218	612					903	my $lft = $repl->[0]; # ref to two-element list
219	612					728	my $rht = $repl->[1];
220							# \cmd{foo} -> LFT foo RHT
221	612					17840	$string =~ s/\\$cmd${endcw}\{(.*?)\}/$lft$1$rht/g;
222							#warn "replaced arg $cmd, yielding $string\n";
223							}
224
225	102					327	$string;
226							}
227
228							# Convert url commands in STRING. This is a special case of commands
229							# with arguments: \url{u} and \href{u}{desc text}. The HTML output
230							# (generated if $OPTIONS{html} is set) is just too special to be handled
231							# in a table; further, \href is the only two-argument command we are
232							# currently handling.
233							#
234							sub _convert_urls {
235	102			102		198	my ($string,$options) = @_;
236
237	102	50				282	if ($options->{html}) {
238							# HTML output.
239							# \url{URL} -> URL
240	0					0	$string =~ s,\\url$endcw\{([^}]*)\}
241							,$1,gx;
242							#
243							# \href{URL}{TEXT} -> TEXT
244							#warn "html href: $string\n" if $string =~ /href/;
245	0					0	$string =~ s,\\href$endcw\{([^}])\}\s\{([^}]*)\}
246							,$2,gx;
247
248							} else {
249							# plain text output.
250							# \url{URL} -> URL
251	102					391	$string =~ s/\\url$endcw\{([^}]*)\}/$1/g;
252							#
253							# \href{URL}{TEXT} -> TEXT (URL)
254							# but, as a special case, if URL ends with TEXT, just output URL,
255							# as in:
256							# \href{https://doi.org/10/fjzzc8}{10/fjzzc8}
257							# ->
258							# https://doi.org/10/fjzzc8
259							#
260							# Yet more specialness: the TEXT might have extra braces, as in
261							# \href{https://doi.org/10/fjzzc8}{{10/fjzzc8}}
262							# left over from previous markup commands (\path) which got
263							# removed. We want to accept and ignore such extra braces,
264							# hence the \{+ ... \}+ in recognizing TEXT.
265							#
266							#warn "txt url: starting with $string\n" if $string =~ /href/;
267	102	50				479	if ($string =~ m/\\href$endcw\{([^}])\}\s\{+([^}]*)\}+/) {
268	0					0	my $url = $1;
269	0					0	my $text = $2;
270							#warn " url: $url\n";
271							#warn " text: $text\n";
272	0	0				0	my $repl = ($url =~ m!$text$!) ? $url : "$text ($url)";
273							#warn " repl: $repl\n";
274	0					0	$string =~ s/\\href$endcw\{([^}])\}\s\{+([^}]*)\}+/$repl/;
275							#warn " result: $string\n";
276							}
277							}
278
279	102					188	$string;
280							}
281
282							# Convert control words (not symbols), that is, a backslash and an
283							# alphabetic sequence of characters terminated by a non-alphabetic
284							# character. Following whitespace is ignored.
285							#
286							sub _convert_control_words {
287	102			102		173	my $string = shift;
288
289	102					1455	foreach my $command ( keys %LaTeX::ToUnicode::Tables::CONTROL_WORDS ) {
290	10812					18303	my $repl = $LaTeX::ToUnicode::Tables::CONTROL_WORDS{$command};
291							# replace {\CMD}, whitespace ignored after \CMD.
292	10812					220536	$string =~ s/\{\\$command$endcw\}/$repl/g;
293
294							# replace \CMD, preceded by not-consumed non-backslash.
295	10812					230193	$string =~ s/(?<=[^\\])\\$command$endcw/$repl/g;
296
297							# replace \CMD at beginning of whole string, which otherwise
298							# wouldn't be matched. Two separate regexps to avoid
299							# variable-length lookbehind.
300	10812					218878	$string =~ s/^\\$command$endcw/$repl/g;
301							}
302
303	102					841	$string;
304							}
305
306							# Convert control symbols, other than accents. Much simpler than
307							# control words, since are self-delimiting, don't take arguments, and
308							# don't consume any following text.
309							#
310							sub _convert_control_symbols {
311	102			102		216	my $string = shift;
312
313	102					939	foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS ) {
314	2754					3835	my $repl = $LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS{$symbol};
315
316							# because these are not alphabetic, we can quotemeta them,
317							# and we need to because "\" is one of the symbols.
318	2754					3122	my $rx = quotemeta($symbol);
319
320							# the preceding character must not be a backslash, else "\\ "
321							# could have the "\ " seen first as a control space, leaving
322							# a spurious \ behind. Don't consume the preceding.
323							# Or it could be at the beginning of a line.
324							#
325	2754					26853	$string =~ s/(^\|(?<=[^\\]))\\$rx/$repl/g;
326							#warn "after sym $symbol (\\$rx -> $repl), have: $string\n";
327							}
328
329	102					340	$string;
330							}
331
332							# Convert accents.
333							#
334							sub _convert_accents {
335	102			102		197	my $string = shift;
336
337							# first the non-alphabetic accent commands, like \".
338	102					932	my %tbl = %LaTeX::ToUnicode::Tables::ACCENT_SYMBOLS;
339	102	100				437	$string =~ s/(\{\\(.)\s*\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} \|\| $1/eg; #{\"{a}}
	29					289
340	102	100				661	$string =~ s/(\{\\(.)\s*(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # {\"a}
	47					444
341	102	50				428	$string =~ s/(\\(.)\s*(\\?\w{1,1}))/ $tbl{$2}{$3} \|\| $1/eg; # \"a
	6					33
342	102	100				340	$string =~ s/(\\(.)\s*\{(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # \"{a}
	20					95
343
344							# second the alphabetic commands, like \c. They have to be handled
345							# differently because \cc is not \c{c}! The only difference in the
346							# regular expressions is using $endcw instead of just \s*.
347							#
348	102					850	%tbl = %LaTeX::ToUnicode::Tables::ACCENT_LETTERS;
349	102	50				633	$string =~ s/(\{\\(.)$endcw\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} \|\| $1/eg; #{\"{a}}
	19					96
350	102	0				442	$string =~ s/(\{\\(.)$endcw(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # {\"a}
	0					0
351	102	0				610	$string =~ s/(\\(.)$endcw(\\?\w{1,1}))/ $tbl{$2}{$3} \|\| $1/eg; # \"a
	0					0
352	102	0				502	$string =~ s/(\\(.)$endcw\{(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # \"{a}
	0					0
353
354
355							# The argument is just one \w character for the \"a case, not two,
356							# because otherwise we might consume a following character that is
357							# not part of the accent, e.g., a backslash (\"a\'e).
358							#
359							# Others can be two because of the \t tie-after accent. Even {\t oo} is ok.
360							#
361							# Allow whitespace after the \CMD in all cases, e.g., "\c c". Even
362							# for the control symbols, it turns out spaces are ignored there
363							# (as in \" o), unlike the usual syntax.
364							#
365							# Some non-word constituents would work, but in practice we hope
366							# everyone just uses letters.
367
368	102					318	$string;
369							}
370
371							# For the [n]german package.
372							sub _convert_german {
373	3			3		4	my $string = shift;
374
375	3					16	foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::GERMAN ) {
376	87					350	$string =~ s/\Q$symbol\E/$LaTeX::ToUnicode::Tables::GERMAN{$symbol}/g;
377							}
378	3					10	$string;
379							}
380
381							# Control words that produce printed symbols (and letters in languages
382							# other than English), that is.
383							#
384							sub _convert_symbols {
385	102			102		156	my $string = shift;
386
387	102					536	foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::SYMBOLS ) {
388	2652					4245	my $repl = $LaTeX::ToUnicode::Tables::SYMBOLS{$symbol};
389							# preceded by a (non-consumed) non-backslash,
390							# usual termination for a control word.
391							# These commands don't take arguments.
392	2652					56882	$string =~ s/(?<=[^\\])\\$symbol$endcw/$repl/g;
393
394							# or the beginning of the whole string:
395	2652					53355	$string =~ s/^\\$symbol$endcw/$repl/g;
396							}
397	102					476	$string;
398							}
399
400							# Special character sequences, not \commands. They aren't all
401							# technically ligatures, but no matter.
402							#
403							sub _convert_ligatures {
404	102			102		177	my $string = shift;
405
406							# have to convert these in order specified.
407	102					588	my @ligs = @LaTeX::ToUnicode::Tables::LIGATURES;
408	102					368	for (my $i = 0; $i < @ligs; $i+=2) {
409	816					1009	my $in = $ligs[$i];
410	816					998	my $out = $ligs[$i+1];
411	816					4817	$string =~ s/\Q$in\E/$out/g;
412							}
413	102					297	$string;
414							}
415
416
417							# Remove primitives like \kern, \hskip, \penalty, etc.
418							#
419							sub _convert_glue_etc_primitives {
420	102			102		218	my ($string) = @_;
421
422							# Remove kerns. Clearly needs generalizing/sharpening to recognize
423							# dimens better, and plenty of other commands could use it.
424							# Here, we only handle literal dimensions ("+1.3pt"), not dimens
425							# referring to control sequences, with or without factors
426							# ("1.1\baselineskip").
427							#_debug("before kern: $string");
428	102					401	my $dimen_re = qr/[-+]?[0-9., ]+[a-z][a-z]\s*/;
429	102					481	$string =~ s!\\kern${endcw}${dimen_re}!!g;
430
431							# Let's do \hfuzz and \vfuzz too. They come up pretty often and it's
432							# practically the same thing (just also ignore optional =)..
433	102					420	$string =~ s!\\[hv]fuzz${endcw}=?\s*${dimen_re}!!g;
434
435							# \hskip and \vskip are dimens possibly preceded by "plus" or "minus".
436							# In contrast to the above, output a space as the replacement,
437							# to avoid words running together.
438	102					448	my $glue_re = qr/${dimen_re}((plus\|minus)${dimen_re})*/;
439	102					447	$string =~ s!\\[hv]skip${endcw}\s*${glue_re}! !g;
440
441							# And here is \penalty. natbib outputs \penalty0 sometimes.
442							# Similar with $dimen_re, we only handle literal decimal
443							# integers here, not things like "0 or `A.
444	102					227	my $number_re = qr/[-+]?[0-9]+\s*/;
445	102					361	$string =~ s!\\penalty${endcw}\s*${number_re}!!g;
446
447							# \looseness is almost the same as \penalty; also accept an optional =.
448	102					334	$string =~ s!\\looseness${endcw}\s=?\s${number_re}!!g;
449
450
451	102					338	return $string;
452							}
453
454							#
455							# Convert LaTeX markup commands in STRING like \textbf{...} and
456							# {\bfshape ...} and {\bf ...}.
457							#
458							# If we're aiming for plain text output, they are just cleared away (the
459							# braces are not removed).
460							#
461							# If we're generating HTML output ("html" key is set in $OPTIONS hash
462							# ref), we use the value in the hash, so that \textbf{foo} becomes
463							# foo. Nested markup doesn't work.
464							#
465							sub _convert_markups {
466	102			102		271	my ($string, $options) = @_;
467
468							# HTML is different.
469	102	50				259	return _convert_markups_html($string) if $options->{html};
470
471							# Not HTML, so here we'll "convert" to plain text by removing the
472							# markup commands.
473
474							# we can do all the markup commands at once.
475	102					710	my $markups = join('\|', keys %LaTeX::ToUnicode::Tables::MARKUPS);
476
477							#warn "_convert_markups: markups = $markups\n";
478							#warn "_convert_markups plain text: starting with $string\n";
479							# Remove \textMARKUP{...}, leaving just the {...}
480	102					649	$string =~ s/\\text($markups)$endcw//g;
481							#warn " after \text: $string\n";
482
483							# Similarly remove \MARKUPshape, plus remove \upshape.
484	102					482	$string =~ s/\\($markups\|up)shape$endcw//g;
485							#warn " after \...shape: $string\n";
486
487							# Remove braces and \command in: {... \MARKUP ...}
488							# where neither ... can contain braces.
489	102					543	$string =~ s/(\{[^{}]+)\\(?:$markups)$endcw([^{}]+\})/$1$2/g;
490							#warn " after ...\\markup...: $string\n";
491
492							# Remove braces and \command in: {\MARKUP ...}
493	102					873	$string =~ s/\{\\(?:$markups)$endcw([^{}]*)\}/$1/g;
494							#warn " after {\\markup...}: $string\n";
495
496							# Remove: {\MARKUP
497							# Although this will leave unmatched } chars behind, there's no
498							# alternative without full parsing, since the bib entry will often
499							# look like: {\em {The TeX{}book}}. Also might, in principle, be
500							# at the end of a line.
501	102					534	$string =~ s/\{\\(?:$markups)$endcw//g;
502							#warn " after {\\markup: $string\n";
503
504							# Remove braces and \command in: \MARKUP ...}
505							# We might have previously removed some braces, and no longer have
506							# the matching {. Happens with {\sl ... \bf ...}, e.g., Wermuth bbls.
507	102					580	$string =~ s/\\(?:$markups)$endcw(.*)\}/$1/g;
508							#warn " after \\markup...}: $string\n";
509
510							# Ultimately we remove all braces in ltx2crossrefxml SanitizeText fns,
511							# so the unmatched braces don't matter ... that code should be moved here.
512
513	102					235	$string;
514							}
515
516							# Convert \markup in STRING to html. We can't always figure out where to
517							# put the end tag, but we always put it somewhere. We don't even attempt
518							# to handle nested markup.
519							#
520							sub _convert_markups_html {
521	0			0			my ($string) = @_;
522
523	0						my %MARKUPS = %LaTeX::ToUnicode::Tables::MARKUPS;
524							# have to consider each markup \command separately.
525	0						for my $markup (keys %MARKUPS) {
526	0						my $hcmd = $MARKUPS{$markup}; # some TeX commands don't translate
527	0	0					my $tag = $hcmd ? "<$hcmd>" : "";
528	0	0					my $end_tag = $hcmd ? "" : "";
529
530							# The easy one: \textMARKUP{...}
531	0						$string =~ s/\\text$markup$endcw\{(.*?)\}/$tag$1$end_tag/g;
532
533							# {x\MARKUP(shape) y} -> xy (leave out braces)
534	0						$string =~ s/\{([^{}]+)\\$markup(shape)?$endcw([^{}]+)\}
535							/$1$tag$3$end_tag/gx;
536
537							# {\MARKUP(shape) y} -> y
538							# or
539							# \MARKUP(shape) y} -> y (without the {).
540							# Same as previous but without the x part. Could do it in one
541							# regex but this seems clearer. The { might be missing due to
542							# previous brace removals, as above.
543	0						$string =~ s/\{?\s*\\$markup(shape)?$endcw([^{}]+)\}
544							/$tag$2$end_tag/gx;
545
546							# for {\MARKUP(shape) ... with no matching right brace, we don't know
547							# where to put the end tag, so seems best to do nothing.
548							}
549
550	0						$string;
551							}
552
553
554							##############################################################
555							# debug_hash_as_string($LABEL, HASH)
556							#
557							# Return LABEL followed by HASH elements, followed by a newline, as a
558							# single string. If HASH is a reference, it is followed (but no recursive
559							# derefencing).
560							###############################################################
561							sub debug_hash_as_string {
562	0			0	0		my ($label) = shift;
563	0	0	0				my (%hash) = (ref $_[0] && $_[0] =~ /.HASH./) ? %{$_[0]} : @_;
	0
564
565	0						my $str = "$label: {";
566	0						my @items = ();
567	0						for my $key (sort keys %hash) {
568	0						my $val = $hash{$key};
569	0	0					$val = ".undef" if ! defined $val;
570	0						$key =~ s/\n/\\n/g;
571	0						$val =~ s/\n/\\n/g;
572	0						push (@items, "$key:$val");
573							}
574	0						$str .= join (",", @items);
575	0						$str .= "}";
576
577	0						return "$str\n";
578							}
579
580							1;
581
582							__END__