File Coverage

blib/lib/LaTeX/ToUnicode.pm
Criterion Covered Total %
statement 132 187 70.5
branch 19 50 38.0
condition 0 3 0.0
subroutine 18 21 85.7
pod 2 3 66.6
total 171 264 64.7


line stmt bran cond sub pod time code
1 1     1   99056 use strict;
  1         1  
  1         28  
2 1     1   4 use warnings;
  1         1  
  1         45  
3             package LaTeX::ToUnicode;
4             BEGIN {
5 1     1   61 $LaTeX::ToUnicode::VERSION = '1.95';
6             }
7             #ABSTRACT: Convert LaTeX commands to Unicode (simplistically)
8              
9             require Exporter;
10             our @ISA = qw(Exporter);
11             our @EXPORT_OK = qw( convert debuglevel $endcw );
12              
13 1     1   4 use utf8;
  1         0  
  1         5  
14 1     1   444 use Encode;
  1         13129  
  1         73  
15 1     1   477 use LaTeX::ToUnicode::Tables;
  1         3  
  1         2387  
16              
17             # Terminating a control word (not symbol) the way TeX does: at the
18             # boundary between a letter (lookbehind) and a nonletter (lookahead),
19             # and then ignore any following whitespace.
20             our $endcw = qr/(?<=[a-zA-Z])(?=[^a-zA-Z]|$)\s*/;
21              
22             # Debugging output on or off; it's pretty random what gets output.
23             # Add more as needed. There is also more debugging output as warn
24             # statements, commented out, too voluminous to enable here.
25             my $debug = 0;
26              
27 0     0 1 0 sub debuglevel { $debug = shift; }
28             sub _debug {
29 612 50   612   1307 return unless $debug;
30             # The backtrace info is split between caller(0) and caller(1), sigh.
31             # We don't need the package name, it's included in $subr in practice.
32 0         0 my (undef,$filename,$line,undef) = caller(0);
33 0         0 my (undef,undef,undef,$subr) = caller(1);
34 0         0 warn @_, " at $filename:$line ($subr)\n";
35             }
36              
37             # The main conversion function.
38             #
39             sub convert {
40 102     102 1 233568 my ($string, %options) = @_;
41             #warn debug_hash_as_string("starting with: $string", %options);
42              
43             # First, remove leading and trailing horizontal whitespace
44             # on each line of the possibly-multiline string we're given.
45 102         565 $string =~ s/^[ \t]*//m;
46 102         564 $string =~ s/[ \t]*$//m;
47            
48             # For HTML output, must convert special characters that were in the
49             # TeX text (&<>) to their entities to avoid misparsing. We want to
50             # do this first, because conversion of the markup commands might
51             # output HTML tags like , and we don't want to convert those <>.
52             # Although <tt> works, better to keep the output HTML as
53             # human-readable as we can.
54             #
55 102 50       364 if ($options{html}) {
56 0         0 $string =~ s/([^\\]|^)&/$1&/g;
57 0         0 $string =~ s/
58 0         0 $string =~ s/>/>/g;
59             }
60            
61 102         209 my $user_hook = $options{hook};
62 102 50       270 if ($user_hook) {
63 0         0 _debug("before user hook: $string");
64 0         0 $string = &$user_hook($string, \%options);
65 0         0 _debug("after user hook: $string");
66             }
67            
68             # Convert general commands that take arguments, since (1) they might
69             # insert TeX commands that need to be converted, and (2) because
70             # their arguments could well contain constructs that will map to a
71             # Perl string \x{nnnn} for Unicode character nnnn; those Perl braces
72             # for the \x will confuse further parsing of the TeX.
73             #
74 102         260 $string = _convert_commands_with_arg($string);
75 102         411 _debug("after commands with arg: $string");
76            
77             # Convert markups (\texttt, etc.); they have the same brace-parsing issue.
78 102         332 $string = _convert_markups($string, \%options);
79 102         311 _debug("after markups: $string");
80            
81             # And urls, a special case of commands with arguments.
82 102         263 $string = _convert_urls($string, \%options);
83 102         263 _debug("after urls: $string");
84              
85 102         198 $string = _convert_control_words($string);
86 102         412 _debug("after control words: $string");
87              
88 102         262 $string = _convert_control_symbols($string);
89 102         310 _debug("after control symbols: $string");
90              
91 102         236 $string = _convert_accents($string);
92 102 100       303 $string = _convert_german($string) if $options{german};
93 102         219 $string = _convert_symbols($string);
94 102         302 $string = _convert_ligatures($string);
95 102         230 $string = _convert_glue_etc_primitives($string);
96 102         398 _debug("after the rest: $string");
97            
98             # Let's handle ties here, after all the other conversions, since
99             # they don't fit well with any of the tables. We don't handle TeX's
100             # other special characters: $ & # ^ _.
101             #
102             # /~, or ~ at the beginning of a line, is probably part of a url or
103             # path, not a tie. Otherwise, consider it a space, since we can't
104             # distinguish true no-break spots (Donald~E. Knuth) from ties that
105             # are only relevant to a particular line width.
106             #
107 102         249 $string =~ s,([^/])~,$1 ,g;
108            
109             # After all the conversions, $string contains \x{....} constructs
110             # (Perl Unicode characters) where translations have happened. Change
111             # those to the desired output format. Thus we assume that the
112             # Unicode \x{....}'s are not themselves involved in further
113             # translations, which is, so far, true.
114             #
115 102 50       302 if (! $options{entities}) {
    0          
116             # Convert our \x strings from Tables.pm to the binary characters.
117            
118             # As an extra-special case, we want to preserve the translation of
119             # \{ and \} as 007[bd] entities even if the --entities option is
120             # not give; otherwise they'd get eliminated like all other braces.
121             # Use a temporary cs \xx to keep them marked, and don't use braces
122             # to delimit the argument since they'll get deleted.
123 102         205 $string =~ s/\\x\{(007[bd])\}/\\xx($1)/g;
124            
125             # Convert all other characters to characters.
126             # Assume exactly four hex digits, since we wrote Tables.pm that way.
127 102         555 $string =~ s/\\x\{(....)\}/ pack('U*', hex($1))/eg;
  86         749  
128              
129             } elsif ($options{entities}) {
130             # Convert the XML special characters that appeared in the input,
131             # e.g., from a TeX \&. Unless we're generating HTML output, in
132             # which case they have already been converted.
133 0 0       0 if (! $options{html}) {
134 0         0 $string =~ s/&/&/g;
135 0         0 $string =~ s/
136 0         0 $string =~ s/>/>/g;
137             }
138            
139             # Our values in Tables.pm are simple ASCII strings \x{....},
140             # so we can replace them with hex entities with no trouble.
141             # Fortunately TeX does not have a standard \x control sequence.
142 0         0 $string =~ s/\\x\{(....)\}/&#x$1;/g;
143            
144             # The rest of the job is about binary Unicode characters in the
145             # input. We want to transform them into entities also. As always
146             # in Perl, there's more than one way to do it, and several are
147             # described here, just for the fun of it.
148 0         0 my $ret = "";
149             #
150             # decode_utf8 is described in https://perldoc.perl.org/Encode.
151             # Without the decode_utf8, all of these methods output each byte
152             # separately; apparently $string is a byte string at this point,
153             # not a Unicode string. I don't know why that is.
154 0         0 $ret = decode_utf8($string);
155             #
156             # Transform everything that's not printable ASCII or newline into
157             # entities.
158 0         0 $ret =~ s/([^ -~\n])/ sprintf("&#x%04x;", ord($1)) /eg;
  0         0  
159             #
160             # This method leaves control characters as literal; doesn't matter
161             # for XML output, since control characters aren't allowed, but
162             # let's use the regexp method anyway.
163             #$ret = encode("ascii", decode_utf8($string), Encode::FB_XMLCREF);
164             #
165             # The nice_string function from perluniintro also works.
166             #
167             # This fails, just outputs numbers (that is, ord values):
168             # foreach my $c (unpack("U*", $ret)) {
169             #
170             # Without the decode_utf8, outputs each byte separately.
171             # With the decode_utf8, works, but the above seems cleaner.
172             #foreach my $c (split(//, $ret)) {
173             # if (ord($c) <= 31 || ord($c) >= 128) {
174             # $ret .= sprintf("&#x%04x;", ord($c));
175             # } else {
176             # $ret .= $c;
177             # }
178             #}
179             #
180 0         0 $string = $ret; # assigned from above.
181             }
182              
183 102 50       273 if ($string =~ /\\x\{/) {
184 0         0 warn "LaTeX::ToUnicode::convert: untranslated \\x remains: $string\n";
185 0         0 warn "LaTeX::ToUnicode::convert: please report as bug.\n";
186             }
187            
188             # Drop all remaining braces.
189 102         314 $string =~ s/[{}]//g;
190            
191 102 50       279 if (! $options{entities}) {
192             # With all the other braces gone, now we can convert the preserved
193             # brace entities from \{ and \} to actual braces.
194 102         196 $string =~ s/\\xx\((007[bd])\)/ pack('U*', hex($1))/eg;
  2         6  
195             }
196              
197             # Backslashes might remain. Don't remove them, as it makes for a
198             # useful way to find unhandled commands.
199              
200             # leave newlines alone, but trim spaces and tabs.
201 102         267 $string =~ s/^[ \t]+//s; # remove leading whitespace
202 102         250 $string =~ s/[ \t]+$//s; # remove trailing whitespace
203 102         185 $string =~ s/[ \t]+/ /gs; # collapse all remaining whitespace to one space
204            
205 102         689 $string;
206             }
207              
208             # Convert commands that take a single braced argument. The table
209             # defines text we're supposed to insert before and after the argument.
210             # We let future processing handle conversion of both the inserted text
211             # and the argument.
212             #
213             sub _convert_commands_with_arg {
214 102     102   209 my $string = shift;
215              
216 102         529 foreach my $cmd ( keys %LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS ) {
217 612         1134 my $repl = $LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS{$cmd};
218 612         903 my $lft = $repl->[0]; # ref to two-element list
219 612         728 my $rht = $repl->[1];
220             # \cmd{foo} -> LFT foo RHT
221 612         17840 $string =~ s/\\$cmd${endcw}\{(.*?)\}/$lft$1$rht/g;
222             #warn "replaced arg $cmd, yielding $string\n";
223             }
224            
225 102         327 $string;
226             }
227              
228             # Convert url commands in STRING. This is a special case of commands
229             # with arguments: \url{u} and \href{u}{desc text}. The HTML output
230             # (generated if $OPTIONS{html} is set) is just too special to be handled
231             # in a table; further, \href is the only two-argument command we are
232             # currently handling.
233             #
234             sub _convert_urls {
235 102     102   198 my ($string,$options) = @_;
236              
237 102 50       282 if ($options->{html}) {
238             # HTML output.
239             # \url{URL} -> URL
240 0         0 $string =~ s,\\url$endcw\{([^}]*)\}
241             ,$1,gx;
242             #
243             # \href{URL}{TEXT} -> TEXT
244             #warn "html href: $string\n" if $string =~ /href/;
245 0         0 $string =~ s,\\href$endcw\{([^}]*)\}\s*\{([^}]*)\}
246             ,$2,gx;
247              
248             } else {
249             # plain text output.
250             # \url{URL} -> URL
251 102         391 $string =~ s/\\url$endcw\{([^}]*)\}/$1/g;
252             #
253             # \href{URL}{TEXT} -> TEXT (URL)
254             # but, as a special case, if URL ends with TEXT, just output URL,
255             # as in:
256             # \href{https://doi.org/10/fjzzc8}{10/fjzzc8}
257             # ->
258             # https://doi.org/10/fjzzc8
259             #
260             # Yet more specialness: the TEXT might have extra braces, as in
261             # \href{https://doi.org/10/fjzzc8}{{10/fjzzc8}}
262             # left over from previous markup commands (\path) which got
263             # removed. We want to accept and ignore such extra braces,
264             # hence the \{+ ... \}+ in recognizing TEXT.
265             #
266             #warn "txt url: starting with $string\n" if $string =~ /href/;
267 102 50       479 if ($string =~ m/\\href$endcw\{([^}]*)\}\s*\{+([^}]*)\}+/) {
268 0         0 my $url = $1;
269 0         0 my $text = $2;
270             #warn " url: $url\n";
271             #warn " text: $text\n";
272 0 0       0 my $repl = ($url =~ m!$text$!) ? $url : "$text ($url)";
273             #warn " repl: $repl\n";
274 0         0 $string =~ s/\\href$endcw\{([^}]*)\}\s*\{+([^}]*)\}+/$repl/;
275             #warn " result: $string\n";
276             }
277             }
278            
279 102         188 $string;
280             }
281              
282             # Convert control words (not symbols), that is, a backslash and an
283             # alphabetic sequence of characters terminated by a non-alphabetic
284             # character. Following whitespace is ignored.
285             #
286             sub _convert_control_words {
287 102     102   173 my $string = shift;
288              
289 102         1455 foreach my $command ( keys %LaTeX::ToUnicode::Tables::CONTROL_WORDS ) {
290 10812         18303 my $repl = $LaTeX::ToUnicode::Tables::CONTROL_WORDS{$command};
291             # replace {\CMD}, whitespace ignored after \CMD.
292 10812         220536 $string =~ s/\{\\$command$endcw\}/$repl/g;
293            
294             # replace \CMD, preceded by not-consumed non-backslash.
295 10812         230193 $string =~ s/(?<=[^\\])\\$command$endcw/$repl/g;
296            
297             # replace \CMD at beginning of whole string, which otherwise
298             # wouldn't be matched. Two separate regexps to avoid
299             # variable-length lookbehind.
300 10812         218878 $string =~ s/^\\$command$endcw/$repl/g;
301             }
302              
303 102         841 $string;
304             }
305              
306             # Convert control symbols, other than accents. Much simpler than
307             # control words, since are self-delimiting, don't take arguments, and
308             # don't consume any following text.
309             #
310             sub _convert_control_symbols {
311 102     102   216 my $string = shift;
312              
313 102         939 foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS ) {
314 2754         3835 my $repl = $LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS{$symbol};
315              
316             # because these are not alphabetic, we can quotemeta them,
317             # and we need to because "\" is one of the symbols.
318 2754         3122 my $rx = quotemeta($symbol);
319            
320             # the preceding character must not be a backslash, else "\\ "
321             # could have the "\ " seen first as a control space, leaving
322             # a spurious \ behind. Don't consume the preceding.
323             # Or it could be at the beginning of a line.
324             #
325 2754         26853 $string =~ s/(^|(?<=[^\\]))\\$rx/$repl/g;
326             #warn "after sym $symbol (\\$rx -> $repl), have: $string\n";
327             }
328              
329 102         340 $string;
330             }
331              
332             # Convert accents.
333             #
334             sub _convert_accents {
335 102     102   197 my $string = shift;
336            
337             # first the non-alphabetic accent commands, like \".
338 102         932 my %tbl = %LaTeX::ToUnicode::Tables::ACCENT_SYMBOLS;
339 102 100       437 $string =~ s/(\{\\(.)\s*\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} || $1/eg; #{\"{a}}
  29         289  
340 102 100       661 $string =~ s/(\{\\(.)\s*(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # {\"a}
  47         444  
341 102 50       428 $string =~ s/(\\(.)\s*(\\?\w{1,1}))/ $tbl{$2}{$3} || $1/eg; # \"a
  6         33  
342 102 100       340 $string =~ s/(\\(.)\s*\{(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # \"{a}
  20         95  
343            
344             # second the alphabetic commands, like \c. They have to be handled
345             # differently because \cc is not \c{c}! The only difference in the
346             # regular expressions is using $endcw instead of just \s*.
347             #
348 102         850 %tbl = %LaTeX::ToUnicode::Tables::ACCENT_LETTERS;
349 102 50       633 $string =~ s/(\{\\(.)$endcw\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} || $1/eg; #{\"{a}}
  19         96  
350 102 0       442 $string =~ s/(\{\\(.)$endcw(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # {\"a}
  0         0  
351 102 0       610 $string =~ s/(\\(.)$endcw(\\?\w{1,1}))/ $tbl{$2}{$3} || $1/eg; # \"a
  0         0  
352 102 0       502 $string =~ s/(\\(.)$endcw\{(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # \"{a}
  0         0  
353            
354            
355             # The argument is just one \w character for the \"a case, not two,
356             # because otherwise we might consume a following character that is
357             # not part of the accent, e.g., a backslash (\"a\'e).
358             #
359             # Others can be two because of the \t tie-after accent. Even {\t oo} is ok.
360             #
361             # Allow whitespace after the \CMD in all cases, e.g., "\c c". Even
362             # for the control symbols, it turns out spaces are ignored there
363             # (as in \" o), unlike the usual syntax.
364             #
365             # Some non-word constituents would work, but in practice we hope
366             # everyone just uses letters.
367              
368 102         318 $string;
369             }
370              
371             # For the [n]german package.
372             sub _convert_german {
373 3     3   4 my $string = shift;
374              
375 3         16 foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::GERMAN ) {
376 87         350 $string =~ s/\Q$symbol\E/$LaTeX::ToUnicode::Tables::GERMAN{$symbol}/g;
377             }
378 3         10 $string;
379             }
380              
381             # Control words that produce printed symbols (and letters in languages
382             # other than English), that is.
383             #
384             sub _convert_symbols {
385 102     102   156 my $string = shift;
386              
387 102         536 foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::SYMBOLS ) {
388 2652         4245 my $repl = $LaTeX::ToUnicode::Tables::SYMBOLS{$symbol};
389             # preceded by a (non-consumed) non-backslash,
390             # usual termination for a control word.
391             # These commands don't take arguments.
392 2652         56882 $string =~ s/(?<=[^\\])\\$symbol$endcw/$repl/g;
393            
394             # or the beginning of the whole string:
395 2652         53355 $string =~ s/^\\$symbol$endcw/$repl/g;
396             }
397 102         476 $string;
398             }
399              
400             # Special character sequences, not \commands. They aren't all
401             # technically ligatures, but no matter.
402             #
403             sub _convert_ligatures {
404 102     102   177 my $string = shift;
405              
406             # have to convert these in order specified.
407 102         588 my @ligs = @LaTeX::ToUnicode::Tables::LIGATURES;
408 102         368 for (my $i = 0; $i < @ligs; $i+=2) {
409 816         1009 my $in = $ligs[$i];
410 816         998 my $out = $ligs[$i+1];
411 816         4817 $string =~ s/\Q$in\E/$out/g;
412             }
413 102         297 $string;
414             }
415              
416            
417             # Remove primitives like \kern, \hskip, \penalty, etc.
418             #
419             sub _convert_glue_etc_primitives {
420 102     102   218 my ($string) = @_;
421            
422             # Remove kerns. Clearly needs generalizing/sharpening to recognize
423             # dimens better, and plenty of other commands could use it.
424             # Here, we only handle literal dimensions ("+1.3pt"), not dimens
425             # referring to control sequences, with or without factors
426             # ("1.1\baselineskip").
427             #_debug("before kern: $string");
428 102         401 my $dimen_re = qr/[-+]?[0-9., ]+[a-z][a-z]\s*/;
429 102         481 $string =~ s!\\kern${endcw}${dimen_re}!!g;
430              
431             # Let's do \hfuzz and \vfuzz too. They come up pretty often and it's
432             # practically the same thing (just also ignore optional =)..
433 102         420 $string =~ s!\\[hv]fuzz${endcw}=?\s*${dimen_re}!!g;
434              
435             # \hskip and \vskip are dimens possibly preceded by "plus" or "minus".
436             # In contrast to the above, output a space as the replacement,
437             # to avoid words running together.
438 102         448 my $glue_re = qr/${dimen_re}((plus|minus)${dimen_re})*/;
439 102         447 $string =~ s!\\[hv]skip${endcw}\s*${glue_re}! !g;
440            
441             # And here is \penalty. natbib outputs \penalty0 sometimes.
442             # Similar with $dimen_re, we only handle literal decimal
443             # integers here, not things like "0 or `A.
444 102         227 my $number_re = qr/[-+]?[0-9]+\s*/;
445 102         361 $string =~ s!\\penalty${endcw}\s*${number_re}!!g;
446            
447             # \looseness is almost the same as \penalty; also accept an optional =.
448 102         334 $string =~ s!\\looseness${endcw}\s*=?\s*${number_re}!!g;
449              
450              
451 102         338 return $string;
452             }
453              
454             #
455             # Convert LaTeX markup commands in STRING like \textbf{...} and
456             # {\bfshape ...} and {\bf ...}.
457             #
458             # If we're aiming for plain text output, they are just cleared away (the
459             # braces are not removed).
460             #
461             # If we're generating HTML output ("html" key is set in $OPTIONS hash
462             # ref), we use the value in the hash, so that \textbf{foo} becomes
463             # foo. Nested markup doesn't work.
464             #
465             sub _convert_markups {
466 102     102   271 my ($string, $options) = @_;
467            
468             # HTML is different.
469 102 50       259 return _convert_markups_html($string) if $options->{html};
470            
471             # Not HTML, so here we'll "convert" to plain text by removing the
472             # markup commands.
473              
474             # we can do all the markup commands at once.
475 102         710 my $markups = join('|', keys %LaTeX::ToUnicode::Tables::MARKUPS);
476            
477             #warn "_convert_markups: markups = $markups\n";
478             #warn "_convert_markups plain text: starting with $string\n";
479             # Remove \textMARKUP{...}, leaving just the {...}
480 102         649 $string =~ s/\\text($markups)$endcw//g;
481             #warn " after \text: $string\n";
482              
483             # Similarly remove \MARKUPshape, plus remove \upshape.
484 102         482 $string =~ s/\\($markups|up)shape$endcw//g;
485             #warn " after \...shape: $string\n";
486              
487             # Remove braces and \command in: {... \MARKUP ...}
488             # where neither ... can contain braces.
489 102         543 $string =~ s/(\{[^{}]+)\\(?:$markups)$endcw([^{}]+\})/$1$2/g;
490             #warn " after ...\\markup...: $string\n";
491              
492             # Remove braces and \command in: {\MARKUP ...}
493 102         873 $string =~ s/\{\\(?:$markups)$endcw([^{}]*)\}/$1/g;
494             #warn " after {\\markup...}: $string\n";
495              
496             # Remove: {\MARKUP
497             # Although this will leave unmatched } chars behind, there's no
498             # alternative without full parsing, since the bib entry will often
499             # look like: {\em {The TeX{}book}}. Also might, in principle, be
500             # at the end of a line.
501 102         534 $string =~ s/\{\\(?:$markups)$endcw//g;
502             #warn " after {\\markup: $string\n";
503              
504             # Remove braces and \command in: \MARKUP ...}
505             # We might have previously removed some braces, and no longer have
506             # the matching {. Happens with {\sl ... \bf ...}, e.g., Wermuth bbls.
507 102         580 $string =~ s/\\(?:$markups)$endcw(.*)\}/$1/g;
508             #warn " after \\markup...}: $string\n";
509              
510             # Ultimately we remove all braces in ltx2crossrefxml SanitizeText fns,
511             # so the unmatched braces don't matter ... that code should be moved here.
512              
513 102         235 $string;
514             }
515              
516             # Convert \markup in STRING to html. We can't always figure out where to
517             # put the end tag, but we always put it somewhere. We don't even attempt
518             # to handle nested markup.
519             #
520             sub _convert_markups_html {
521 0     0     my ($string) = @_;
522            
523 0           my %MARKUPS = %LaTeX::ToUnicode::Tables::MARKUPS;
524             # have to consider each markup \command separately.
525 0           for my $markup (keys %MARKUPS) {
526 0           my $hcmd = $MARKUPS{$markup}; # some TeX commands don't translate
527 0 0         my $tag = $hcmd ? "<$hcmd>" : "";
528 0 0         my $end_tag = $hcmd ? "" : "";
529            
530             # The easy one: \textMARKUP{...}
531 0           $string =~ s/\\text$markup$endcw\{(.*?)\}/$tag$1$end_tag/g;
532              
533             # {x\MARKUP(shape) y} -> xy (leave out braces)
534 0           $string =~ s/\{([^{}]+)\\$markup(shape)?$endcw([^{}]+)\}
535             /$1$tag$3$end_tag/gx;
536              
537             # {\MARKUP(shape) y} -> y
538             # or
539             # \MARKUP(shape) y} -> y (without the {).
540             # Same as previous but without the x part. Could do it in one
541             # regex but this seems clearer. The { might be missing due to
542             # previous brace removals, as above.
543 0           $string =~ s/\{?\s*\\$markup(shape)?$endcw([^{}]+)\}
544             /$tag$2$end_tag/gx;
545              
546             # for {\MARKUP(shape) ... with no matching right brace, we don't know
547             # where to put the end tag, so seems best to do nothing.
548             }
549            
550 0           $string;
551             }
552              
553            
554             ##############################################################
555             # debug_hash_as_string($LABEL, HASH)
556             #
557             # Return LABEL followed by HASH elements, followed by a newline, as a
558             # single string. If HASH is a reference, it is followed (but no recursive
559             # derefencing).
560             ###############################################################
561             sub debug_hash_as_string {
562 0     0 0   my ($label) = shift;
563 0 0 0       my (%hash) = (ref $_[0] && $_[0] =~ /.*HASH.*/) ? %{$_[0]} : @_;
  0            
564              
565 0           my $str = "$label: {";
566 0           my @items = ();
567 0           for my $key (sort keys %hash) {
568 0           my $val = $hash{$key};
569 0 0         $val = ".undef" if ! defined $val;
570 0           $key =~ s/\n/\\n/g;
571 0           $val =~ s/\n/\\n/g;
572 0           push (@items, "$key:$val");
573             }
574 0           $str .= join (",", @items);
575 0           $str .= "}";
576              
577 0           return "$str\n";
578             }
579              
580             1;
581              
582             __END__