line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
69972
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
2
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
41
|
|
3
|
|
|
|
|
|
|
package LaTeX::ToUnicode; |
4
|
|
|
|
|
|
|
BEGIN { |
5
|
1
|
|
|
1
|
|
59
|
$LaTeX::ToUnicode::VERSION = '0.53'; |
6
|
|
|
|
|
|
|
} |
7
|
|
|
|
|
|
|
#ABSTRACT: Convert LaTeX commands to Unicode (simplistically) |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require Exporter; |
10
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
11
|
|
|
|
|
|
|
our @EXPORT_OK = qw( convert debuglevel $endcw ); |
12
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
6
|
use utf8; |
|
1
|
|
|
|
|
10
|
|
|
1
|
|
|
|
|
6
|
|
14
|
1
|
|
|
1
|
|
593
|
use Encode; |
|
1
|
|
|
|
|
10561
|
|
|
1
|
|
|
|
|
70
|
|
15
|
1
|
|
|
1
|
|
476
|
use LaTeX::ToUnicode::Tables; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
2605
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# Terminating a control word (not symbol) the way TeX does: at the |
18
|
|
|
|
|
|
|
# boundary between a letter (lookbehind) and a nonletter (lookahead), |
19
|
|
|
|
|
|
|
# and then ignore any following whitespace. |
20
|
|
|
|
|
|
|
our $endcw = qr/(?<=[a-zA-Z])(?=[^a-zA-Z]|$)\s*/; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# all we need for is debugging being on and off. And it's pretty random |
23
|
|
|
|
|
|
|
# what gets output. |
24
|
|
|
|
|
|
|
my $debug = 0; |
25
|
0
|
|
|
0
|
1
|
0
|
sub debuglevel { $debug = shift; } |
26
|
|
|
|
|
|
|
sub _debug { |
27
|
510
|
50
|
|
510
|
|
1066
|
return unless $debug; |
28
|
0
|
|
|
|
|
0
|
my ($pkgname,$filename,$line,$subr) = caller(1); |
29
|
0
|
|
|
|
|
0
|
warn @_, " at $filename:$line (${pkgname}::$subr)\n"; |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# The main conversion function. |
33
|
|
|
|
|
|
|
# |
34
|
|
|
|
|
|
|
sub convert { |
35
|
102
|
|
|
102
|
1
|
57375
|
my ($string, %options) = @_; |
36
|
|
|
|
|
|
|
#warn debug_hash_as_string("starting with: $string", %options); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# First, remove leading and trailing horizontal whitespace |
39
|
|
|
|
|
|
|
# on each line of the possibly-multiline string we're given. |
40
|
102
|
|
|
|
|
558
|
$string =~ s/^[ \t]*//m; |
41
|
102
|
|
|
|
|
520
|
$string =~ s/[ \t]*$//m; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# For HTML output, must convert special characters that were in the |
44
|
|
|
|
|
|
|
# TeX text (&<>) to their entities to avoid misparsing. We want to |
45
|
|
|
|
|
|
|
# do this first, because conversion of the markup commands might |
46
|
|
|
|
|
|
|
# output HTML tags like , and we don't want to convert those <>. |
47
|
|
|
|
|
|
|
# Although <tt> works, better to keep the output HTML as |
48
|
|
|
|
|
|
|
# human-readable as we can. |
49
|
|
|
|
|
|
|
# |
50
|
102
|
50
|
|
|
|
313
|
if ($options{html}) { |
51
|
0
|
|
|
|
|
0
|
$string =~ s/([^\\]|^)&/$1&/g; |
52
|
0
|
|
|
|
|
0
|
$string =~ s/</g; |
53
|
0
|
|
|
|
|
0
|
$string =~ s/>/>/g; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
102
|
|
|
|
|
167
|
my $user_hook = $options{hook}; |
57
|
102
|
50
|
|
|
|
187
|
if ($user_hook) { |
58
|
0
|
|
|
|
|
0
|
$string = &$user_hook($string, \%options); |
59
|
0
|
|
|
|
|
0
|
_debug("after user hook: $string"); |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# Convert general commands that take arguments, since (1) they might |
63
|
|
|
|
|
|
|
# insert TeX commands that need to be converted, and (2) because |
64
|
|
|
|
|
|
|
# their arguments could well contain constructs that will map to a |
65
|
|
|
|
|
|
|
# Perl string \x{nnnn} for Unicode character nnnn; those Perl braces |
66
|
|
|
|
|
|
|
# for the \x will confuse further parsing of the TeX. |
67
|
|
|
|
|
|
|
# |
68
|
102
|
|
|
|
|
187
|
$string = _convert_commands_with_arg($string); |
69
|
102
|
|
|
|
|
354
|
_debug("after commands with arg: $string"); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Convert markups (\texttt, etc.); they have the same brace-parsing issue. |
72
|
102
|
|
|
|
|
248
|
$string = _convert_markups($string, \%options); |
73
|
102
|
|
|
|
|
320
|
_debug("after markups: $string"); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# And urls, a special case of commands with arguments. |
76
|
102
|
|
|
|
|
211
|
$string = _convert_urls($string, \%options); |
77
|
102
|
|
|
|
|
288
|
_debug("after urls: $string"); |
78
|
|
|
|
|
|
|
|
79
|
102
|
|
|
|
|
173
|
$string = _convert_control_words($string); |
80
|
102
|
|
|
|
|
360
|
_debug("after control words: $string"); |
81
|
|
|
|
|
|
|
|
82
|
102
|
|
|
|
|
183
|
$string = _convert_control_symbols($string); |
83
|
102
|
|
|
|
|
311
|
_debug("after control symbols: $string"); |
84
|
|
|
|
|
|
|
|
85
|
102
|
|
|
|
|
187
|
$string = _convert_accents($string); |
86
|
102
|
100
|
|
|
|
279
|
$string = _convert_german($string) if $options{german}; |
87
|
102
|
|
|
|
|
209
|
$string = _convert_symbols($string); |
88
|
102
|
|
|
|
|
209
|
$string = _convert_ligatures($string); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# Let's handle ties here, after all the other conversions, since |
91
|
|
|
|
|
|
|
# they don't fit well with any of the tables. |
92
|
|
|
|
|
|
|
# |
93
|
|
|
|
|
|
|
# /~, or ~ at the beginning of a line, is probably part of a url or |
94
|
|
|
|
|
|
|
# path, not a tie. Otherwise, consider it a space, since a no-break |
95
|
|
|
|
|
|
|
# spot in TeX is most likely fine to break in text or HTML. |
96
|
|
|
|
|
|
|
# |
97
|
102
|
|
|
|
|
186
|
$string =~ s,([^/])~,$1 ,g; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Remove kerns. Clearly needs generalizing/sharpening to recognize |
100
|
|
|
|
|
|
|
# dimens better, and plenty of other commands could use it. |
101
|
|
|
|
|
|
|
#_debug("before kern: $string"); |
102
|
102
|
|
|
|
|
301
|
my $dimen_re = qr/[-+]?[0-9., ]+[a-z][a-z]\s*/; |
103
|
102
|
|
|
|
|
369
|
$string =~ s!\\kern${endcw}${dimen_re}!!g; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# What the heck, let's do \hfuzz and \vfuzz too. They come up pretty |
106
|
|
|
|
|
|
|
# often and practically the same thing (plus ignore optional =).. |
107
|
102
|
|
|
|
|
344
|
$string =~ s!\\[hv]fuzz${endcw}=?\s*${dimen_re}!!g; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# After all the conversions, $string contains \x{....} constructs |
110
|
|
|
|
|
|
|
# (Perl Unicode characters) where translations have happened. Change |
111
|
|
|
|
|
|
|
# those to the desired output format. Thus we assume that the |
112
|
|
|
|
|
|
|
# Unicode \x{....}'s are not themselves involved in further |
113
|
|
|
|
|
|
|
# translations, which is, so far, true. |
114
|
|
|
|
|
|
|
# |
115
|
102
|
50
|
|
|
|
290
|
if (! $options{entities}) { |
|
|
0
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# Convert our \x strings from Tables.pm to the binary characters. |
117
|
|
|
|
|
|
|
# Assume no more than four hex digits. |
118
|
102
|
|
|
|
|
531
|
$string =~ s/\\x\{(.{1,4})\}/ pack('U*', hex($1))/eg; |
|
85
|
|
|
|
|
577
|
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
} elsif ($options{entities}) { |
121
|
|
|
|
|
|
|
# Convert the XML special characters that appeared in the input, |
122
|
|
|
|
|
|
|
# e.g., from a TeX \&. Unless we're generating HTML output, in |
123
|
|
|
|
|
|
|
# which case they have already been converted. |
124
|
0
|
0
|
|
|
|
0
|
if (! $options{html}) { |
125
|
0
|
|
|
|
|
0
|
$string =~ s/&/&/g; |
126
|
0
|
|
|
|
|
0
|
$string =~ s/</g; |
127
|
0
|
|
|
|
|
0
|
$string =~ s/>/>/g; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# Our values in Tables.pm are simple ASCII strings \x{....}, |
131
|
|
|
|
|
|
|
# so we can replace them with hex entities with no trouble. |
132
|
|
|
|
|
|
|
# Fortunately TeX does not have a standard \x control sequence. |
133
|
0
|
|
|
|
|
0
|
$string =~ s/\\x\{(....)\}/$1;/g; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# The rest of the job is about binary Unicode characters in the |
136
|
|
|
|
|
|
|
# input. We want to transform them into entities also. As always |
137
|
|
|
|
|
|
|
# in Perl, there's more than one way to do it, and several are |
138
|
|
|
|
|
|
|
# described here, just for the fun of it. |
139
|
0
|
|
|
|
|
0
|
my $ret = ""; |
140
|
|
|
|
|
|
|
# |
141
|
|
|
|
|
|
|
# decode_utf8 is described in https://perldoc.perl.org/Encode. |
142
|
|
|
|
|
|
|
# Without the decode_utf8, all of these methods output each byte |
143
|
|
|
|
|
|
|
# separately; apparently $string is a byte string at this point, |
144
|
|
|
|
|
|
|
# not a Unicode string. I don't know why that is. |
145
|
0
|
|
|
|
|
0
|
$ret = decode_utf8($string); |
146
|
|
|
|
|
|
|
# |
147
|
|
|
|
|
|
|
# Transform everything that's not printable ASCII or newline into |
148
|
|
|
|
|
|
|
# entities. |
149
|
0
|
|
|
|
|
0
|
$ret =~ s/([^ -~\n])/ sprintf("%04x;", ord($1)) /eg; |
|
0
|
|
|
|
|
0
|
|
150
|
|
|
|
|
|
|
# |
151
|
|
|
|
|
|
|
# This method leaves control characters as literal; doesn't matter |
152
|
|
|
|
|
|
|
# for XML output, since control characters aren't allowed, but |
153
|
|
|
|
|
|
|
# let's use the regexp method anyway. |
154
|
|
|
|
|
|
|
#$ret = encode("ascii", decode_utf8($string), Encode::FB_XMLCREF); |
155
|
|
|
|
|
|
|
# |
156
|
|
|
|
|
|
|
# The nice_string function from perluniintro also works. |
157
|
|
|
|
|
|
|
# |
158
|
|
|
|
|
|
|
# This fails, just outputs numbers (that is, ord values): |
159
|
|
|
|
|
|
|
# foreach my $c (unpack("U*", $ret)) { |
160
|
|
|
|
|
|
|
# |
161
|
|
|
|
|
|
|
# Without the decode_utf8, outputs each byte separately. |
162
|
|
|
|
|
|
|
# With the decode_utf8, works, but the above seems cleaner. |
163
|
|
|
|
|
|
|
#foreach my $c (split(//, $ret)) { |
164
|
|
|
|
|
|
|
# if (ord($c) <= 31 || ord($c) >= 128) { |
165
|
|
|
|
|
|
|
# $ret .= sprintf("%04x;", ord($c)); |
166
|
|
|
|
|
|
|
# } else { |
167
|
|
|
|
|
|
|
# $ret .= $c; |
168
|
|
|
|
|
|
|
# } |
169
|
|
|
|
|
|
|
#} |
170
|
|
|
|
|
|
|
# |
171
|
0
|
|
|
|
|
0
|
$string = $ret; # assigned from above. |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
102
|
50
|
|
|
|
271
|
if ($string =~ /\\x\{/) { |
175
|
0
|
|
|
|
|
0
|
warn "LaTeX::ToUnicode::convert: untranslated \\x remains: $string\n"; |
176
|
0
|
|
|
|
|
0
|
warn "LaTeX::ToUnicode::convert: please report as bug.\n"; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# Drop all braces. |
180
|
102
|
|
|
|
|
290
|
$string =~ s/[{}]//g; |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# Backslashes might remain. Don't remove them, as it makes for a |
183
|
|
|
|
|
|
|
# useful way to find unhandled commands. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# leave newlines alone, but trim spaces and tabs. |
186
|
102
|
|
|
|
|
223
|
$string =~ s/^[ \t]+//s; # remove leading whitespace |
187
|
102
|
|
|
|
|
201
|
$string =~ s/[ \t]+$//s; # remove trailing whitespace |
188
|
102
|
|
|
|
|
163
|
$string =~ s/[ \t]+/ /gs; # collapse all remaining whitespace to one space |
189
|
|
|
|
|
|
|
|
190
|
102
|
|
|
|
|
641
|
$string; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Convert commands that take a single braced argument. The table |
194
|
|
|
|
|
|
|
# defines text we're supposed to insert before and after the argument. |
195
|
|
|
|
|
|
|
# We let future processing handle conversion of both the inserted text |
196
|
|
|
|
|
|
|
# and the argument. |
197
|
|
|
|
|
|
|
# |
198
|
|
|
|
|
|
|
sub _convert_commands_with_arg { |
199
|
102
|
|
|
102
|
|
174
|
my $string = shift; |
200
|
|
|
|
|
|
|
|
201
|
102
|
|
|
|
|
298
|
foreach my $cmd ( keys %LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS ) { |
202
|
306
|
|
|
|
|
643
|
my $repl = $LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS{$cmd}; |
203
|
306
|
|
|
|
|
445
|
my $lft = $repl->[0]; # ref to two-element list |
204
|
306
|
|
|
|
|
393
|
my $rht = $repl->[1]; |
205
|
|
|
|
|
|
|
# \cmd{foo} -> LFT foo RHT |
206
|
306
|
|
|
|
|
6360
|
$string =~ s/\\$cmd${endcw}\{(.*?)\}/$lft$1$rht/g; |
207
|
|
|
|
|
|
|
#warn "replaced arg $cmd, yielding $string\n"; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
102
|
|
|
|
|
302
|
$string; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# Convert url commands in STRING. This is a special case of commands |
214
|
|
|
|
|
|
|
# with arguments: \url{u} and \href{u}{desc text}. The HTML output |
215
|
|
|
|
|
|
|
# (generated if $OPTIONS{html} is set) is just too special to be handled |
216
|
|
|
|
|
|
|
# in a table; further, \href is the only two-argument command we are |
217
|
|
|
|
|
|
|
# currently handling. |
218
|
|
|
|
|
|
|
# |
219
|
|
|
|
|
|
|
sub _convert_urls { |
220
|
102
|
|
|
102
|
|
186
|
my ($string,$options) = @_; |
221
|
|
|
|
|
|
|
|
222
|
102
|
50
|
|
|
|
223
|
if ($options->{html}) { |
223
|
|
|
|
|
|
|
# HTML output. |
224
|
|
|
|
|
|
|
# \url{URL} -> URL |
225
|
0
|
|
|
|
|
0
|
$string =~ s,\\url$endcw\{([^}]*)\} |
226
|
|
|
|
|
|
|
,$1,gx; |
227
|
|
|
|
|
|
|
# |
228
|
|
|
|
|
|
|
# \href{URL}{TEXT} -> TEXT |
229
|
0
|
|
|
|
|
0
|
$string =~ s,\\href$endcw\{([^}]*)\}\s*\{([^}]*)\} |
230
|
|
|
|
|
|
|
,$2,gx; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
} else { |
233
|
|
|
|
|
|
|
# plain text output. |
234
|
|
|
|
|
|
|
# \url{URL} -> URL |
235
|
102
|
|
|
|
|
309
|
$string =~ s/\\url$endcw\{([^}]*)\}/$1/g; |
236
|
|
|
|
|
|
|
# |
237
|
|
|
|
|
|
|
# \href{URL}{TEXT} -> TEXT (URL) |
238
|
|
|
|
|
|
|
# but, as a special case, if URL ends with TEXT, just output URL, |
239
|
|
|
|
|
|
|
# as in: |
240
|
|
|
|
|
|
|
# \href{https://doi.org/10/fjzzc8}{10/fjzzc8} |
241
|
|
|
|
|
|
|
# -> |
242
|
|
|
|
|
|
|
# https://doi.org/10/fjzzc8 |
243
|
|
|
|
|
|
|
# |
244
|
|
|
|
|
|
|
# Yet more specialness: the TEXT might have extra braces, as in |
245
|
|
|
|
|
|
|
# \href{https://doi.org/10/fjzzc8}{{10/fjzzc8}} |
246
|
|
|
|
|
|
|
# left over from previous markup commands (\path) which got |
247
|
|
|
|
|
|
|
# removed. We want to accept and ignore such extra braces, |
248
|
|
|
|
|
|
|
# hence the \{+ ... \}+ in recognizing TEXT. |
249
|
|
|
|
|
|
|
# |
250
|
|
|
|
|
|
|
#warn "txt url: starting with $string\n"; |
251
|
102
|
50
|
|
|
|
388
|
if ($string =~ m/\\href$endcw\{([^}]*)\}\s*\{+([^}]*)\}+/) { |
252
|
0
|
|
|
|
|
0
|
my $url = $1; |
253
|
0
|
|
|
|
|
0
|
my $text = $2; |
254
|
|
|
|
|
|
|
#warn " url: $url\n"; |
255
|
|
|
|
|
|
|
#warn " text: $text\n"; |
256
|
0
|
0
|
|
|
|
0
|
my $repl = ($url =~ m!$text$!) ? $url : "$text ($url)"; |
257
|
|
|
|
|
|
|
#warn " repl: $repl\n"; |
258
|
0
|
|
|
|
|
0
|
$string =~ s/\\href$endcw\{([^}]*)\}\s*\{+([^}]*)\}+/$repl/; |
259
|
|
|
|
|
|
|
#warn " str: $string\n"; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
102
|
|
|
|
|
203
|
$string; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Convert control words (not symbols), that is, a backslash and an |
267
|
|
|
|
|
|
|
# alphabetic sequence of characters terminated by a non-alphabetic |
268
|
|
|
|
|
|
|
# character. Following whitespace is ignored. |
269
|
|
|
|
|
|
|
# |
270
|
|
|
|
|
|
|
sub _convert_control_words { |
271
|
102
|
|
|
102
|
|
159
|
my $string = shift; |
272
|
|
|
|
|
|
|
|
273
|
102
|
|
|
|
|
692
|
foreach my $command ( keys %LaTeX::ToUnicode::Tables::CONTROL_WORDS ) { |
274
|
5814
|
|
|
|
|
12500
|
my $repl = $LaTeX::ToUnicode::Tables::CONTROL_WORDS{$command}; |
275
|
|
|
|
|
|
|
# replace {\CMD}, whitespace ignored after \CMD. |
276
|
5814
|
|
|
|
|
82552
|
$string =~ s/\{\\$command$endcw\}/$repl/g; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# replace \CMD, preceded by not-consumed non-backslash. |
279
|
5814
|
|
|
|
|
99539
|
$string =~ s/(?<=[^\\])\\$command$endcw/$repl/g; |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# replace \CMD at beginning of whole string, which otherwise |
282
|
|
|
|
|
|
|
# wouldn't be matched. Two separate regexps to avoid |
283
|
|
|
|
|
|
|
# variable-length lookbehind. |
284
|
5814
|
|
|
|
|
87222
|
$string =~ s/^\\$command$endcw/$repl/g; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
102
|
|
|
|
|
521
|
$string; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# Convert control symbols, other than accents. Much simpler than |
291
|
|
|
|
|
|
|
# control words, since are self-delimiting, don't take arguments, and |
292
|
|
|
|
|
|
|
# don't consume any following text. |
293
|
|
|
|
|
|
|
# |
294
|
|
|
|
|
|
|
sub _convert_control_symbols { |
295
|
102
|
|
|
102
|
|
167
|
my $string = shift; |
296
|
|
|
|
|
|
|
|
297
|
102
|
|
|
|
|
557
|
foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS ) { |
298
|
2754
|
|
|
|
|
5840
|
my $repl = $LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS{$symbol}; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# because these are not alphabetic, we can quotemeta them, |
301
|
|
|
|
|
|
|
# and we need to because "\" is one of the symbols. |
302
|
2754
|
|
|
|
|
3669
|
my $rx = quotemeta($symbol); |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# the preceding character must not be a backslash, else "\\ " |
305
|
|
|
|
|
|
|
# could have the "\ " seen first as a control space, leaving |
306
|
|
|
|
|
|
|
# a spurious \ behind. Don't consume the preceding. |
307
|
|
|
|
|
|
|
# Or it could be at the beginning of a line. |
308
|
|
|
|
|
|
|
# |
309
|
2754
|
|
|
|
|
34846
|
$string =~ s/(^|(?<=[^\\]))\\$rx/$repl/g; |
310
|
|
|
|
|
|
|
#warn "after sym $symbol (\\$rx -> $repl), have: $string\n"; |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
102
|
|
|
|
|
382
|
$string; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# Convert accents. |
317
|
|
|
|
|
|
|
# |
318
|
|
|
|
|
|
|
sub _convert_accents { |
319
|
102
|
|
|
102
|
|
161
|
my $string = shift; |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# first the non-alphabetic accent commands, like \". |
322
|
102
|
|
|
|
|
490
|
my %tbl = %LaTeX::ToUnicode::Tables::ACCENT_SYMBOLS; |
323
|
102
|
100
|
|
|
|
394
|
$string =~ s/(\{\\(.)\s*\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} || $1/eg; #{\"{a}} |
|
29
|
|
|
|
|
234
|
|
324
|
102
|
100
|
|
|
|
533
|
$string =~ s/(\{\\(.)\s*(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # {\"a} |
|
47
|
|
|
|
|
365
|
|
325
|
102
|
50
|
|
|
|
360
|
$string =~ s/(\\(.)\s*(\\?\w{1,1}))/ $tbl{$2}{$3} || $1/eg; # \"a |
|
6
|
|
|
|
|
40
|
|
326
|
102
|
100
|
|
|
|
293
|
$string =~ s/(\\(.)\s*\{(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # \"{a} |
|
20
|
|
|
|
|
111
|
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# second the alphabetic commands, like \c. They have be handled |
329
|
|
|
|
|
|
|
# differently because \cc is not \c{c}! The only difference in the |
330
|
|
|
|
|
|
|
# regular expressions is using $endcw instead of just \s*. |
331
|
|
|
|
|
|
|
# |
332
|
102
|
|
|
|
|
898
|
%tbl = %LaTeX::ToUnicode::Tables::ACCENT_LETTERS; |
333
|
102
|
50
|
|
|
|
490
|
$string =~ s/(\{\\(.)$endcw\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} || $1/eg; #{\"{a}} |
|
19
|
|
|
|
|
114
|
|
334
|
102
|
0
|
|
|
|
347
|
$string =~ s/(\{\\(.)$endcw(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # {\"a} |
|
0
|
|
|
|
|
0
|
|
335
|
102
|
0
|
|
|
|
443
|
$string =~ s/(\\(.)$endcw(\\?\w{1,1}))/ $tbl{$2}{$3} || $1/eg; # \"a |
|
0
|
|
|
|
|
0
|
|
336
|
102
|
0
|
|
|
|
404
|
$string =~ s/(\\(.)$endcw\{(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # \"{a} |
|
0
|
|
|
|
|
0
|
|
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# The argument is just one \w character for the \"a case, not two, |
340
|
|
|
|
|
|
|
# because otherwise we might consume a following character that is |
341
|
|
|
|
|
|
|
# not part of the accent, e.g., a backslash (\"a\'e). |
342
|
|
|
|
|
|
|
# |
343
|
|
|
|
|
|
|
# Others can be two because of the \t tie-after accent. Even {\t oo} is ok. |
344
|
|
|
|
|
|
|
# |
345
|
|
|
|
|
|
|
# Allow whitespace after the \CMD in all cases, e.g., "\c c". Even |
346
|
|
|
|
|
|
|
# for the control symbols, it turns out spaces are ignored there |
347
|
|
|
|
|
|
|
# (as in \" o), unlike the usual syntax. |
348
|
|
|
|
|
|
|
# |
349
|
|
|
|
|
|
|
# Some non-word constituents would work, but in practice we hope |
350
|
|
|
|
|
|
|
# everyone just uses letters. |
351
|
|
|
|
|
|
|
|
352
|
102
|
|
|
|
|
375
|
$string; |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
# For the [n]german package. |
356
|
|
|
|
|
|
|
sub _convert_german { |
357
|
3
|
|
|
3
|
|
6
|
my $string = shift; |
358
|
|
|
|
|
|
|
|
359
|
3
|
|
|
|
|
18
|
foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::GERMAN ) { |
360
|
87
|
|
|
|
|
542
|
$string =~ s/\Q$symbol\E/$LaTeX::ToUnicode::Tables::GERMAN{$symbol}/g; |
361
|
|
|
|
|
|
|
} |
362
|
3
|
|
|
|
|
10
|
$string; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# Control words that produce printed symbols (and letters in languages |
366
|
|
|
|
|
|
|
# other than English), that is. |
367
|
|
|
|
|
|
|
# |
368
|
|
|
|
|
|
|
sub _convert_symbols { |
369
|
102
|
|
|
102
|
|
170
|
my $string = shift; |
370
|
|
|
|
|
|
|
|
371
|
102
|
|
|
|
|
437
|
foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::SYMBOLS ) { |
372
|
2550
|
|
|
|
|
5424
|
my $repl = $LaTeX::ToUnicode::Tables::SYMBOLS{$symbol}; |
373
|
|
|
|
|
|
|
# preceded by a (non-consumed) non-backslash, |
374
|
|
|
|
|
|
|
# usual termination for a control word. |
375
|
|
|
|
|
|
|
# These commands don't take arguments. |
376
|
2550
|
|
|
|
|
44337
|
$string =~ s/(?<=[^\\])\\$symbol$endcw/$repl/g; |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# or the beginning of the whole string: |
379
|
2550
|
|
|
|
|
39948
|
$string =~ s/^\\$symbol$endcw/$repl/g; |
380
|
|
|
|
|
|
|
} |
381
|
102
|
|
|
|
|
395
|
$string; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Special character sequences, not \commands. They aren't all |
385
|
|
|
|
|
|
|
# technically ligatures, but no matter. |
386
|
|
|
|
|
|
|
# |
387
|
|
|
|
|
|
|
sub _convert_ligatures { |
388
|
102
|
|
|
102
|
|
169
|
my $string = shift; |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
# have to convert these in order specified. |
391
|
102
|
|
|
|
|
397
|
my @ligs = @LaTeX::ToUnicode::Tables::LIGATURES; |
392
|
102
|
|
|
|
|
304
|
for (my $i = 0; $i < @ligs; $i+=2) { |
393
|
816
|
|
|
|
|
1449
|
my $in = $ligs[$i]; |
394
|
816
|
|
|
|
|
1143
|
my $out = $ligs[$i+1]; |
395
|
816
|
|
|
|
|
5880
|
$string =~ s/\Q$in\E/$out/g; |
396
|
|
|
|
|
|
|
} |
397
|
102
|
|
|
|
|
311
|
$string; |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# |
401
|
|
|
|
|
|
|
# Convert LaTeX markup commands in STRING like \textbf{...} and |
402
|
|
|
|
|
|
|
# {\bfshape ...} and {\bf ...}. |
403
|
|
|
|
|
|
|
# |
404
|
|
|
|
|
|
|
# If we're aiming for plain text output, they are just cleared away (the |
405
|
|
|
|
|
|
|
# braces are not removed). |
406
|
|
|
|
|
|
|
# |
407
|
|
|
|
|
|
|
# If we're generating HTML output ("html" key is set in $OPTIONS hash |
408
|
|
|
|
|
|
|
# ref), we use the value in the hash, so that \textbf{foo} becomes |
409
|
|
|
|
|
|
|
# foo. Nested markup doesn't work. |
410
|
|
|
|
|
|
|
# |
411
|
|
|
|
|
|
|
sub _convert_markups { |
412
|
102
|
|
|
102
|
|
222
|
my ($string, $options) = @_; |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# HTML is different. |
415
|
102
|
50
|
|
|
|
221
|
return _convert_markups_html($string) if $options->{html}; |
416
|
|
|
|
|
|
|
# Ok, we'll "convert" to plain text by removing the markup commands. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
# we can do all markup commands at once. |
419
|
102
|
|
|
|
|
478
|
my $markups = join('|', keys %LaTeX::ToUnicode::Tables::MARKUPS); |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
# Remove \textMARKUP{...}, leaving just the {...} |
422
|
102
|
|
|
|
|
461
|
$string =~ s/\\text($markups)$endcw//g; |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# Similarly remove \MARKUPshape. |
425
|
102
|
|
|
|
|
365
|
$string =~ s/\\($markups)shape$endcw//g; |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# Remove braces and \command in: {... \MARKUP ...} |
428
|
102
|
|
|
|
|
375
|
$string =~ s/(\{[^{}]+)\\(?:$markups)$endcw([^{}]+\})/$1$2/g; |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Remove braces and \command in: {\MARKUP ...} |
431
|
102
|
|
|
|
|
624
|
$string =~ s/\{\\(?:$markups)$endcw([^{}]*)\}/$1/g; |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# Remove: {\MARKUP |
434
|
|
|
|
|
|
|
# Although this will leave unmatched } chars behind, there's no |
435
|
|
|
|
|
|
|
# alternative without full parsing, since the bib entry will often |
436
|
|
|
|
|
|
|
# look like: {\em {The TeX{}book}}. Also might, in principle, be |
437
|
|
|
|
|
|
|
# at the end of a line. |
438
|
102
|
|
|
|
|
406
|
$string =~ s/\{\\(?:$markups)$endcw//g; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# Ultimately we remove all braces in ltx2crossrefxml SanitizeText fns, |
441
|
|
|
|
|
|
|
# so the unmatched braces don't matter ... that code should be moved. |
442
|
|
|
|
|
|
|
|
443
|
102
|
|
|
|
|
332
|
$string; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Convert \markup in STRING to html. We can't always figure out where to |
447
|
|
|
|
|
|
|
# put the end tag, but we always put it somewhere. We don't even attempt |
448
|
|
|
|
|
|
|
# to handle nested markup. |
449
|
|
|
|
|
|
|
# |
450
|
|
|
|
|
|
|
sub _convert_markups_html { |
451
|
0
|
|
|
0
|
|
|
my ($string) = @_; |
452
|
|
|
|
|
|
|
|
453
|
0
|
|
|
|
|
|
my %MARKUPS = %LaTeX::ToUnicode::Tables::MARKUPS; |
454
|
|
|
|
|
|
|
# have to consider each markup \command separately. |
455
|
0
|
|
|
|
|
|
for my $markup (keys %MARKUPS) { |
456
|
0
|
|
|
|
|
|
my $hcmd = $MARKUPS{$markup}; # some TeX commands don't translate |
457
|
0
|
0
|
|
|
|
|
my $tag = $hcmd ? "<$hcmd>" : ""; |
458
|
0
|
0
|
|
|
|
|
my $end_tag = $hcmd ? "$hcmd>" : ""; |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# The easy one: \textMARKUP{...} |
461
|
0
|
|
|
|
|
|
$string =~ s/\\text$markup$endcw\{(.*?)\}/$tag$1$end_tag/g; |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# {x\MARKUP(shape) y} -> xy (leave out braces) |
464
|
0
|
|
|
|
|
|
$string =~ s/\{([^{}]+)\\$markup(shape)?$endcw([^{}]+)\} |
465
|
|
|
|
|
|
|
/$1$tag$3$end_tag/gx; |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
# {\MARKUP(shape) y} -> y. Same as previous but without |
468
|
|
|
|
|
|
|
# the x part. Could do it in one regex but this seems clearer. |
469
|
0
|
|
|
|
|
|
$string =~ s/\{\\$markup(shape)?$endcw([^{}]+)\} |
470
|
|
|
|
|
|
|
/$tag$2$end_tag/gx; |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# for {\MARKUP(shape) ... with no matching brace, we don't know |
473
|
|
|
|
|
|
|
# where to put the end tag, so seems best to do nothing. |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
0
|
|
|
|
|
|
$string; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
############################################################## |
481
|
|
|
|
|
|
|
# debug_hash_as_string($LABEL, HASH) |
482
|
|
|
|
|
|
|
# |
483
|
|
|
|
|
|
|
# Return LABEL followed by HASH elements, followed by a newline, as a |
484
|
|
|
|
|
|
|
# single string. If HASH is a reference, it is followed (but no recursive |
485
|
|
|
|
|
|
|
# derefencing). |
486
|
|
|
|
|
|
|
############################################################### |
487
|
|
|
|
|
|
|
sub debug_hash_as_string { |
488
|
0
|
|
|
0
|
0
|
|
my ($label) = shift; |
489
|
0
|
0
|
0
|
|
|
|
my (%hash) = (ref $_[0] && $_[0] =~ /.*HASH.*/) ? %{$_[0]} : @_; |
|
0
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
491
|
0
|
|
|
|
|
|
my $str = "$label: {"; |
492
|
0
|
|
|
|
|
|
my @items = (); |
493
|
0
|
|
|
|
|
|
for my $key (sort keys %hash) { |
494
|
0
|
|
|
|
|
|
my $val = $hash{$key}; |
495
|
0
|
0
|
|
|
|
|
$val = ".undef" if ! defined $val; |
496
|
0
|
|
|
|
|
|
$key =~ s/\n/\\n/g; |
497
|
0
|
|
|
|
|
|
$val =~ s/\n/\\n/g; |
498
|
0
|
|
|
|
|
|
push (@items, "$key:$val"); |
499
|
|
|
|
|
|
|
} |
500
|
0
|
|
|
|
|
|
$str .= join (",", @items); |
501
|
0
|
|
|
|
|
|
$str .= "}"; |
502
|
|
|
|
|
|
|
|
503
|
0
|
|
|
|
|
|
return "$str\n"; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
1; |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
__END__ |