line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
############################################################################### |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# LaTeX::TOM (TeX Object Model) |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Version 1.05 |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------- |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# originally written by Aaron Krowne (akrowne@vt.edu) |
10
|
|
|
|
|
|
|
# July 2002 |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# Virginia Polytechnic Institute and State University |
13
|
|
|
|
|
|
|
# Department of Computer Science |
14
|
|
|
|
|
|
|
# Digital Libraries Research Laboratory |
15
|
|
|
|
|
|
|
# |
16
|
|
|
|
|
|
|
# now maintained by Steven Schubiger (schubiger@cpan.org) |
17
|
|
|
|
|
|
|
# April 2008 |
18
|
|
|
|
|
|
|
# |
19
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------- |
20
|
|
|
|
|
|
|
# |
21
|
|
|
|
|
|
|
# This module provides some decent semantic handling of LaTeX documents. It is |
22
|
|
|
|
|
|
|
# inspired by XML::DOM, so users of that module should be able to acclimate |
23
|
|
|
|
|
|
|
# themselves to this one quickly. Basically the subroutines in this package |
24
|
|
|
|
|
|
|
# allow you to parse a LaTeX document into its logical structure, including |
25
|
|
|
|
|
|
|
# groupings, commands, environments, and comments. These all go into a tree |
26
|
|
|
|
|
|
|
# which is built as arrays of Perl hashes. |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
############################################################################### |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
package LaTeX::TOM; |
31
|
|
|
|
|
|
|
|
32
|
10
|
|
|
10
|
|
85050
|
use strict; |
|
10
|
|
|
|
|
73
|
|
|
10
|
|
|
|
|
307
|
|
33
|
10
|
|
|
10
|
|
54
|
use base qw(LaTeX::TOM::Parser); |
|
10
|
|
|
|
|
16
|
|
|
10
|
|
|
|
|
5988
|
|
34
|
10
|
|
|
10
|
|
79
|
use constant true => 1; |
|
10
|
|
|
|
|
23
|
|
|
10
|
|
|
|
|
4880
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
our $VERSION = '1.05'; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
our (%INNERCMDS, %MATHENVS, %MATHBRACKETS, |
39
|
|
|
|
|
|
|
%BRACELESS, %TEXTENVS, $PARSE_ERRORS_FATAL, |
40
|
|
|
|
|
|
|
$DEBUG); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# BEGIN CONFIG SECTION ######################################################## |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# these are commands that can be "embedded" within a grouping to alter the |
45
|
|
|
|
|
|
|
# environment of that grouping. For instance {\bf text}. Without listing the |
46
|
|
|
|
|
|
|
# command names here, the parser will treat such sequences as plain text. |
47
|
|
|
|
|
|
|
# |
48
|
|
|
|
|
|
|
%INNERCMDS = map { $_ => true } ( |
49
|
|
|
|
|
|
|
'bf', |
50
|
|
|
|
|
|
|
'md', |
51
|
|
|
|
|
|
|
'em', |
52
|
|
|
|
|
|
|
'up', |
53
|
|
|
|
|
|
|
'sl', |
54
|
|
|
|
|
|
|
'sc', |
55
|
|
|
|
|
|
|
'sf', |
56
|
|
|
|
|
|
|
'rm', |
57
|
|
|
|
|
|
|
'it', |
58
|
|
|
|
|
|
|
'tt', |
59
|
|
|
|
|
|
|
'noindent', |
60
|
|
|
|
|
|
|
'mathtt', |
61
|
|
|
|
|
|
|
'mathbf', |
62
|
|
|
|
|
|
|
'tiny', |
63
|
|
|
|
|
|
|
'scriptsize', |
64
|
|
|
|
|
|
|
'footnotesize', |
65
|
|
|
|
|
|
|
'small', |
66
|
|
|
|
|
|
|
'normalsize', |
67
|
|
|
|
|
|
|
'large', |
68
|
|
|
|
|
|
|
'Large', |
69
|
|
|
|
|
|
|
'LARGE', |
70
|
|
|
|
|
|
|
'huge', |
71
|
|
|
|
|
|
|
'Huge', |
72
|
|
|
|
|
|
|
'HUGE', |
73
|
|
|
|
|
|
|
); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# these commands put their environments into math mode |
76
|
|
|
|
|
|
|
# |
77
|
|
|
|
|
|
|
%MATHENVS = map { $_ => true } ( |
78
|
|
|
|
|
|
|
'align', |
79
|
|
|
|
|
|
|
'equation', |
80
|
|
|
|
|
|
|
'eqnarray', |
81
|
|
|
|
|
|
|
'displaymath', |
82
|
|
|
|
|
|
|
'ensuremath', |
83
|
|
|
|
|
|
|
'math', |
84
|
|
|
|
|
|
|
'$$', |
85
|
|
|
|
|
|
|
'$', |
86
|
|
|
|
|
|
|
'\[', |
87
|
|
|
|
|
|
|
'\(', |
88
|
|
|
|
|
|
|
); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# these commands/environments put their children in text (non-math) mode |
91
|
|
|
|
|
|
|
# |
92
|
|
|
|
|
|
|
%TEXTENVS = map { $_ => true } ( |
93
|
|
|
|
|
|
|
'tiny', |
94
|
|
|
|
|
|
|
'scriptsize', |
95
|
|
|
|
|
|
|
'footnotesize', |
96
|
|
|
|
|
|
|
'small', |
97
|
|
|
|
|
|
|
'normalsize', |
98
|
|
|
|
|
|
|
'large', |
99
|
|
|
|
|
|
|
'Large', |
100
|
|
|
|
|
|
|
'LARGE', |
101
|
|
|
|
|
|
|
'huge', |
102
|
|
|
|
|
|
|
'Huge', |
103
|
|
|
|
|
|
|
'HUGE', |
104
|
|
|
|
|
|
|
'text', |
105
|
|
|
|
|
|
|
'textbf', |
106
|
|
|
|
|
|
|
'textmd', |
107
|
|
|
|
|
|
|
'textsc', |
108
|
|
|
|
|
|
|
'textsf', |
109
|
|
|
|
|
|
|
'textrm', |
110
|
|
|
|
|
|
|
'textsl', |
111
|
|
|
|
|
|
|
'textup', |
112
|
|
|
|
|
|
|
'texttt', |
113
|
|
|
|
|
|
|
'mbox', |
114
|
|
|
|
|
|
|
'fbox', |
115
|
|
|
|
|
|
|
'section', |
116
|
|
|
|
|
|
|
'subsection', |
117
|
|
|
|
|
|
|
'subsubsection', |
118
|
|
|
|
|
|
|
'em', |
119
|
|
|
|
|
|
|
'bf', |
120
|
|
|
|
|
|
|
'emph', |
121
|
|
|
|
|
|
|
'it', |
122
|
|
|
|
|
|
|
'enumerate', |
123
|
|
|
|
|
|
|
'description', |
124
|
|
|
|
|
|
|
'itemize', |
125
|
|
|
|
|
|
|
'trivlist', |
126
|
|
|
|
|
|
|
'list', |
127
|
|
|
|
|
|
|
'proof', |
128
|
|
|
|
|
|
|
'theorem', |
129
|
|
|
|
|
|
|
'lemma', |
130
|
|
|
|
|
|
|
'thm', |
131
|
|
|
|
|
|
|
'prop', |
132
|
|
|
|
|
|
|
'lem', |
133
|
|
|
|
|
|
|
'table', |
134
|
|
|
|
|
|
|
'tabular', |
135
|
|
|
|
|
|
|
'tabbing', |
136
|
|
|
|
|
|
|
'caption', |
137
|
|
|
|
|
|
|
'footnote', |
138
|
|
|
|
|
|
|
'center', |
139
|
|
|
|
|
|
|
'flushright', |
140
|
|
|
|
|
|
|
'document', |
141
|
|
|
|
|
|
|
'article', |
142
|
|
|
|
|
|
|
'titlepage', |
143
|
|
|
|
|
|
|
'title', |
144
|
|
|
|
|
|
|
'author', |
145
|
|
|
|
|
|
|
'titlerunninghead', |
146
|
|
|
|
|
|
|
'authorrunninghead', |
147
|
|
|
|
|
|
|
'affil', |
148
|
|
|
|
|
|
|
'email', |
149
|
|
|
|
|
|
|
'abstract', |
150
|
|
|
|
|
|
|
'thanks', |
151
|
|
|
|
|
|
|
'algorithm', |
152
|
|
|
|
|
|
|
'nonumalgorithm', |
153
|
|
|
|
|
|
|
'references', |
154
|
|
|
|
|
|
|
'thebibliography', |
155
|
|
|
|
|
|
|
'bibitem', |
156
|
|
|
|
|
|
|
'verbatim', |
157
|
|
|
|
|
|
|
'verbatimtab', |
158
|
|
|
|
|
|
|
'quotation', |
159
|
|
|
|
|
|
|
'quote', |
160
|
|
|
|
|
|
|
); |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# these form sets of simple mode delimiters |
163
|
|
|
|
|
|
|
# |
164
|
|
|
|
|
|
|
%MATHBRACKETS = ( |
165
|
|
|
|
|
|
|
'$$' => '$$', |
166
|
|
|
|
|
|
|
'$' => '$', |
167
|
|
|
|
|
|
|
# '\[' => '\]', # these are problematic and handled separately now |
168
|
|
|
|
|
|
|
# '\(' => '\)', |
169
|
|
|
|
|
|
|
); |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# these commands require no braces, and their parameters are simply the |
172
|
|
|
|
|
|
|
# "word" following the command declaration |
173
|
|
|
|
|
|
|
# |
174
|
|
|
|
|
|
|
%BRACELESS = map { $_ => true } ( |
175
|
|
|
|
|
|
|
'oddsidemargin', |
176
|
|
|
|
|
|
|
'evensidemargin', |
177
|
|
|
|
|
|
|
'topmargin', |
178
|
|
|
|
|
|
|
'headheight', |
179
|
|
|
|
|
|
|
'headsep', |
180
|
|
|
|
|
|
|
'textwidth', |
181
|
|
|
|
|
|
|
'textheight', |
182
|
|
|
|
|
|
|
'input', |
183
|
|
|
|
|
|
|
); |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# default value controlling how fatal parse errors are |
186
|
|
|
|
|
|
|
# |
187
|
|
|
|
|
|
|
# 0 = warn, 1 = die, 2 = silent |
188
|
|
|
|
|
|
|
# |
189
|
|
|
|
|
|
|
$PARSE_ERRORS_FATAL = 0; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# debugging mode (internal use) |
192
|
|
|
|
|
|
|
# |
193
|
|
|
|
|
|
|
# 0 = off, 1 = messages, 2 = messages and code |
194
|
|
|
|
|
|
|
# |
195
|
|
|
|
|
|
|
$DEBUG = 0; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# END CONFIG SECTION ########################################################## |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
sub new { |
200
|
9
|
|
|
9
|
1
|
1161
|
my $class = shift; |
201
|
|
|
|
|
|
|
|
202
|
9
|
|
|
|
|
71
|
return LaTeX::TOM::Parser->_new(@_); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
1; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=head1 NAME |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
LaTeX::TOM - A module for parsing, analyzing, and manipulating LaTeX documents. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head1 SYNOPSIS |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
use LaTeX::TOM; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
$parser = LaTeX::TOM->new; |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
$document = $parser->parseFile('mypaper.tex'); |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
$latex = $document->toLaTeX; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
$specialnodes = $document->getNodesByCondition(sub { |
222
|
|
|
|
|
|
|
my $node = shift; |
223
|
|
|
|
|
|
|
return ( |
224
|
|
|
|
|
|
|
$node->getNodeType eq 'TEXT' |
225
|
|
|
|
|
|
|
&& $node->getNodeText =~ /magic string/ |
226
|
|
|
|
|
|
|
); |
227
|
|
|
|
|
|
|
}); |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
$sections = $document->getNodesByCondition(sub { |
230
|
|
|
|
|
|
|
my $node = shift; |
231
|
|
|
|
|
|
|
return ( |
232
|
|
|
|
|
|
|
$node->getNodeType eq 'COMMAND' |
233
|
|
|
|
|
|
|
&& $node->getCommandName =~ /section$/ |
234
|
|
|
|
|
|
|
); |
235
|
|
|
|
|
|
|
}); |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
$indexme = $document->getIndexableText; |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
$document->print; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 DESCRIPTION |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
This module provides a parser which parses and interprets (though not fully) |
244
|
|
|
|
|
|
|
LaTeX documents and returns a tree-based representation of what it finds. |
245
|
|
|
|
|
|
|
This tree is a C. The tree contains C nodes. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This module should be especially useful to anyone who wants to do processing |
248
|
|
|
|
|
|
|
of LaTeX documents that requires extraction of plain-text information, or |
249
|
|
|
|
|
|
|
altering of the plain-text components (or alternatively, the math-text |
250
|
|
|
|
|
|
|
components). |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=head1 COMPONENTS |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 LaTeX::TOM::Parser |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
The parser recognizes 3 parameters upon creation by C<< LaTeX::TOM->new >>. |
257
|
|
|
|
|
|
|
The parameters, in order, are |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=over 4 |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=item parse error handling (= B<0> || 1 || 2) |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Determines what happens when a parse error is encountered. C<0> results in a |
264
|
|
|
|
|
|
|
warning. C<1> results in a die. C<2> results in silence. Note that particular |
265
|
|
|
|
|
|
|
groupings in LaTeX (i.e. newcommands and the like) contain invalid TeX or |
266
|
|
|
|
|
|
|
LaTeX, so you nearly always need this parameter to be C<0> or C<2> to completely |
267
|
|
|
|
|
|
|
parse the document. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=item read inputs flag (= 0 || B<1>) |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
This flag determines whether a scan for C<\input> and C<\input-like> commands is |
272
|
|
|
|
|
|
|
performed, and the resulting called files parsed and added to the parent |
273
|
|
|
|
|
|
|
parse tree. C<0> means no, C<1> means do it. Note that this will happen recursively |
274
|
|
|
|
|
|
|
if it is turned on. Also, bibliographies (F<.bbl> files) are detected and |
275
|
|
|
|
|
|
|
included. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item apply mappings flag (= 0 || B<1>) |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
This flag determines whether (most) user-defined mappings are applied. This |
280
|
|
|
|
|
|
|
means C<\defs>, C<\newcommands>, and C<\newenvironments>. This is critical for |
281
|
|
|
|
|
|
|
properly analyzing the content of the document, as this must be phrased in terms |
282
|
|
|
|
|
|
|
of the semantics of the original TeX and LaTeX commands, not ad hoc user macros. |
283
|
|
|
|
|
|
|
So, for instance, do not expect plain-text extraction to work properly with this |
284
|
|
|
|
|
|
|
option off. |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=back |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
The parser returns a C ($document in the SYNOPSIS). |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=head2 LaTeX::TOM::Node |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Nodes may be of the following types: |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=over 4 |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=item TEXT |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
C nodes can be thought of as representing the plain-text portions of the |
299
|
|
|
|
|
|
|
LaTeX document. This includes math and anything else that is not a recognized |
300
|
|
|
|
|
|
|
TeX or LaTeX command, or user-defined command. In reality, C nodes contain |
301
|
|
|
|
|
|
|
commands that this parser does not yet recognize the semantics of. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=item COMMAND |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
A C node represents a TeX command. It always has child nodes in a tree, |
306
|
|
|
|
|
|
|
though the tree might be empty if the command operates on zero parameters. An |
307
|
|
|
|
|
|
|
example of a command is |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
\textbf{blah} |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
This would parse into a C node for C, which would have a subtree |
312
|
|
|
|
|
|
|
containing the C node with text ``blah.'' |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=item ENVIRONMENT |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Similarly, TeX environments parse into C nodes, which have metadata |
317
|
|
|
|
|
|
|
about the environment, along with a subtree representing what is contained in |
318
|
|
|
|
|
|
|
the environment. For example, |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
\begin{equation} |
321
|
|
|
|
|
|
|
r = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} |
322
|
|
|
|
|
|
|
\end{equation} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
Would parse into an C node of the class ``equation'' with a child |
325
|
|
|
|
|
|
|
tree containing the result of parsing C<``r = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}.''> |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=item GROUP |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
A C is like an anonymous C. Since you can put whatever you want in |
330
|
|
|
|
|
|
|
curly-braces (C<{}>) in TeX in order to make semantically isolated regions, this |
331
|
|
|
|
|
|
|
separation is preserved by the parser. A C is just the subtree of the |
332
|
|
|
|
|
|
|
parsed contents of plain curly-braces. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
It is important to note that currently only the first C in a series of |
335
|
|
|
|
|
|
|
Cs following a LaTeX command will actually be parsed into a C node. |
336
|
|
|
|
|
|
|
The reason is that, for the initial purposes of this module, it was not |
337
|
|
|
|
|
|
|
necessary to recognize additional Cs as additional parameters to the |
338
|
|
|
|
|
|
|
C. However, this is something that this module really should do |
339
|
|
|
|
|
|
|
eventually. Currently if you want all the parameters to a multi-parametered |
340
|
|
|
|
|
|
|
command, you'll need to pick out all the following C nodes yourself. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
Eventually this will become something like a list which is stored in the |
343
|
|
|
|
|
|
|
C node, much like L's treatment of attributes. These are, in a |
344
|
|
|
|
|
|
|
sense, apart from the rest of the document tree. Then C nodes will become |
345
|
|
|
|
|
|
|
much more rare. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=item COMMENT |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
A C node is very similar to a C node, except it is specifically for |
350
|
|
|
|
|
|
|
lines beginning with C<``%''> (the TeX comment delimiter) or the right-hand |
351
|
|
|
|
|
|
|
portion of a line that has C<``%''> at some internal point. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=back |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head2 LaTeX::TOM::Trees |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
As mentioned before, the Tree is the return result of a parse. |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
The tree is nothing more than an arrayref of Nodes, some of which may contain |
360
|
|
|
|
|
|
|
their own trees. This is useful knowledge at this point, since the user isn't |
361
|
|
|
|
|
|
|
provided with a full suite of convenient tree-modification methods. However, |
362
|
|
|
|
|
|
|
Trees do already have some very convenient methods, described in the next |
363
|
|
|
|
|
|
|
section. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=head1 METHODS |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=head2 LaTeX::TOM |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=head3 new |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=over 4 |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=item C<> |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Instantiate a new parser object. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=back |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
In this section all of the methods for each of the components are listed and |
380
|
|
|
|
|
|
|
described. |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=head2 LaTeX::TOM::Parser |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
The methods for the parser are: |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head3 parseFile (filename) |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=over 4 |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=item C<> |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
Read in the contents of I and parse them, returning a C. |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=back |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=head3 parse (string) |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=over 4 |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=item C<> |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
Parse the string I and return a C. |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=back |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=head2 LaTeX::TOM::Tree |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
This section contains methods for the Trees returned by the parser. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head3 copy |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=over 4 |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item C<> |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Duplicate a tree into new memory. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=back |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head3 print |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=over 4 |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item C<> |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
A debug print of the structure of the tree. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=back |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head3 plainText |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=over 4 |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item C<> |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Returns an arrayref which is a list of strings representing the text of all |
437
|
|
|
|
|
|
|
C C nodes, in an inorder traversal. |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=back |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=head3 indexableText |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=over 4 |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=item C<> |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
A method like the above but which goes one step further; it cleans all of the |
448
|
|
|
|
|
|
|
returned text and concatenates it into a single string which one could consider |
449
|
|
|
|
|
|
|
having all of the standard information retrieval value for the document, |
450
|
|
|
|
|
|
|
making it useful for indexing. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=back |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=head3 toLaTeX |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=over 4 |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=item C<> |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
Return a string representing the LaTeX encoded by the tree. This is especially |
461
|
|
|
|
|
|
|
useful to get a normal document again, after modifying nodes of the tree. |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=back |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=head3 getTopLevelNodes |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=over 4 |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
=item C<> |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
Return a list of C at the top level of the Tree. |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=back |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head3 getAllNodes |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=over 4 |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=item C<> |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
Return an arrayref with B nodes of the tree. This "flattens" the tree. |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=back |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=head3 getCommandNodesByName (name) |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=over 4 |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=item C<> |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
Return an arrayref with all C nodes in the tree which have a name |
492
|
|
|
|
|
|
|
matching I. |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
=back |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=head3 getEnvironmentsByName (name) |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=over 4 |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=item C<> |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
Return an arrayref with all C nodes in the tree which have a class |
503
|
|
|
|
|
|
|
matching I. |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=back |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=head3 getNodesByCondition (code reference) |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=over 4 |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=item C<> |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
This is a catch-all search method which can be used to pull out nodes that |
514
|
|
|
|
|
|
|
match pretty much any perl expression, without manually having to traverse the |
515
|
|
|
|
|
|
|
tree. I is a perl code reference which receives as its first |
516
|
|
|
|
|
|
|
argument the node of the tree that is currently scrutinized and is expected to |
517
|
|
|
|
|
|
|
return a boolean value. See the SYNOPSIS for examples. |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=back |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
=head3 getFirstNode |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=over 4 |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=item C<> |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
Returns the first node of the tree. This is useful if you want to walk the tree |
528
|
|
|
|
|
|
|
yourself, starting with the first node. |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=back |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=head2 LaTeX::TOM::Node |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
This section contains the methods for nodes of the parsed Trees. |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=head3 getNodeType |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=over 4 |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item C<> |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
Returns the type, one of C, C, C, C, or C, |
543
|
|
|
|
|
|
|
as described above. |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=back |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=head3 getNodeText |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=over 4 |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=item C<> |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Applicable for C or C nodes; this returns the document text they contain. |
554
|
|
|
|
|
|
|
This is undef for other node types. |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=back |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=head3 setNodeText |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=over 4 |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=item C<> |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
Set the node text, also for C and C nodes. |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=back |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=head3 getNodeStartingPosition |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=over 4 |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=item C<> |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
Get the starting character position in the document of this node. For C |
575
|
|
|
|
|
|
|
and C nodes, this will be where the text begins. For C, |
576
|
|
|
|
|
|
|
C, or C nodes, this will be the position of the I character of |
577
|
|
|
|
|
|
|
the opening identifier. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=back |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head3 getNodeEndingPosition |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
=over 4 |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=item C<> |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
Same as above, but for last character. For C, C, or C |
588
|
|
|
|
|
|
|
nodes, this will be the I character of the closing identifier. |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=back |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=head3 getNodeOuterStartingPosition |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=over 4 |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=item C<> |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
Same as getNodeStartingPosition, but for C, C, or C nodes, |
599
|
|
|
|
|
|
|
this returns the I character of the opening identifier. |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
=back |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
=head3 getNodeOuterEndingPosition |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=over 4 |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=item C<> |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
Same as getNodeEndingPosition, but for C, C, or C nodes, |
610
|
|
|
|
|
|
|
this returns the I character of the closing identifier. |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=back |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
=head3 getNodeMathFlag |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=over 4 |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=item C<> |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
This applies to any node type. It is C<1> if the node sets, or is contained |
621
|
|
|
|
|
|
|
within, a math mode region. C<0> otherwise. C nodes which have this flag as C<1> |
622
|
|
|
|
|
|
|
can be assumed to be the actual mathematics contained in the document. |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=back |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
=head3 getNodePlainTextFlag |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
=over 4 |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
=item C<> |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
This applies only to C nodes. It is C<1> if the node is non-math B is |
633
|
|
|
|
|
|
|
visible (in other words, will end up being a part of the output document). One |
634
|
|
|
|
|
|
|
would only want to index C nodes with this property, for information |
635
|
|
|
|
|
|
|
retrieval purposes. |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
=back |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=head3 getEnvironmentClass |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=over 4 |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=item C<> |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
This applies only to C nodes. Returns what class of environment the |
646
|
|
|
|
|
|
|
node represents (the C in C<\begin{X}> and C<\end{X}>). |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=back |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head3 getCommandName |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=over 4 |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=item C<> |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
This applies only to C nodes. Returns the name of the command (the C in |
657
|
|
|
|
|
|
|
C<\X{...}>). |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
=back |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=head3 getChildTree |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=over 4 |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
=item C<> |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
This applies only to C, C, and C nodes: it returns the |
668
|
|
|
|
|
|
|
C which is ``under'' the calling node. |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=back |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=head3 getFirstChild |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=over 4 |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item C<> |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
This applies only to C, C, and C nodes: it returns the |
679
|
|
|
|
|
|
|
first node from the first level of the child subtree. |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=back |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=head3 getLastChild |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=over 4 |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item C<> |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
Same as above, but for the last node of the first level. |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=back |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=head3 getPreviousSibling |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=over 4 |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
=item C<> |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
Return the prior node on the same level of the tree. |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
=back |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
=head3 getNextSibling |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
=over 4 |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
=item C<> |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
Same as above, but for following node. |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=back |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=head3 getParent |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
=over 4 |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=item C<> |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
Get the parent node of this node in the tree. |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
=back |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=head3 getNextGroupNode |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
=over 4 |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=item C<> |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
This is an interesting function, and kind of a hack because of the way the |
730
|
|
|
|
|
|
|
parser makes the current tree. Basically it will give you the next sibling |
731
|
|
|
|
|
|
|
that is a C node, until it either hits the end of the tree level, a C |
732
|
|
|
|
|
|
|
node which doesn't match C^\s*$/>, or a C node. |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
This is useful for finding all Ced parameters after a C node (see |
735
|
|
|
|
|
|
|
comments for C in the C / C section). You |
736
|
|
|
|
|
|
|
can just have a while loop that calls this method until it gets C, and |
737
|
|
|
|
|
|
|
you'll know you've found all the parameters to a command. |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
Note: this may be bad, but C Nodes matching C^\s*\[[0-9]+\]$/> (optional |
740
|
|
|
|
|
|
|
parameter groups) are treated as if they were 'blank'. |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=back |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=head1 CAVEATS |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
Due to the lack of tree-modification methods, currently this module is |
747
|
|
|
|
|
|
|
mostly useful for minor modifications to the parsed document, for instance, |
748
|
|
|
|
|
|
|
altering the text of C nodes but not deleting the nodes. Of course, the |
749
|
|
|
|
|
|
|
user can still do this by breaking abstraction and directly modifying the Tree. |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
Also note that the parsing is not complete. This module was not written with |
752
|
|
|
|
|
|
|
the intention of being able to produce output documents the way ``latex'' does. |
753
|
|
|
|
|
|
|
The intent was instead to be able to analyze and modify the document on a |
754
|
|
|
|
|
|
|
logical level with regards to the content; it doesn't care about the document |
755
|
|
|
|
|
|
|
formatting and outputting side of TeX/LaTeX. |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
There is much work still to be done. See the F list in the F source. |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=head1 BUGS |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
Probably plenty. However, this module has performed fairly well on a set of |
762
|
|
|
|
|
|
|
~1000 research publications from the Computing Research Repository, so I |
763
|
|
|
|
|
|
|
deemed it ``good enough'' to use for purposes similar to mine. |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
Please let the maintainer know of parser errors if you discover any. |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
=head1 CREDITS |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
Thanks to (in order of appearance) who have contributed valuable suggestions and patches: |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
Otakar Smrz |
772
|
|
|
|
|
|
|
Moritz Lenz |
773
|
|
|
|
|
|
|
James Bowlin |
774
|
|
|
|
|
|
|
Jesse S. Bangs |
775
|
|
|
|
|
|
|
Cord Merrell |
776
|
|
|
|
|
|
|
Debian Perl Group |
777
|
|
|
|
|
|
|
Eli Billauer |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=head1 AUTHORS |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
Written by Aaron Krowne |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
Maintained by Steven Schubiger |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
=head1 LICENSE |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
This program is free software; you may redistribute it and/or |
788
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
See L |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=cut |