line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Amuse::Preprocessor; |
2
|
|
|
|
|
|
|
|
3
|
9
|
|
|
9
|
|
477135
|
use strict; |
|
9
|
|
|
|
|
83
|
|
|
9
|
|
|
|
|
304
|
|
4
|
9
|
|
|
9
|
|
47
|
use warnings; |
|
9
|
|
|
|
|
19
|
|
|
9
|
|
|
|
|
267
|
|
5
|
|
|
|
|
|
|
|
6
|
9
|
|
|
9
|
|
4094
|
use Text::Amuse::Preprocessor::HTML; |
|
9
|
|
|
|
|
33
|
|
|
9
|
|
|
|
|
574
|
|
7
|
9
|
|
|
9
|
|
3972
|
use Text::Amuse::Preprocessor::Parser; |
|
9
|
|
|
|
|
26
|
|
|
9
|
|
|
|
|
356
|
|
8
|
9
|
|
|
9
|
|
3739
|
use Text::Amuse::Preprocessor::Typography qw/get_typography_filter/; |
|
9
|
|
|
|
|
28
|
|
|
9
|
|
|
|
|
732
|
|
9
|
9
|
|
|
9
|
|
4375
|
use Text::Amuse::Preprocessor::Footnotes; |
|
9
|
|
|
|
|
36
|
|
|
9
|
|
|
|
|
346
|
|
10
|
9
|
|
|
9
|
|
4229
|
use Text::Amuse::Functions; |
|
9
|
|
|
|
|
345264
|
|
|
9
|
|
|
|
|
595
|
|
11
|
9
|
|
|
9
|
|
82
|
use File::Spec; |
|
9
|
|
|
|
|
24
|
|
|
9
|
|
|
|
|
209
|
|
12
|
9
|
|
|
9
|
|
53
|
use File::Temp qw(); |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
134
|
|
13
|
9
|
|
|
9
|
|
50
|
use File::Copy qw(); |
|
9
|
|
|
|
|
21
|
|
|
9
|
|
|
|
|
138
|
|
14
|
9
|
|
|
9
|
|
46
|
use Data::Dumper; |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
18941
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Text::Amuse::Preprocessor - Helpers for Text::Amuse document formatting. |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 VERSION |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Version 0.65 |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=cut |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our $VERSION = '0.65'; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 SYNOPSIS |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
use Text::Amuse::Preprocessor; |
32
|
|
|
|
|
|
|
my $pp = Text::Amuse::Preprocessor->new( |
33
|
|
|
|
|
|
|
input => $infile, |
34
|
|
|
|
|
|
|
output => $outfile, |
35
|
|
|
|
|
|
|
html => 1, |
36
|
|
|
|
|
|
|
fix_links => 1, |
37
|
|
|
|
|
|
|
fix_typography => 1, |
38
|
|
|
|
|
|
|
fix_nbsp => 1, |
39
|
|
|
|
|
|
|
fix_footnotes => 1 |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
$pp->process; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This module provides a solution to apply some common fixes to muse |
46
|
|
|
|
|
|
|
files. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Without any option save for C and C |
49
|
|
|
|
|
|
|
mandatory), the only things the module does is to remove carriage |
50
|
|
|
|
|
|
|
returns, replace character ligatures or characters which shouldn't |
51
|
|
|
|
|
|
|
enter at all and expand the tabs to 4 spaces (no smart expanding). |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 LANGUAGE SUPPORT |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
The following languages are supported |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=over 4 |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item english |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
smart quotes, dashes, and the common superscripts (like 11th) |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=item russian |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
smart quotes, dashes and non-breaking spaces |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item spanish |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
smart quotes and dashes |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item finnish |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
smart quotes and dashes |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=item swedish |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
smart quotes and dashes |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=item serbian |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
smart quotes and dashes |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=item croatian |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
smart quotes and dashes |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=item italian |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
smart quotes and dashes |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=item macedonian |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
smart quotes and dashes |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=item german |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
smart quotes and dashes |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=back |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=head1 ACCESSORS |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
The following values are read-only and must be passed to the constructor. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head2 Mandatory |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head3 input |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
Can be a string (with the input file path) or a reference to a scalar |
110
|
|
|
|
|
|
|
with the text to process). |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head3 output |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Can be a string (with the output file path) or a reference to a scalar |
115
|
|
|
|
|
|
|
with the processed text. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head2 Optional |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head3 html |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Before doing anything, convert the HTML input into a muse file. Even |
122
|
|
|
|
|
|
|
if possible, you're discouraged to do the html import and the fixing |
123
|
|
|
|
|
|
|
in the same processing. Instead, create two objects, then first do the |
124
|
|
|
|
|
|
|
HTML to muse convert, save the result somewhere, add the headers, then |
125
|
|
|
|
|
|
|
reprocess it with the required fixes above. |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
Notably, the output will be without an header, so the language will |
128
|
|
|
|
|
|
|
not be detected. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Default to false. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head3 fix_links |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
Find the links and add the markup if needed. Default to false. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head3 fix_typography |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Apply the typographical fixes. Default to false. This add the "smart |
139
|
|
|
|
|
|
|
quotes" feature. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head3 remove_nbsp |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Remove all the non-break spaces in the document, unconditionally. This |
144
|
|
|
|
|
|
|
options does not conflict with the following. If both are provided, |
145
|
|
|
|
|
|
|
first the non-break spaces are removed, then reinserted. |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head3 fix_nbsp |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Add non-break spaces where appropriate (whatever this means). |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head3 show_nbsp |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Make the non-break spaces visible and explicit as ~~ (available on |
154
|
|
|
|
|
|
|
Text::Amuse since version 0.94). |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head3 fix_footnotes |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Rearrange the footnotes if needed. Default to false. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=head3 debug |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Don't unlink the temporary files and be verbose |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 METHODS |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 new(%options) |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
Constructor. Accepts the above options. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=cut |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub new { |
173
|
83
|
|
|
83
|
1
|
49909
|
my ($class, %options) = @_; |
174
|
83
|
|
|
|
|
560
|
my $self = { |
175
|
|
|
|
|
|
|
html => 0, |
176
|
|
|
|
|
|
|
fix_links => 0, |
177
|
|
|
|
|
|
|
fix_typography => 0, |
178
|
|
|
|
|
|
|
fix_footnotes => 0, |
179
|
|
|
|
|
|
|
remove_nbsp => 0, |
180
|
|
|
|
|
|
|
show_nbsp => 0, |
181
|
|
|
|
|
|
|
fix_nbsp => 0, |
182
|
|
|
|
|
|
|
debug => 0, |
183
|
|
|
|
|
|
|
input => undef, |
184
|
|
|
|
|
|
|
output => undef, |
185
|
|
|
|
|
|
|
}; |
186
|
83
|
|
|
|
|
394
|
foreach my $k (keys %$self) { |
187
|
830
|
100
|
|
|
|
1615
|
if (exists $options{$k}) { |
188
|
522
|
|
|
|
|
1005
|
$self->{$k} = delete $options{$k}; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
} |
191
|
83
|
|
|
|
|
245
|
$self->{_error} = ''; |
192
|
83
|
|
|
|
|
206
|
$self->{_verbatim_pieces} = {}; |
193
|
83
|
|
|
|
|
167
|
$self->{_unique_counter} = 0; |
194
|
83
|
50
|
|
|
|
194
|
die "Unrecognized option: " . join(' ', keys %options) . "\n" if %options; |
195
|
83
|
50
|
|
|
|
226
|
die "Missing input" unless defined $self->{input}; |
196
|
83
|
50
|
|
|
|
188
|
die "Missing output" unless defined $self->{output}; |
197
|
83
|
|
|
|
|
329
|
bless $self, $class; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
sub _get_unique_counter { |
201
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
202
|
0
|
|
|
|
|
0
|
my $counter = ++$self->{_unique_counter}; |
203
|
0
|
|
|
|
|
0
|
return $counter; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub _verbatim_pieces { |
207
|
0
|
|
|
0
|
|
0
|
return shift->{_verbatim_pieces}; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub html { |
211
|
83
|
|
|
83
|
1
|
264
|
return shift->{html}; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
sub fix_links { |
215
|
83
|
|
|
83
|
1
|
251
|
return shift->{fix_links}; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub fix_typography { |
219
|
83
|
|
|
83
|
1
|
144
|
return shift->{fix_typography}; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub remove_nbsp { |
223
|
83
|
|
|
83
|
1
|
159
|
return shift->{remove_nbsp}; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub show_nbsp { |
227
|
83
|
|
|
83
|
1
|
191
|
return shift->{show_nbsp}; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
sub fix_nbsp { |
231
|
58
|
|
|
58
|
1
|
211
|
return shift->{fix_nbsp}; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
sub fix_footnotes { |
235
|
83
|
|
|
83
|
1
|
294
|
return shift->{fix_footnotes}; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub debug { |
239
|
166
|
|
|
166
|
1
|
670
|
return shift->{debug}; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub input { |
243
|
83
|
|
|
83
|
1
|
201
|
return shift->{input}; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
sub output { |
247
|
78
|
|
|
78
|
1
|
192
|
return shift->{output}; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=head2 process |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Process C according to the options passed and write into |
253
|
|
|
|
|
|
|
C |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=cut |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub _infile { |
258
|
252
|
|
|
252
|
|
1665
|
my ($self, $arg) = @_; |
259
|
252
|
100
|
|
|
|
527
|
if ($arg) { |
260
|
83
|
50
|
|
|
|
244
|
die "Infile already set" if $self->{_infile}; |
261
|
83
|
|
|
|
|
193
|
$self->{_infile} = $arg; |
262
|
|
|
|
|
|
|
} |
263
|
252
|
|
|
|
|
655
|
return $self->{_infile}; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# temporary file for output |
267
|
|
|
|
|
|
|
sub _outfile { |
268
|
83
|
|
|
83
|
|
141
|
my $self = shift; |
269
|
83
|
|
|
|
|
193
|
return File::Spec->catfile($self->tmpdir, 'output.muse'); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
sub _fn_outfile { |
273
|
24
|
|
|
24
|
|
43
|
my $self = shift; |
274
|
24
|
|
|
|
|
63
|
return File::Spec->catfile($self->tmpdir, 'fn-out.muse'); |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub process { |
278
|
83
|
|
|
83
|
1
|
1400
|
my $self = shift; |
279
|
83
|
|
|
|
|
246
|
my $debug = $self->debug; |
280
|
|
|
|
|
|
|
|
281
|
83
|
|
|
|
|
208
|
my $wd = $self->tmpdir; |
282
|
83
|
100
|
|
|
|
1261
|
print "# Using $wd to store temporary files\n" if $debug; |
283
|
83
|
|
|
|
|
256
|
my $infile = $self->_set_infile; |
284
|
83
|
50
|
|
|
|
1175
|
die "Something went wrong" unless -f $infile; |
285
|
|
|
|
|
|
|
|
286
|
83
|
50
|
|
|
|
359
|
if ($self->html) { |
287
|
0
|
|
|
|
|
0
|
$self->_process_html; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# then try to get the language |
291
|
83
|
|
|
|
|
188
|
my ($filter, $specific_filter, $nbsp_filter); |
292
|
83
|
|
|
|
|
179
|
my $fixlinks = $self->fix_links; |
293
|
83
|
|
|
|
|
190
|
my $fixtypo = $self->fix_typography; |
294
|
83
|
|
|
|
|
202
|
my $remove_nbsp = $self->remove_nbsp; |
295
|
83
|
|
|
|
|
200
|
my $show_nbsp = $self->show_nbsp; |
296
|
83
|
|
|
|
|
248
|
my $lang = $self->_get_lang; |
297
|
|
|
|
|
|
|
|
298
|
83
|
100
|
100
|
|
|
314
|
if ($lang && $fixtypo) { |
299
|
55
|
|
|
|
|
215
|
$filter = |
300
|
|
|
|
|
|
|
Text::Amuse::Preprocessor::TypographyFilters::filter($lang); |
301
|
55
|
|
|
|
|
179
|
$specific_filter = |
302
|
|
|
|
|
|
|
Text::Amuse::Preprocessor::TypographyFilters::specific_filter($lang); |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
83
|
100
|
100
|
|
|
309
|
if ($lang && $self->fix_nbsp) { |
306
|
14
|
|
|
|
|
47
|
$nbsp_filter = |
307
|
|
|
|
|
|
|
Text::Amuse::Preprocessor::TypographyFilters::nbsp_filter($lang); |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
83
|
|
|
|
|
224
|
my $outfile = $self->_outfile; |
311
|
83
|
|
|
|
|
1396
|
my $line; |
312
|
83
|
|
|
|
|
238
|
my @body = Text::Amuse::Preprocessor::Parser::parse_text($self->_read_file($infile)); |
313
|
|
|
|
|
|
|
# print Dumper(\@body); |
314
|
|
|
|
|
|
|
CHUNK: |
315
|
83
|
|
|
|
|
241
|
foreach my $piece (@body) { |
316
|
5451
|
100
|
|
|
|
10839
|
next CHUNK if $piece->{type} ne 'text'; |
317
|
|
|
|
|
|
|
# print "Processing $piece->{type} $piece->{string}\n"; |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# do the job |
320
|
3726
|
|
|
|
|
5709
|
$line = $piece->{string}; |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# some bad things we want to filter anyway |
323
|
|
|
|
|
|
|
# $line =~ s/─/—/g; # they look the same, but they are not |
324
|
3726
|
|
|
|
|
6558
|
$line =~ s/\x{2500}/\x{2014}/g; |
325
|
|
|
|
|
|
|
# ligatures, totally lame to have in input file |
326
|
3726
|
|
|
|
|
6082
|
$line =~ s/\x{fb00}/ff/g; |
327
|
3726
|
|
|
|
|
5791
|
$line =~ s/\x{fb01}/fi/g; |
328
|
3726
|
|
|
|
|
6069
|
$line =~ s/\x{fb02}/fl/g; |
329
|
3726
|
|
|
|
|
5710
|
$line =~ s/\x{fb03}/ffi/g; |
330
|
3726
|
|
|
|
|
5710
|
$line =~ s/\x{fb04}/ffl/g; |
331
|
|
|
|
|
|
|
# remove soft-hyphens + space. They are invisible in browsers |
332
|
|
|
|
|
|
|
# and sometimes even on the console |
333
|
3726
|
|
|
|
|
5711
|
$line =~ s/\x{ad}\s*//g; |
334
|
3726
|
100
|
|
|
|
6487
|
if ($remove_nbsp) { |
335
|
226
|
|
|
|
|
490
|
$line =~ s/\x{a0}/ /g; |
336
|
226
|
|
|
|
|
423
|
$line =~ s/~~/ /g; |
337
|
|
|
|
|
|
|
} |
338
|
3726
|
100
|
|
|
|
6640
|
if ($fixtypo) { |
339
|
2152
|
|
|
|
|
5490
|
$line =~ s/(?<=\.) (?=\.)//g; # collapse the dots |
340
|
|
|
|
|
|
|
} |
341
|
3726
|
100
|
|
|
|
6367
|
if ($fixlinks) { |
342
|
2030
|
|
|
|
|
4732
|
$line = Text::Amuse::Preprocessor::TypographyFilters::linkify($line); |
343
|
|
|
|
|
|
|
} |
344
|
3726
|
100
|
|
|
|
6827
|
if ($filter) { |
345
|
2148
|
|
|
|
|
4194
|
$line = $filter->($line); |
346
|
|
|
|
|
|
|
} |
347
|
3726
|
100
|
|
|
|
6980
|
if ($specific_filter) { |
348
|
226
|
|
|
|
|
462
|
$line = $specific_filter->($line); |
349
|
|
|
|
|
|
|
} |
350
|
3726
|
100
|
|
|
|
6449
|
if ($nbsp_filter) { |
351
|
172
|
|
|
|
|
370
|
$line = $nbsp_filter->($line); |
352
|
|
|
|
|
|
|
} |
353
|
3726
|
100
|
|
|
|
6204
|
if ($show_nbsp) { |
354
|
87
|
|
|
|
|
280
|
$line =~ s/\x{a0}/~~/g; |
355
|
|
|
|
|
|
|
} |
356
|
3726
|
|
|
|
|
6843
|
$piece->{string} = $line; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
# write out |
359
|
83
|
|
|
|
|
224
|
$self->_write_file($outfile, join('', map { $_->{string} } @body)); |
|
5451
|
|
|
|
|
9566
|
|
360
|
|
|
|
|
|
|
|
361
|
83
|
100
|
|
|
|
699
|
if ($self->fix_footnotes) { |
362
|
24
|
|
|
|
|
65
|
my $fn_auxfile = $self->_fn_outfile; |
363
|
24
|
|
|
|
|
537
|
my $fnfixer = Text::Amuse::Preprocessor::Footnotes |
364
|
|
|
|
|
|
|
->new(input => $outfile, |
365
|
|
|
|
|
|
|
output => $fn_auxfile); |
366
|
|
|
|
|
|
|
# print "$outfile $fn_auxfile\n"; |
367
|
24
|
100
|
|
|
|
88
|
if ($fnfixer->process) { |
368
|
|
|
|
|
|
|
# replace the outfile |
369
|
19
|
|
|
|
|
103
|
$outfile = $fn_auxfile; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
else { |
372
|
|
|
|
|
|
|
# set the error |
373
|
5
|
|
|
|
|
29
|
$self->_set_error({ %{ $fnfixer->error } }); |
|
5
|
|
|
|
|
17
|
|
374
|
5
|
|
|
|
|
56
|
return; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
78
|
|
|
|
|
661
|
my $output = $self->output; |
379
|
78
|
100
|
|
|
|
254
|
if (my $ref = ref($output)) { |
380
|
39
|
50
|
|
|
|
115
|
if ($ref eq 'SCALAR') { |
381
|
39
|
|
|
|
|
100
|
$$output = $self->_read_file($outfile); |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
else { |
384
|
0
|
|
|
|
|
0
|
die "Output is not a scalar ref!"; |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
else { |
388
|
39
|
50
|
|
|
|
216
|
File::Copy::move($outfile, $output) |
389
|
|
|
|
|
|
|
or die "Cannot move $outfile to $output, $!"; |
390
|
|
|
|
|
|
|
} |
391
|
78
|
|
|
|
|
5962
|
return $output; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
sub _process_html { |
395
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
396
|
|
|
|
|
|
|
# read the infile, process, overwrite. Doc states that it's just lame. |
397
|
0
|
|
|
|
|
0
|
my $body = $self->_read_file($self->_infile); |
398
|
0
|
|
|
|
|
0
|
my $html = Text::Amuse::Preprocessor::HTML::html_to_muse($body); |
399
|
0
|
|
|
|
|
0
|
$self->_write_file($self->_infile, $html); |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
sub _write_file { |
403
|
113
|
|
|
113
|
|
51730
|
my ($self, $file, $body) = @_; |
404
|
113
|
50
|
33
|
|
|
560
|
die unless $file && $body; |
405
|
113
|
50
|
|
|
|
8211
|
open (my $fh, '>:encoding(UTF-8)', $file) or die "opening $file $!"; |
406
|
113
|
|
|
|
|
12605
|
print $fh $body; |
407
|
113
|
50
|
|
|
|
4912
|
close $fh or die "closing $file: $!"; |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
sub _read_file { |
412
|
209
|
|
|
209
|
|
49919
|
my ($self, $file) = @_; |
413
|
209
|
50
|
|
|
|
483
|
die unless $file; |
414
|
209
|
50
|
|
|
|
8210
|
open (my $fh, '<:encoding(UTF-8)', $file) or die "$file: $!"; |
415
|
209
|
|
|
|
|
14882
|
local $/ = undef; |
416
|
209
|
|
|
|
|
6547
|
my $body = <$fh>; |
417
|
209
|
|
|
|
|
8232
|
close $fh; |
418
|
209
|
|
|
|
|
2001
|
return $body; |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub _set_infile { |
424
|
83
|
|
|
83
|
|
157
|
my $self = shift; |
425
|
83
|
|
|
|
|
202
|
my $input = $self->input; |
426
|
83
|
|
|
|
|
201
|
my $infile = File::Spec->catfile($self->tmpdir, 'input.txt'); |
427
|
83
|
100
|
|
|
|
1222
|
if (my $ref = ref($input)) { |
428
|
40
|
50
|
|
|
|
116
|
if ($ref eq 'SCALAR') { |
429
|
40
|
50
|
|
3
|
|
2459
|
open (my $fh, '>:encoding(UTF-8)', $infile) or die "$infile: $!"; |
|
3
|
|
|
|
|
23
|
|
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
64
|
|
430
|
40
|
|
|
|
|
8590
|
print $fh $$input; |
431
|
40
|
50
|
|
|
|
1708
|
close $fh or die "closing $infile $!"; |
432
|
40
|
|
|
|
|
236
|
$self->_infile($infile); |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
else { |
435
|
0
|
|
|
|
|
0
|
die Dumper($ref) . " is not a scalar ref!"; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
else { |
439
|
43
|
50
|
|
|
|
191
|
File::Copy::copy($input, $infile) or die "Couldn't copy $input to $infile $!"; |
440
|
43
|
|
|
|
|
14027
|
$self->_infile($infile); |
441
|
|
|
|
|
|
|
} |
442
|
83
|
|
|
|
|
218
|
return $self->_infile; |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 html_to_muse |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Can be called on the class and will invoke the |
449
|
|
|
|
|
|
|
L's C function on the |
450
|
|
|
|
|
|
|
argument returning the converted chunk. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=cut |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
sub html_to_muse { |
455
|
2
|
|
|
2
|
1
|
872
|
my ($self, $text) = @_; |
456
|
2
|
50
|
|
|
|
7
|
return unless defined $text; |
457
|
2
|
|
|
|
|
7
|
return Text::Amuse::Preprocessor::HTML::html_to_muse($text); |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=head2 error |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
This is set only when processing footnotes. See |
463
|
|
|
|
|
|
|
L documentation for the hashref |
464
|
|
|
|
|
|
|
returned when an error has been detected. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=cut |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
sub error { |
469
|
17
|
|
|
17
|
1
|
11198
|
return shift->{_error}; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
sub _set_error { |
473
|
5
|
|
|
5
|
|
13
|
my ($self, $error) = @_; |
474
|
5
|
50
|
|
|
|
17
|
$self->{_error} = $error if $error; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=head2 tmpdir |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Return the directory name used internally to hold the temporary files. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=cut |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub tmpdir { |
484
|
273
|
|
|
273
|
1
|
449
|
my $self = shift; |
485
|
273
|
100
|
|
|
|
980
|
unless ($self->{_tmpdir}) { |
486
|
83
|
|
|
|
|
186
|
$self->{_tmpdir} = File::Temp->newdir(CLEANUP => !$self->debug); |
487
|
|
|
|
|
|
|
} |
488
|
273
|
|
|
|
|
36444
|
return $self->{_tmpdir}->dirname; |
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
sub _get_lang { |
492
|
83
|
|
|
83
|
|
135
|
my $self = shift; |
493
|
83
|
|
|
|
|
160
|
my $infile = $self->_infile; |
494
|
|
|
|
|
|
|
# shouldn't happen |
495
|
83
|
50
|
33
|
|
|
1104
|
die unless $infile && -f $infile; |
496
|
83
|
|
|
|
|
209
|
my $info; |
497
|
83
|
|
|
|
|
163
|
eval { |
498
|
83
|
|
|
|
|
311
|
$info = Text::Amuse::Functions::muse_fast_scan_header($infile); |
499
|
|
|
|
|
|
|
}; |
500
|
83
|
100
|
66
|
|
|
29095
|
if ($info && $info->{lang}) { |
501
|
58
|
50
|
|
|
|
326
|
if ($info->{lang} =~ m/^\s*([a-z]{2,3})\s*$/s) { |
502
|
58
|
|
|
|
|
290
|
return $1; |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
} |
505
|
25
|
|
|
|
|
85
|
return; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=head1 AUTHOR |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Marco Pessotto, C<< >> |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
=head1 BUGS |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
Please report any bugs or feature requests to the author's email. If |
516
|
|
|
|
|
|
|
you find a bug, please provide a minimal muse file which reproduces |
517
|
|
|
|
|
|
|
the problem (so I can add it to the test suite). |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=head1 SUPPORT |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
perldoc Text::Amuse::Preprocessor |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Repository available at GitHub: |
526
|
|
|
|
|
|
|
L |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=head1 SEE ALSO |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
The original documentation for the Emacs Muse markup can be found at: |
531
|
|
|
|
|
|
|
L |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
The parser itself is L. |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
This distribution ships the following executables |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=over 4 |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=item * html-to-muse.pl (HTML to muse converter) |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=item * muse-check-footnotes.pl (footnote checker) |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
=item * muse-rearrange-footnotes.pl (fix footnote numbering) |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=item * pod-to-muse.pl (POD to muse converter) |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=item * muse-preprocessor.pl (script which uses this module) |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=back |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
See the manpage or pass --help to the scripts for usage. |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=head1 LICENSE |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
556
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
557
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
See L for more information. |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=cut |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
1; # End of Text::Amuse::Preprocessor |