line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
437685
|
use Renard::Incunabula::Common::Setup; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8
|
|
2
|
|
|
|
|
|
|
package Renard::Incunabula::MuPDF::mutool; |
3
|
|
|
|
|
|
|
# ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool |
4
|
|
|
|
|
|
|
$Renard::Incunabula::MuPDF::mutool::VERSION = '0.003'; |
5
|
1
|
|
|
1
|
|
10929
|
use Capture::Tiny qw(capture); |
|
1
|
|
|
|
|
48235
|
|
|
1
|
|
|
|
|
103
|
|
6
|
1
|
|
|
1
|
|
1058
|
use XML::Simple; |
|
1
|
|
|
|
|
12463
|
|
|
1
|
|
|
|
|
15
|
|
7
|
1
|
|
|
1
|
|
1008
|
use Alien::MuPDF 0.007; |
|
1
|
|
|
|
|
17564
|
|
|
1
|
|
|
|
|
15
|
|
8
|
1
|
|
|
1
|
|
32805
|
use Path::Tiny; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
107
|
|
9
|
1
|
|
|
1
|
|
9
|
use Function::Parameters; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
14
|
|
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
1577
|
use Log::Any qw($log); |
|
1
|
|
|
|
|
10998
|
|
|
1
|
|
|
|
|
9
|
|
12
|
1
|
|
|
1
|
|
3338
|
use constant MUPDF_DEFAULT_RESOLUTION => 72; # dpi |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
142
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
BEGIN { |
15
|
1
|
|
|
1
|
|
15
|
our $MUTOOL_PATH = Alien::MuPDF->mutool_path; |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
|
18
|
0
|
|
|
0
|
|
|
fun _call_mutool( @mutool_args ) { |
|
0
|
|
|
|
|
|
|
19
|
0
|
|
|
|
|
|
my @args = ( $Renard::Incunabula::MuPDF::mutool::MUTOOL_PATH, @mutool_args ); |
20
|
0
|
|
|
|
|
|
my ($stdout, $exit); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Note: The code below is marked as uncoverable because it only applies |
23
|
|
|
|
|
|
|
# on Windows and we are currently only automatically checking coverage |
24
|
|
|
|
|
|
|
# on Linux via Travis-CI. |
25
|
|
|
|
|
|
|
# uncoverable branch true |
26
|
0
|
0
|
|
|
|
|
if( $^O eq 'MSWin32' ) { |
27
|
|
|
|
|
|
|
# Need to redirect to a file for two reasons: |
28
|
|
|
|
|
|
|
# - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>. |
29
|
|
|
|
|
|
|
# - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>. |
30
|
0
|
|
|
|
|
|
my $temp_fh = File::Temp->new; # uncoverable statement |
31
|
0
|
|
|
|
|
|
close $temp_fh; # to avoid Windows file locking # uncoverable statement |
32
|
|
|
|
|
|
|
|
33
|
0
|
|
|
|
|
|
my $output_param = 0; # uncoverable statement |
34
|
0
|
|
|
|
|
|
for my $idx (1..@args-2) { # uncoverable statement |
35
|
|
|
|
|
|
|
# uncoverable branch true |
36
|
0
|
0
|
0
|
|
|
|
if( $args[$idx] eq '-o' # uncoverable statement |
37
|
|
|
|
|
|
|
&& $args[$idx+1] eq '-' ) { |
38
|
0
|
|
|
|
|
|
$args[$idx+1] = $temp_fh->filename; # uncoverable statement |
39
|
0
|
|
|
|
|
|
$output_param = 1; # uncoverable statement |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# uncoverable branch true |
44
|
0
|
0
|
|
|
|
|
if( not $output_param ) { # uncoverable statement |
45
|
|
|
|
|
|
|
# redirect into a temp file |
46
|
|
|
|
|
|
|
my $cmd = join " ", # uncoverable statement |
47
|
0
|
0
|
|
|
|
|
map { $_ =~ /\s/ ? "\"$_\"" : $_ } # uncoverable statement |
|
0
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
@args; # uncoverable statement |
49
|
0
|
|
|
|
|
|
my $redir = $temp_fh->filename; # uncoverable statement |
50
|
0
|
|
|
|
|
|
@args = ("$cmd > \"$redir\""); # uncoverable statement |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
0
|
|
|
|
|
|
$log->infof("running mutool: %s", \@args); # uncoverable statement |
54
|
0
|
|
|
|
|
|
system( @args ); # uncoverable statement |
55
|
0
|
|
|
|
|
|
$stdout = path( $temp_fh->filename )->slurp_raw; # uncoverable statement |
56
|
0
|
|
|
|
|
|
$exit = $?; # uncoverable statement |
57
|
|
|
|
|
|
|
} else { |
58
|
|
|
|
|
|
|
($stdout, undef, $exit) = capture { |
59
|
0
|
|
|
0
|
|
|
$log->infof("running mutool: %s", \@args); |
60
|
0
|
|
|
|
|
|
system( @args ); |
61
|
0
|
|
|
|
|
|
}; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
0
|
0
|
|
|
|
|
die "Unexpected mutool exit: $exit" if $exit; |
65
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
return $stdout; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no, $zoom_level) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
70
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
71
|
|
|
|
|
|
|
qw(draw), |
72
|
|
|
|
|
|
|
qw( -r ), ($zoom_level * MUPDF_DEFAULT_RESOLUTION), # calculate the resolution |
73
|
|
|
|
|
|
|
qw( -F png ), |
74
|
|
|
|
|
|
|
qw( -o -), |
75
|
|
|
|
|
|
|
$pdf_filename, |
76
|
|
|
|
|
|
|
$pdf_page_no, |
77
|
|
|
|
|
|
|
); |
78
|
|
|
|
|
|
|
|
79
|
0
|
|
|
|
|
|
return $stdout; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
84
|
|
|
|
|
|
|
qw(draw), |
85
|
|
|
|
|
|
|
qw(-F stext), |
86
|
|
|
|
|
|
|
qw(-o -), |
87
|
|
|
|
|
|
|
$pdf_filename, |
88
|
|
|
|
|
|
|
$pdf_page_no, |
89
|
|
|
|
|
|
|
); |
90
|
|
|
|
|
|
|
|
91
|
0
|
|
|
|
|
|
return $stdout; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
95
|
0
|
|
|
|
|
|
my $stext_xml = get_mutool_text_stext_raw( |
96
|
|
|
|
|
|
|
$pdf_filename, |
97
|
|
|
|
|
|
|
$pdf_page_no, |
98
|
|
|
|
|
|
|
); |
99
|
|
|
|
|
|
|
|
100
|
0
|
|
|
|
|
|
my $stext = XMLin( $stext_xml, |
101
|
|
|
|
|
|
|
ForceArray => [ qw(page block line span char) ] ); |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
return $stext; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_page_info_raw($pdf_filename) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
108
|
|
|
|
|
|
|
qw(pages), |
109
|
|
|
|
|
|
|
$pdf_filename |
110
|
|
|
|
|
|
|
); |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# remove the first line |
113
|
0
|
|
|
|
|
|
$stdout =~ s/^[^\n]*\n//s; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# wraps the data with a root node |
116
|
0
|
|
|
|
|
|
return "<document>$stdout</document>" |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_page_info_xml($pdf_filename) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
my $page_info_xml = get_mutool_page_info_raw( $pdf_filename ); |
121
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
|
my $page_info = XMLin( $page_info_xml, |
123
|
|
|
|
|
|
|
ForceArray => [ qw(page) ] ); |
124
|
|
|
|
|
|
|
|
125
|
0
|
|
|
|
|
|
return $page_info; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_outline_simple($pdf_filename) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
129
|
0
|
|
|
|
|
|
my $outline_text = _call_mutool( |
130
|
|
|
|
|
|
|
qw(show), |
131
|
|
|
|
|
|
|
$pdf_filename, |
132
|
|
|
|
|
|
|
qw(outline) |
133
|
|
|
|
|
|
|
); |
134
|
|
|
|
|
|
|
|
135
|
0
|
|
|
|
|
|
my @outline_items = (); |
136
|
0
|
|
|
|
|
|
open my $outline_fh, '<:encoding(UTF-8):crlf', \$outline_text; |
137
|
0
|
|
|
|
|
|
while( defined( my $line = <$outline_fh> ) ) { |
138
|
0
|
|
|
|
|
|
$line =~ /^(?<indent>\t*)(?<text>.*)\t#(?<page>\d+)(,(?<dx>\d+),(?<dy>\d+))?$/; |
139
|
1
|
|
|
1
|
|
3152
|
my %copy = %+; |
|
1
|
|
|
|
|
516
|
|
|
1
|
|
|
|
|
147
|
|
|
0
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
$copy{level} = length $copy{indent}; |
141
|
0
|
|
|
|
|
|
delete $copy{indent}; |
142
|
|
|
|
|
|
|
# not storing the offsets yet and not every line has offsets |
143
|
0
|
|
|
|
|
|
delete @copy{qw(dx dy)}; |
144
|
0
|
|
|
|
|
|
push @outline_items, \%copy; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
0
|
|
|
|
|
|
return \@outline_items; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
1; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
__END__ |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=pod |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=encoding UTF-8 |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 NAME |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Renard::Incunabula::MuPDF::mutool - Retrieve PDF image and text data via MuPDF's mutool |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 VERSION |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
version 0.003 |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=head1 FUNCTIONS |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 _call_mutool |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
_call_mutool( @args ) |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
Helper function which calls C<mutool> with the contents of the C<@args> array. |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Returns the captured C<STDOUT> of the call. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
This function dies if C<mutool> unsuccessfully exits. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 get_mutool_pdf_page_as_png |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no) |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
This function returns a PNG stream that renders page number C<$pdf_page_no> of |
184
|
|
|
|
|
|
|
the PDF file C<$pdf_filename>. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head2 get_mutool_text_stext_raw |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
This function returns an XML string that contains structured text from page |
191
|
|
|
|
|
|
|
number C<$pdf_page_no> of the PDF file C<$pdf_filename>. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
The XML format is defined by the output of C<mutool> looks like this (for page |
194
|
|
|
|
|
|
|
23 of the C<pdf_reference_1-7.pdf> file): |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
<document name="test-data/test-data/PDF/Adobe/pdf_reference_1-7.pdf"> |
197
|
|
|
|
|
|
|
<page width="531" height="666"> |
198
|
|
|
|
|
|
|
<block bbox="261.18 616.16394 269.77765 625.2532"> |
199
|
|
|
|
|
|
|
<line bbox="261.18 616.16394 269.77765 625.2532"> |
200
|
|
|
|
|
|
|
<span bbox="261.18 616.16394 269.77765 625.2532" font="MyriadPro-Semibold" size="7.98"> |
201
|
|
|
|
|
|
|
<char bbox="261.18 616.16394 265.50037 625.2532" x="261.18" y="623.2582" c="2"/> |
202
|
|
|
|
|
|
|
<char bbox="265.50037 616.16394 269.77765 625.2532" x="265.50037" y="623.2582" c="3"/> |
203
|
|
|
|
|
|
|
</span> |
204
|
|
|
|
|
|
|
</line> |
205
|
|
|
|
|
|
|
</block> |
206
|
|
|
|
|
|
|
<block bbox="225.78 88.20229 305.18158 117.93829"> |
207
|
|
|
|
|
|
|
<line bbox="225.78 88.20229 305.18158 117.93829"> |
208
|
|
|
|
|
|
|
<span bbox="225.78 88.20229 305.18158 117.93829" font="MyriadPro-Bold" size="24"> |
209
|
|
|
|
|
|
|
<char bbox="225.78 88.20229 239.5176 117.93829" x="225.78" y="111.93829" c="P"/> |
210
|
|
|
|
|
|
|
<char bbox="239.5176 88.20229 248.4552 117.93829" x="239.5176" y="111.93829" c="r"/> |
211
|
|
|
|
|
|
|
<char bbox="248.4552 88.20229 261.1128 117.93829" x="248.4552" y="111.93829" c="e"/> |
212
|
|
|
|
|
|
|
<char bbox="261.1128 88.20229 269.28238 117.93829" x="261.1128" y="111.93829" c="f"/> |
213
|
|
|
|
|
|
|
<char bbox="269.28238 88.20229 281.93997 117.93829" x="269.28238" y="111.93829" c="a"/> |
214
|
|
|
|
|
|
|
<char bbox="281.93997 88.20229 292.50958 117.93829" x="281.93997" y="111.93829" c="c"/> |
215
|
|
|
|
|
|
|
<char bbox="292.50958 88.20229 305.18158 117.93829" x="292.50958" y="111.93829" c="e"/> |
216
|
|
|
|
|
|
|
</span> |
217
|
|
|
|
|
|
|
</line> |
218
|
|
|
|
|
|
|
</block> |
219
|
|
|
|
|
|
|
</page> |
220
|
|
|
|
|
|
|
</document> |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Simplified, the high-level structure looks like: |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
<page> -> [list of blocks] |
225
|
|
|
|
|
|
|
<block> -> [list of blocks] |
226
|
|
|
|
|
|
|
a block is either: |
227
|
|
|
|
|
|
|
- stext |
228
|
|
|
|
|
|
|
<line> -> [list of lines] (all have same baseline) |
229
|
|
|
|
|
|
|
<span> -> [list of spans] (horizontal spaces over a line) |
230
|
|
|
|
|
|
|
<char> -> [list of chars] |
231
|
|
|
|
|
|
|
- image |
232
|
|
|
|
|
|
|
# TODO document the image data from mutool |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 get_mutool_text_stext_xml |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Returns a HashRef of the structured text from from page |
239
|
|
|
|
|
|
|
number C<$pdf_page_no> of the PDF file C<$pdf_filename>. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for |
242
|
|
|
|
|
|
|
details on the structure of this data. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=head2 get_mutool_page_info_raw |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
get_mutool_page_info_raw($pdf_filename) |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
The data is in the form: |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
<document> |
253
|
|
|
|
|
|
|
<page pagenum="1"> |
254
|
|
|
|
|
|
|
<MediaBox l="0" b="0" r="531" t="666" /> |
255
|
|
|
|
|
|
|
<CropBox l="0" b="0" r="531" t="666" /> |
256
|
|
|
|
|
|
|
<Rotate v="0" /> |
257
|
|
|
|
|
|
|
</page> |
258
|
|
|
|
|
|
|
<page pagenum="2"> |
259
|
|
|
|
|
|
|
... |
260
|
|
|
|
|
|
|
</page> |
261
|
|
|
|
|
|
|
</document> |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 get_mutool_page_info_xml |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
get_mutool_page_info_xml($pdf_filename) |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Returns a HashRef containing the page bounding boxes of PDF file |
268
|
|
|
|
|
|
|
C<$pdf_filename>. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for |
271
|
|
|
|
|
|
|
information on the structure of the data. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head2 get_mutool_outline_simple |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
fun get_mutool_outline_simple($pdf_filename) |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Returns an array of the outline of the PDF file C<$pdf_filename> as an |
278
|
|
|
|
|
|
|
C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of |
279
|
|
|
|
|
|
|
L<Renard::Incunabula::Outline>. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head1 SEE ALSO |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
L<Repository information|http://project-renard.github.io/doc/development/repo/p5-Renard-Incunabula-MuPDF-mutool/> |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=head1 AUTHOR |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Project Renard |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
This software is copyright (c) 2017 by Project Renard. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
294
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=cut |