line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
3
|
|
|
3
|
|
858
|
use Renard::Curie::Setup; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
28
|
|
2
|
|
|
|
|
|
|
package Renard::Curie::Data::PDF; |
3
|
|
|
|
|
|
|
# ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool |
4
|
|
|
|
|
|
|
$Renard::Curie::Data::PDF::VERSION = '0.002'; |
5
|
3
|
|
|
3
|
|
1441
|
use Capture::Tiny qw(capture); |
|
3
|
|
|
|
|
37685
|
|
|
3
|
|
|
|
|
164
|
|
6
|
3
|
|
|
3
|
|
5189
|
use XML::Simple; |
|
3
|
|
|
|
|
21251
|
|
|
3
|
|
|
|
|
23
|
|
7
|
3
|
|
|
3
|
|
2506
|
use Alien::MuPDF 0.007; |
|
3
|
|
|
|
|
33302
|
|
|
3
|
|
|
|
|
26
|
|
8
|
3
|
|
|
3
|
|
55720
|
use Path::Tiny; |
|
3
|
|
|
|
|
33
|
|
|
3
|
|
|
|
|
180
|
|
9
|
3
|
|
|
3
|
|
18
|
use Function::Parameters; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
23
|
|
10
|
|
|
|
|
|
|
|
11
|
3
|
|
|
3
|
|
3950
|
use Log::Any qw($log); |
|
3
|
|
|
|
|
21555
|
|
|
3
|
|
|
|
|
15
|
|
12
|
3
|
|
|
3
|
|
6870
|
use constant MUPDF_DEFAULT_RESOLUTION => 72; # dpi |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
301
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
BEGIN { |
15
|
3
|
|
|
3
|
|
28
|
our $MUTOOL_PATH = Alien::MuPDF->mutool_path; |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
|
18
|
0
|
|
|
0
|
|
|
fun _call_mutool( @mutool_args ) { |
|
0
|
|
|
|
|
|
|
19
|
0
|
|
|
|
|
|
my @args = ( $Renard::Curie::Data::PDF::MUTOOL_PATH, @mutool_args ); |
20
|
0
|
|
|
|
|
|
my ($stdout, $exit); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Note: The code below is marked as uncoverable because it only applies |
23
|
|
|
|
|
|
|
# on Windows and we are currently only automatically checking coverage |
24
|
|
|
|
|
|
|
# on Linux via Travis-CI. |
25
|
|
|
|
|
|
|
# uncoverable branch true |
26
|
0
|
0
|
|
|
|
|
if( $^O eq 'MSWin32' ) { |
27
|
|
|
|
|
|
|
# Need to redirect to a file for two reasons: |
28
|
|
|
|
|
|
|
# - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>. |
29
|
|
|
|
|
|
|
# - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>. |
30
|
0
|
|
|
|
|
|
my $temp_fh = File::Temp->new; # uncoverable statement |
31
|
0
|
|
|
|
|
|
close $temp_fh; # to avoid Windows file locking # uncoverable statement |
32
|
|
|
|
|
|
|
|
33
|
0
|
|
|
|
|
|
my $output_param = 0; # uncoverable statement |
34
|
0
|
|
|
|
|
|
for my $idx (1..@args-2) { # uncoverable statement |
35
|
|
|
|
|
|
|
# uncoverable branch true |
36
|
0
|
0
|
0
|
|
|
|
if( $args[$idx] eq '-o' # uncoverable statement |
37
|
|
|
|
|
|
|
&& $args[$idx+1] eq '-' ) { |
38
|
0
|
|
|
|
|
|
$args[$idx+1] = $temp_fh->filename; # uncoverable statement |
39
|
0
|
|
|
|
|
|
$output_param = 1; # uncoverable statement |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# uncoverable branch true |
44
|
0
|
0
|
|
|
|
|
if( not $output_param ) { # uncoverable statement |
45
|
|
|
|
|
|
|
# redirect into a temp file |
46
|
|
|
|
|
|
|
my $cmd = join " ", # uncoverable statement |
47
|
0
|
0
|
|
|
|
|
map { $_ =~ /\s/ ? "\"$_\"" : $_ } # uncoverable statement |
|
0
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
@args; # uncoverable statement |
49
|
0
|
|
|
|
|
|
my $redir = $temp_fh->filename; # uncoverable statement |
50
|
0
|
|
|
|
|
|
@args = ("$cmd > \"$redir\""); # uncoverable statement |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
0
|
|
|
|
|
|
$log->infof("running mutool: %s", \@args); # uncoverable statement |
54
|
0
|
|
|
|
|
|
system( @args ); # uncoverable statement |
55
|
0
|
|
|
|
|
|
$stdout = path( $temp_fh->filename )->slurp_raw; # uncoverable statement |
56
|
0
|
|
|
|
|
|
$exit = $?; # uncoverable statement |
57
|
|
|
|
|
|
|
} else { |
58
|
|
|
|
|
|
|
($stdout, undef, $exit) = capture { |
59
|
0
|
|
|
0
|
|
|
$log->infof("running mutool: %s", \@args); |
60
|
0
|
|
|
|
|
|
system( @args ); |
61
|
0
|
|
|
|
|
|
}; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
0
|
0
|
|
|
|
|
die "Unexpected mutool exit: $exit" if $exit; |
65
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
return $stdout; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no, $zoom_level) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
70
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
71
|
|
|
|
|
|
|
qw(draw), |
72
|
|
|
|
|
|
|
qw( -r ), ($zoom_level * MUPDF_DEFAULT_RESOLUTION), # calculate the resolution |
73
|
|
|
|
|
|
|
qw( -F png ), |
74
|
|
|
|
|
|
|
qw( -o -), |
75
|
|
|
|
|
|
|
$pdf_filename, |
76
|
|
|
|
|
|
|
$pdf_page_no, |
77
|
|
|
|
|
|
|
); |
78
|
|
|
|
|
|
|
|
79
|
0
|
|
|
|
|
|
return $stdout; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
84
|
|
|
|
|
|
|
qw(draw), |
85
|
|
|
|
|
|
|
qw(-F stext), |
86
|
|
|
|
|
|
|
qw(-o -), |
87
|
|
|
|
|
|
|
$pdf_filename, |
88
|
|
|
|
|
|
|
$pdf_page_no, |
89
|
|
|
|
|
|
|
); |
90
|
|
|
|
|
|
|
|
91
|
0
|
|
|
|
|
|
return $stdout; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
95
|
0
|
|
|
|
|
|
my $stext_xml = get_mutool_text_stext_raw( |
96
|
|
|
|
|
|
|
$pdf_filename, |
97
|
|
|
|
|
|
|
$pdf_page_no, |
98
|
|
|
|
|
|
|
); |
99
|
|
|
|
|
|
|
# page -> [list of blocks] |
100
|
|
|
|
|
|
|
# block -> [list of blocks] |
101
|
|
|
|
|
|
|
# block is either: |
102
|
|
|
|
|
|
|
# - stext |
103
|
|
|
|
|
|
|
# line -> [list of lines] (all have same baseline) |
104
|
|
|
|
|
|
|
# span -> [list of spans] (horizontal spaces over a line) |
105
|
|
|
|
|
|
|
# char -> [list of chars] |
106
|
|
|
|
|
|
|
# - image |
107
|
|
|
|
|
|
|
# TODO |
108
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
my $stext = XMLin( $stext_xml, |
110
|
|
|
|
|
|
|
ForceArray => [ qw(page block line span char) ] ); |
111
|
|
|
|
|
|
|
|
112
|
0
|
|
|
|
|
|
return $stext; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_page_info_raw($pdf_filename) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
117
|
|
|
|
|
|
|
qw(pages), |
118
|
|
|
|
|
|
|
$pdf_filename |
119
|
|
|
|
|
|
|
); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# remove the first line |
122
|
0
|
|
|
|
|
|
$stdout =~ s/^[^\n]*\n//s; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# wraps the data with a root node |
125
|
0
|
|
|
|
|
|
return "<document>$stdout</document>" |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_page_info_xml($pdf_filename) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
129
|
0
|
|
|
|
|
|
my $page_info_xml = get_mutool_page_info_raw( $pdf_filename ); |
130
|
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
my $page_info = XMLin( $page_info_xml, |
132
|
|
|
|
|
|
|
ForceArray => [ qw(page) ] ); |
133
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
return $page_info; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
0
|
0
|
|
0
|
1
|
|
fun get_mutool_outline_simple($pdf_filename) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
|
my $outline_text = _call_mutool( |
139
|
|
|
|
|
|
|
qw(show), |
140
|
|
|
|
|
|
|
$pdf_filename, |
141
|
|
|
|
|
|
|
qw(outline) |
142
|
|
|
|
|
|
|
); |
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
my @outline_items = (); |
145
|
0
|
|
|
|
|
|
open my $outline_fh, '<:encoding(UTF-8):crlf', \$outline_text; |
146
|
0
|
|
|
|
|
|
while( defined( my $line = <$outline_fh> ) ) { |
147
|
0
|
|
|
|
|
|
$line =~ /^(?<indent>\t*)(?<text>.*)\t#(?<page>\d+)(,(?<dx>\d+),(?<dy>\d+))?$/; |
148
|
3
|
|
|
3
|
|
7566
|
my %copy = %+; |
|
3
|
|
|
|
|
1019
|
|
|
3
|
|
|
|
|
286
|
|
|
0
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
$copy{level} = length $copy{indent}; |
150
|
0
|
|
|
|
|
|
delete $copy{indent}; |
151
|
|
|
|
|
|
|
# not storing the offsets yet and not every line has offsets |
152
|
0
|
|
|
|
|
|
delete @copy{qw(dx dy)}; |
153
|
0
|
|
|
|
|
|
push @outline_items, \%copy; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
0
|
|
|
|
|
|
return \@outline_items; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
1; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
__END__ |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=pod |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=encoding UTF-8 |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head1 NAME |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Renard::Curie::Data::PDF - Retrieve PDF image and text data via MuPDF's mutool |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 VERSION |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
version 0.002 |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 FUNCTIONS |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head2 _call_mutool |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
_call_mutool( @args ) |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
Helper function which calls C<mutool> with the contents of the C<@args> array. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Returns the captured C<STDOUT> of the call. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
This function dies if C<mutool> unsuccessfully exits. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=head2 get_mutool_pdf_page_as_png |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no) |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
This function returns a PNG stream that renders page number C<$pdf_page_no> of |
193
|
|
|
|
|
|
|
the PDF file C<$pdf_filename>. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 get_mutool_text_stext_raw |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This function returns an XML string that contains structured text from page |
200
|
|
|
|
|
|
|
number C<$pdf_page_no> of the PDF file C<$pdf_filename>. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
The XML format is defined by the output of C<mutool> looks like this (for page |
203
|
|
|
|
|
|
|
23 of the C<pdf_reference_1-7.pdf> file): |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
<document name="test-data/test-data/PDF/Adobe/pdf_reference_1-7.pdf"> |
206
|
|
|
|
|
|
|
<page width="531" height="666"> |
207
|
|
|
|
|
|
|
<block bbox="261.18 616.16394 269.77765 625.2532"> |
208
|
|
|
|
|
|
|
<line bbox="261.18 616.16394 269.77765 625.2532"> |
209
|
|
|
|
|
|
|
<span bbox="261.18 616.16394 269.77765 625.2532" font="MyriadPro-Semibold" size="7.98"> |
210
|
|
|
|
|
|
|
<char bbox="261.18 616.16394 265.50037 625.2532" x="261.18" y="623.2582" c="2"/> |
211
|
|
|
|
|
|
|
<char bbox="265.50037 616.16394 269.77765 625.2532" x="265.50037" y="623.2582" c="3"/> |
212
|
|
|
|
|
|
|
</span> |
213
|
|
|
|
|
|
|
</line> |
214
|
|
|
|
|
|
|
</block> |
215
|
|
|
|
|
|
|
<block bbox="225.78 88.20229 305.18158 117.93829"> |
216
|
|
|
|
|
|
|
<line bbox="225.78 88.20229 305.18158 117.93829"> |
217
|
|
|
|
|
|
|
<span bbox="225.78 88.20229 305.18158 117.93829" font="MyriadPro-Bold" size="24"> |
218
|
|
|
|
|
|
|
<char bbox="225.78 88.20229 239.5176 117.93829" x="225.78" y="111.93829" c="P"/> |
219
|
|
|
|
|
|
|
<char bbox="239.5176 88.20229 248.4552 117.93829" x="239.5176" y="111.93829" c="r"/> |
220
|
|
|
|
|
|
|
<char bbox="248.4552 88.20229 261.1128 117.93829" x="248.4552" y="111.93829" c="e"/> |
221
|
|
|
|
|
|
|
<char bbox="261.1128 88.20229 269.28238 117.93829" x="261.1128" y="111.93829" c="f"/> |
222
|
|
|
|
|
|
|
<char bbox="269.28238 88.20229 281.93997 117.93829" x="269.28238" y="111.93829" c="a"/> |
223
|
|
|
|
|
|
|
<char bbox="281.93997 88.20229 292.50958 117.93829" x="281.93997" y="111.93829" c="c"/> |
224
|
|
|
|
|
|
|
<char bbox="292.50958 88.20229 305.18158 117.93829" x="292.50958" y="111.93829" c="e"/> |
225
|
|
|
|
|
|
|
</span> |
226
|
|
|
|
|
|
|
</line> |
227
|
|
|
|
|
|
|
</block> |
228
|
|
|
|
|
|
|
</page> |
229
|
|
|
|
|
|
|
</document> |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Simplified, the high-level structure looks like: |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
<page> -> [list of blocks] |
234
|
|
|
|
|
|
|
<block> -> [list of blocks] |
235
|
|
|
|
|
|
|
a block is either: |
236
|
|
|
|
|
|
|
- stext |
237
|
|
|
|
|
|
|
<line> -> [list of lines] (all have same baseline) |
238
|
|
|
|
|
|
|
<span> -> [list of spans] (horizontal spaces over a line) |
239
|
|
|
|
|
|
|
<char> -> [list of chars] |
240
|
|
|
|
|
|
|
- image |
241
|
|
|
|
|
|
|
TODO |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=head2 get_mutool_text_stext_xml |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Returns a HashRef of the structured text from from page |
248
|
|
|
|
|
|
|
number C<$pdf_page_no> of the PDF file C<$pdf_filename>. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for |
251
|
|
|
|
|
|
|
details on the structure of this data. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=head2 get_mutool_page_info_raw |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
get_mutool_page_info_raw($pdf_filename) |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
The data is in the form: |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
<document> |
262
|
|
|
|
|
|
|
<page pagenum="1"> |
263
|
|
|
|
|
|
|
<MediaBox l="0" b="0" r="531" t="666" /> |
264
|
|
|
|
|
|
|
<CropBox l="0" b="0" r="531" t="666" /> |
265
|
|
|
|
|
|
|
<Rotate v="0" /> |
266
|
|
|
|
|
|
|
</page> |
267
|
|
|
|
|
|
|
<page pagenum="2"> |
268
|
|
|
|
|
|
|
... |
269
|
|
|
|
|
|
|
</page> |
270
|
|
|
|
|
|
|
</document> |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=head2 get_mutool_page_info_xml |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
get_mutool_page_info_xml($pdf_filename) |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Returns a HashRef containing the page bounding boxes of PDF file |
277
|
|
|
|
|
|
|
C<$pdf_filename>. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for |
280
|
|
|
|
|
|
|
information on the structure of the data. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head2 get_mutool_outline_simple |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
fun get_mutool_outline_simple($pdf_filename) |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
Returns an array of the outline of the PDF file C<$pdf_filename> as an |
287
|
|
|
|
|
|
|
C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of |
288
|
|
|
|
|
|
|
L<Renard::Curie::Model::Outline>. |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=head1 AUTHOR |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Project Renard |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
This software is copyright (c) 2016 by Project Renard. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
299
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=cut |