| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package PDF::Make::Linearization; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
182212
|
use strict; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
117
|
|
|
4
|
3
|
|
|
3
|
|
14
|
use warnings; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
148
|
|
|
5
|
3
|
|
|
3
|
|
699
|
use PDF::Make; |
|
|
3
|
|
|
|
|
9
|
|
|
|
3
|
|
|
|
|
191
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.03'; |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 NAME |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
PDF::Make::Linearization - PDF Linearization (Fast Web View) support |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use PDF::Make; |
|
16
|
|
|
|
|
|
|
use PDF::Make::Linearization; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# Check if a PDF is linearized |
|
19
|
|
|
|
|
|
|
my $doc = PDF::Make->open('document.pdf'); |
|
20
|
|
|
|
|
|
|
if ($doc->is_linearized) { |
|
21
|
|
|
|
|
|
|
my $params = $doc->linear_params; |
|
22
|
|
|
|
|
|
|
say "Fast Web View: Yes"; |
|
23
|
|
|
|
|
|
|
say "Pages: $params->{page_count}"; |
|
24
|
|
|
|
|
|
|
say "First page ends at byte: $params->{first_page_end}"; |
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# Create a linearized PDF |
|
28
|
|
|
|
|
|
|
my $pdf = PDF::Make->new; |
|
29
|
|
|
|
|
|
|
$pdf->page->text(100, 700, "Page 1"); |
|
30
|
|
|
|
|
|
|
$pdf->page->text(100, 700, "Page 2"); |
|
31
|
|
|
|
|
|
|
$pdf->finalize; |
|
32
|
|
|
|
|
|
|
$pdf->write_linearized('optimized.pdf'); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# Streaming reader for HTTP byte-range requests |
|
35
|
|
|
|
|
|
|
my $reader = PDF::Make::StreamReader->new( |
|
36
|
|
|
|
|
|
|
fetch => sub { |
|
37
|
|
|
|
|
|
|
my ($offset, $length) = @_; |
|
38
|
|
|
|
|
|
|
return http_range_request($url, $offset, $length); |
|
39
|
|
|
|
|
|
|
} |
|
40
|
|
|
|
|
|
|
); |
|
41
|
|
|
|
|
|
|
$reader->read_header; |
|
42
|
|
|
|
|
|
|
say "Pages: ", $reader->page_count; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# Load pages on demand |
|
45
|
|
|
|
|
|
|
$reader->read_page(0); # First page (usually pre-loaded) |
|
46
|
|
|
|
|
|
|
$reader->read_page(5); # Triggers fetch for page 6 |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
This module provides PDF linearization support, enabling "Fast Web View" |
|
51
|
|
|
|
|
|
|
functionality per Annex F of ISO 32000-2:2020. |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Linearization reorganizes a PDF file so that: |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=over 4 |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item * The first page can display before the entire file downloads |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item * Subsequent pages load on demand via HTTP byte-range requests |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=item * Hint tables enable efficient page offset calculation |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=back |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 METHODS ADDED TO PDF::Make |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head2 is_linearized |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my $bool = $doc->is_linearized; |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Returns true if the document is linearized (has Fast Web View). |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=head2 linear_params |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
my $params = $doc->linear_params; |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
Returns a hashref with linearization parameters: |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
{ |
|
80
|
|
|
|
|
|
|
version => 1, # Linearized version |
|
81
|
|
|
|
|
|
|
file_length => 123456, # Total file size |
|
82
|
|
|
|
|
|
|
hint_offset => 1234, # Hint stream offset |
|
83
|
|
|
|
|
|
|
hint_length => 567, # Hint stream length |
|
84
|
|
|
|
|
|
|
first_page_obj => 7, # First page object number |
|
85
|
|
|
|
|
|
|
first_page_end => 12345, # End of first page section |
|
86
|
|
|
|
|
|
|
page_count => 10, # Number of pages |
|
87
|
|
|
|
|
|
|
main_xref_offset => 98765, # Main xref table offset |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Returns undef if document is not linearized. |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=head2 linearize |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
$doc->linearize; |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Prepares the document for linearized output. This analyzes page dependencies |
|
97
|
|
|
|
|
|
|
and computes the optimal object ordering. |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head2 write_linearized |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
$doc->write_linearized($path); |
|
102
|
|
|
|
|
|
|
my $bytes = $doc->write_linearized; |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Writes the document in linearized format. If a path is provided, writes to |
|
105
|
|
|
|
|
|
|
that file. Otherwise returns the PDF bytes. |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=cut |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Storage for linearization state (inside-out pattern for XS objects) |
|
110
|
|
|
|
|
|
|
my %_linearize_state; |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# Add methods to PDF::Make::Document - XS provides these when available |
|
113
|
|
|
|
|
|
|
{ |
|
114
|
3
|
|
|
3
|
|
17
|
no warnings 'redefine'; |
|
|
3
|
|
|
|
|
7
|
|
|
|
3
|
|
|
|
|
1040
|
|
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# is_linearized - wraps XS _xs_is_linearized |
|
117
|
|
|
|
|
|
|
unless (defined &PDF::Make::Document::is_linearized) { |
|
118
|
7
|
|
|
7
|
|
390065
|
*PDF::Make::Document::is_linearized = sub { $_[0]->_xs_is_linearized }; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# linear_params - wraps XS _xs_linear_params; returns undef when not linearized |
|
122
|
|
|
|
|
|
|
unless (defined &PDF::Make::Document::linear_params) { |
|
123
|
|
|
|
|
|
|
*PDF::Make::Document::linear_params = sub { |
|
124
|
4
|
|
|
4
|
|
475
|
my ($self) = @_; |
|
125
|
4
|
50
|
|
|
|
10
|
return undef unless $self->is_linearized; |
|
126
|
0
|
|
|
|
|
0
|
return $self->_xs_linear_params; |
|
127
|
|
|
|
|
|
|
}; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# linearize - wraps XS _xs_linearize, records state for write_linearized |
|
131
|
|
|
|
|
|
|
unless (defined &PDF::Make::Document::linearize) { |
|
132
|
|
|
|
|
|
|
*PDF::Make::Document::linearize = sub { |
|
133
|
4
|
|
|
4
|
|
2457
|
my ($self) = @_; |
|
134
|
4
|
|
|
|
|
47
|
$self->_xs_linearize; |
|
135
|
4
|
|
|
|
|
20
|
$_linearize_state{"$self"} = 1; |
|
136
|
4
|
|
|
|
|
10
|
return $self; |
|
137
|
|
|
|
|
|
|
}; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# write_linearized - wraps XS _xs_write_linearized_to_path, or returns bytes |
|
141
|
|
|
|
|
|
|
unless (defined &PDF::Make::Document::write_linearized) { |
|
142
|
|
|
|
|
|
|
*PDF::Make::Document::write_linearized = sub { |
|
143
|
6
|
|
|
6
|
|
2645
|
my ($self, $path) = @_; |
|
144
|
6
|
|
|
|
|
29
|
$_linearize_state{"$self"} = 1; |
|
145
|
6
|
100
|
|
|
|
39
|
return $self->_xs_write_linearized_to_path($path) if defined $path; |
|
146
|
4
|
|
|
|
|
12
|
return $self->_write_linearized_bytes; |
|
147
|
|
|
|
|
|
|
}; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# _write_linearized_bytes - use LinearContext to produce linearized output |
|
151
|
|
|
|
|
|
|
unless (defined &PDF::Make::Document::_write_linearized_bytes) { |
|
152
|
|
|
|
|
|
|
*PDF::Make::Document::_write_linearized_bytes = sub { |
|
153
|
4
|
|
|
4
|
|
8
|
my ($self) = @_; |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Use LinearContext pipeline |
|
156
|
4
|
|
|
|
|
122
|
my $ctx = PDF::Make::LinearContext->_new($self); |
|
157
|
4
|
|
|
|
|
22
|
$ctx->analyze; |
|
158
|
4
|
|
|
|
|
12
|
$ctx->build_hints; |
|
159
|
4
|
|
|
|
|
186
|
return $ctx->write; |
|
160
|
|
|
|
|
|
|
}; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 PDF::Make::StreamReader |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Streaming reader for linearized PDFs, enabling page-on-demand loading. |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head2 new |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
my $reader = PDF::Make::StreamReader->new( |
|
171
|
|
|
|
|
|
|
fetch => sub { |
|
172
|
|
|
|
|
|
|
my ($offset, $length) = @_; |
|
173
|
|
|
|
|
|
|
# Return $length bytes starting at $offset |
|
174
|
|
|
|
|
|
|
return $data; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
); |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Creates a new streaming reader with the given fetch callback. |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head2 read_header |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
$reader->read_header; |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Reads and parses the PDF header and linearization dictionary. |
|
185
|
|
|
|
|
|
|
This is the first operation to perform. |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head2 is_linearized |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
if ($reader->is_linearized) { ... } |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Returns true if the PDF is linearized. |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head2 page_count |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
my $count = $reader->page_count; |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Returns the total number of pages. Available after C. |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head2 page_available |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
if ($reader->page_available($page_num)) { ... } |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Returns true if the given page (0-based) is loaded. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head2 read_page |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
$reader->read_page($page_num); |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Fetches and parses the given page's data. May trigger HTTP range request. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head2 page_range |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
my ($offset, $length) = $reader->page_range($page_num); |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
Returns the byte offset and length for the given page. |
|
216
|
|
|
|
|
|
|
Useful for HTTP Range header construction. |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
package PDF::Make::StreamReader; |
|
221
|
|
|
|
|
|
|
|
|
222
|
3
|
|
|
3
|
|
17
|
use strict; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
101
|
|
|
223
|
3
|
|
|
3
|
|
12
|
use warnings; |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
126
|
|
|
224
|
3
|
|
|
3
|
|
24
|
use Carp qw(croak); |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
4083
|
|
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub new { |
|
227
|
14
|
|
|
14
|
|
11157
|
my ($class, %args) = @_; |
|
228
|
|
|
|
|
|
|
|
|
229
|
14
|
100
|
|
|
|
320
|
croak "fetch callback required" unless $args{fetch}; |
|
230
|
12
|
100
|
|
|
|
242
|
croak "fetch must be a code reference" unless ref($args{fetch}) eq 'CODE'; |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
my $self = bless { |
|
233
|
|
|
|
|
|
|
fetch => $args{fetch}, |
|
234
|
10
|
|
|
|
|
54
|
is_linearized => 0, |
|
235
|
|
|
|
|
|
|
page_count => 0, |
|
236
|
|
|
|
|
|
|
params => {}, |
|
237
|
|
|
|
|
|
|
hints_loaded => 0, |
|
238
|
|
|
|
|
|
|
page_hints => [], |
|
239
|
|
|
|
|
|
|
shared_hints => [], |
|
240
|
|
|
|
|
|
|
page_loaded => {}, # page_num => 1 |
|
241
|
|
|
|
|
|
|
header_data => undef, |
|
242
|
|
|
|
|
|
|
_doc => undef, |
|
243
|
|
|
|
|
|
|
}, $class; |
|
244
|
|
|
|
|
|
|
|
|
245
|
10
|
|
|
|
|
29
|
return $self; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
sub read_header { |
|
249
|
8
|
|
|
8
|
|
853
|
my ($self) = @_; |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
# Fetch first 4KB |
|
252
|
8
|
|
|
|
|
11
|
my $header_size = 4096; |
|
253
|
8
|
|
|
|
|
21
|
my $data = $self->{fetch}->(0, $header_size); |
|
254
|
|
|
|
|
|
|
|
|
255
|
8
|
50
|
33
|
|
|
122
|
croak "Failed to fetch header" unless defined $data && length($data) > 0; |
|
256
|
|
|
|
|
|
|
|
|
257
|
8
|
|
|
|
|
11
|
$self->{header_data} = $data; |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
# Check for linearization |
|
260
|
8
|
100
|
|
|
|
42
|
if ($data =~ m{/Linearized\s+(\d+)}s) { |
|
261
|
6
|
|
|
|
|
24
|
$self->{is_linearized} = 1; |
|
262
|
6
|
|
|
|
|
18
|
$self->{params}{version} = $1; |
|
263
|
|
|
|
|
|
|
} |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
# Extract linearization parameters |
|
266
|
8
|
100
|
|
|
|
18
|
if ($self->{is_linearized}) { |
|
267
|
|
|
|
|
|
|
# /L - file length |
|
268
|
6
|
50
|
|
|
|
24
|
if ($data =~ m{/L\s+(\d+)}s) { |
|
269
|
6
|
|
|
|
|
14
|
$self->{params}{file_length} = $1; |
|
270
|
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# /N - page count |
|
273
|
6
|
50
|
|
|
|
17
|
if ($data =~ m{/N\s+(\d+)}s) { |
|
274
|
6
|
|
|
|
|
12
|
$self->{page_count} = $1; |
|
275
|
6
|
|
|
|
|
13
|
$self->{params}{page_count} = $1; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# /O - first page object |
|
279
|
6
|
100
|
|
|
|
17
|
if ($data =~ m{/O\s+(\d+)}s) { |
|
280
|
5
|
|
|
|
|
18
|
$self->{params}{first_page_obj} = $1; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# /E - end of first page |
|
284
|
6
|
100
|
|
|
|
16
|
if ($data =~ m{/E\s+(\d+)}s) { |
|
285
|
5
|
|
|
|
|
10
|
$self->{params}{first_page_end} = $1; |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# /H - hint stream [offset length] |
|
289
|
6
|
100
|
|
|
|
25
|
if ($data =~ m{/H\s*\[\s*(\d+)\s+(\d+)\s*(?:(\d+)\s+(\d+)\s*)?\]}s) { |
|
290
|
5
|
|
|
|
|
16
|
$self->{params}{hint_offset} = $1; |
|
291
|
5
|
|
|
|
|
14
|
$self->{params}{hint_length} = $2; |
|
292
|
5
|
50
|
|
|
|
12
|
$self->{params}{overflow_offset} = $3 if defined $3; |
|
293
|
5
|
50
|
|
|
|
24
|
$self->{params}{overflow_length} = $4 if defined $4; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# /T - main xref offset |
|
297
|
6
|
100
|
|
|
|
17
|
if ($data =~ m{/T\s+(\d+)}s) { |
|
298
|
5
|
|
|
|
|
9
|
$self->{params}{main_xref_offset} = $1; |
|
299
|
|
|
|
|
|
|
} |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Mark first page as loaded (it's in the header section) |
|
302
|
6
|
|
|
|
|
11
|
$self->{page_loaded}{0} = 1; |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
|
|
305
|
8
|
|
|
|
|
15
|
return $self; |
|
306
|
|
|
|
|
|
|
} |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub is_linearized { |
|
309
|
5
|
|
|
5
|
|
485
|
my ($self) = @_; |
|
310
|
5
|
|
|
|
|
21
|
return $self->{is_linearized}; |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
sub page_count { |
|
314
|
4
|
|
|
4
|
|
9
|
my ($self) = @_; |
|
315
|
4
|
|
|
|
|
15
|
return $self->{page_count}; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub params { |
|
319
|
2
|
|
|
2
|
|
4
|
my ($self) = @_; |
|
320
|
2
|
|
|
|
|
3
|
return { %{$self->{params}} }; |
|
|
2
|
|
|
|
|
14
|
|
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
sub page_available { |
|
324
|
7
|
|
|
7
|
|
1676
|
my ($self, $page_num) = @_; |
|
325
|
7
|
100
|
|
|
|
31
|
return $self->{page_loaded}{$page_num} ? 1 : 0; |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub load_hints { |
|
329
|
4
|
|
|
4
|
|
15
|
my ($self) = @_; |
|
330
|
|
|
|
|
|
|
|
|
331
|
4
|
100
|
|
|
|
13
|
return if $self->{hints_loaded}; |
|
332
|
3
|
100
|
|
|
|
91
|
croak "Not linearized" unless $self->{is_linearized}; |
|
333
|
|
|
|
|
|
|
|
|
334
|
2
|
|
|
|
|
4
|
my $offset = $self->{params}{hint_offset}; |
|
335
|
2
|
|
|
|
|
4
|
my $length = $self->{params}{hint_length}; |
|
336
|
|
|
|
|
|
|
|
|
337
|
2
|
50
|
33
|
|
|
9
|
croak "Hint offset/length not available" |
|
338
|
|
|
|
|
|
|
unless defined $offset && defined $length; |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# Fetch hint stream |
|
341
|
2
|
|
|
|
|
5
|
my $hint_data = $self->{fetch}->($offset, $length); |
|
342
|
2
|
50
|
33
|
|
|
13
|
croak "Failed to fetch hint stream" |
|
343
|
|
|
|
|
|
|
unless defined $hint_data && length($hint_data) >= $length; |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# Parse hint stream |
|
346
|
2
|
|
|
|
|
9
|
$self->_parse_hint_stream($hint_data); |
|
347
|
|
|
|
|
|
|
|
|
348
|
2
|
|
|
|
|
3
|
$self->{hints_loaded} = 1; |
|
349
|
|
|
|
|
|
|
|
|
350
|
2
|
|
|
|
|
7
|
return $self; |
|
351
|
|
|
|
|
|
|
} |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
sub _parse_hint_stream { |
|
354
|
2
|
|
|
2
|
|
3
|
my ($self, $data) = @_; |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# Find stream content (skip object header and dictionary) |
|
357
|
2
|
|
|
|
|
26
|
my $stream_start = index($data, "stream"); |
|
358
|
2
|
50
|
|
|
|
8
|
return unless $stream_start >= 0; |
|
359
|
2
|
|
|
|
|
5
|
$stream_start += 6; # Skip "stream" |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# Skip newline after "stream" |
|
362
|
2
|
50
|
|
|
|
9
|
$stream_start++ if substr($data, $stream_start, 1) eq "\r"; |
|
363
|
2
|
50
|
|
|
|
6
|
$stream_start++ if substr($data, $stream_start, 1) eq "\n"; |
|
364
|
|
|
|
|
|
|
|
|
365
|
2
|
|
|
|
|
4
|
my $stream_end = rindex($data, "endstream"); |
|
366
|
2
|
50
|
|
|
|
24
|
return unless $stream_end > $stream_start; |
|
367
|
|
|
|
|
|
|
|
|
368
|
2
|
|
|
|
|
7
|
my $stream_content = substr($data, $stream_start, $stream_end - $stream_start); |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Parse page offset hint table header (§F.4.2) |
|
371
|
|
|
|
|
|
|
# First 40 bytes contain header fields |
|
372
|
2
|
50
|
|
|
|
17
|
return unless length($stream_content) >= 40; |
|
373
|
|
|
|
|
|
|
|
|
374
|
2
|
|
|
|
|
14
|
my @bytes = unpack("C*", $stream_content); |
|
375
|
2
|
|
|
|
|
15
|
my $pos = 0; |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# Item 1: Min objects per page (4 bytes) |
|
378
|
2
|
|
|
|
|
9
|
my $min_obj = ($bytes[$pos] << 24) | ($bytes[$pos+1] << 16) | |
|
379
|
|
|
|
|
|
|
($bytes[$pos+2] << 8) | $bytes[$pos+3]; |
|
380
|
2
|
|
|
|
|
4
|
$pos += 4; |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
# Item 2: First page location (4 bytes) |
|
383
|
2
|
|
|
|
|
4
|
my $first_loc = ($bytes[$pos] << 24) | ($bytes[$pos+1] << 16) | |
|
384
|
|
|
|
|
|
|
($bytes[$pos+2] << 8) | $bytes[$pos+3]; |
|
385
|
2
|
|
|
|
|
3
|
$pos += 4; |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# Item 3: Bits for obj count (2 bytes) |
|
388
|
2
|
|
|
|
|
4
|
my $bits_obj = ($bytes[$pos] << 8) | $bytes[$pos+1]; |
|
389
|
2
|
|
|
|
|
3
|
$pos += 2; |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# Item 4: Min page length (4 bytes) |
|
392
|
2
|
|
|
|
|
4
|
my $min_len = ($bytes[$pos] << 24) | ($bytes[$pos+1] << 16) | |
|
393
|
|
|
|
|
|
|
($bytes[$pos+2] << 8) | $bytes[$pos+3]; |
|
394
|
2
|
|
|
|
|
2
|
$pos += 4; |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# Item 5: Bits for page length (2 bytes) |
|
397
|
2
|
|
|
|
|
5
|
my $bits_len = ($bytes[$pos] << 8) | $bytes[$pos+1]; |
|
398
|
2
|
|
|
|
|
3
|
$pos += 2; |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# Store parsed values |
|
401
|
|
|
|
|
|
|
$self->{hint_header} = { |
|
402
|
2
|
|
|
|
|
12
|
min_obj_count => $min_obj, |
|
403
|
|
|
|
|
|
|
first_page_loc => $first_loc, |
|
404
|
|
|
|
|
|
|
bits_obj_count => $bits_obj, |
|
405
|
|
|
|
|
|
|
min_page_length => $min_len, |
|
406
|
|
|
|
|
|
|
bits_page_len => $bits_len, |
|
407
|
|
|
|
|
|
|
}; |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# Continue parsing per-page data... |
|
410
|
|
|
|
|
|
|
# (Simplified for now) |
|
411
|
|
|
|
|
|
|
|
|
412
|
2
|
|
|
|
|
5
|
return 1; |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
sub read_page { |
|
416
|
5
|
|
|
5
|
|
2198
|
my ($self, $page_num) = @_; |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
croak "Invalid page number" |
|
419
|
5
|
100
|
100
|
|
|
328
|
if $page_num < 0 || $page_num >= $self->{page_count}; |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
# Already loaded? |
|
422
|
2
|
100
|
|
|
|
6
|
return $self if $self->{page_loaded}{$page_num}; |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# Need hints for page ranges |
|
425
|
1
|
50
|
|
|
|
4
|
$self->load_hints unless $self->{hints_loaded}; |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# Get page byte range |
|
428
|
1
|
|
|
|
|
3
|
my ($offset, $length) = $self->page_range($page_num); |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Fetch page data |
|
431
|
1
|
|
|
|
|
3
|
my $page_data = $self->{fetch}->($offset, $length); |
|
432
|
1
|
50
|
33
|
|
|
13
|
croak "Failed to fetch page $page_num" |
|
433
|
|
|
|
|
|
|
unless defined $page_data && length($page_data) > 0; |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# Parse page objects |
|
436
|
|
|
|
|
|
|
# (In real implementation, would parse and add to document) |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# Mark page as loaded |
|
439
|
1
|
|
|
|
|
2
|
$self->{page_loaded}{$page_num} = 1; |
|
440
|
|
|
|
|
|
|
|
|
441
|
1
|
|
|
|
|
2
|
return $self; |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
sub page_range { |
|
445
|
4
|
|
|
4
|
|
988
|
my ($self, $page_num) = @_; |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
croak "Invalid page number" |
|
448
|
4
|
100
|
66
|
|
|
105
|
if $page_num < 0 || $page_num >= $self->{page_count}; |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# Load hints if needed |
|
451
|
3
|
50
|
|
|
|
7
|
$self->load_hints unless $self->{hints_loaded}; |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# Calculate offset from hint data |
|
454
|
3
|
|
|
|
|
3
|
my $header = $self->{hint_header}; |
|
455
|
3
|
50
|
|
|
|
4
|
return (0, 0) unless $header; |
|
456
|
|
|
|
|
|
|
|
|
457
|
3
|
|
|
|
|
5
|
my $offset = $header->{first_page_loc}; |
|
458
|
3
|
|
|
|
|
3
|
my $length = $header->{min_page_length}; |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# Add deltas for pages before this one |
|
461
|
3
|
|
|
|
|
7
|
for my $i (0 .. $page_num - 1) { |
|
462
|
3
|
|
|
|
|
4
|
my $hint = $self->{page_hints}[$i]; |
|
463
|
3
|
50
|
|
|
|
18
|
if ($hint) { |
|
464
|
0
|
|
|
|
|
0
|
$offset += $hint->{page_length}; |
|
465
|
|
|
|
|
|
|
} else { |
|
466
|
3
|
|
|
|
|
3
|
$offset += $length; # Use min length as estimate |
|
467
|
|
|
|
|
|
|
} |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
# Get this page's length |
|
471
|
3
|
|
|
|
|
4
|
my $page_hint = $self->{page_hints}[$page_num]; |
|
472
|
3
|
50
|
|
|
|
6
|
if ($page_hint) { |
|
473
|
0
|
|
|
|
|
0
|
$length = $page_hint->{page_length}; |
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
|
|
476
|
3
|
|
|
|
|
7
|
return ($offset, $length); |
|
477
|
|
|
|
|
|
|
} |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
sub doc { |
|
480
|
1
|
|
|
1
|
|
2
|
my ($self) = @_; |
|
481
|
1
|
|
|
|
|
3
|
return $self->{_doc}; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=head1 LINEARIZATION STRUCTURE |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
A linearized PDF has this structure: |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
┌─────────────────────────────────────┐ |
|
489
|
|
|
|
|
|
|
│ Header (%PDF-2.0) │ |
|
490
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
491
|
|
|
|
|
|
|
│ Linearization dictionary (obj 1) │ |
|
492
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
493
|
|
|
|
|
|
|
│ First page xref (partial) │ |
|
494
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
495
|
|
|
|
|
|
|
│ Document catalog, pages tree root │ |
|
496
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
497
|
|
|
|
|
|
|
│ First page objects │ |
|
498
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
499
|
|
|
|
|
|
|
│ Hint stream │ |
|
500
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
501
|
|
|
|
|
|
|
│ Remaining pages (2..N) │ |
|
502
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
503
|
|
|
|
|
|
|
│ Shared objects │ |
|
504
|
|
|
|
|
|
|
├─────────────────────────────────────┤ |
|
505
|
|
|
|
|
|
|
│ Main xref + trailer │ |
|
506
|
|
|
|
|
|
|
└─────────────────────────────────────┘ |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
L, ISO 32000-2:2020 Annex F (Linearized PDF) |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=cut |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
1; |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
__END__ |