File Coverage

blib/lib/PDF/Make/Linearization.pm
Criterion Covered Total %
statement 138 141 97.8
branch 49 68 72.0
condition 9 18 50.0
subroutine 23 23 100.0
pod n/a
total 219 250 87.6


line stmt bran cond sub pod time code
1             package PDF::Make::Linearization;
2              
3 3     3   182212 use strict;
  3         3  
  3         117  
4 3     3   14 use warnings;
  3         4  
  3         148  
5 3     3   699 use PDF::Make;
  3         9  
  3         191  
6              
7             our $VERSION = '0.03';
8              
9             =head1 NAME
10              
11             PDF::Make::Linearization - PDF Linearization (Fast Web View) support
12              
13             =head1 SYNOPSIS
14              
15             use PDF::Make;
16             use PDF::Make::Linearization;
17              
18             # Check if a PDF is linearized
19             my $doc = PDF::Make->open('document.pdf');
20             if ($doc->is_linearized) {
21             my $params = $doc->linear_params;
22             say "Fast Web View: Yes";
23             say "Pages: $params->{page_count}";
24             say "First page ends at byte: $params->{first_page_end}";
25             }
26              
27             # Create a linearized PDF
28             my $pdf = PDF::Make->new;
29             $pdf->page->text(100, 700, "Page 1");
30             $pdf->page->text(100, 700, "Page 2");
31             $pdf->finalize;
32             $pdf->write_linearized('optimized.pdf');
33              
34             # Streaming reader for HTTP byte-range requests
35             my $reader = PDF::Make::StreamReader->new(
36             fetch => sub {
37             my ($offset, $length) = @_;
38             return http_range_request($url, $offset, $length);
39             }
40             );
41             $reader->read_header;
42             say "Pages: ", $reader->page_count;
43              
44             # Load pages on demand
45             $reader->read_page(0); # First page (usually pre-loaded)
46             $reader->read_page(5); # Triggers fetch for page 6
47              
48             =head1 DESCRIPTION
49              
50             This module provides PDF linearization support, enabling "Fast Web View"
51             functionality per Annex F of ISO 32000-2:2020.
52              
53             Linearization reorganizes a PDF file so that:
54              
55             =over 4
56              
57             =item * The first page can display before the entire file downloads
58              
59             =item * Subsequent pages load on demand via HTTP byte-range requests
60              
61             =item * Hint tables enable efficient page offset calculation
62              
63             =back
64              
65             =head1 METHODS ADDED TO PDF::Make
66              
67             =head2 is_linearized
68              
69             my $bool = $doc->is_linearized;
70              
71             Returns true if the document is linearized (has Fast Web View).
72              
73             =head2 linear_params
74              
75             my $params = $doc->linear_params;
76              
77             Returns a hashref with linearization parameters:
78              
79             {
80             version => 1, # Linearized version
81             file_length => 123456, # Total file size
82             hint_offset => 1234, # Hint stream offset
83             hint_length => 567, # Hint stream length
84             first_page_obj => 7, # First page object number
85             first_page_end => 12345, # End of first page section
86             page_count => 10, # Number of pages
87             main_xref_offset => 98765, # Main xref table offset
88             }
89              
90             Returns undef if document is not linearized.
91              
92             =head2 linearize
93              
94             $doc->linearize;
95              
96             Prepares the document for linearized output. This analyzes page dependencies
97             and computes the optimal object ordering.
98              
99             =head2 write_linearized
100              
101             $doc->write_linearized($path);
102             my $bytes = $doc->write_linearized;
103              
104             Writes the document in linearized format. If a path is provided, writes to
105             that file. Otherwise returns the PDF bytes.
106              
107             =cut
108              
109             # Storage for linearization state (inside-out pattern for XS objects)
110             my %_linearize_state;
111              
112             # Add methods to PDF::Make::Document - XS provides these when available
113             {
114 3     3   17 no warnings 'redefine';
  3         7  
  3         1040  
115            
116             # is_linearized - wraps XS _xs_is_linearized
117             unless (defined &PDF::Make::Document::is_linearized) {
118 7     7   390065 *PDF::Make::Document::is_linearized = sub { $_[0]->_xs_is_linearized };
119             }
120              
121             # linear_params - wraps XS _xs_linear_params; returns undef when not linearized
122             unless (defined &PDF::Make::Document::linear_params) {
123             *PDF::Make::Document::linear_params = sub {
124 4     4   475 my ($self) = @_;
125 4 50       10 return undef unless $self->is_linearized;
126 0         0 return $self->_xs_linear_params;
127             };
128             }
129              
130             # linearize - wraps XS _xs_linearize, records state for write_linearized
131             unless (defined &PDF::Make::Document::linearize) {
132             *PDF::Make::Document::linearize = sub {
133 4     4   2457 my ($self) = @_;
134 4         47 $self->_xs_linearize;
135 4         20 $_linearize_state{"$self"} = 1;
136 4         10 return $self;
137             };
138             }
139              
140             # write_linearized - wraps XS _xs_write_linearized_to_path, or returns bytes
141             unless (defined &PDF::Make::Document::write_linearized) {
142             *PDF::Make::Document::write_linearized = sub {
143 6     6   2645 my ($self, $path) = @_;
144 6         29 $_linearize_state{"$self"} = 1;
145 6 100       39 return $self->_xs_write_linearized_to_path($path) if defined $path;
146 4         12 return $self->_write_linearized_bytes;
147             };
148             }
149            
150             # _write_linearized_bytes - use LinearContext to produce linearized output
151             unless (defined &PDF::Make::Document::_write_linearized_bytes) {
152             *PDF::Make::Document::_write_linearized_bytes = sub {
153 4     4   8 my ($self) = @_;
154              
155             # Use LinearContext pipeline
156 4         122 my $ctx = PDF::Make::LinearContext->_new($self);
157 4         22 $ctx->analyze;
158 4         12 $ctx->build_hints;
159 4         186 return $ctx->write;
160             };
161             }
162             }
163              
164             =head1 PDF::Make::StreamReader
165              
166             Streaming reader for linearized PDFs, enabling page-on-demand loading.
167              
168             =head2 new
169              
170             my $reader = PDF::Make::StreamReader->new(
171             fetch => sub {
172             my ($offset, $length) = @_;
173             # Return $length bytes starting at $offset
174             return $data;
175             }
176             );
177              
178             Creates a new streaming reader with the given fetch callback.
179              
180             =head2 read_header
181              
182             $reader->read_header;
183              
184             Reads and parses the PDF header and linearization dictionary.
185             This is the first operation to perform.
186              
187             =head2 is_linearized
188              
189             if ($reader->is_linearized) { ... }
190              
191             Returns true if the PDF is linearized.
192              
193             =head2 page_count
194              
195             my $count = $reader->page_count;
196              
197             Returns the total number of pages. Available after C.
198              
199             =head2 page_available
200              
201             if ($reader->page_available($page_num)) { ... }
202              
203             Returns true if the given page (0-based) is loaded.
204              
205             =head2 read_page
206              
207             $reader->read_page($page_num);
208              
209             Fetches and parses the given page's data. May trigger HTTP range request.
210              
211             =head2 page_range
212              
213             my ($offset, $length) = $reader->page_range($page_num);
214              
215             Returns the byte offset and length for the given page.
216             Useful for HTTP Range header construction.
217              
218             =cut
219              
220             package PDF::Make::StreamReader;
221              
222 3     3   17 use strict;
  3         4  
  3         101  
223 3     3   12 use warnings;
  3         5  
  3         126  
224 3     3   24 use Carp qw(croak);
  3         3  
  3         4083  
225              
226             sub new {
227 14     14   11157 my ($class, %args) = @_;
228            
229 14 100       320 croak "fetch callback required" unless $args{fetch};
230 12 100       242 croak "fetch must be a code reference" unless ref($args{fetch}) eq 'CODE';
231            
232             my $self = bless {
233             fetch => $args{fetch},
234 10         54 is_linearized => 0,
235             page_count => 0,
236             params => {},
237             hints_loaded => 0,
238             page_hints => [],
239             shared_hints => [],
240             page_loaded => {}, # page_num => 1
241             header_data => undef,
242             _doc => undef,
243             }, $class;
244            
245 10         29 return $self;
246             }
247              
248             sub read_header {
249 8     8   853 my ($self) = @_;
250            
251             # Fetch first 4KB
252 8         11 my $header_size = 4096;
253 8         21 my $data = $self->{fetch}->(0, $header_size);
254            
255 8 50 33     122 croak "Failed to fetch header" unless defined $data && length($data) > 0;
256            
257 8         11 $self->{header_data} = $data;
258            
259             # Check for linearization
260 8 100       42 if ($data =~ m{/Linearized\s+(\d+)}s) {
261 6         24 $self->{is_linearized} = 1;
262 6         18 $self->{params}{version} = $1;
263             }
264            
265             # Extract linearization parameters
266 8 100       18 if ($self->{is_linearized}) {
267             # /L - file length
268 6 50       24 if ($data =~ m{/L\s+(\d+)}s) {
269 6         14 $self->{params}{file_length} = $1;
270             }
271            
272             # /N - page count
273 6 50       17 if ($data =~ m{/N\s+(\d+)}s) {
274 6         12 $self->{page_count} = $1;
275 6         13 $self->{params}{page_count} = $1;
276             }
277            
278             # /O - first page object
279 6 100       17 if ($data =~ m{/O\s+(\d+)}s) {
280 5         18 $self->{params}{first_page_obj} = $1;
281             }
282            
283             # /E - end of first page
284 6 100       16 if ($data =~ m{/E\s+(\d+)}s) {
285 5         10 $self->{params}{first_page_end} = $1;
286             }
287            
288             # /H - hint stream [offset length]
289 6 100       25 if ($data =~ m{/H\s*\[\s*(\d+)\s+(\d+)\s*(?:(\d+)\s+(\d+)\s*)?\]}s) {
290 5         16 $self->{params}{hint_offset} = $1;
291 5         14 $self->{params}{hint_length} = $2;
292 5 50       12 $self->{params}{overflow_offset} = $3 if defined $3;
293 5 50       24 $self->{params}{overflow_length} = $4 if defined $4;
294             }
295            
296             # /T - main xref offset
297 6 100       17 if ($data =~ m{/T\s+(\d+)}s) {
298 5         9 $self->{params}{main_xref_offset} = $1;
299             }
300            
301             # Mark first page as loaded (it's in the header section)
302 6         11 $self->{page_loaded}{0} = 1;
303             }
304            
305 8         15 return $self;
306             }
307              
308             sub is_linearized {
309 5     5   485 my ($self) = @_;
310 5         21 return $self->{is_linearized};
311             }
312              
313             sub page_count {
314 4     4   9 my ($self) = @_;
315 4         15 return $self->{page_count};
316             }
317              
318             sub params {
319 2     2   4 my ($self) = @_;
320 2         3 return { %{$self->{params}} };
  2         14  
321             }
322              
323             sub page_available {
324 7     7   1676 my ($self, $page_num) = @_;
325 7 100       31 return $self->{page_loaded}{$page_num} ? 1 : 0;
326             }
327              
328             sub load_hints {
329 4     4   15 my ($self) = @_;
330            
331 4 100       13 return if $self->{hints_loaded};
332 3 100       91 croak "Not linearized" unless $self->{is_linearized};
333            
334 2         4 my $offset = $self->{params}{hint_offset};
335 2         4 my $length = $self->{params}{hint_length};
336            
337 2 50 33     9 croak "Hint offset/length not available"
338             unless defined $offset && defined $length;
339            
340             # Fetch hint stream
341 2         5 my $hint_data = $self->{fetch}->($offset, $length);
342 2 50 33     13 croak "Failed to fetch hint stream"
343             unless defined $hint_data && length($hint_data) >= $length;
344            
345             # Parse hint stream
346 2         9 $self->_parse_hint_stream($hint_data);
347            
348 2         3 $self->{hints_loaded} = 1;
349            
350 2         7 return $self;
351             }
352              
353             sub _parse_hint_stream {
354 2     2   3 my ($self, $data) = @_;
355            
356             # Find stream content (skip object header and dictionary)
357 2         26 my $stream_start = index($data, "stream");
358 2 50       8 return unless $stream_start >= 0;
359 2         5 $stream_start += 6; # Skip "stream"
360            
361             # Skip newline after "stream"
362 2 50       9 $stream_start++ if substr($data, $stream_start, 1) eq "\r";
363 2 50       6 $stream_start++ if substr($data, $stream_start, 1) eq "\n";
364            
365 2         4 my $stream_end = rindex($data, "endstream");
366 2 50       24 return unless $stream_end > $stream_start;
367            
368 2         7 my $stream_content = substr($data, $stream_start, $stream_end - $stream_start);
369            
370             # Parse page offset hint table header (§F.4.2)
371             # First 40 bytes contain header fields
372 2 50       17 return unless length($stream_content) >= 40;
373            
374 2         14 my @bytes = unpack("C*", $stream_content);
375 2         15 my $pos = 0;
376            
377             # Item 1: Min objects per page (4 bytes)
378 2         9 my $min_obj = ($bytes[$pos] << 24) | ($bytes[$pos+1] << 16) |
379             ($bytes[$pos+2] << 8) | $bytes[$pos+3];
380 2         4 $pos += 4;
381            
382             # Item 2: First page location (4 bytes)
383 2         4 my $first_loc = ($bytes[$pos] << 24) | ($bytes[$pos+1] << 16) |
384             ($bytes[$pos+2] << 8) | $bytes[$pos+3];
385 2         3 $pos += 4;
386            
387             # Item 3: Bits for obj count (2 bytes)
388 2         4 my $bits_obj = ($bytes[$pos] << 8) | $bytes[$pos+1];
389 2         3 $pos += 2;
390            
391             # Item 4: Min page length (4 bytes)
392 2         4 my $min_len = ($bytes[$pos] << 24) | ($bytes[$pos+1] << 16) |
393             ($bytes[$pos+2] << 8) | $bytes[$pos+3];
394 2         2 $pos += 4;
395            
396             # Item 5: Bits for page length (2 bytes)
397 2         5 my $bits_len = ($bytes[$pos] << 8) | $bytes[$pos+1];
398 2         3 $pos += 2;
399            
400             # Store parsed values
401             $self->{hint_header} = {
402 2         12 min_obj_count => $min_obj,
403             first_page_loc => $first_loc,
404             bits_obj_count => $bits_obj,
405             min_page_length => $min_len,
406             bits_page_len => $bits_len,
407             };
408            
409             # Continue parsing per-page data...
410             # (Simplified for now)
411            
412 2         5 return 1;
413             }
414              
415             sub read_page {
416 5     5   2198 my ($self, $page_num) = @_;
417            
418             croak "Invalid page number"
419 5 100 100     328 if $page_num < 0 || $page_num >= $self->{page_count};
420            
421             # Already loaded?
422 2 100       6 return $self if $self->{page_loaded}{$page_num};
423            
424             # Need hints for page ranges
425 1 50       4 $self->load_hints unless $self->{hints_loaded};
426            
427             # Get page byte range
428 1         3 my ($offset, $length) = $self->page_range($page_num);
429            
430             # Fetch page data
431 1         3 my $page_data = $self->{fetch}->($offset, $length);
432 1 50 33     13 croak "Failed to fetch page $page_num"
433             unless defined $page_data && length($page_data) > 0;
434            
435             # Parse page objects
436             # (In real implementation, would parse and add to document)
437            
438             # Mark page as loaded
439 1         2 $self->{page_loaded}{$page_num} = 1;
440            
441 1         2 return $self;
442             }
443              
444             sub page_range {
445 4     4   988 my ($self, $page_num) = @_;
446            
447             croak "Invalid page number"
448 4 100 66     105 if $page_num < 0 || $page_num >= $self->{page_count};
449            
450             # Load hints if needed
451 3 50       7 $self->load_hints unless $self->{hints_loaded};
452            
453             # Calculate offset from hint data
454 3         3 my $header = $self->{hint_header};
455 3 50       4 return (0, 0) unless $header;
456            
457 3         5 my $offset = $header->{first_page_loc};
458 3         3 my $length = $header->{min_page_length};
459            
460             # Add deltas for pages before this one
461 3         7 for my $i (0 .. $page_num - 1) {
462 3         4 my $hint = $self->{page_hints}[$i];
463 3 50       18 if ($hint) {
464 0         0 $offset += $hint->{page_length};
465             } else {
466 3         3 $offset += $length; # Use min length as estimate
467             }
468             }
469            
470             # Get this page's length
471 3         4 my $page_hint = $self->{page_hints}[$page_num];
472 3 50       6 if ($page_hint) {
473 0         0 $length = $page_hint->{page_length};
474             }
475            
476 3         7 return ($offset, $length);
477             }
478              
479             sub doc {
480 1     1   2 my ($self) = @_;
481 1         3 return $self->{_doc};
482             }
483              
484             =head1 LINEARIZATION STRUCTURE
485              
486             A linearized PDF has this structure:
487              
488             ┌─────────────────────────────────────┐
489             │ Header (%PDF-2.0) │
490             ├─────────────────────────────────────┤
491             │ Linearization dictionary (obj 1) │
492             ├─────────────────────────────────────┤
493             │ First page xref (partial) │
494             ├─────────────────────────────────────┤
495             │ Document catalog, pages tree root │
496             ├─────────────────────────────────────┤
497             │ First page objects │
498             ├─────────────────────────────────────┤
499             │ Hint stream │
500             ├─────────────────────────────────────┤
501             │ Remaining pages (2..N) │
502             ├─────────────────────────────────────┤
503             │ Shared objects │
504             ├─────────────────────────────────────┤
505             │ Main xref + trailer │
506             └─────────────────────────────────────┘
507              
508             =head1 SEE ALSO
509              
510             L, ISO 32000-2:2020 Annex F (Linearized PDF)
511              
512             =cut
513              
514             1;
515              
516             __END__