File Coverage

blib/lib/PDF/API2/Basic/PDF/File.pm
Criterion Covered Total %
statement 545 721 75.5
branch 188 324 58.0
condition 41 90 45.5
subroutine 45 48 93.7
pod 27 27 100.0
total 846 1210 69.9


line stmt bran cond sub pod time code
1             # Code in the PDF::API2::Basic::PDF namespace was originally copied from the
2             # Text::PDF distribution.
3             #
4             # Copyright Martin Hosken
5             #
6             # Martin Hosken's code may be used under the terms of the MIT license.
7             # Subsequent versions of the code have the same license as PDF::API2.
8              
9             package PDF::API2::Basic::PDF::File;
10              
11 40     40   101720 use strict;
  40         91  
  40         3468  
12              
13             our $VERSION = '2.048'; # VERSION
14              
15             =head1 NAME
16              
17             PDF::API2::Basic::PDF::File - Low-level PDF file access
18              
19             =head1 SYNOPSIS
20              
21             $p = PDF::API2::Basic::PDF::File->open("filename.pdf", 1);
22             $p->new_obj($obj_ref);
23             $p->free_obj($obj_ref);
24             $p->append_file;
25             $p->close_file;
26             $p->release; # IMPORTANT!
27              
28             =head1 DESCRIPTION
29              
30             This class keeps track of the directory aspects of a PDF file. There are two
31             parts to the directory: the main directory object which is the parent to all
32             other objects and a chain of cross-reference tables and corresponding trailer
33             dictionaries starting with the main directory object.
34              
35             =head1 INSTANCE VARIABLES
36              
37             Within this class hierarchy, rather than making everything visible via methods,
38             which would be a lot of work, there are various instance variables which are
39             accessible via associative array referencing. To distinguish instance variables
40             from content variables (which may come from the PDF content itself), each such
41             variable will start with a space.
42              
43             Variables which do not start with a space directly reflect elements in a PDF
44             dictionary. In the case of a PDF::API2::Basic::PDF::File, the elements reflect those in the
45             trailer dictionary.
46              
47             Since some variables are not designed for class users to access, variables are
48             marked in the documentation with (R) to indicate that such an entry should only
49             be used as read-only information. (P) indicates that the information is private
50             and not designed for user use at all, but is included in the documentation for
51             completeness and to ensure that nobody else tries to use it.
52              
53             =over
54              
55             =item newroot
56              
57             This variable allows the user to create a new root entry to occur in the trailer
58             dictionary which is output when the file is written or appended. If you wish to
59             over-ride the root element in the dictionary you have, use this entry to indicate
60             that without losing the current Root entry. Notice that newroot should point to
61             a PDF level object and not just to a dictionary which does not have object status.
62              
63             =item INFILE (R)
64              
65             Contains the filehandle used to read this information into this PDF directory. Is
66             an IO object.
67              
68             =item fname (R)
69              
70             This is the filename which is reflected by INFILE, or the original IO object passed
71             in.
72              
73             =item update (R)
74              
75             This indicates that the read file has been opened for update and that at some
76             point, $p->appendfile() can be called to update the file with the changes that
77             have been made to the memory representation.
78              
79             =item maxobj (R)
80              
81             Contains the first usable object number above any that have already appeared
82             in the file so far.
83              
84             =item outlist (P)
85              
86             This is a list of Objind which are to be output when the next appendfile or outfile
87             occurs.
88              
89             =item firstfree (P)
90              
91             Contains the first free object in the free object list. Free objects are removed
92             from the front of the list and added to the end.
93              
94             =item lastfree (P)
95              
96             Contains the last free object in the free list. It may be the same as the firstfree
97             if there is only one free object.
98              
99             =item objcache (P)
100              
101             All objects are held in the cache to ensure that a system only has one occurrence of
102             each object. In effect, the objind class acts as a container type class to hold the
103             PDF object structure and it would be unfortunate if there were two identical
104             place-holders floating around a system.
105              
106             =item epos (P)
107              
108             The end location of the read-file.
109              
110             =back
111              
112             Each trailer dictionary contains a number of private instance variables which
113             hold the chain together.
114              
115             =over
116              
117             =item loc (P)
118              
119             Contains the location of the start of the cross-reference table preceding the
120             trailer.
121              
122             =item xref (P)
123              
124             Contains an anonymous array of each cross-reference table entry.
125              
126             =item prev (P)
127              
128             A reference to the previous table. Note this differs from the Prev entry which
129             is in PDF which contains the location of the previous cross-reference table.
130              
131             =back
132              
133             =head1 METHODS
134              
135             =cut
136              
137 40     40   260 use Scalar::Util qw(blessed weaken);
  40         76  
  40         2689  
138              
139 40     40   233 use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types);
  40         76  
  40         8382  
140              
141             $ws_char = '[ \t\r\n\f\0]';
142             $delim_char = '[][<>{}()/%]';
143             $reg_char = '[^][<>{}()/% \t\r\n\f\0]';
144             $irreg_char = '[][<>{}()/% \t\r\n\f\0]';
145             $cr = '\s*(?:\015|\012|(?:\015\012))';
146              
147             my $re_comment = qr/(?:\%[^\r\n]*)/;
148             my $re_whitespace = qr/(?:[ \t\r\n\f\0]|$re_comment)/;
149              
150             %types = (
151             'Page' => 'PDF::API2::Basic::PDF::Page',
152             'Pages' => 'PDF::API2::Basic::PDF::Pages',
153             );
154              
155             my $readDebug = 0;
156              
157 40     40   325 use Carp;
  40         79  
  40         5877  
158 40     40   653 use IO::File;
  40         7449  
  40         6499  
159              
160             # Now for the basic PDF types
161 40     40   737 use PDF::API2::Basic::PDF::Utils;
  40         79  
  40         3341  
162              
163 40     40   228 use PDF::API2::Basic::PDF::Array;
  40         76  
  40         1193  
164 40     40   195 use PDF::API2::Basic::PDF::Bool;
  40         94  
  40         955  
165 40     40   179 use PDF::API2::Basic::PDF::Dict;
  40         78  
  40         954  
166 40     40   173 use PDF::API2::Basic::PDF::Name;
  40         72  
  40         917  
167 40     40   166 use PDF::API2::Basic::PDF::Number;
  40         70  
  40         934  
168 40     40   166 use PDF::API2::Basic::PDF::Objind;
  40         76  
  40         1304  
169 40     40   179 use PDF::API2::Basic::PDF::String;
  40         76  
  40         877  
170 40     40   21006 use PDF::API2::Basic::PDF::Page;
  40         131  
  40         1531  
171 40     40   250 use PDF::API2::Basic::PDF::Pages;
  40         90  
  40         919  
172 40     40   190 use PDF::API2::Basic::PDF::Null;
  40         79  
  40         1208  
173 40     40   203 use POSIX qw(ceil floor);
  40         75  
  40         372  
174              
175 40     40   3852 no warnings qw[ deprecated recursion uninitialized ];
  40         103  
  40         55751  
176              
177              
178             =head2 PDF::API2::Basic::PDF::File->new
179              
180             Creates a new, empty file object which can act as the host to other PDF objects.
181             Since there is no file associated with this object, it is assumed that the
182             object is created in readiness for creating a new PDF file.
183              
184             =cut
185              
186             sub new {
187 164     164 1 489 my ($class, $root) = @_;
188 164         852 my $self = $class->_new();
189              
190 164 50       712 unless ($root) {
191 164         711 $root = PDFDict();
192 164         617 $root->{'Type'} = PDFName('Catalog');
193             }
194 164         825 $self->new_obj($root);
195 164         443 $self->{'Root'} = $root;
196 164         758 return $self;
197             }
198              
199              
200             =head2 $p = PDF::API2::Basic::PDF::File->open($filename, $update)
201              
202             Opens the file and reads all the trailers and cross reference tables to build
203             a complete directory of objects.
204              
205             $update specifies whether this file is being opened for updating and editing,
206             or simply to be read.
207              
208             $filename may be an IO object
209              
210             =cut
211              
212             sub open {
213 16     16 1 56 my ($class, $filename, $update) = @_;
214 16         49 my ($fh, $buffer);
215              
216 16         79 my $self = $class->_new();
217 16 100       69 if (ref $filename) {
218 8         31 $self->{' INFILE'} = $filename;
219 8 50       24 if ($update) {
220 8         24 $self->{' update'} = 1;
221 8         22 $self->{' OUTFILE'} = $filename;
222             }
223 8         147 $fh = $filename;
224             }
225             else {
226 8 50       90 die "File '$filename' does not exist" unless -f $filename;
227 8 50       95 die "File '$filename' is not readable" unless -r $filename;
228 8 50       40 if ($update) {
229 8 50       79 die "File '$filename' is not writable" unless -w $filename;
230             }
231 8   50     120 $fh = IO::File->new(($update ? '+' : '') . "<$filename")
232             || die "Error opening '$filename': $!";
233 8         1100 $self->{' INFILE'} = $fh;
234 8 50       26 if ($update) {
235 8         26 $self->{' update'} = 1;
236 8         22 $self->{' OUTFILE'} = $fh;
237 8         25 $self->{' fname'} = $filename;
238             }
239             }
240 16         89 binmode $fh, ':raw';
241 16         198 $fh->seek(0, 0); # go to start of file
242 16         218 $fh->read($buffer, 255);
243 16 50       1630 unless ($buffer =~ /^\%PDF\-([12]\.\d+)\s*$cr/m) {
244 0         0 croak "$filename does not appear to be a valid PDF";
245             }
246 16         94 $self->{' version'} = $1;
247              
248 16         76 $fh->seek(0, 2); # go to end of file
249 16         255 my $end = $fh->tell();
250 16         134 $self->{' epos'} = $end;
251 16         73 foreach my $offset (1..64) {
252 32         151 $fh->seek($end - 16 * $offset, 0);
253 32         276 $fh->read($buffer, 16 * $offset);
254 32 100       4702 last if $buffer =~ m/startxref($cr|\s*)\d+($cr|\s*)\%\%eof.*?/i;
255             }
256 16 50       896 unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr|\s*)\%\%eof.*?/i) {
257 0         0 die "Malformed PDF file $filename";
258             }
259 16         75 my $xpos = $1;
260 16         62 $self->{' xref_position'} = $xpos;
261              
262 16         97 my $tdict = $self->readxrtr($xpos, $self);
263 16         108 foreach my $key (keys %$tdict) {
264 121         282 $self->{$key} = $tdict->{$key};
265             }
266 16         186 return $self;
267             }
268              
269             =head2 $p->version($version)
270              
271             Gets/sets the PDF version (e.g. 1.4)
272              
273             =cut
274              
275             sub version {
276 9     9 1 12 my $self = shift();
277              
278 9 100       13 if (@_) {
279 2         3 my $version = shift();
280 2 50       12 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
281 2         5 $self->header_version($version);
282 2 50       6 if ($version >= 1.4) {
283 2         4 $self->trailer_version($version);
284             }
285             else {
286 0         0 delete $self->{'Root'}->{'Version'};
287 0         0 $self->out_obj($self->{'Root'});
288             }
289 2         5 return $version;
290             }
291              
292 7         11 my $header_version = $self->header_version();
293 7         12 my $trailer_version = $self->trailer_version();
294 7 100       41 return $trailer_version if $trailer_version > $header_version;
295 3         9 return $header_version;
296             }
297              
298             =head2 $version = $p->header_version($version)
299              
300             Gets/sets the PDF version stored in the file header.
301              
302             =cut
303              
304             sub header_version {
305 14     14 1 29 my $self = shift();
306              
307 14 100       20 if (@_) {
308 5         8 my $version = shift();
309 5 50       22 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
310 5         10 $self->{' version'} = $version;
311             }
312              
313 14         26 return $self->{' version'};
314             }
315              
316             =head2 $version = $p->trailer_version($version)
317              
318             Gets/sets the PDF version stored in the document catalog.
319              
320             =cut
321              
322             sub trailer_version {
323 12     12 1 21 my $self = shift();
324              
325 12 100       18 if (@_) {
326 4         5 my $version = shift();
327 4 50       13 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
328 4         11 $self->{'Root'}->{'Version'} = PDFName($version);
329 4         10 $self->out_obj($self->{'Root'});
330 4         5 return $version;
331             }
332              
333 8 50       14 return unless $self->{'Root'}->{'Version'};
334 8         26 $self->{'Root'}->{'Version'}->realise();
335 8         19 return $self->{'Root'}->{'Version'}->val();
336             }
337              
338             =head2 $prev_version = $p->require_version($version)
339              
340             Ensures that the PDF version is at least C<$version>.
341              
342             =cut
343              
344             sub require_version {
345 3     3 1 1041 my ($self, $min_version) = @_;
346 3         6 my $current_version = $self->version();
347 3 100       10 $self->version($min_version) if $current_version < $min_version;
348 3         5 return $current_version;
349             }
350              
351             =head2 $p->release()
352              
353             Releases ALL of the memory used by the PDF document and all of its
354             component objects. After calling this method, do B expect to
355             have anything left in the C object (so if
356             you need to save, be sure to do it before calling this method).
357              
358             B, that it is important that you call this method on any
359             C object when you wish to destruct it and
360             free up its memory. Internally, PDF files have an enormous number of
361             cross-references and this causes circular references within the
362             internal data structures. Calling 'C' forces a brute-force
363             cleanup of the data structures, freeing up all of the memory. Once
364             you've called this method, though, don't expect to be able to do
365             anything else with the C object; it'll
366             have B internal state whatsoever.
367              
368             =cut
369              
370             # Maintainer's Question: Couldn't this be handled by a DESTROY method
371             # instead of requiring an explicit call to release()?
372             sub release {
373 622     622 1 916 my $self = shift();
374              
375 622 50       1273 return $self unless ref($self);
376 622         1429 my @tofree = values %$self;
377              
378 622         1573 foreach my $key (keys %$self) {
379 2271         3025 $self->{$key} = undef;
380 2271         3271 delete $self->{$key};
381             }
382              
383             # PDFs with highly-interconnected page trees or outlines can hit Perl's
384             # recursion limit pretty easily, so disable the warning for this specific
385             # loop.
386 40     40   350 no warnings 'recursion';
  40         90  
  40         385092  
387              
388 622         2388 while (my $item = shift @tofree) {
389 7113 100 100     21432 if (blessed($item) and $item->can('release')) {
    100          
    100          
390 2117         3961 $item->release(1);
391             }
392             elsif (ref($item) eq 'ARRAY') {
393 1555         4217 push @tofree, @$item;
394             }
395             elsif (ref($item) eq 'HASH') {
396 879         2162 push @tofree, values %$item;
397 879         2765 foreach my $key (keys %$item) {
398 3832         4919 $item->{$key} = undef;
399 3832         6930 delete $item->{$key};
400             }
401             }
402             else {
403 2562         5905 $item = undef;
404             }
405             }
406             }
407              
408             =head2 $p->append_file()
409              
410             Appends the objects for output to the read file and then appends the appropriate table.
411              
412             =cut
413              
414             sub append_file {
415 3     3 1 5 my $self = shift();
416 3 50       9 return unless $self->{' update'};
417              
418 3         7 my $fh = $self->{' INFILE'};
419              
420 3         11 my $tdict = PDFDict();
421 3         8 $tdict->{'Prev'} = PDFNum($self->{' loc'});
422 3         6 $tdict->{'Info'} = $self->{'Info'};
423 3 50       7 if (defined $self->{' newroot'}) {
424 0         0 $tdict->{'Root'} = $self->{' newroot'};
425             }
426             else {
427 3         6 $tdict->{'Root'} = $self->{'Root'};
428             }
429 3         6 $tdict->{'Size'} = $self->{'Size'};
430              
431 3         12 foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) {
  54         76  
432 9 50       17 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
433             }
434              
435 3         18 $fh->seek($self->{' epos'}, 0);
436 3         23 $self->out_trailer($tdict, $self->{' update'});
437 3         40 close $self->{' OUTFILE'};
438             }
439              
440              
441             =head2 $p->out_file($fname)
442              
443             Writes a PDF file to a file of the given filename based on the current list of
444             objects to be output. It creates the trailer dictionary based on information
445             in $self.
446              
447             $fname may be an IO object;
448              
449             =cut
450              
451             sub out_file {
452 140     140 1 381 my ($self, $fname) = @_;
453              
454 140         561 $self->create_file($fname);
455 140         594 $self->close_file();
456 140         394 return $self;
457             }
458              
459              
460             =head2 $p->create_file($fname)
461              
462             Creates a new output file (no check is made of an existing open file) of
463             the given filename or IO object. Note, make sure that $p->{' version'} is set
464             correctly before calling this function.
465              
466             =cut
467              
468             sub create_file {
469 140     140 1 344 my ($self, $filename) = @_;
470 140         247 my $fh;
471              
472 140         375 $self->{' fname'} = $filename;
473 140 50       442 if (ref $filename) {
474 140         284 $fh = $filename;
475             }
476             else {
477 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
478 0         0 binmode($fh,':raw');
479             }
480              
481 140         587 $self->{' OUTFILE'} = $fh;
482 140   50     997 $fh->print('%PDF-' . ($self->{' version'} // '1.4') . "\n");
483 140         3500 $fh->print("%\xC6\xCD\xCD\xB5\n"); # and some binary stuff in a comment
484 140         818 return $self;
485             }
486              
487              
488             =head2 $p->clone_file($fname)
489              
490             Creates a copy of the input file at the specified filename and sets it as the
491             output file for future writes. A file handle may be passed instead of a
492             filename.
493              
494             =cut
495              
496             sub clone_file {
497 4     4 1 12 my ($self, $filename) = @_;
498 4         8 my $fh;
499              
500 4         13 $self->{' fname'} = $filename;
501 4 50       12 if (ref $filename) {
502 4         9 $fh = $filename;
503             }
504             else {
505 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
506 0         0 binmode($fh,':raw');
507             }
508              
509 4         12 $self->{' OUTFILE'} = $fh;
510              
511 4         9 my $in = $self->{' INFILE'};
512 4         19 $in->seek(0, 0);
513 4         52 my $data;
514 4         18 while (not $in->eof()) {
515 4         79 $in->read($data, 1024 * 1024);
516 4         166 $fh->print($data);
517             }
518 4         74 return $self;
519             }
520              
521             =head2 $p->close_file
522              
523             Closes up the open file for output by outputting the trailer etc.
524              
525             =cut
526              
527             sub close_file {
528 145     145 1 287 my $self = shift();
529              
530 145         833 my $tdict = PDFDict();
531 145 50       2990 $tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'};
532 145 50 33     735 $tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'};
533              
534             # remove all freed objects from the outlist, AND the outlist_cache if not updating
535             # NO! Don't do that thing! In fact, let out_trailer do the opposite!
536              
537 145   66     730 $tdict->{'Size'} = $self->{'Size'} || PDFNum(1);
538 145 100       487 $tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'};
539 145 100       440 if ($self->{' update'}) {
540 5         34 foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) {
  97         186  
541 16 50       38 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
542             }
543              
544 5         15 my $fh = $self->{' INFILE'};
545 5         20 $fh->seek($self->{' epos'}, 0);
546             }
547              
548 145         1032 $self->out_trailer($tdict, $self->{' update'});
549 145         1609 close($self->{' OUTFILE'});
550 145 50 33     760 if ($^O eq 'MacOS' and not ref($self->{' fname'})) {
551 0         0 MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'});
552             }
553              
554 145         875 return $self;
555             }
556              
557             =head2 ($value, $str) = $p->readval($str, %opts)
558              
559             Reads a PDF value from the current position in the file. If $str is too short
560             then read some more from the current location in the file until the whole object
561             is read. This is a recursive call which may slurp in a whole big stream (unprocessed).
562              
563             Returns the recursive data structure read and also the current $str that has been
564             read from the file.
565              
566             =cut
567              
568             sub readval {
569 842     842 1 291009 my ($self, $str, %opts) = @_;
570 842         1497 my $fh = $self->{' INFILE'};
571 842         1216 my ($result, $value);
572              
573 842 100       1704 my $update = defined($opts{update}) ? $opts{update} : 1;
574 842 100       1967 $str = update($fh, $str) if $update;
575              
576 842         2959 $str =~ s/^$ws_char+//; # Ignore initial white space
577 842         2734 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
578              
579             # Dictionary
580 842 100       12631 if ($str =~ m/^<
    100          
    100          
    100          
    100          
    50          
    100          
    50          
    50          
    0          
581 146         361 $str = substr ($str, 2);
582 146 100       428 $str = update($fh, $str) if $update;
583 146         552 $result = PDFDict();
584              
585 146         502 while ($str !~ m/^>>/) {
586 398         1595 $str =~ s/^$ws_char+//; # Ignore initial white space
587 398         1457 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
588              
589 398 50       2263 if ($str =~ s|^/($reg_char+)||) {
    0          
    0          
590 398         1101 my $key = PDF::API2::Basic::PDF::Name::name_to_string($1, $self);
591 398         1503 ($value, $str) = $self->readval($str, %opts);
592 398 50 50     1316 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
593 398         1093 $result->{$key} = $value;
594             }
595             }
596             elsif ($str =~ s|^/$ws_char+||) {
597             # fixes a broken key problem of acrobat. -- fredo
598 0         0 ($value, $str) = $self->readval($str, %opts);
599 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
600 0         0 $result->{'null'} = $value;
601             }
602             }
603             elsif ($str =~ s|^//|/|) {
604             # fixes again a broken key problem of illustrator/enfocus. -- fredo
605 0         0 ($value, $str) = $self->readval($str, %opts);
606 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
607 0         0 $result->{'null'} = $value;
608             }
609             }
610             else {
611 0         0 die "Invalid dictionary key";
612             }
613 398 100       1019 $str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk
614             }
615 146         388 $str =~ s/^>>//;
616 146 100       390 $str = update($fh, $str) if $update;
617             # streams can't be followed by a lone carriage-return.
618             # fredo: yes they can !!! -- use the MacOS Luke.
619 146 100 66     538 if (($str =~ s/^stream(?:(?:\015\012)|\012|\015)//) and ($result->{'Length'}->val != 0)) { # stream
620 10         39 my $length = $result->{'Length'}->val() + 0;
621 10         43 $result->{' streamsrc'} = $fh;
622 10         67 $result->{' streamloc'} = $fh->tell - length($str);
623              
624 10 50       154 unless ($opts{'nostreams'}) {
625 10 50       59 if ($length > length($str)) {
626 0         0 $value = $str;
627 0         0 $length -= length($str);
628 0         0 read $fh, $str, $length + 11; # slurp the whole stream!
629             }
630             else {
631 10         33 $value = '';
632             }
633 10         70 $value .= substr($str, 0, $length);
634 10         70 $result->{' stream'} = $value;
635 10         38 $result->{' nofilt'} = 1;
636 10 50       48 $str = update($fh, $str, 1) if $update; # tell update we are in-stream and only need an endstream
637 10         57 $str = substr($str, index($str, 'endstream') + 9);
638             }
639             }
640              
641 146 100 100     628 if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val}) {
642 34         107 bless $result, $types{$result->{'Type'}->val};
643             }
644             # gdj: FIXME: if any of the ws chars were crs, then the whole
645             # string might not have been read.
646             }
647              
648             # Indirect Object
649             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R/s) {
650 152         474 my $num = $1 + 0;
651 152         289 $value = $2 + 0;
652 152         1903 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R//s;
653 152 100       433 unless ($result = $self->test_obj($num, $value)) {
654 119         466 $result = PDF::API2::Basic::PDF::Objind->new();
655 119         359 $result->{' objnum'} = $num;
656 119         268 $result->{' objgen'} = $value;
657 119         296 $self->add_obj($result, $num, $value);
658             }
659 152         314 $result->{' parent'} = $self;
660 152         349 weaken $result->{' parent'};
661              
662             # Removed to address changes being lost when an indirect object is realised twice
663             # $result->{' realised'} = 0;
664              
665             # gdj: FIXME: if any of the ws chars were crs, then the whole
666             # string might not have been read.
667             }
668              
669             # Object
670             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj/s) {
671 95         172 my $obj;
672 95         306 my $num = $1 + 0;
673 95         187 $value = $2 + 0;
674 95         1702 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj//s;
675 95         3447 ($obj, $str) = $self->readval($str, %opts);
676 95 100       314 if ($result = $self->test_obj($num, $value)) {
677 81         308 $result->merge($obj);
678             }
679             else {
680 14         27 $result = $obj;
681 14         77 $self->add_obj($result, $num, $value);
682 14         39 $result->{' realised'} = 1;
683             }
684 95 100       299 $str = update($fh, $str) if $update; # thanks to kundrat@kundrat.sk
685 95         596 $str =~ s/^endobj//;
686             }
687              
688             # Name
689             elsif ($str =~ m|^/($reg_char*)|s) {
690 287         667 $value = $1;
691 287         4367 $str =~ s|^/($reg_char*)||s;
692 287         1023 $result = PDF::API2::Basic::PDF::Name->from_pdf($value, $self);
693             }
694              
695             # Literal String
696             elsif ($str =~ m/^\(/) {
697             # We now need to find an unbalanced, unescaped right-paren.
698             # This can't be done with a regex.
699 2         5 my $value = '(';
700 2         7 $str = substr($str, 1);
701              
702 2         5 my $nested_level = 1;
703 2         4 while (1) {
704             # Ignore everything up to the first escaped or parenthesis character
705 2 50       9 if ($str =~ /^([^\\()]+)(.*)/s) {
706 2         7 $value .= $1;
707 2         5 $str = $2;
708             }
709              
710             # Ignore escaped parentheses
711 2 50       14 if ($str =~ /^(\\[()])/) {
    50          
    50          
    0          
712 0         0 $value .= $1;
713 0         0 $str = substr($str, 2);
714             }
715              
716             # Left parenthesis: increase nesting
717             elsif ($str =~ /^\(/) {
718 0         0 $value .= '(';
719 0         0 $str = substr($str, 1);
720 0         0 $nested_level++;
721             }
722              
723             # Right parenthesis: decrease nesting
724             elsif ($str =~ /^\)/) {
725 2         5 $value .= ')';
726 2         5 $str = substr($str, 1);
727 2         3 $nested_level--;
728 2 50       8 last unless $nested_level;
729             }
730              
731             # Other escaped character
732             elsif ($str =~ /^(\\[^()])/) {
733 0         0 $value .= $1;
734 0         0 $str = substr($str, 2);
735             }
736              
737             # If there wasn't an escaped or parenthesis character,
738             # read some more.
739             else {
740             # We don't use update because we don't want to remove
741             # whitespace or comments.
742 0 0       0 $fh->read($str, 255, length($str)) or die 'Unterminated string.';
743             }
744             }
745              
746 2         8 $result = PDF::API2::Basic::PDF::String->from_pdf($value);
747             }
748              
749             # Hex String
750             elsif ($str =~ m/^
751 0         0 $str =~ s/^
752 0         0 $fh->read($str, 255, length($str)) while (0 > index($str, '>'));
753 0         0 ($value, $str) = ($str =~ /^(.*?)>(.*)/s);
754 0         0 $result = PDF::API2::Basic::PDF::String->from_pdf('<' . $value . '>');
755             }
756              
757             # Array
758             elsif ($str =~ m/^\[/) {
759 74         253 $str =~ s/^\[//;
760 74 50       252 $str = update($fh, $str) if $update;
761 74         354 $result = PDFArray();
762 74         353 while ($str !~ m/^\]/) {
763 224         1079 $str =~ s/^$ws_char+//; # Ignore initial white space
764 224         1002 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
765              
766 224         748 ($value, $str) = $self->readval($str, %opts);
767 224         862 $result->add_elements($value);
768 224 50       590 $str = update($fh, $str) if $update; # str might just be exhausted!
769             }
770 74         192 $str =~ s/^\]//;
771             }
772              
773             # Boolean
774             elsif ($str =~ m/^(true|false)($irreg_char|$)/) {
775 0         0 $value = $1;
776 0         0 $str =~ s/^(?:true|false)//;
777 0         0 $result = PDF::API2::Basic::PDF::Bool->from_pdf($value);
778             }
779              
780             # Number
781             elsif ($str =~ m/^([+-.0-9]+)($irreg_char|$)/) {
782 86         281 $value = $1;
783 86         289 $str =~ s/^([+-.0-9]+)//;
784              
785             # If $str only consists of whitespace (or is empty), call update to
786             # see if this is the beginning of an indirect object or reference
787 86 100 100     2631 if ($update and ($str =~ /^$re_whitespace*$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace*$/s)) {
      100        
788 6         83 $str =~ s/^$re_whitespace+/ /s;
789 6         80 $str =~ s/$re_whitespace+$/ /s;
790 6         18 $str = update($fh, $str);
791 6 100       158 if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R|obj)/s) {
792 4         22 return $self->readval("$value $str", %opts);
793             }
794             }
795              
796 82         730 $result = PDF::API2::Basic::PDF::Number->from_pdf($value);
797             }
798              
799             # Null
800             elsif ($str =~ m/^null($irreg_char|$)/) {
801 0         0 $str =~ s/^null//;
802 0         0 $result = PDF::API2::Basic::PDF::Null->new;
803             }
804              
805             else {
806 0         0 die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . ".";
807             }
808              
809 838         13221 $str =~ s/^$ws_char+//s;
810 838         2432 return ($result, $str);
811             }
812              
813              
814             =head2 $ref = $p->read_obj($objind, %opts)
815              
816             Given an indirect object reference, locate it and read the object returning
817             the read in object.
818              
819             =cut
820              
821             sub read_obj {
822 77     77 1 194 my ($self, $objind, %opts) = @_;
823              
824 77   50     243 my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) || return;
825 77 50       252 $objind->merge($res) unless $objind eq $res;
826 77         301 return $objind;
827             }
828              
829              
830             =head2 $ref = $p->read_objnum($num, $gen, %opts)
831              
832             Returns a fully read object of given number and generation in this file
833              
834             =cut
835              
836             sub read_objnum {
837 85     85 1 3744 my ($self, $num, $gen, %opts) = @_;
838 85 50       239 croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num;
839 85 50       174 croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen;
840 85 50       412 croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/;
841 85 50       269 croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/;
842              
843 85   50     242 my $object_location = $self->locate_obj($num, $gen) || return;
844 85         147 my $object;
845              
846             # Compressed object
847 85 100       203 if (ref($object_location)) {
848 4         15 my ($object_stream_num, $object_stream_pos) = @{$object_location};
  4         13  
849              
850 4         46 my $object_stream = $self->read_objnum($object_stream_num, 0, %opts);
851 4 50       21 die 'Cannot find the compressed object stream' unless $object_stream;
852              
853 4 50       40 $object_stream->read_stream() if $object_stream->{' nofilt'};
854              
855             # An object stream starts with pairs of integers containing object numbers and
856             # stream offsets relative to the First key
857 4         11 my $fh;
858             my $pairs;
859 4 50       16 unless ($object_stream->{' streamfile'}) {
860 4         21 $pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val);
861             }
862             else {
863 0         0 CORE::open($fh, '<', $object_stream->{' streamfile'});
864 0         0 read($fh, $pairs, $object_stream->{'First'}->val());
865             }
866 4         22 my @map = split /\s+/, $pairs;
867              
868             # Find the offset of the object in the stream
869 4         10 my $index = $object_stream_pos * 2;
870 4 50       21 die "Objind $num does not exist at index $index" unless $map[$index] == $num;
871 4         13 my $start = $map[$index + 1];
872              
873             # Unless this is the last object in the stream, its length is determined by the
874             # offset of the next object
875 4         10 my $last_object_in_stream = $map[-2];
876 4         8 my $length;
877 4 100       20 if ($last_object_in_stream == $num) {
878 2 50       10 if ($object_stream->{' stream'}) {
879 2         10 $length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start;
880             }
881             else {
882 0         0 $length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start;
883             }
884             }
885             else {
886 2         8 my $next_start = $map[$index + 3];
887 2         6 $length = $next_start - $start;
888             }
889              
890             # Read the object from the stream
891 4         14 my $stream = "$num 0 obj ";
892 4 50       17 unless ($object_stream->{' streamfile'}) {
893 4         18 $stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length);
894             }
895             else {
896 0         0 seek($fh, $object_stream->{'First'}->val() + $start, 0);
897 0         0 read($fh, $stream, $length, length($stream));
898 0         0 close $fh;
899             }
900              
901 4         36 ($object) = $self->readval($stream, %opts, update => 0);
902 4         29 return $object;
903             }
904              
905 81         365 my $current_location = $self->{' INFILE'}->tell;
906 81         758 $self->{' INFILE'}->seek($object_location, 0);
907 81         737 ($object) = $self->readval('', %opts);
908 81         478 $self->{' INFILE'}->seek($current_location, 0);
909 81         976 return $object;
910             }
911              
912              
913             =head2 $objind = $p->new_obj($obj)
914              
915             Creates a new, free object reference based on free space in the cross reference chain.
916             If nothing free then thinks up a new number. If $obj then turns that object into this
917             new object rather than returning a new object.
918              
919             =cut
920              
921             sub new_obj {
922 1072     1072 1 2198 my ($self, $base) = @_;
923 1072         1590 my $res;
924              
925 1072 50 66     3492 if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) {
  15         66  
926 0         0 $res = shift(@{$self->{' free'}});
  0         0  
927 0 0       0 if (defined $base) {
928 0         0 my ($num, $gen) = @{$self->{' objects'}{$res->uid}};
  0         0  
929 0         0 $self->remove_obj($res);
930 0         0 $self->add_obj($base, $num, $gen);
931 0         0 return $self->out_obj($base);
932             }
933             else {
934 0         0 $self->{' objects'}{$res->uid}[2] = 0;
935 0         0 return $res;
936             }
937             }
938              
939 1072         1687 my $tdict = $self;
940 1072         1629 my $i;
941 1072         2481 while (defined $tdict) {
942 1073 50       3783 $i = $tdict->{' xref'}{defined($i) ? $i : ''}[0];
943 1073   33     3484 while (defined $i and $i != 0) {
944 0         0 my ($ni, $ng) = @{$tdict->{' xref'}{$i}};
  0         0  
945 0 0       0 unless (defined $self->locate_obj($i, $ng)) {
946 0 0       0 if (defined $base) {
947 0         0 $self->add_obj($base, $i, $ng);
948 0         0 return $base;
949             }
950             else {
951 0   0     0 $res = $self->test_obj($i, $ng) || $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, $ng);
952 0         0 $self->out_obj($res);
953 0         0 return $res;
954             }
955             }
956 0         0 $i = $ni;
957             }
958 1073         2368 $tdict = $tdict->{' prev'};
959             }
960              
961 1072         2052 $i = $self->{' maxobj'}++;
962 1072 50       2069 if (defined $base) {
963 1072         3647 $self->add_obj($base, $i, 0);
964 1072         2939 $self->out_obj($base);
965 1072         2321 return $base;
966             }
967             else {
968 0         0 $res = $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, 0);
969 0         0 $self->out_obj($res);
970 0         0 return $res;
971             }
972             }
973              
974              
975             =head2 $p->out_obj($objind)
976              
977             Indicates that the given object reference should appear in the output xref
978             table whether with data or freed.
979              
980             =cut
981              
982             sub out_obj {
983 2676     2676 1 4615 my ($self, $obj) = @_;
984              
985             # This is why we've been keeping the outlist CACHE around; to speed
986             # up this method by orders of magnitude (it saves up from having to
987             # grep the full outlist each time through as we'll just do a lookup
988             # in the hash) (which is super-fast).
989 2676 100       8378 unless (exists $self->{' outlist_cache'}{$obj}) {
990 1087         1780 push @{$self->{' outlist'}}, $obj;
  1087         2613  
991             # weaken $self->{' outlist'}->[-1];
992 1087         3312 $self->{' outlist_cache'}{$obj} = 1;
993             }
994 2676         4917 return $obj;
995             }
996              
997              
998             =head2 $p->free_obj($objind)
999              
1000             Marks an object reference for output as being freed.
1001              
1002             =cut
1003              
1004             sub free_obj {
1005 0     0 1 0 my ($self, $obj) = @_;
1006              
1007 0         0 push @{$self->{' free'}}, $obj;
  0         0  
1008 0         0 $self->{' objects'}{$obj->uid()}[2] = 1;
1009 0         0 $self->out_obj($obj);
1010             }
1011              
1012              
1013             =head2 $p->remove_obj($objind)
1014              
1015             Removes the object from all places where we might remember it
1016              
1017             =cut
1018              
1019             sub remove_obj {
1020 0     0 1 0 my ($self, $objind) = @_;
1021              
1022             # who says it has to be fast
1023 0         0 delete $self->{' objects'}{$objind->uid()};
1024 0         0 delete $self->{' outlist_cache'}{$objind};
1025 0         0 delete $self->{' printed_cache'}{$objind};
1026 0         0 @{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}};
  0         0  
  0         0  
  0         0  
1027 0         0 @{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}};
  0         0  
  0         0  
  0         0  
1028             $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef
1029 0 0       0 if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind;
1030 0         0 return $self;
1031             }
1032              
1033              
1034             =head2 $p->ship_out(@objects)
1035              
1036             Ships the given objects (or all objects for output if @objects is empty) to
1037             the currently open output file (assuming there is one). Freed objects are not
1038             shipped, and once an object is shipped it is switched such that this file
1039             becomes its source and it will not be shipped again unless out_obj is called
1040             again. Notice that a shipped out object can be re-output or even freed, but
1041             that it will not cause the data already output to be changed.
1042              
1043             =cut
1044              
1045             sub ship_out {
1046 154     154 1 394 my ($self, @objs) = @_;
1047              
1048 154 50       465 die "No output file specified" unless defined $self->{' OUTFILE'};
1049 154         292 my $fh = $self->{' OUTFILE'};
1050 154         440 seek($fh, 0, 2); # go to the end of the file
1051              
1052 154 50       496 @objs = @{$self->{' outlist'}} unless scalar @objs > 0;
  154         535  
1053 154         423 foreach my $objind (@objs) {
1054 890 50       2768 next unless $objind->is_obj($self);
1055 890         1464 my $j = -1;
1056 890         1693 for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) {
  890         2296  
1057 890 50       2623 if ($self->{' outlist'}[$i] eq $objind) {
1058 890         1436 $j = $i;
1059 890         1747 last;
1060             }
1061             }
1062 890 50       1789 next if $j < 0;
1063 890         1289 splice(@{$self->{' outlist'}}, $j, 1);
  890         2057  
1064 890         2437 delete $self->{' outlist_cache'}{$objind};
1065 890 50       1147 next if grep { $_ eq $objind } @{$self->{' free'}};
  0         0  
  890         2348  
1066              
1067 890 50       1894 map { $fh->print("\% $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'};
  0         0  
1068 890         2726 $self->{' locs'}{$objind->uid()} = $fh->tell();
1069 890         2264 my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1];
  890         1802  
1070 890         2716 $fh->printf('%d %d obj ', $objnum, $objgen);
1071 890         8947 $objind->outobjdeep($fh, $self);
1072 890         3475 $fh->print(" endobj\n");
1073              
1074             # Note that we've output this obj, not forgetting to update
1075             # the cache of whats printed.
1076 890 50       5858 unless (exists $self->{' printed_cache'}{$objind}) {
1077 890         1275 push @{$self->{' printed'}}, $objind;
  890         2230  
1078 890         3387 $self->{' printed_cache'}{$objind}++;
1079             }
1080             }
1081 154         486 return $self;
1082             }
1083              
1084             =head2 $p->copy($outpdf, \&filter)
1085              
1086             Iterates over every object in the file reading the object, calling filter with the object
1087             and outputting the result. if filter is not defined, then just copies input to output.
1088              
1089             =cut
1090              
1091             sub copy {
1092 0     0 1 0 my ($self, $out, $filter) = @_;
1093 0         0 my ($obj, $minl, $mini, $ming);
1094              
1095 0         0 foreach my $key (grep { not m/^[\s\-]/ } keys %$self) {
  0         0  
1096 0 0       0 $out->{$key} = $self->{$key} unless defined $out->{$key};
1097             }
1098              
1099 0         0 my $tdict = $self;
1100 0         0 while (defined $tdict) {
1101 0         0 foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) {
  0         0  
  0         0  
1102 0         0 my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}};
  0         0  
1103 0 0       0 next unless $nt eq 'n';
1104              
1105 0 0 0     0 if ($nl < $minl or $mini == 0) {
1106 0         0 $mini = $i;
1107 0         0 $ming = $ng;
1108 0         0 $minl = $nl;
1109             }
1110 0 0       0 unless ($obj = $self->test_obj($i, $ng)) {
1111 0         0 $obj = PDF::API2::Basic::PDF::Objind->new();
1112 0         0 $obj->{' objnum'} = $i;
1113 0         0 $obj->{' objgen'} = $ng;
1114 0         0 $self->add_obj($obj, $i, $ng);
1115 0         0 $obj->{' parent'} = $self;
1116 0         0 weaken $obj->{' parent'};
1117 0         0 $obj->{' realised'} = 0;
1118             }
1119 0         0 $obj->realise;
1120 0 0       0 my $res = defined $filter ? &{$filter}($obj) : $obj;
  0         0  
1121 0 0 0     0 $out->new_obj($res) unless (!$res || $res->is_obj($out));
1122             }
1123 0         0 $tdict = $tdict->{' prev'};
1124             }
1125              
1126             # test for linearized and remove it from output
1127 0         0 $obj = $self->test_obj($mini, $ming);
1128 0 0 0     0 if ($obj->isa('PDF::API2::Basic::PDF::Dict') && $obj->{'Linearized'}) {
1129 0         0 $out->free_obj($obj);
1130             }
1131              
1132 0         0 return $self;
1133             }
1134              
1135              
1136             =head1 PRIVATE METHODS & FUNCTIONS
1137              
1138             The following methods and functions are considered private to this class. This
1139             does not mean you cannot use them if you have a need, just that they aren't really
1140             designed for users of this class.
1141              
1142             =head2 $offset = $p->locate_obj($num, $gen)
1143              
1144             Returns a file offset to the object asked for by following the chain of cross
1145             reference tables until it finds the one you want.
1146              
1147             =cut
1148              
1149             sub locate_obj {
1150 85     85 1 245 my ($self, $num, $gen) = @_;
1151              
1152 85         154 my $tdict = $self;
1153 85         170 my $seen = {};
1154 85         209 while (defined $tdict) {
1155 94         273 $seen->{$tdict->{' loc'}} = 1;
1156 94 100       314 if (ref $tdict->{' xref'}{$num}) {
1157 85         155 my $ref = $tdict->{' xref'}{$num};
1158 85 100       228 return $ref unless scalar(@$ref) == 3;
1159              
1160 81 50       219 if ($ref->[1] == $gen) {
1161 81 50       461 return $ref->[0] if $ref->[2] eq 'n';
1162 0         0 return; # if $ref->[2] eq 'f';
1163             }
1164             }
1165 9         65 $tdict = $tdict->{' prev'};
1166 9 50       34 if ($seen->{$tdict->{' loc'}}) {
1167 0         0 die 'Malformed PDF: trailer contains a loop or repeated object ID';
1168             }
1169             }
1170 0         0 return;
1171             }
1172              
1173              
1174             =head2 update($fh, $str, $instream)
1175              
1176             Keeps reading $fh for more data to ensure that $str has at least a line full
1177             for C to work on. At this point we also take the opportunity to ignore
1178             comments.
1179              
1180             =cut
1181              
1182             sub update {
1183 1940     1940 1 3448 my ($fh, $str, $instream) = @_;
1184 1940 50       4728 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1185 1940 100       3319 if ($instream) {
1186             # we are inside a (possible binary) stream
1187             # so we fetch data till we see an 'endstream'
1188             # -- fredo/2004-09-03
1189 10   33     78 while ($str !~ m/endstream/ and not $fh->eof()) {
1190 0 0       0 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1191 0         0 $fh->read($str, 314, length($str));
1192             }
1193             }
1194             else {
1195 1930         8649 $str =~ s/^$ws_char*//;
1196 1930   100     19762 while ($str !~ m/$cr/ and not $fh->eof()) {
1197 107 50       1302 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1198 107         459 $fh->read($str, 314, length($str));
1199 107         3083 $str =~ s/^$ws_char*//so;
1200             }
1201 1930         7839 while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23
1202 1 50       6 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1203 1   33     63 $fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof());
1204 1         75 $str =~ s/^\%[^\015\012]*$ws_char*//so; # fixed for reportlab -- fredo
1205             }
1206             }
1207              
1208 1940         4316 return $str;
1209             }
1210              
1211             =head2 $objind = $p->test_obj($num, $gen)
1212              
1213             Tests the cache to see whether an object reference (which may or may not have
1214             been getobj()ed) has been cached. Returns it if it has.
1215              
1216             =cut
1217              
1218             sub test_obj {
1219 247     247 1 515 my ($self, $num, $gen) = @_;
1220 247         1200 return $self->{' objcache'}{$num, $gen};
1221             }
1222              
1223              
1224             =head2 $p->add_obj($objind)
1225              
1226             Adds the given object to the internal object cache.
1227              
1228             =cut
1229              
1230             sub add_obj {
1231 1205     1205 1 3042 my ($self, $obj, $num, $gen) = @_;
1232              
1233 1205         5577 $self->{' objcache'}{$num, $gen} = $obj;
1234 1205         5586 $self->{' objects'}{$obj->uid()} = [$num, $gen];
1235             # weaken $self->{' objcache'}{$num, $gen};
1236 1205         2412 return $obj;
1237             }
1238              
1239              
1240             =head2 $tdict = $p->readxrtr($xpos)
1241              
1242             Recursive function which reads each of the cross-reference and trailer tables
1243             in turn until there are no more.
1244              
1245             Returns a dictionary corresponding to the trailer chain. Each trailer also
1246             includes the corresponding cross-reference table.
1247              
1248             The structure of the xref private element in a trailer dictionary is of an
1249             anonymous hash of cross reference elements by object number. Each element
1250             consists of an array of 3 elements corresponding to the three elements read
1251             in [location, generation number, free or used]. See the PDF specification
1252             for details.
1253              
1254             =cut
1255              
1256             sub _unpack_xref_stream {
1257 78     78   152 my ($self, $width, $data) = @_;
1258              
1259 78 100       184 return unpack('C', $data) if $width == 1;
1260 52 50       180 return unpack('n', $data) if $width == 2;
1261 0 0       0 return unpack('N', "\x00$data") if $width == 3;
1262 0 0       0 return unpack('N', $data) if $width == 4;
1263 0 0       0 return unpack('Q>', $data) if $width == 8;
1264              
1265 0         0 die "Unsupported xref stream entry width: $width";
1266             }
1267              
1268             sub readxrtr {
1269 19     19 1 60 my ($self, $xpos) = @_;
1270 19         69 my ($tdict, $buf, $xmin, $xnum, $xdiff);
1271              
1272 19         51 my $fh = $self->{' INFILE'};
1273 19         91 $fh->seek($xpos, 0);
1274 19         212 $fh->read($buf, 22);
1275 19         238 $buf = update($fh, $buf); # fix for broken JAWS xref calculation.
1276              
1277 19         59 my $xlist = {};
1278              
1279             ## seams that some products calculate wrong prev entries (short)
1280             ## so we seek ahead to find one -- fredo; save for now
1281             #while($buf !~ m/^xref$cr/i && !eof($fh))
1282             #{
1283             # $buf =~ s/^(\s+|\S+|.)//i;
1284             # $buf=update($fh,$buf);
1285             #}
1286              
1287 19 100       518 if ($buf =~ s/^xref$cr//i) {
    50          
1288             # Plain XRef tables.
1289 16         942 while ($buf =~ m/^$ws_char*([0-9]+)$ws_char+([0-9]+)$ws_char*$cr(.*?)$/s) {
1290 21         60 my $old_buf = $buf;
1291 21         64 $xmin = $1;
1292 21         51 $xnum = $2;
1293 21         55 $buf = $3;
1294 21 50       499 unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) {
1295             # See PDF 1.7 section 7.5.4: Cross-Reference Table
1296 0         0 warn q{Malformed xref in PDF file: subsection shall begin with a line containing two numbers separated by a SPACE (20h)};
1297             }
1298 21         51 $xdiff = length($buf);
1299              
1300 21         110 $fh->read($buf, 20 * $xnum - $xdiff + 15, $xdiff);
1301 21   66     1028 while ($xnum-- > 0 and $buf =~ s/^0*([0-9]*)$ws_char+0*([0-9]+)$ws_char+([nf])$cr//) {
1302 118 50       609 $xlist->{$xmin} = [$1, $2, $3] unless exists $xlist->{$xmin};
1303 118         1033 $xmin++;
1304             }
1305             }
1306              
1307 16 50       119 if ($buf !~ /^\s*trailer\b/i) {
1308 0         0 die "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell - length($buf));
1309             }
1310              
1311 16         66 $buf =~ s/^\s*trailer\b//i;
1312              
1313 16         84 ($tdict, $buf) = $self->readval($buf);
1314             }
1315             elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) {
1316 3         43 my ($xref_obj, $xref_gen) = ($1, $2);
1317              
1318             # XRef streams.
1319 3         24 ($tdict, $buf) = $self->readval($buf);
1320              
1321 3 50       18 unless ($tdict->{' stream'}) {
1322 0         0 die "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}";
1323             }
1324 3         29 $tdict->read_stream(1);
1325              
1326 3         13 my $stream = $tdict->{' stream'};
1327 3         9 my @widths = map { $_->val } @{$tdict->{W}->val};
  9         29  
  3         21  
1328              
1329 3         7 my $start = 0;
1330 3         12 my $last;
1331              
1332             my @index;
1333 3 100       22 if (defined $tdict->{Index}) {
1334 1         2 @index = map { $_->val() } @{$tdict->{Index}->val};
  2         6  
  1         5  
1335             }
1336             else {
1337 2         14 @index = (0, $tdict->{Size}->val);
1338             }
1339              
1340 3         14 while (scalar @index) {
1341 3         8 $start = shift(@index);
1342 3         13 $last = $start + shift(@index) - 1;
1343              
1344 3         16 for my $i ($start...$last) {
1345             # Replaced "for $xmin" because it creates a loop-specific local variable, and we
1346             # need $xmin to be correct for maxobj below.
1347 26         46 $xmin = $i;
1348              
1349 26         37 my @cols;
1350              
1351 26         43 for my $w (@widths) {
1352 78         144 my $data;
1353 78 50       231 $data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w;
1354              
1355 78         146 push @cols, $data;
1356             }
1357              
1358 26 100       73 $cols[0] = 1 unless defined $cols[0];
1359 26 50       56 if ($cols[0] > 2) {
1360 0         0 die "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj";
1361             }
1362              
1363 26 50       64 next if exists $xlist->{$xmin};
1364              
1365 26 50       102 my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535));
    100          
1366 26 100       75 push @objind, ($cols[0] == 0 ? 'f' : 'n') if $cols[0] < 2;
    100          
1367              
1368 26         101 $xlist->{$xmin} = \@objind;
1369             }
1370             }
1371             }
1372             else {
1373 0         0 die "Malformed xref in PDF file $self->{' fname'}";
1374             }
1375              
1376 19         82 $tdict->{' loc'} = $xpos;
1377 19         59 $tdict->{' xref'} = $xlist;
1378 19 100       109 $self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'};
1379             $tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val)
1380 19 100 66     108 if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val != 0);
1381 19 100       84 delete $tdict->{' prev'} unless defined $tdict->{' prev'};
1382 19         77 return $tdict;
1383             }
1384              
1385              
1386             =head2 $p->out_trailer($tdict)
1387              
1388             Outputs the body and trailer for a PDF file by outputting all the objects in
1389             the ' outlist' and then outputting a xref table for those objects and any
1390             freed ones. It then outputs the trailing dictionary and the trailer code.
1391              
1392             =cut
1393              
1394             sub out_trailer {
1395 148     148 1 620 my ($self, $tdict, $update) = @_;
1396 148         326 my $fh = $self->{' OUTFILE'};
1397              
1398 148         409 while (@{$self->{' outlist'}}) {
  302         967  
1399 154         658 $self->ship_out();
1400             }
1401              
1402             # When writing new trailers, most dictionary entries get copied from the
1403             # previous trailer, but entries related to cross-reference streams should
1404             # get removed (and possibly recreated below).
1405 148         1213 delete $tdict->{$_} for (# Entries common to streams
1406             qw(Length Filter DecodeParms F FFilter FDecodeParms DL),
1407              
1408             # Entries specific to cross-reference streams
1409             qw(Index W XRefStm));
1410              
1411 148         603 $tdict->{'Size'} = PDFNum($self->{' maxobj'});
1412              
1413 148         481 my $tloc = $fh->tell();
1414 148         760 my @out;
1415              
1416 148 100       270 my @xreflist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } (@{$self->{' printed'} || []}, @{$self->{' free'} || []});
  1334 100       2976  
  148         565  
  148         789  
1417              
1418 148         552 my ($i, $j, $k);
1419 148 100       427 unless ($update) {
1420 140         241 $i = 1;
1421 140         512 for ($j = 0; $j < @xreflist; $j++) {
1422 868         1148 my @inserts;
1423 868         1205 $k = $xreflist[$j];
1424 868         2049 while ($i < $self->{' objects'}{$k->uid}[0]) {
1425 0         0 my ($n) = PDF::API2::Basic::PDF::Objind->new();
1426 0         0 $self->add_obj($n, $i, 0);
1427 0         0 $self->free_obj($n);
1428 0         0 push(@inserts, $n);
1429 0         0 $i++;
1430             }
1431 868         1421 splice(@xreflist, $j, 0, @inserts);
1432 868         1928 $j += @inserts;
1433 868         1710 $i++;
1434             }
1435             }
1436              
1437 148 100       356 my @freelist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } @{$self->{' free'} || []};
  0         0  
  148         560  
1438              
1439 148         257 $j = 0; my $first = -1; $k = 0;
  148         293  
  148         259  
1440 148         537 for ($i = 0; $i <= $#xreflist + 1; $i++) {
1441 1038 100 100     3074 if ($i > $#xreflist || $self->{' objects'}{$xreflist[$i]->uid}[0] != $j + 1) {
1442 160 100       1015 push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid}[0] ") . ($i - $first) . "\n";
1443 160 100       441 if ($first == -1) {
1444 148 50       780 push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid}[0] : 0);
1445 148         264 $first = 0;
1446             }
1447 160         493 for ($j = $first; $j < $i; $j++) {
1448 890         1282 my $xref = $xreflist[$j];
1449 890 50 33     2077 if (defined($freelist[$k]) and defined($xref) and "$freelist[$k]" eq "$xref") {
      33        
1450 0         0 $k++;
1451             push @out, pack("A10AA5A4",
1452             sprintf("%010d", (defined $freelist[$k] ?
1453             $self->{' objects'}{$freelist[$k]->uid}[0] : 0)), " ",
1454 0 0       0 sprintf("%05d", $self->{' objects'}{$xref->uid}[1] + 1),
1455             " f \n");
1456             }
1457             else {
1458             push @out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid}), " ",
1459 890         2047 sprintf("%05d", $self->{' objects'}{$xref->uid}[1]),
1460             " n \n");
1461             }
1462             }
1463 160         272 $first = $i;
1464 160 100       789 $j = $self->{' objects'}{$xreflist[$i]->uid}[0] if ($i < scalar @xreflist);
1465             }
1466             else {
1467 878         1764 $j++;
1468             }
1469             }
1470 148 50 33     622 if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') {
1471 0         0 my (@index, @stream);
1472 0         0 for (@out) {
1473 0         0 my @a = split;
1474 0 0       0 @a == 2 ? push @index, @a : push @stream, \@a;
1475             }
1476 0         0 my $i = $self->{' maxobj'}++;
1477 0         0 $self->add_obj($tdict, $i, 0);
1478 0         0 $self->out_obj($tdict );
1479              
1480 0         0 push @index, $i, 1;
1481 0         0 push @stream, [$tloc, 0, 'n'];
1482              
1483 0 0       0 my $len = $tloc > 0xFFFF ? 4 : 2; # don't expect files > 4 Gb
1484 0 0       0 my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC'; # don't expect gennum > 255, it's absurd.
1485             # Adobe doesn't use them anymore anyway
1486 0         0 my $stream = '';
1487 0         0 my @prev = (0) x ($len + 2);
1488 0         0 for (@stream) {
1489 0 0 0     0 $_->[1] = 0 if $_->[2] eq 'f' and $_->[1] == 65535;
1490 0 0       0 my @line = unpack 'C*', pack $tpl, $_->[2] eq 'n' ? 1 : 0, @{$_}[0..1];
  0         0  
1491              
1492             $stream .= pack 'C*', 2, # prepend filtering method, "PNG Up"
1493 0         0 map {($line[$_] - $prev[$_] + 256) % 256 } 0 .. $#line;
  0         0  
1494 0         0 @prev = @line;
1495             }
1496 0         0 $tdict->{'Size'} = PDFNum($i + 1);
1497 0         0 $tdict->{'Index'} = PDFArray(map PDFNum( $_ ), @index);
1498 0         0 $tdict->{'W'} = PDFArray(map PDFNum( $_ ), 1, $len, 1);
1499 0         0 $tdict->{'Filter'} = PDFName('FlateDecode');
1500              
1501 0         0 $tdict->{'DecodeParms'} = PDFDict();
1502 0         0 $tdict->{'DecodeParms'}->val->{'Predictor'} = PDFNum(12);
1503 0         0 $tdict->{'DecodeParms'}->val->{'Columns'} = PDFNum($len + 2);
1504              
1505 0         0 $stream = PDF::API2::Basic::PDF::Filter::FlateDecode->new->outfilt($stream, 1);
1506 0         0 $tdict->{' stream'} = $stream;
1507 0         0 $tdict->{' nofilt'} = 1;
1508 0         0 delete $tdict->{'Length'};
1509 0         0 $self->ship_out();
1510             }
1511             else {
1512 148         685 $fh->print("xref\n", @out, "trailer\n");
1513 148         1545 $tdict->outobjdeep($fh, $self);
1514 148         426 $fh->print("\n");
1515             }
1516 148         1157 $fh->print("startxref\n$tloc\n%%EOF\n");
1517             }
1518              
1519              
1520             =head2 PDF::API2::Basic::PDF::File->_new
1521              
1522             Creates a very empty PDF file object (used by new and open)
1523              
1524             =cut
1525              
1526             sub _new {
1527 180     180   506 my $class = shift();
1528 180         346 my $self = {};
1529              
1530 180         399 bless $self, $class;
1531 180         699 $self->{' outlist'} = [];
1532 180         544 $self->{' outlist_cache'} = {}; # A cache of whats in the 'outlist'
1533 180         472 $self->{' maxobj'} = 1;
1534 180         487 $self->{' objcache'} = {};
1535 180         489 $self->{' objects'} = {};
1536              
1537 180         458 return $self;
1538             }
1539              
1540             1;
1541              
1542             =head1 AUTHOR
1543              
1544             Martin Hosken Martin_Hosken@sil.org
1545              
1546             Copyright Martin Hosken 1999 and onwards
1547              
1548             No warranty or expression of effectiveness, least of all regarding anyone's
1549             safety, is implied in this software or documentation.