File Coverage

blib/lib/PDF/API2/Basic/PDF/File.pm
Criterion Covered Total %
statement 542 717 75.5
branch 187 322 58.0
condition 41 90 45.5
subroutine 45 48 93.7
pod 27 27 100.0
total 842 1204 69.9


line stmt bran cond sub pod time code
1             # Code in the PDF::API2::Basic::PDF namespace was originally copied from the
2             # Text::PDF distribution.
3             #
4             # Copyright Martin Hosken
5             #
6             # Martin Hosken's code may be used under the terms of the MIT license.
7             # Subsequent versions of the code have the same license as PDF::API2.
8              
9             package PDF::API2::Basic::PDF::File;
10              
11 39     39   68589 use strict;
  39         108  
  39         2925  
12              
13             our $VERSION = '2.043'; # VERSION
14              
15             =head1 NAME
16              
17             PDF::API2::Basic::PDF::File - Low-level PDF file access
18              
19             =head1 SYNOPSIS
20              
21             $p = PDF::API2::Basic::PDF::File->open("filename.pdf", 1);
22             $p->new_obj($obj_ref);
23             $p->free_obj($obj_ref);
24             $p->append_file;
25             $p->close_file;
26             $p->release; # IMPORTANT!
27              
28             =head1 DESCRIPTION
29              
30             This class keeps track of the directory aspects of a PDF file. There are two
31             parts to the directory: the main directory object which is the parent to all
32             other objects and a chain of cross-reference tables and corresponding trailer
33             dictionaries starting with the main directory object.
34              
35             =head1 INSTANCE VARIABLES
36              
37             Within this class hierarchy, rather than making everything visible via methods,
38             which would be a lot of work, there are various instance variables which are
39             accessible via associative array referencing. To distinguish instance variables
40             from content variables (which may come from the PDF content itself), each such
41             variable will start with a space.
42              
43             Variables which do not start with a space directly reflect elements in a PDF
44             dictionary. In the case of a PDF::API2::Basic::PDF::File, the elements reflect those in the
45             trailer dictionary.
46              
47             Since some variables are not designed for class users to access, variables are
48             marked in the documentation with (R) to indicate that such an entry should only
49             be used as read-only information. (P) indicates that the information is private
50             and not designed for user use at all, but is included in the documentation for
51             completeness and to ensure that nobody else tries to use it.
52              
53             =over
54              
55             =item newroot
56              
57             This variable allows the user to create a new root entry to occur in the trailer
58             dictionary which is output when the file is written or appended. If you wish to
59             over-ride the root element in the dictionary you have, use this entry to indicate
60             that without losing the current Root entry. Notice that newroot should point to
61             a PDF level object and not just to a dictionary which does not have object status.
62              
63             =item INFILE (R)
64              
65             Contains the filehandle used to read this information into this PDF directory. Is
66             an IO object.
67              
68             =item fname (R)
69              
70             This is the filename which is reflected by INFILE, or the original IO object passed
71             in.
72              
73             =item update (R)
74              
75             This indicates that the read file has been opened for update and that at some
76             point, $p->appendfile() can be called to update the file with the changes that
77             have been made to the memory representation.
78              
79             =item maxobj (R)
80              
81             Contains the first usable object number above any that have already appeared
82             in the file so far.
83              
84             =item outlist (P)
85              
86             This is a list of Objind which are to be output when the next appendfile or outfile
87             occurs.
88              
89             =item firstfree (P)
90              
91             Contains the first free object in the free object list. Free objects are removed
92             from the front of the list and added to the end.
93              
94             =item lastfree (P)
95              
96             Contains the last free object in the free list. It may be the same as the firstfree
97             if there is only one free object.
98              
99             =item objcache (P)
100              
101             All objects are held in the cache to ensure that a system only has one occurrence of
102             each object. In effect, the objind class acts as a container type class to hold the
103             PDF object structure and it would be unfortunate if there were two identical
104             place-holders floating around a system.
105              
106             =item epos (P)
107              
108             The end location of the read-file.
109              
110             =back
111              
112             Each trailer dictionary contains a number of private instance variables which
113             hold the chain together.
114              
115             =over
116              
117             =item loc (P)
118              
119             Contains the location of the start of the cross-reference table preceding the
120             trailer.
121              
122             =item xref (P)
123              
124             Contains an anonymous array of each cross-reference table entry.
125              
126             =item prev (P)
127              
128             A reference to the previous table. Note this differs from the Prev entry which
129             is in PDF which contains the location of the previous cross-reference table.
130              
131             =back
132              
133             =head1 METHODS
134              
135             =cut
136              
137 39     39   275 use Scalar::Util qw(blessed weaken);
  39         90  
  39         2457  
138              
139 39     39   250 use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types);
  39         85  
  39         6451  
140              
141             $ws_char = '[ \t\r\n\f\0]';
142             $delim_char = '[][<>{}()/%]';
143             $reg_char = '[^][<>{}()/% \t\r\n\f\0]';
144             $irreg_char = '[][<>{}()/% \t\r\n\f\0]';
145             $cr = '\s*(?:\015|\012|(?:\015\012))';
146              
147             my $re_comment = qr/(?:\%[^\r\n]*)/;
148             my $re_whitespace = qr/(?:[ \t\r\n\f\0]|$re_comment)/;
149              
150             %types = (
151             'Page' => 'PDF::API2::Basic::PDF::Page',
152             'Pages' => 'PDF::API2::Basic::PDF::Pages',
153             );
154              
155             my $readDebug = 0;
156              
157 39     39   318 use Carp;
  39         109  
  39         2505  
158 39     39   753 use IO::File;
  39         8814  
  39         5410  
159              
160             # Now for the basic PDF types
161 39     39   708 use PDF::API2::Basic::PDF::Utils;
  39         87  
  39         3067  
162              
163 39     39   276 use PDF::API2::Basic::PDF::Array;
  39         118  
  39         1166  
164 39     39   249 use PDF::API2::Basic::PDF::Bool;
  39         84  
  39         985  
165 39     39   233 use PDF::API2::Basic::PDF::Dict;
  39         91  
  39         1060  
166 39     39   234 use PDF::API2::Basic::PDF::Name;
  39         116  
  39         1016  
167 39     39   220 use PDF::API2::Basic::PDF::Number;
  39         116  
  39         1096  
168 39     39   231 use PDF::API2::Basic::PDF::Objind;
  39         93  
  39         1265  
169 39     39   222 use PDF::API2::Basic::PDF::String;
  39         112  
  39         890  
170 39     39   17720 use PDF::API2::Basic::PDF::Page;
  39         117  
  39         1407  
171 39     39   284 use PDF::API2::Basic::PDF::Pages;
  39         85  
  39         765  
172 39     39   196 use PDF::API2::Basic::PDF::Null;
  39         97  
  39         984  
173 39     39   201 use POSIX qw(ceil floor);
  39         155  
  39         285  
174              
175 39     39   3311 no warnings qw[ deprecated recursion uninitialized ];
  39         95  
  39         44395  
176              
177              
178             =head2 PDF::API2::Basic::PDF::File->new
179              
180             Creates a new, empty file object which can act as the host to other PDF objects.
181             Since there is no file associated with this object, it is assumed that the
182             object is created in readiness for creating a new PDF file.
183              
184             =cut
185              
186             sub new {
187 164     164 1 425 my ($class, $root) = @_;
188 164         531 my $self = $class->_new();
189              
190 164 50       467 unless ($root) {
191 164         582 $root = PDFDict();
192 164         538 $root->{'Type'} = PDFName('Catalog');
193             }
194 164         610 $self->new_obj($root);
195 164         400 $self->{'Root'} = $root;
196 164         568 return $self;
197             }
198              
199              
200             =head2 $p = PDF::API2::Basic::PDF::File->open($filename, $update)
201              
202             Opens the file and reads all the trailers and cross reference tables to build
203             a complete directory of objects.
204              
205             $update specifies whether this file is being opened for updating and editing,
206             or simply to be read.
207              
208             $filename may be an IO object
209              
210             =cut
211              
212             sub open {
213 16     16 1 65 my ($class, $filename, $update) = @_;
214 16         33 my ($fh, $buffer);
215              
216 16         74 my $self = $class->_new();
217 16 100       75 if (ref $filename) {
218 8         22 $self->{' INFILE'} = $filename;
219 8 50       28 if ($update) {
220 8         18 $self->{' update'} = 1;
221 8         27 $self->{' OUTFILE'} = $filename;
222             }
223 8         16 $fh = $filename;
224             }
225             else {
226 8 50       115 die "File '$filename' does not exist" unless -f $filename;
227 8 50       114 die "File '$filename' is not readable" unless -r $filename;
228 8 50       44 if ($update) {
229 8 50       111 die "File '$filename' is not writable" unless -w $filename;
230             }
231 8   50     109 $fh = IO::File->new(($update ? '+' : '') . "<$filename")
232             || die "Error opening '$filename': $!";
233 8         892 $self->{' INFILE'} = $fh;
234 8 50       31 if ($update) {
235 8         27 $self->{' update'} = 1;
236 8         40 $self->{' OUTFILE'} = $fh;
237 8         25 $self->{' fname'} = $filename;
238             }
239             }
240 16         105 binmode $fh, ':raw';
241 16         162 $fh->seek(0, 0); # go to start of file
242 16         272 $fh->read($buffer, 255);
243 16 50       973 unless ($buffer =~ /^\%PDF\-([12]\.\d+)\s*$cr/m) {
244 0         0 croak "$filename does not appear to be a valid PDF";
245             }
246 16         101 $self->{' version'} = $1;
247              
248 16         92 $fh->seek(0, 2); # go to end of file
249 16         268 my $end = $fh->tell();
250 16         138 $self->{' epos'} = $end;
251 16         70 foreach my $offset (1..64) {
252 32         168 $fh->seek($end - 16 * $offset, 0);
253 32         352 $fh->read($buffer, 16 * $offset);
254 32 100       956 last if $buffer =~ m/startxref($cr|\s*)\d+($cr|\s*)\%\%eof.*?/i;
255             }
256 16 50       432 unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr|\s*)\%\%eof.*?/i) {
257 0         0 die "Malformed PDF file $filename";
258             }
259 16         72 my $xpos = $1;
260 16         65 $self->{' xref_position'} = $xpos;
261              
262 16         82 my $tdict = $self->readxrtr($xpos, $self);
263 16         82 foreach my $key (keys %$tdict) {
264 121         269 $self->{$key} = $tdict->{$key};
265             }
266 16         150 return $self;
267             }
268              
269             =head2 $p->version($version)
270              
271             Gets/sets the PDF version (e.g. 1.4)
272              
273             =cut
274              
275             sub version {
276 9     9 1 14 my $self = shift();
277              
278 9 100       26 if (@_) {
279 2         5 my $version = shift();
280 2 50       20 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
281 2         7 $self->header_version($version);
282 2 50       9 if ($version >= 1.4) {
283 2         8 $self->trailer_version($version);
284             }
285             else {
286 0         0 delete $self->{'Root'}->{'Version'};
287 0         0 $self->out_obj($self->{'Root'});
288             }
289 2         4 return $version;
290             }
291              
292 7         14 my $header_version = $self->header_version();
293 7         15 my $trailer_version = $self->trailer_version();
294 7 100       44 return $trailer_version if $trailer_version > $header_version;
295 3         18 return $header_version;
296             }
297              
298             =head2 $version = $p->header_version($version)
299              
300             Gets/sets the PDF version stored in the file header.
301              
302             =cut
303              
304             sub header_version {
305 14     14 1 43 my $self = shift();
306              
307 14 100       31 if (@_) {
308 5         12 my $version = shift();
309 5 50       30 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
310 5         14 $self->{' version'} = $version;
311             }
312              
313 14         32 return $self->{' version'};
314             }
315              
316             =head2 $version = $p->trailer_version($version)
317              
318             Gets/sets the PDF version stored in the document catalog.
319              
320             =cut
321              
322             sub trailer_version {
323 12     12 1 31 my $self = shift();
324              
325 12 100       23 if (@_) {
326 4         8 my $version = shift();
327 4 50       18 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
328 4         16 $self->{'Root'}->{'Version'} = PDFName($version);
329 4         16 $self->out_obj($self->{'Root'});
330 4         9 return $version;
331             }
332              
333 8 50       28 return unless $self->{'Root'}->{'Version'};
334 8         37 $self->{'Root'}->{'Version'}->realise();
335 8         28 return $self->{'Root'}->{'Version'}->val();
336             }
337              
338             =head2 $prev_version = $p->require_version($version)
339              
340             Ensures that the PDF version is at least C<$version>.
341              
342             =cut
343              
344             sub require_version {
345 3     3 1 1019 my ($self, $min_version) = @_;
346 3         12 my $current_version = $self->version();
347 3 100       13 $self->version($min_version) if $current_version < $min_version;
348 3         9 return $current_version;
349             }
350              
351             =head2 $p->release()
352              
353             Releases ALL of the memory used by the PDF document and all of its
354             component objects. After calling this method, do B expect to
355             have anything left in the C object (so if
356             you need to save, be sure to do it before calling this method).
357              
358             B, that it is important that you call this method on any
359             C object when you wish to destruct it and
360             free up its memory. Internally, PDF files have an enormous number of
361             cross-references and this causes circular references within the
362             internal data structures. Calling 'C' forces a brute-force
363             cleanup of the data structures, freeing up all of the memory. Once
364             you've called this method, though, don't expect to be able to do
365             anything else with the C object; it'll
366             have B internal state whatsoever.
367              
368             =cut
369              
370             # Maintainer's Question: Couldn't this be handled by a DESTROY method
371             # instead of requiring an explicit call to release()?
372             sub release {
373 618     618 1 925 my $self = shift();
374              
375 618 50       1229 return $self unless ref($self);
376 618         1423 my @tofree = values %$self;
377              
378 618         1353 foreach my $key (keys %$self) {
379 2271         2944 $self->{$key} = undef;
380 2271         3243 delete $self->{$key};
381             }
382              
383             # PDFs with highly-interconnected page trees or outlines can hit Perl's
384             # recursion limit pretty easily, so disable the warning for this specific
385             # loop.
386 39     39   364 no warnings 'recursion';
  39         94  
  39         299054  
387              
388 618         2243 while (my $item = shift @tofree) {
389 7109 100 100     24004 if (blessed($item) and $item->can('release')) {
    100          
    100          
390 2117         4280 $item->release(1);
391             }
392             elsif (ref($item) eq 'ARRAY') {
393 1554         4353 push @tofree, @$item;
394             }
395             elsif (ref($item) eq 'HASH') {
396 878         2158 push @tofree, values %$item;
397 878         2075 foreach my $key (keys %$item) {
398 3822         4899 $item->{$key} = undef;
399 3822         6609 delete $item->{$key};
400             }
401             }
402             else {
403 2560         5748 $item = undef;
404             }
405             }
406             }
407              
408             =head2 $p->append_file()
409              
410             Appends the objects for output to the read file and then appends the appropriate table.
411              
412             =cut
413              
414             sub append_file {
415 3     3 1 8 my $self = shift();
416 3 50       10 return unless $self->{' update'};
417              
418 3         8 my $fh = $self->{' INFILE'};
419              
420 3         11 my $tdict = PDFDict();
421 3         24 $tdict->{'Prev'} = PDFNum($self->{' loc'});
422 3         9 $tdict->{'Info'} = $self->{'Info'};
423 3 50       11 if (defined $self->{' newroot'}) {
424 0         0 $tdict->{'Root'} = $self->{' newroot'};
425             }
426             else {
427 3         7 $tdict->{'Root'} = $self->{'Root'};
428             }
429 3         9 $tdict->{'Size'} = $self->{'Size'};
430              
431 3         26 foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) {
  54         128  
432 9 50       24 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
433             }
434              
435 3         18 $fh->seek($self->{' epos'}, 0);
436 3         26 $self->out_trailer($tdict, $self->{' update'});
437 3         38 close $self->{' OUTFILE'};
438             }
439              
440              
441             =head2 $p->out_file($fname)
442              
443             Writes a PDF file to a file of the given filename based on the current list of
444             objects to be output. It creates the trailer dictionary based on information
445             in $self.
446              
447             $fname may be an IO object;
448              
449             =cut
450              
451             sub out_file {
452 140     140 1 347 my ($self, $fname) = @_;
453              
454 140         479 $self->create_file($fname);
455 140         434 $self->close_file();
456 140         364 return $self;
457             }
458              
459              
460             =head2 $p->create_file($fname)
461              
462             Creates a new output file (no check is made of an existing open file) of
463             the given filename or IO object. Note, make sure that $p->{' version'} is set
464             correctly before calling this function.
465              
466             =cut
467              
468             sub create_file {
469 140     140 1 289 my ($self, $filename) = @_;
470 140         223 my $fh;
471              
472 140         304 $self->{' fname'} = $filename;
473 140 50       363 if (ref $filename) {
474 140         229 $fh = $filename;
475             }
476             else {
477 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
478 0         0 binmode($fh,':raw');
479             }
480              
481 140         270 $self->{' OUTFILE'} = $fh;
482 140   50     842 $fh->print('%PDF-' . ($self->{' version'} // '1.4') . "\n");
483 140         1249 $fh->print("%\xC6\xCD\xCD\xB5\n"); # and some binary stuff in a comment
484 140         803 return $self;
485             }
486              
487              
488             =head2 $p->clone_file($fname)
489              
490             Creates a copy of the input file at the specified filename and sets it as the
491             output file for future writes. A file handle may be passed instead of a
492             filename.
493              
494             =cut
495              
496             sub clone_file {
497 4     4 1 22 my ($self, $filename) = @_;
498 4         9 my $fh;
499              
500 4         14 $self->{' fname'} = $filename;
501 4 50       17 if (ref $filename) {
502 4         10 $fh = $filename;
503             }
504             else {
505 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
506 0         0 binmode($fh,':raw');
507             }
508              
509 4         11 $self->{' OUTFILE'} = $fh;
510              
511 4         10 my $in = $self->{' INFILE'};
512 4         23 $in->seek(0, 0);
513 4         58 my $data;
514 4         26 while (not $in->eof()) {
515 4         96 $in->read($data, 1024 * 1024);
516 4         155 $fh->print($data);
517             }
518 4         80 return $self;
519             }
520              
521             =head2 $p->close_file
522              
523             Closes up the open file for output by outputting the trailer etc.
524              
525             =cut
526              
527             sub close_file {
528 145     145 1 227 my $self = shift();
529              
530 145         422 my $tdict = PDFDict();
531 145 50       570 $tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'};
532 145 50 33     550 $tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'};
533              
534             # remove all freed objects from the outlist, AND the outlist_cache if not updating
535             # NO! Don't do that thing! In fact, let out_trailer do the opposite!
536              
537 145   66     571 $tdict->{'Size'} = $self->{'Size'} || PDFNum(1);
538 145 100       483 $tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'};
539 145 100       388 if ($self->{' update'}) {
540 5         43 foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) {
  97         226  
541 16 50       47 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
542             }
543              
544 5         17 my $fh = $self->{' INFILE'};
545 5         32 $fh->seek($self->{' epos'}, 0);
546             }
547              
548 145         847 $self->out_trailer($tdict, $self->{' update'});
549 145         1433 close($self->{' OUTFILE'});
550 145 50 33     677 if ($^O eq 'MacOS' and not ref($self->{' fname'})) {
551 0         0 MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'});
552             }
553              
554 145         674 return $self;
555             }
556              
557             =head2 ($value, $str) = $p->readval($str, %opts)
558              
559             Reads a PDF value from the current position in the file. If $str is too short
560             then read some more from the current location in the file until the whole object
561             is read. This is a recursive call which may slurp in a whole big stream (unprocessed).
562              
563             Returns the recursive data structure read and also the current $str that has been
564             read from the file.
565              
566             =cut
567              
568             sub readval {
569 831     831 1 14354 my ($self, $str, %opts) = @_;
570 831         1415 my $fh = $self->{' INFILE'};
571 831         1235 my ($result, $value);
572              
573 831 100       1605 my $update = defined($opts{update}) ? $opts{update} : 1;
574 831 100       1861 $str = update($fh, $str) if $update;
575              
576 831         2462 $str =~ s/^$ws_char+//; # Ignore initial white space
577 831         2243 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
578              
579             # Dictionary
580 831 100       9870 if ($str =~ m/^<
    100          
    100          
    100          
    100          
    50          
    100          
    50          
    50          
    0          
581 146         413 $str = substr ($str, 2);
582 146 100       408 $str = update($fh, $str) if $update;
583 146         469 $result = PDFDict();
584              
585 146         441 while ($str !~ m/^>>/) {
586 387         1345 $str =~ s/^$ws_char+//; # Ignore initial white space
587 387         1116 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
588              
589 387 50       2066 if ($str =~ s|^/($reg_char+)||) {
    0          
    0          
590 387         1074 my $key = PDF::API2::Basic::PDF::Name::name_to_string($1, $self);
591 387         1472 ($value, $str) = $self->readval($str, %opts);
592 387 50 50     1243 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
593 387         904 $result->{$key} = $value;
594             }
595             }
596             elsif ($str =~ s|^/$ws_char+||) {
597             # fixes a broken key problem of acrobat. -- fredo
598 0         0 ($value, $str) = $self->readval($str, %opts);
599 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
600 0         0 $result->{'null'} = $value;
601             }
602             }
603             elsif ($str =~ s|^//|/|) {
604             # fixes again a broken key problem of illustrator/enfocus. -- fredo
605 0         0 ($value, $str) = $self->readval($str, %opts);
606 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
607 0         0 $result->{'null'} = $value;
608             }
609             }
610             else {
611 0         0 die "Invalid dictionary key";
612             }
613 387 100       1320 $str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk
614             }
615 146         499 $str =~ s/^>>//;
616 146 100       405 $str = update($fh, $str) if $update;
617             # streams can't be followed by a lone carriage-return.
618             # fredo: yes they can !!! -- use the MacOS Luke.
619 146 100 66     541 if (($str =~ s/^stream(?:(?:\015\012)|\012|\015)//) and ($result->{'Length'}->val != 0)) { # stream
620 10         35 my $length = $result->{'Length'}->val;
621 10         28 $result->{' streamsrc'} = $fh;
622 10         35 $result->{' streamloc'} = $fh->tell - length($str);
623              
624 10 50       112 unless ($opts{'nostreams'}) {
625 10 50       65 if ($length > length($str)) {
626 0         0 $value = $str;
627 0         0 $length -= length($str);
628 0         0 read $fh, $str, $length + 11; # slurp the whole stream!
629             }
630             else {
631 10         25 $value = '';
632             }
633 10         32 $value .= substr($str, 0, $length);
634 10         29 $result->{' stream'} = $value;
635 10         23 $result->{' nofilt'} = 1;
636 10 50       35 $str = update($fh, $str, 1) if $update; # tell update we are in-stream and only need an endstream
637 10         44 $str = substr($str, index($str, 'endstream') + 9);
638             }
639             }
640              
641 146 100 100     569 if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val}) {
642 34         119 bless $result, $types{$result->{'Type'}->val};
643             }
644             # gdj: FIXME: if any of the ws chars were crs, then the whole
645             # string might not have been read.
646             }
647              
648             # Indirect Object
649             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R/s) {
650 152         408 my $num = $1;
651 152         273 $value = $2;
652 152         1432 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R//s;
653 152 100       471 unless ($result = $self->test_obj($num, $value)) {
654 119         543 $result = PDF::API2::Basic::PDF::Objind->new();
655 119         392 $result->{' objnum'} = $num;
656 119         232 $result->{' objgen'} = $value;
657 119         365 $self->add_obj($result, $num, $value);
658             }
659 152         327 $result->{' parent'} = $self;
660 152         514 weaken $result->{' parent'};
661              
662             # Removed to address changes being lost when an indirect object is realised twice
663             # $result->{' realised'} = 0;
664              
665             # gdj: FIXME: if any of the ws chars were crs, then the whole
666             # string might not have been read.
667             }
668              
669             # Object
670             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj/s) {
671 95         182 my $obj;
672 95         262 my $num = $1;
673 95         181 $value = $2;
674 95         1124 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj//s;
675 95         398 ($obj, $str) = $self->readval($str, %opts);
676 95 100       248 if ($result = $self->test_obj($num, $value)) {
677 81         305 $result->merge($obj);
678             }
679             else {
680 14         24 $result = $obj;
681 14         57 $self->add_obj($result, $num, $value);
682 14         25 $result->{' realised'} = 1;
683             }
684 95 100       298 $str = update($fh, $str) if $update; # thanks to kundrat@kundrat.sk
685 95         640 $str =~ s/^endobj//;
686             }
687              
688             # Name
689             elsif ($str =~ m|^/($reg_char*)|s) {
690 276         677 $value = $1;
691 276         1447 $str =~ s|^/($reg_char*)||s;
692 276         1001 $result = PDF::API2::Basic::PDF::Name->from_pdf($value, $self);
693             }
694              
695             # Literal String
696             elsif ($str =~ m/^\(/) {
697             # We now need to find an unbalanced, unescaped right-paren.
698             # This can't be done with a regex.
699 2         13 my $value = '(';
700 2         8 $str = substr($str, 1);
701              
702 2         4 my $nested_level = 1;
703 2         5 while (1) {
704             # Ignore everything up to the first escaped or parenthesis character
705 2 50       13 if ($str =~ /^([^\\()]+)(.*)/s) {
706 2         7 $value .= $1;
707 2         6 $str = $2;
708             }
709              
710             # Ignore escaped parentheses
711 2 50       18 if ($str =~ /^(\\[()])/) {
    50          
    50          
    0          
712 0         0 $value .= $1;
713 0         0 $str = substr($str, 2);
714             }
715              
716             # Left parenthesis: increase nesting
717             elsif ($str =~ /^\(/) {
718 0         0 $value .= '(';
719 0         0 $str = substr($str, 1);
720 0         0 $nested_level++;
721             }
722              
723             # Right parenthesis: decrease nesting
724             elsif ($str =~ /^\)/) {
725 2         7 $value .= ')';
726 2         16 $str = substr($str, 1);
727 2         6 $nested_level--;
728 2 50       7 last unless $nested_level;
729             }
730              
731             # Other escaped character
732             elsif ($str =~ /^(\\[^()])/) {
733 0         0 $value .= $1;
734 0         0 $str = substr($str, 2);
735             }
736              
737             # If there wasn't an escaped or parenthesis character,
738             # read some more.
739             else {
740             # We don't use update because we don't want to remove
741             # whitespace or comments.
742 0 0       0 $fh->read($str, 255, length($str)) or die 'Unterminated string.';
743             }
744             }
745              
746 2         10 $result = PDF::API2::Basic::PDF::String->from_pdf($value);
747             }
748              
749             # Hex String
750             elsif ($str =~ m/^
751 0         0 $str =~ s/^
752 0         0 $fh->read($str, 255, length($str)) while (0 > index($str, '>'));
753 0         0 ($value, $str) = ($str =~ /^(.*?)>(.*)/s);
754 0         0 $result = PDF::API2::Basic::PDF::String->from_pdf('<' . $value . '>');
755             }
756              
757             # Array
758             elsif ($str =~ m/^\[/) {
759 74         301 $str =~ s/^\[//;
760 74 50       270 $str = update($fh, $str) if $update;
761 74         220 $result = PDFArray();
762 74         211 while ($str !~ m/^\]/) {
763 224         850 $str =~ s/^$ws_char+//; # Ignore initial white space
764 224         712 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
765              
766 224         681 ($value, $str) = $self->readval($str, %opts);
767 224         725 $result->add_elements($value);
768 224 50       533 $str = update($fh, $str) if $update; # str might just be exhausted!
769             }
770 74         288 $str =~ s/^\]//;
771             }
772              
773             # Boolean
774             elsif ($str =~ m/^(true|false)($irreg_char|$)/) {
775 0         0 $value = $1;
776 0         0 $str =~ s/^(?:true|false)//;
777 0         0 $result = PDF::API2::Basic::PDF::Bool->from_pdf($value);
778             }
779              
780             # Number
781             elsif ($str =~ m/^([+-.0-9]+)($irreg_char|$)/) {
782 86         259 $value = $1;
783 86         285 $str =~ s/^([+-.0-9]+)//;
784              
785             # If $str only consists of whitespace (or is empty), call update to
786             # see if this is the beginning of an indirect object or reference
787 86 100 100     1631 if ($update and ($str =~ /^$re_whitespace*$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace*$/s)) {
      100        
788 6         60 $str =~ s/^$re_whitespace+/ /s;
789 6         46 $str =~ s/$re_whitespace+$/ /s;
790 6         16 $str = update($fh, $str);
791 6 100       161 if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R|obj)/s) {
792 4         19 return $self->readval("$value $str", %opts);
793             }
794             }
795              
796 82         526 $result = PDF::API2::Basic::PDF::Number->from_pdf($value);
797             }
798              
799             # Null
800             elsif ($str =~ m/^null($irreg_char|$)/) {
801 0         0 $str =~ s/^null//;
802 0         0 $result = PDF::API2::Basic::PDF::Null->new;
803             }
804              
805             else {
806 0         0 die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . ".";
807             }
808              
809 827         3681 $str =~ s/^$ws_char+//s;
810 827         2919 return ($result, $str);
811             }
812              
813              
814             =head2 $ref = $p->read_obj($objind, %opts)
815              
816             Given an indirect object reference, locate it and read the object returning
817             the read in object.
818              
819             =cut
820              
821             sub read_obj {
822 77     77 1 173 my ($self, $objind, %opts) = @_;
823              
824 77   50     294 my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) || return;
825 77 50       287 $objind->merge($res) unless $objind eq $res;
826 77         283 return $objind;
827             }
828              
829              
830             =head2 $ref = $p->read_objnum($num, $gen, %opts)
831              
832             Returns a fully read object of given number and generation in this file
833              
834             =cut
835              
836             sub read_objnum {
837 85     85 1 2471 my ($self, $num, $gen, %opts) = @_;
838 85 50       197 croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num;
839 85 50       182 croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen;
840 85 50       410 croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/;
841 85 50       320 croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/;
842              
843 85   50     281 my $object_location = $self->locate_obj($num, $gen) || return;
844 85         166 my $object;
845              
846             # Compressed object
847 85 100       195 if (ref($object_location)) {
848 4         8 my ($object_stream_num, $object_stream_pos) = @{$object_location};
  4         10  
849              
850 4         32 my $object_stream = $self->read_objnum($object_stream_num, 0, %opts);
851 4 50       17 die 'Cannot find the compressed object stream' unless $object_stream;
852              
853 4 50       23 $object_stream->read_stream() if $object_stream->{' nofilt'};
854              
855             # An object stream starts with pairs of integers containing object numbers and
856             # stream offsets relative to the First key
857 4         12 my $fh;
858             my $pairs;
859 4 50       12 unless ($object_stream->{' streamfile'}) {
860 4         18 $pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val);
861             }
862             else {
863 0         0 CORE::open($fh, '<', $object_stream->{' streamfile'});
864 0         0 read($fh, $pairs, $object_stream->{'First'}->val());
865             }
866 4         32 my @map = split /\s+/, $pairs;
867              
868             # Find the offset of the object in the stream
869 4         21 my $index = $object_stream_pos * 2;
870 4 50       16 die "Objind $num does not exist at index $index" unless $map[$index] == $num;
871 4         16 my $start = $map[$index + 1];
872              
873             # Unless this is the last object in the stream, its length is determined by the
874             # offset of the next object
875 4         16 my $last_object_in_stream = $map[-2];
876 4         11 my $length;
877 4 100       15 if ($last_object_in_stream == $num) {
878 2 50       7 if ($object_stream->{' stream'}) {
879 2         17 $length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start;
880             }
881             else {
882 0         0 $length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start;
883             }
884             }
885             else {
886 2         11 my $next_start = $map[$index + 3];
887 2         5 $length = $next_start - $start;
888             }
889              
890             # Read the object from the stream
891 4         14 my $stream = "$num 0 obj ";
892 4 50       21 unless ($object_stream->{' streamfile'}) {
893 4         17 $stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length);
894             }
895             else {
896 0         0 seek($fh, $object_stream->{'First'}->val() + $start, 0);
897 0         0 read($fh, $stream, $length, length($stream));
898 0         0 close $fh;
899             }
900              
901 4         18 ($object) = $self->readval($stream, %opts, update => 0);
902 4         20 return $object;
903             }
904              
905 81         288 my $current_location = $self->{' INFILE'}->tell;
906 81         612 $self->{' INFILE'}->seek($object_location, 0);
907 81         852 ($object) = $self->readval('', %opts);
908 81         468 $self->{' INFILE'}->seek($current_location, 0);
909 81         1143 return $object;
910             }
911              
912              
913             =head2 $objind = $p->new_obj($obj)
914              
915             Creates a new, free object reference based on free space in the cross reference chain.
916             If nothing free then thinks up a new number. If $obj then turns that object into this
917             new object rather than returning a new object.
918              
919             =cut
920              
921             sub new_obj {
922 1072     1072 1 2021 my ($self, $base) = @_;
923 1072         1726 my $res;
924              
925 1072 50 66     2901 if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) {
  15         63  
926 0         0 $res = shift(@{$self->{' free'}});
  0         0  
927 0 0       0 if (defined $base) {
928 0         0 my ($num, $gen) = @{$self->{' objects'}{$res->uid}};
  0         0  
929 0         0 $self->remove_obj($res);
930 0         0 $self->add_obj($base, $num, $gen);
931 0         0 return $self->out_obj($base);
932             }
933             else {
934 0         0 $self->{' objects'}{$res->uid}[2] = 0;
935 0         0 return $res;
936             }
937             }
938              
939 1072         1610 my $tdict = $self;
940 1072         1464 my $i;
941 1072         2212 while (defined $tdict) {
942 1073 50       3238 $i = $tdict->{' xref'}{defined($i) ? $i : ''}[0];
943 1073   33     2671 while (defined $i and $i != 0) {
944 0         0 my ($ni, $ng) = @{$tdict->{' xref'}{$i}};
  0         0  
945 0 0       0 unless (defined $self->locate_obj($i, $ng)) {
946 0 0       0 if (defined $base) {
947 0         0 $self->add_obj($base, $i, $ng);
948 0         0 return $base;
949             }
950             else {
951 0   0     0 $res = $self->test_obj($i, $ng) || $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, $ng);
952 0         0 $self->out_obj($res);
953 0         0 return $res;
954             }
955             }
956 0         0 $i = $ni;
957             }
958 1073         2272 $tdict = $tdict->{' prev'};
959             }
960              
961 1072         1805 $i = $self->{' maxobj'}++;
962 1072 50       2039 if (defined $base) {
963 1072         3075 $self->add_obj($base, $i, 0);
964 1072         2719 $self->out_obj($base);
965 1072         2276 return $base;
966             }
967             else {
968 0         0 $res = $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, 0);
969 0         0 $self->out_obj($res);
970 0         0 return $res;
971             }
972             }
973              
974              
975             =head2 $p->out_obj($objind)
976              
977             Indicates that the given object reference should appear in the output xref
978             table whether with data or freed.
979              
980             =cut
981              
982             sub out_obj {
983 2676     2676 1 4544 my ($self, $obj) = @_;
984              
985             # This is why we've been keeping the outlist CACHE around; to speed
986             # up this method by orders of magnitude (it saves up from having to
987             # grep the full outlist each time through as we'll just do a lookup
988             # in the hash) (which is super-fast).
989 2676 100       7115 unless (exists $self->{' outlist_cache'}{$obj}) {
990 1087         1609 push @{$self->{' outlist'}}, $obj;
  1087         2442  
991             # weaken $self->{' outlist'}->[-1];
992 1087         3378 $self->{' outlist_cache'}{$obj} = 1;
993             }
994 2676         4748 return $obj;
995             }
996              
997              
998             =head2 $p->free_obj($objind)
999              
1000             Marks an object reference for output as being freed.
1001              
1002             =cut
1003              
1004             sub free_obj {
1005 0     0 1 0 my ($self, $obj) = @_;
1006              
1007 0         0 push @{$self->{' free'}}, $obj;
  0         0  
1008 0         0 $self->{' objects'}{$obj->uid()}[2] = 1;
1009 0         0 $self->out_obj($obj);
1010             }
1011              
1012              
1013             =head2 $p->remove_obj($objind)
1014              
1015             Removes the object from all places where we might remember it
1016              
1017             =cut
1018              
1019             sub remove_obj {
1020 0     0 1 0 my ($self, $objind) = @_;
1021              
1022             # who says it has to be fast
1023 0         0 delete $self->{' objects'}{$objind->uid()};
1024 0         0 delete $self->{' outlist_cache'}{$objind};
1025 0         0 delete $self->{' printed_cache'}{$objind};
1026 0         0 @{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}};
  0         0  
  0         0  
  0         0  
1027 0         0 @{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}};
  0         0  
  0         0  
  0         0  
1028             $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef
1029 0 0       0 if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind;
1030 0         0 return $self;
1031             }
1032              
1033              
1034             =head2 $p->ship_out(@objects)
1035              
1036             Ships the given objects (or all objects for output if @objects is empty) to
1037             the currently open output file (assuming there is one). Freed objects are not
1038             shipped, and once an object is shipped it is switched such that this file
1039             becomes its source and it will not be shipped again unless out_obj is called
1040             again. Notice that a shipped out object can be re-output or even freed, but
1041             that it will not cause the data already output to be changed.
1042              
1043             =cut
1044              
1045             sub ship_out {
1046 154     154 1 322 my ($self, @objs) = @_;
1047              
1048 154 50       387 die "No output file specified" unless defined $self->{' OUTFILE'};
1049 154         267 my $fh = $self->{' OUTFILE'};
1050 154         363 seek($fh, 0, 2); # go to the end of the file
1051              
1052 154 50       394 @objs = @{$self->{' outlist'}} unless scalar @objs > 0;
  154         422  
1053 154         417 foreach my $objind (@objs) {
1054 890 50       2438 next unless $objind->is_obj($self);
1055 890         1576 my $j = -1;
1056 890         1402 for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) {
  890         2029  
1057 890 50       2435 if ($self->{' outlist'}[$i] eq $objind) {
1058 890         1331 $j = $i;
1059 890         1596 last;
1060             }
1061             }
1062 890 50       1771 next if $j < 0;
1063 890         1221 splice(@{$self->{' outlist'}}, $j, 1);
  890         1757  
1064 890         2241 delete $self->{' outlist_cache'}{$objind};
1065 890 50       1235 next if grep { $_ eq $objind } @{$self->{' free'}};
  0         0  
  890         2281  
1066              
1067 890 50       1812 map { $fh->print("\% $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'};
  0         0  
1068 890         2446 $self->{' locs'}{$objind->uid()} = $fh->tell();
1069 890         1654 my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1];
  890         1826  
1070 890         2600 $fh->printf('%d %d obj ', $objnum, $objgen);
1071 890         9002 $objind->outobjdeep($fh, $self);
1072 890         3271 $fh->print(" endobj\n");
1073              
1074             # Note that we've output this obj, not forgetting to update
1075             # the cache of whats printed.
1076 890 50       5822 unless (exists $self->{' printed_cache'}{$objind}) {
1077 890         1281 push @{$self->{' printed'}}, $objind;
  890         1979  
1078 890         3012 $self->{' printed_cache'}{$objind}++;
1079             }
1080             }
1081 154         397 return $self;
1082             }
1083              
1084             =head2 $p->copy($outpdf, \&filter)
1085              
1086             Iterates over every object in the file reading the object, calling filter with the object
1087             and outputting the result. if filter is not defined, then just copies input to output.
1088              
1089             =cut
1090              
1091             sub copy {
1092 0     0 1 0 my ($self, $out, $filter) = @_;
1093 0         0 my ($obj, $minl, $mini, $ming);
1094              
1095 0         0 foreach my $key (grep { not m/^[\s\-]/ } keys %$self) {
  0         0  
1096 0 0       0 $out->{$key} = $self->{$key} unless defined $out->{$key};
1097             }
1098              
1099 0         0 my $tdict = $self;
1100 0         0 while (defined $tdict) {
1101 0         0 foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) {
  0         0  
  0         0  
1102 0         0 my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}};
  0         0  
1103 0 0       0 next unless $nt eq 'n';
1104              
1105 0 0 0     0 if ($nl < $minl or $mini == 0) {
1106 0         0 $mini = $i;
1107 0         0 $ming = $ng;
1108 0         0 $minl = $nl;
1109             }
1110 0 0       0 unless ($obj = $self->test_obj($i, $ng)) {
1111 0         0 $obj = PDF::API2::Basic::PDF::Objind->new();
1112 0         0 $obj->{' objnum'} = $i;
1113 0         0 $obj->{' objgen'} = $ng;
1114 0         0 $self->add_obj($obj, $i, $ng);
1115 0         0 $obj->{' parent'} = $self;
1116 0         0 weaken $obj->{' parent'};
1117 0         0 $obj->{' realised'} = 0;
1118             }
1119 0         0 $obj->realise;
1120 0 0       0 my $res = defined $filter ? &{$filter}($obj) : $obj;
  0         0  
1121 0 0 0     0 $out->new_obj($res) unless (!$res || $res->is_obj($out));
1122             }
1123 0         0 $tdict = $tdict->{' prev'};
1124             }
1125              
1126             # test for linearized and remove it from output
1127 0         0 $obj = $self->test_obj($mini, $ming);
1128 0 0 0     0 if ($obj->isa('PDF::API2::Basic::PDF::Dict') && $obj->{'Linearized'}) {
1129 0         0 $out->free_obj($obj);
1130             }
1131              
1132 0         0 return $self;
1133             }
1134              
1135              
1136             =head1 PRIVATE METHODS & FUNCTIONS
1137              
1138             The following methods and functions are considered private to this class. This
1139             does not mean you cannot use them if you have a need, just that they aren't really
1140             designed for users of this class.
1141              
1142             =head2 $offset = $p->locate_obj($num, $gen)
1143              
1144             Returns a file offset to the object asked for by following the chain of cross
1145             reference tables until it finds the one you want.
1146              
1147             =cut
1148              
1149             sub locate_obj {
1150 85     85 1 191 my ($self, $num, $gen) = @_;
1151              
1152 85         125 my $tdict = $self;
1153 85         204 while (defined $tdict) {
1154 94 100       310 if (ref $tdict->{' xref'}{$num}) {
1155 85         155 my $ref = $tdict->{' xref'}{$num};
1156 85 100       206 return $ref unless scalar(@$ref) == 3;
1157              
1158 81 50       278 if ($ref->[1] == $gen) {
1159 81 50       395 return $ref->[0] if $ref->[2] eq 'n';
1160 0         0 return; # if $ref->[2] eq 'f';
1161             }
1162             }
1163 9         38 $tdict = $tdict->{' prev'};
1164             }
1165 0         0 return;
1166             }
1167              
1168              
1169             =head2 update($fh, $str, $instream)
1170              
1171             Keeps reading $fh for more data to ensure that $str has at least a line full
1172             for C to work on. At this point we also take the opportunity to ignore
1173             comments.
1174              
1175             =cut
1176              
1177             sub update {
1178 1918     1918 1 3408 my ($fh, $str, $instream) = @_;
1179 1918 50       3401 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1180 1918 100       3111 if ($instream) {
1181             # we are inside a (possible binary) stream
1182             # so we fetch data till we see an 'endstream'
1183             # -- fredo/2004-09-03
1184 10   33     53 while ($str !~ m/endstream/ and not $fh->eof()) {
1185 0 0       0 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1186 0         0 $fh->read($str, 314, length($str));
1187             }
1188             }
1189             else {
1190 1908         7940 $str =~ s/^$ws_char*//;
1191 1908   100     15374 while ($str !~ m/$cr/ and not $fh->eof()) {
1192 107 50       1404 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1193 107         450 $fh->read($str, 314, length($str));
1194 107         2543 $str =~ s/^$ws_char*//so;
1195             }
1196 1908         4872 while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23
1197 1 50       5 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1198 1   33     34 $fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof());
1199 1         24 $str =~ s/^\%[^\015\012]*$ws_char*//so; # fixed for reportlab -- fredo
1200             }
1201             }
1202              
1203 1918         4557 return $str;
1204             }
1205              
1206             =head2 $objind = $p->test_obj($num, $gen)
1207              
1208             Tests the cache to see whether an object reference (which may or may not have
1209             been getobj()ed) has been cached. Returns it if it has.
1210              
1211             =cut
1212              
1213             sub test_obj {
1214 247     247 1 558 my ($self, $num, $gen) = @_;
1215 247         962 return $self->{' objcache'}{$num, $gen};
1216             }
1217              
1218              
1219             =head2 $p->add_obj($objind)
1220              
1221             Adds the given object to the internal object cache.
1222              
1223             =cut
1224              
1225             sub add_obj {
1226 1205     1205 1 2547 my ($self, $obj, $num, $gen) = @_;
1227              
1228 1205         4081 $self->{' objcache'}{$num, $gen} = $obj;
1229 1205         5293 $self->{' objects'}{$obj->uid()} = [$num, $gen];
1230             # weaken $self->{' objcache'}{$num, $gen};
1231 1205         2367 return $obj;
1232             }
1233              
1234              
1235             =head2 $tdict = $p->readxrtr($xpos)
1236              
1237             Recursive function which reads each of the cross-reference and trailer tables
1238             in turn until there are no more.
1239              
1240             Returns a dictionary corresponding to the trailer chain. Each trailer also
1241             includes the corresponding cross-reference table.
1242              
1243             The structure of the xref private element in a trailer dictionary is of an
1244             anonymous hash of cross reference elements by object number. Each element
1245             consists of an array of 3 elements corresponding to the three elements read
1246             in [location, generation number, free or used]. See the PDF specification
1247             for details.
1248              
1249             =cut
1250              
1251             sub _unpack_xref_stream {
1252 78     78   138 my ($self, $width, $data) = @_;
1253              
1254 78 100       161 return unpack('C', $data) if $width == 1;
1255 52 50       121 return unpack('n', $data) if $width == 2;
1256 0 0       0 return unpack('N', "\x00$data") if $width == 3;
1257 0 0       0 return unpack('N', $data) if $width == 4;
1258 0 0       0 return unpack('Q>', $data) if $width == 8;
1259              
1260 0         0 die "Unsupported xref stream entry width: $width";
1261             }
1262              
1263             sub readxrtr {
1264 19     19 1 68 my ($self, $xpos) = @_;
1265 19         43 my ($tdict, $buf, $xmin, $xnum, $xdiff);
1266              
1267 19         67 my $fh = $self->{' INFILE'};
1268 19         91 $fh->seek($xpos, 0);
1269 19         255 $fh->read($buf, 22);
1270 19         267 $buf = update($fh, $buf); # fix for broken JAWS xref calculation.
1271              
1272 19         47 my $xlist = {};
1273              
1274             ## seams that some products calculate wrong prev entries (short)
1275             ## so we seek ahead to find one -- fredo; save for now
1276             #while($buf !~ m/^xref$cr/i && !eof($fh))
1277             #{
1278             # $buf =~ s/^(\s+|\S+|.)//i;
1279             # $buf=update($fh,$buf);
1280             #}
1281              
1282 19 100       304 if ($buf =~ s/^xref$cr//i) {
    50          
1283             # Plain XRef tables.
1284 16         524 while ($buf =~ m/^$ws_char*([0-9]+)$ws_char+([0-9]+)$ws_char*$cr(.*?)$/s) {
1285 21         54 my $old_buf = $buf;
1286 21         57 $xmin = $1;
1287 21         48 $xnum = $2;
1288 21         65 $buf = $3;
1289 21 50       384 unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) {
1290             # See PDF 1.7 section 7.5.4: Cross-Reference Table
1291 0         0 warn q{Malformed xref in PDF file: subsection shall begin with a line containing two numbers separated by a SPACE (20h)};
1292             }
1293 21         57 $xdiff = length($buf);
1294              
1295 21         138 $fh->read($buf, 20 * $xnum - $xdiff + 15, $xdiff);
1296 21   66     703 while ($xnum-- > 0 and $buf =~ s/^0*([0-9]*)$ws_char+0*([0-9]+)$ws_char+([nf])$cr//) {
1297 118 50       616 $xlist->{$xmin} = [$1, $2, $3] unless exists $xlist->{$xmin};
1298 118         970 $xmin++;
1299             }
1300             }
1301              
1302 16 50       103 if ($buf !~ /^\s*trailer\b/i) {
1303 0         0 die "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell - length($buf));
1304             }
1305              
1306 16         78 $buf =~ s/^\s*trailer\b//i;
1307              
1308 16         94 ($tdict, $buf) = $self->readval($buf);
1309             }
1310             elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) {
1311 3         18 my ($xref_obj, $xref_gen) = ($1, $2);
1312              
1313             # XRef streams.
1314 3         13 ($tdict, $buf) = $self->readval($buf);
1315              
1316 3 50       14 unless ($tdict->{' stream'}) {
1317 0         0 die "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}";
1318             }
1319 3         14 $tdict->read_stream(1);
1320              
1321 3         9 my $stream = $tdict->{' stream'};
1322 3         5 my @widths = map { $_->val } @{$tdict->{W}->val};
  9         20  
  3         12  
1323              
1324 3         7 my $start = 0;
1325 3         7 my $last;
1326              
1327             my @index;
1328 3 100       11 if (defined $tdict->{Index}) {
1329 1         2 @index = map { $_->val() } @{$tdict->{Index}->val};
  2         4  
  1         7  
1330             }
1331             else {
1332 2         10 @index = (0, $tdict->{Size}->val);
1333             }
1334              
1335 3         15 while (scalar @index) {
1336 3         10 $start = shift(@index);
1337 3         11 $last = $start + shift(@index) - 1;
1338              
1339 3         13 for my $i ($start...$last) {
1340             # Replaced "for $xmin" because it creates a loop-specific local variable, and we
1341             # need $xmin to be correct for maxobj below.
1342 26         36 $xmin = $i;
1343              
1344 26         32 my @cols;
1345              
1346 26         42 for my $w (@widths) {
1347 78         96 my $data;
1348 78 50       208 $data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w;
1349              
1350 78         157 push @cols, $data;
1351             }
1352              
1353 26 100       54 $cols[0] = 1 unless defined $cols[0];
1354 26 50       48 if ($cols[0] > 2) {
1355 0         0 die "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj";
1356             }
1357              
1358 26 50       50 next if exists $xlist->{$xmin};
1359              
1360 26 50       60 my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535));
    100          
1361 26 100       66 push @objind, ($cols[0] == 0 ? 'f' : 'n') if $cols[0] < 2;
    100          
1362              
1363 26         72 $xlist->{$xmin} = \@objind;
1364             }
1365             }
1366             }
1367             else {
1368 0         0 die "Malformed xref in PDF file $self->{' fname'}";
1369             }
1370              
1371 19         69 $tdict->{' loc'} = $xpos;
1372 19         49 $tdict->{' xref'} = $xlist;
1373 19 100       127 $self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'};
1374             $tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val)
1375 19 100 66     100 if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val != 0);
1376 19 100       70 delete $tdict->{' prev'} unless defined $tdict->{' prev'};
1377 19         60 return $tdict;
1378             }
1379              
1380              
1381             =head2 $p->out_trailer($tdict)
1382              
1383             Outputs the body and trailer for a PDF file by outputting all the objects in
1384             the ' outlist' and then outputting a xref table for those objects and any
1385             freed ones. It then outputs the trailing dictionary and the trailer code.
1386              
1387             =cut
1388              
1389             sub out_trailer {
1390 148     148 1 529 my ($self, $tdict, $update) = @_;
1391 148         290 my $fh = $self->{' OUTFILE'};
1392              
1393 148         300 while (@{$self->{' outlist'}}) {
  302         811  
1394 154         476 $self->ship_out();
1395             }
1396              
1397             # When writing new trailers, most dictionary entries get copied from the
1398             # previous trailer, but entries related to cross-reference streams should
1399             # get removed (and possibly recreated below).
1400 148         952 delete $tdict->{$_} for (# Entries common to streams
1401             qw(Length Filter DecodeParms F FFilter FDecodeParms DL),
1402              
1403             # Entries specific to cross-reference streams
1404             qw(Index W XRefStm));
1405              
1406 148         555 $tdict->{'Size'} = PDFNum($self->{' maxobj'});
1407              
1408 148         459 my $tloc = $fh->tell();
1409 148         812 my @out;
1410              
1411 148 100       263 my @xreflist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } (@{$self->{' printed'} || []}, @{$self->{' free'} || []});
  1334 100       2702  
  148         476  
  148         680  
1412              
1413 148         360 my ($i, $j, $k);
1414 148 100       346 unless ($update) {
1415 140         239 $i = 1;
1416 140         374 for ($j = 0; $j < @xreflist; $j++) {
1417 868         1146 my @inserts;
1418 868         1234 $k = $xreflist[$j];
1419 868         1739 while ($i < $self->{' objects'}{$k->uid}[0]) {
1420 0         0 my ($n) = PDF::API2::Basic::PDF::Objind->new();
1421 0         0 $self->add_obj($n, $i, 0);
1422 0         0 $self->free_obj($n);
1423 0         0 push(@inserts, $n);
1424 0         0 $i++;
1425             }
1426 868         1353 splice(@xreflist, $j, 0, @inserts);
1427 868         1155 $j += @inserts;
1428 868         1712 $i++;
1429             }
1430             }
1431              
1432 148 100       273 my @freelist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } @{$self->{' free'} || []};
  0         0  
  148         554  
1433              
1434 148         280 $j = 0; my $first = -1; $k = 0;
  148         230  
  148         254  
1435 148         464 for ($i = 0; $i <= $#xreflist + 1; $i++) {
1436 1038 100 100     2799 if ($i > $#xreflist || $self->{' objects'}{$xreflist[$i]->uid}[0] != $j + 1) {
1437 160 100       729 push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid}[0] ") . ($i - $first) . "\n";
1438 160 100       422 if ($first == -1) {
1439 148 50       704 push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid}[0] : 0);
1440 148         288 $first = 0;
1441             }
1442 160         415 for ($j = $first; $j < $i; $j++) {
1443 890         1437 my $xref = $xreflist[$j];
1444 890 50 33     1993 if (defined($freelist[$k]) and defined($xref) and "$freelist[$k]" eq "$xref") {
      33        
1445 0         0 $k++;
1446             push @out, pack("A10AA5A4",
1447             sprintf("%010d", (defined $freelist[$k] ?
1448             $self->{' objects'}{$freelist[$k]->uid}[0] : 0)), " ",
1449 0 0       0 sprintf("%05d", $self->{' objects'}{$xref->uid}[1] + 1),
1450             " f \n");
1451             }
1452             else {
1453             push @out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid}), " ",
1454 890         2007 sprintf("%05d", $self->{' objects'}{$xref->uid}[1]),
1455             " n \n");
1456             }
1457             }
1458 160         299 $first = $i;
1459 160 100       637 $j = $self->{' objects'}{$xreflist[$i]->uid}[0] if ($i < scalar @xreflist);
1460             }
1461             else {
1462 878         1859 $j++;
1463             }
1464             }
1465 148 50 33     531 if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') {
1466 0         0 my (@index, @stream);
1467 0         0 for (@out) {
1468 0         0 my @a = split;
1469 0 0       0 @a == 2 ? push @index, @a : push @stream, \@a;
1470             }
1471 0         0 my $i = $self->{' maxobj'}++;
1472 0         0 $self->add_obj($tdict, $i, 0);
1473 0         0 $self->out_obj($tdict );
1474              
1475 0         0 push @index, $i, 1;
1476 0         0 push @stream, [$tloc, 0, 'n'];
1477              
1478 0 0       0 my $len = $tloc > 0xFFFF ? 4 : 2; # don't expect files > 4 Gb
1479 0 0       0 my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC'; # don't expect gennum > 255, it's absurd.
1480             # Adobe doesn't use them anymore anyway
1481 0         0 my $stream = '';
1482 0         0 my @prev = (0) x ($len + 2);
1483 0         0 for (@stream) {
1484 0 0 0     0 $_->[1] = 0 if $_->[2] eq 'f' and $_->[1] == 65535;
1485 0 0       0 my @line = unpack 'C*', pack $tpl, $_->[2] eq 'n' ? 1 : 0, @{$_}[0..1];
  0         0  
1486              
1487             $stream .= pack 'C*', 2, # prepend filtering method, "PNG Up"
1488 0         0 map {($line[$_] - $prev[$_] + 256) % 256 } 0 .. $#line;
  0         0  
1489 0         0 @prev = @line;
1490             }
1491 0         0 $tdict->{'Size'} = PDFNum($i + 1);
1492 0         0 $tdict->{'Index'} = PDFArray(map PDFNum( $_ ), @index);
1493 0         0 $tdict->{'W'} = PDFArray(map PDFNum( $_ ), 1, $len, 1);
1494 0         0 $tdict->{'Filter'} = PDFName('FlateDecode');
1495              
1496 0         0 $tdict->{'DecodeParms'} = PDFDict();
1497 0         0 $tdict->{'DecodeParms'}->val->{'Predictor'} = PDFNum(12);
1498 0         0 $tdict->{'DecodeParms'}->val->{'Columns'} = PDFNum($len + 2);
1499              
1500 0         0 $stream = PDF::API2::Basic::PDF::Filter::FlateDecode->new->outfilt($stream, 1);
1501 0         0 $tdict->{' stream'} = $stream;
1502 0         0 $tdict->{' nofilt'} = 1;
1503 0         0 delete $tdict->{'Length'};
1504 0         0 $self->ship_out();
1505             }
1506             else {
1507 148         622 $fh->print("xref\n", @out, "trailer\n");
1508 148         1554 $tdict->outobjdeep($fh, $self);
1509 148         424 $fh->print("\n");
1510             }
1511 148         1153 $fh->print("startxref\n$tloc\n%%EOF\n");
1512             }
1513              
1514              
1515             =head2 PDF::API2::Basic::PDF::File->_new
1516              
1517             Creates a very empty PDF file object (used by new and open)
1518              
1519             =cut
1520              
1521             sub _new {
1522 180     180   387 my $class = shift();
1523 180         361 my $self = {};
1524              
1525 180         402 bless $self, $class;
1526 180         679 $self->{' outlist'} = [];
1527 180         423 $self->{' outlist_cache'} = {}; # A cache of whats in the 'outlist'
1528 180         399 $self->{' maxobj'} = 1;
1529 180         401 $self->{' objcache'} = {};
1530 180         413 $self->{' objects'} = {};
1531              
1532 180         446 return $self;
1533             }
1534              
1535             1;
1536              
1537             =head1 AUTHOR
1538              
1539             Martin Hosken Martin_Hosken@sil.org
1540              
1541             Copyright Martin Hosken 1999 and onwards
1542              
1543             No warranty or expression of effectiveness, least of all regarding anyone's
1544             safety, is implied in this software or documentation.