File Coverage

blib/lib/Image/ExifTool/PDF.pm
Criterion Covered Total %
statement 535 1060 50.4
branch 290 742 39.0
condition 80 289 27.6
subroutine 22 29 75.8
pod 0 24 0.0
total 927 2144 43.2


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: PDF.pm
3             #
4             # Description: Read PDF meta information
5             #
6             # Revisions: 07/11/2005 - P. Harvey Created
7             # 07/25/2005 - P. Harvey Add support for encrypted documents
8             #
9             # References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
10             # 2) http://search.cpan.org/dist/Crypt-RC4/
11             # 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
12             # 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
13             # 5) http://tools.ietf.org/search/rfc3454
14             # 6) http://www.armware.dk/RFC/rfc/rfc4013.html
15             #------------------------------------------------------------------------------
16              
17             package Image::ExifTool::PDF;
18              
19 25     25   7442 use strict;
  25         64  
  25         1330  
20 25     25   155 use vars qw($VERSION $AUTOLOAD $lastFetched);
  25         62  
  25         1874  
21 25     25   149 use Image::ExifTool qw(:DataAccess :Utils);
  25         60  
  25         467579  
22             require Exporter;
23              
24             $VERSION = '1.61';
25              
26             sub FetchObject($$$$);
27             sub ExtractObject($$;$$);
28             sub ReadToNested($;$);
29             sub ProcessDict($$$$;$$);
30             sub ProcessAcroForm($$$$;$$);
31             sub ExpandArray($);
32             sub ReadPDFValue($);
33             sub CheckPDF($$$);
34              
35             # $lastFetched - last fetched object reference (used for decryption)
36             # (undefined if fetched object was already decrypted, eg. object from stream)
37              
38             my $cryptInfo; # encryption object reference (plus additional information)
39             my $cryptString; # flag that strings are encrypted
40             my $cryptStream; # flag that streams are encrypted
41             my $lastOffset; # last fetched object offset
42             my %streamObjs; # hash of stream objects
43             my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
44             my $pdfVer; # version of PDF file being processed (from header)
45              
46             # filters supported in DecodeStream()
47             my %supportedFilter = (
48             '/FlateDecode' => 1,
49             '/Crypt' => 1,
50             '/Identity' => 1, # (not filtered)
51             '/DCTDecode' => 1, # (JPEG image - not filtered)
52             '/JPXDecode' => 1, # (Jpeg2000 image - not filtered)
53             '/LZWDecode' => 1, # (usually a bitmapped image)
54             '/ASCIIHexDecode' => 1,
55             '/ASCII85Decode' => 1,
56             # other standard filters that we currently don't support
57             #'/JBIG2Decode' => 0, # (JBIG2 image format not supported)
58             #'/CCITTFaxDecode' => 0,
59             #'/RunLengthDecode' => 0,
60             );
61              
62             # tags in main PDF directories
63             %Image::ExifTool::PDF::Main = (
64             GROUPS => { 2 => 'Document' },
65             VARS => { CAPTURE => ['Main','Prev'] },
66             Info => {
67             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
68             # Adobe Acrobat 10.1.5 will create a duplicate Info dictionary with
69             # a different object number when metadata is edited. This flag
70             # is part of a patch to ignore this duplicate information (unless
71             # the IgnoreMinorErrors option is used)
72             IgnoreDuplicates => 1,
73             },
74             Root => {
75             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
76             },
77             Encrypt => {
78             NoProcess => 1, # don't process normally (processed in advance)
79             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
80             },
81             _linearized => {
82             Name => 'Linearized',
83             Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
84             PrintConv => { 'true' => 'Yes', 'false' => 'No' },
85             },
86             );
87              
88             # tags in PDF Info dictionary
89             %Image::ExifTool::PDF::Info = (
90             GROUPS => { 2 => 'Document' },
91             VARS => { CAPTURE => ['Info'] },
92             EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
93             WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
94             CHECK_PROC => \&CheckPDF,
95             WRITABLE => 'string',
96             # set PRIORITY to 0 so most recent Info dictionary takes precedence
97             # (Acrobat Pro bug? doesn't use same object/generation number for
98             # new Info dictionary when doing incremental update)
99             PRIORITY => 0,
100             NOTES => q{
101             As well as the tags listed below, the PDF specification allows for
102             user-defined tags to exist in the Info dictionary. These tags, which should
103             have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
104             object, are also extracted by ExifTool.
105              
106             B specifies the value format, and may be C, C,
107             C, C, C or C for PDF tags.
108             },
109             Title => { },
110             Author => { Groups => { 2 => 'Author' } },
111             Subject => { },
112             Keywords => {
113             List => 'string', # this is a string list
114             Notes => q{
115             stored as a string but treated as a comma- or semicolon-separated list of
116             items when reading if the string contains commas or semicolons, whichever is
117             more numerous, otherwise it is treated a space-separated list of items. The
118             list behaviour may be defeated by setting the API NoPDFList option. Written
119             as a comma-separated string. Note that the corresponding XMP-pdf:Keywords
120             tag is not treated as a list, so the NoPDFList option should be used when
121             copying between these two.
122             },
123             },
124             Creator => { },
125             Producer => { },
126             CreationDate => {
127             Name => 'CreateDate',
128             Writable => 'date',
129             PDF2 => 1, # not deprecated in PDF 2.0
130             Groups => { 2 => 'Time' },
131             Shift => 'Time',
132             PrintConv => '$self->ConvertDateTime($val)',
133             PrintConvInv => '$self->InverseDateTime($val)',
134             },
135             ModDate => {
136             Name => 'ModifyDate',
137             Writable => 'date',
138             PDF2 => 1, # not deprecated in PDF 2.0
139             Groups => { 2 => 'Time' },
140             Shift => 'Time',
141             PrintConv => '$self->ConvertDateTime($val)',
142             PrintConvInv => '$self->InverseDateTime($val)',
143             },
144             SourceModified => {
145             Name => 'SourceModified',
146             Writable => 'date',
147             PDF2 => 1,
148             Groups => { 2 => 'Time' },
149             Shift => 'Time',
150             PrintConv => '$self->ConvertDateTime($val)',
151             PrintConvInv => '$self->InverseDateTime($val)',
152             },
153             Trapped => {
154             Protected => 1,
155             # remove leading '/' from '/True' or '/False'
156             ValueConv => '$val=~s{^/}{}; $val',
157             ValueConvInv => '"/$val"',
158             },
159             'AAPL:Keywords' => { #PH
160             Name => 'AppleKeywords',
161             List => 'array', # this is an array of values
162             Notes => q{
163             keywords written by Apple utilities, although they seem to use PDF:Keywords
164             when reading
165             },
166             },
167             );
168              
169             # tags in the PDF Root document catalog
170             %Image::ExifTool::PDF::Root = (
171             GROUPS => { 2 => 'Document' },
172             # note: can't capture previous versions of Root since they are not parsed
173             VARS => { CAPTURE => ['Root'] },
174             NOTES => 'This is the PDF document catalog.',
175             MarkInfo => {
176             SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
177             },
178             Metadata => {
179             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
180             },
181             Pages => {
182             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
183             },
184             Perms => {
185             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
186             },
187             AcroForm => {
188             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AcroForm' },
189             },
190             AF => {
191             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AF' },
192             },
193             Lang => 'Language',
194             PageLayout => { },
195             PageMode => { },
196             Version => {
197             Name => 'PDFVersion',
198             RawConv => '$$self{PDFVersion} = $val if $$self{PDFVersion} < $val; $val',
199             },
200             );
201              
202             # tags extracted from the PDF Encrypt dictionary
203             %Image::ExifTool::PDF::Encrypt = (
204             GROUPS => { 2 => 'Document' },
205             NOTES => 'Tags extracted from the document Encrypt dictionary.',
206             Filter => {
207             Name => 'Encryption',
208             Notes => q{
209             extracted value is actually a combination of the Filter, SubFilter, V, R and
210             Length information from the Encrypt dictionary
211             },
212             },
213             P => {
214             Name => 'UserAccess',
215             ValueConv => '$val & 0x0f3c', # ignore reserved bits
216             PrintConvColumns => 2,
217             PrintConv => { BITMASK => {
218             2 => 'Print',
219             3 => 'Modify',
220             4 => 'Copy',
221             5 => 'Annotate',
222             8 => 'Fill forms',
223             9 => 'Extract',
224             10 => 'Assemble',
225             11 => 'Print high-res',
226             }},
227             },
228             );
229              
230             # tags in PDF Pages dictionary
231             %Image::ExifTool::PDF::Pages = (
232             GROUPS => { 2 => 'Document' },
233             Count => 'PageCount',
234             Kids => {
235             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
236             },
237             MediaBox => { Name => 'MediaBox', List => 1 },
238             );
239              
240             # tags in PDF Perms dictionary
241             %Image::ExifTool::PDF::Perms = (
242             NOTES => 'Additional document permissions imposed by digital signatures.',
243             DocMDP => {
244             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
245             },
246             FieldMDP => {
247             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
248             },
249             UR3 => {
250             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
251             },
252             );
253              
254             # tags in PDF Perms dictionary
255             %Image::ExifTool::PDF::AcroForm = (
256             PROCESS_PROC => \&ProcessAcroForm,
257             _has_xfa => {
258             Name => 'HasXFA',
259             Notes => q{
260             this tag is defined if a document contains form fields, and is true if it
261             uses XML Forms Architecture; not a real Tag ID
262             },
263             PrintConv => { 'true' => 'Yes', 'false' => 'No' },
264             },
265             );
266              
267             # tags extracted from AF dictionary
268             %Image::ExifTool::PDF::AF = (
269             PROCESS_PROC => \&ProcessAF,
270             NOTES => 'Processed only for C2PA information if AFRelationship is "/C2PA_Manifest".',
271             EF => {
272             SubDirectory => { TagTable => 'Image::ExifTool::PDF::EF' },
273             },
274             );
275              
276             # tags extracted from EF dictionary
277             %Image::ExifTool::PDF::EF = (
278             F => {
279             Name => 'F_', # (don't want single-letter tag names)
280             SubDirectory => { TagTable => 'Image::ExifTool::PDF::F' },
281             },
282             );
283              
284             # tags extracted from F dictionary
285             %Image::ExifTool::PDF::F = (
286             NOTES => 'C2PA JUMBF metadata extracted from "/C2PA_Manifest" file.',
287             _stream => {
288             Name => 'JUMBF',
289             Condition => '$$self{AFRelationship} eq "/C2PA_Manifest"',
290             SubDirectory => {
291             TagTable => 'Image::ExifTool::Jpeg2000::Main',
292             DirName => 'JUMBF',
293             ByteOrder => 'BigEndian',
294             },
295             },
296             );
297              
298             # tags in PDF Kids dictionary
299             %Image::ExifTool::PDF::Kids = (
300             Metadata => {
301             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
302             },
303             PieceInfo => {
304             SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
305             },
306             Resources => {
307             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
308             },
309             Kids => {
310             Condition => '$self->Options("ExtractEmbedded")',
311             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
312             },
313             );
314              
315             # tags in PDF Resources dictionary
316             %Image::ExifTool::PDF::Resources = (
317             ColorSpace => {
318             SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
319             },
320             XObject => {
321             Condition => '$self->Options("ExtractEmbedded")',
322             SubDirectory => { TagTable => 'Image::ExifTool::PDF::XObject' },
323             },
324             Properties => {
325             Condition => '$self->Options("ExtractEmbedded")',
326             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Properties' },
327             },
328             );
329              
330             # tags in PDF ColorSpace dictionary
331             %Image::ExifTool::PDF::ColorSpace = (
332             DefaultRGB => {
333             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
334             ConvertToDict => 1, # (not seen yet, but just in case)
335             },
336             DefaultCMYK => {
337             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
338             # hack: this is stored as an array instead of a dictionary in my
339             # sample, so convert to a dictionary to extract the ICCBased element
340             ConvertToDict => 1,
341             },
342             Cs1 => {
343             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
344             ConvertToDict => 1, # (just in case)
345             },
346             CS0 => {
347             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
348             ConvertToDict => 1, # (just in case)
349             },
350             );
351              
352             # tags in PDF DefaultRGB dictionary
353             %Image::ExifTool::PDF::DefaultRGB = (
354             ICCBased => {
355             SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
356             },
357             );
358              
359             # tags in PDF ICCBased, Cs1 and CS0 dictionaries
360             %Image::ExifTool::PDF::ICCBased = (
361             _stream => {
362             Name => 'ICC_Profile',
363             SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
364             },
365             );
366              
367             # tags in PDF XObject dictionary (parsed only if ExtractEmbedded is enabled)
368             %Image::ExifTool::PDF::XObject = (
369             EXTRACT_UNKNOWN => 0, # extract known but numbered tags (Im1, Im2, etc)
370             Im => {
371             Notes => q{
372             the L option enables information to be extracted from these
373             embedded images
374             },
375             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Im' },
376             },
377             );
378              
379             # tags in PDF Im# dictionary
380             %Image::ExifTool::PDF::Im = (
381             NOTES => q{
382             Information extracted from embedded images with the L option.
383             The EmbeddedImage and its metadata are extracted only for JPEG and Jpeg2000
384             image formats.
385             },
386             Width => 'EmbeddedImageWidth',
387             Height => 'EmbeddedImageHeight',
388             Filter => { Name => 'EmbeddedImageFilter', List => 1 },
389             ColorSpace => {
390             Name => 'EmbeddedImageColorSpace',
391             List => 1,
392             RawConv => 'ref $val ? undef : $val', # (ignore color space data)
393             },
394             Image_stream => {
395             Name => 'EmbeddedImage',
396             Groups => { 2 => 'Preview' },
397             Binary => 1,
398             },
399             );
400              
401             # tags in PDF Properties dictionary
402             %Image::ExifTool::PDF::Properties = (
403             EXTRACT_UNKNOWN => 0, # extract known but numbered tags (MC0, MC1, etc)
404             MC => {
405             Notes => q{
406             the L option enables information to be extracted from these
407             embedded metadata dictionaries
408             },
409             SubDirectory => { TagTable => 'Image::ExifTool::PDF::MC' },
410             }
411             );
412              
413             # tags in PDF MC# dictionary
414             %Image::ExifTool::PDF::MC = (
415             Metadata => {
416             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
417             }
418             );
419              
420             # tags in PDF PieceInfo dictionary
421             %Image::ExifTool::PDF::PieceInfo = (
422             AdobePhotoshop => {
423             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
424             },
425             Illustrator => {
426             # assume this is an illustrator file if it contains this directory
427             # and doesn't have a ".PDF" extension
428             Condition => q{
429             $self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
430             return 1;
431             },
432             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
433             },
434             );
435              
436             # tags in PDF AdobePhotoshop dictionary
437             %Image::ExifTool::PDF::AdobePhotoshop = (
438             Private => {
439             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
440             },
441             );
442              
443             # tags in PDF Illustrator dictionary
444             %Image::ExifTool::PDF::Illustrator = (
445             Private => {
446             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
447             },
448             );
449              
450             # tags in PDF Private dictionary
451             %Image::ExifTool::PDF::Private = (
452             ImageResources => {
453             SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
454             },
455             );
456              
457             # tags in PDF AI Private dictionary
458             %Image::ExifTool::PDF::AIPrivate = (
459             GROUPS => { 2 => 'Document' },
460             EXTRACT_UNKNOWN => 0, # extract known but numbered tags
461             AIMetaData => {
462             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
463             },
464             AIPrivateData => {
465             Notes => q{
466             the L option enables information to be extracted from embedded
467             PostScript documents in the AIPrivateData# and AIPDFPrivateData# streams
468             },
469             JoinStreams => 1, # join streams from numbered tags and process as one
470             SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
471             },
472             AIPDFPrivateData => {
473             JoinStreams => 1, # join streams from numbered tags and process as one
474             SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
475             },
476             RoundTripVersion => { },
477             ContainerVersion => { },
478             CreatorVersion => { },
479             );
480              
481             # tags in PDF AIMetaData dictionary
482             %Image::ExifTool::PDF::AIMetaData = (
483             _stream => {
484             Name => 'AIStream',
485             SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
486             },
487             );
488              
489             # tags in PDF ImageResources dictionary
490             %Image::ExifTool::PDF::ImageResources = (
491             _stream => {
492             Name => 'PhotoshopStream',
493             SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
494             },
495             );
496              
497             # tags in PDF MarkInfo dictionary
498             %Image::ExifTool::PDF::MarkInfo = (
499             GROUPS => { 2 => 'Document' },
500             Marked => {
501             Name => 'TaggedPDF',
502             Notes => "not a Tagged PDF if this tag is missing",
503             PrintConv => { 'true' => 'Yes', 'false' => 'No' },
504             },
505             );
506              
507             # tags in PDF Metadata dictionary
508             %Image::ExifTool::PDF::Metadata = (
509             GROUPS => { 2 => 'Document' },
510             XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
511             Name => 'XMP',
512             SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
513             },
514             );
515              
516             # tags in PDF signature directories (DocMDP, FieldMDP or UR3)
517             %Image::ExifTool::PDF::Signature = (
518             GROUPS => { 2 => 'Document' },
519             ContactInfo => 'SignerContactInfo',
520             Location => 'SigningLocation',
521             M => {
522             Name => 'SigningDate',
523             Format => 'date',
524             Groups => { 2 => 'Time' },
525             PrintConv => '$self->ConvertDateTime($val)',
526             },
527             Name => 'SigningAuthority',
528             Reason => 'SigningReason',
529             Reference => {
530             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
531             },
532             Prop_AuthTime => {
533             Name => 'AuthenticationTime',
534             PrintConv => 'ConvertTimeSpan($val) . " ago"',
535             },
536             Prop_AuthType => 'AuthenticationType',
537             );
538              
539             # tags in PDF Reference dictionary
540             %Image::ExifTool::PDF::Reference = (
541             TransformParams => {
542             SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
543             },
544             );
545              
546             # tags in PDF TransformParams dictionary
547             %Image::ExifTool::PDF::TransformParams = (
548             GROUPS => { 2 => 'Document' },
549             Annots => {
550             Name => 'AnnotationUsageRights',
551             Notes => q{
552             possible values are Create, Delete, Modify, Copy, Import and Export;
553             additional values for UR3 signatures are Online and SummaryView
554             },
555             List => 1,
556             },
557             Document => {
558             Name => 'DocumentUsageRights',
559             Notes => 'only possible value is FullSave',
560             List => 1,
561             },
562             Form => {
563             Name => 'FormUsageRights',
564             Notes => q{
565             possible values are FillIn, Import, Export, SubmitStandalone and
566             SpawnTemplate; additional values for UR3 signatures are BarcodePlaintext and
567             Online
568             },
569             List => 1,
570             },
571             FormEX => {
572             Name => 'FormExtraUsageRights',
573             Notes => 'UR signatures only; only possible value is BarcodePlaintext',
574             List => 1,
575             },
576             Signature => {
577             Name => 'SignatureUsageRights',
578             Notes => 'only possible value is Modify',
579             List => 1,
580             },
581             EF => {
582             Name => 'EmbeddedFileUsageRights',
583             Notes => 'possible values are Create, Delete, Modify and Import',
584             List => 1,
585             },
586             Msg => 'UsageRightsMessage',
587             P => {
588             Name => 'ModificationPermissions',
589             Notes => q{
590             1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
591             false
592             },
593             PrintConv => {
594             1 => 'No changes permitted',
595             2 => 'Fill forms, Create page templates, Sign',
596             3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
597             'true' => 'Restrict all applications to reader permissions',
598             'false' => 'Do not restrict applications to reader permissions',
599             },
600             },
601             Action => {
602             Name => 'FieldPermissions',
603             Notes => 'FieldMDP signatures only',
604             PrintConv => {
605             'All' => 'Disallow changes to all form fields',
606             'Include' => 'Disallow changes to specified form fields',
607             'Exclude' => 'Allow changes to specified form fields',
608             },
609             },
610             Fields => {
611             Notes => 'FieldMDP signatures only',
612             Name => 'FormFields',
613             List => 1,
614             },
615             );
616              
617             # unknown tags for use in verbose option
618             %Image::ExifTool::PDF::Unknown = (
619             GROUPS => { 2 => 'Unknown' },
620             );
621              
622             #------------------------------------------------------------------------------
623             # AutoLoad our writer routines when necessary
624             #
625             sub AUTOLOAD
626             {
627 19     19   183 return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
628             }
629              
630             #------------------------------------------------------------------------------
631             # Convert from PDF to EXIF-style date/time
632             # Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
633             # Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
634             sub ConvertPDFDate($)
635             {
636 10     10 0 17 my $date = shift;
637             # remove optional 'D:' prefix
638 10         53 $date =~ s/^D://;
639             # fill in default values if necessary
640             # YYYYmmddHHMMSS
641 10         24 my $default = '00000101000000';
642 10 50       31 if (length $date < length $default) {
643 0         0 $date .= substr($default, length $date);
644             }
645 10 50       58 $date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
646 10         73 $date = "$1:$2:$3 $4:$5:$6";
647 10 50       36 if ($7) {
648 10         26 my $tz = $7;
649 10 50       218 if ($tz =~ /^\s*Z/i) {
    50          
650             # ignore any "HH'mm'" after the Z (OS X 10.6 does this)
651 0         0 $date .= 'Z';
652             # tolerate some improper formatting in timezone specification
653             } elsif ($tz =~ /^\s*([-+])\s*(\d+)[': ]+(\d*)/) {
654 10   50     68 $date .= $1 . $2 . ':' . ($3 || '00');
655             }
656             }
657 10         25 return $date;
658             }
659              
660             #------------------------------------------------------------------------------
661             # Locate any object in the XRef tables (including compressed objects)
662             # Inputs: 0) XRef reference, 1) object reference string (or free object number)
663             # Returns: offset to object in file or compressed object reference string,
664             # 0 if object is free, or undefined on error
665             sub LocateAnyObject($$)
666             {
667 238     238 0 532 my ($xref, $ref) = @_;
668 238 50       555 return undef unless $xref;
669 238 100       1008 return $$xref{$ref} if exists $$xref{$ref};
670             # get the object number
671 7 50       49 return undef unless $ref =~ /^(\d+)/;
672 7         23 my $objNum = $1;
673             # return 0 if the object number has been reused (old object is free)
674 7 100       38 return 0 if defined $$xref{$objNum};
675             #
676             # scan our XRef stream dictionaries for this object
677             #
678 1 50       19 return undef unless $$xref{dicts};
679 0         0 my $dict;
680 0         0 foreach $dict (@{$$xref{dicts}}) {
  0         0  
681             # quick check to see if the object is in the range for this xref stream
682 0 0       0 next if $objNum >= $$dict{Size};
683 0         0 my $index = $$dict{Index};
684 0 0       0 next if $objNum < $$index[0];
685             # scan the tables for the specified object
686 0         0 my $size = $$dict{_entry_size};
687 0         0 my $num = scalar(@$index) / 2;
688 0         0 my $tot = 0;
689 0         0 my $i;
690 0         0 for ($i=0; $i<$num; ++$i) {
691 0         0 my $start = $$index[$i*2];
692 0         0 my $count = $$index[$i*2+1];
693             # table is in ascending order, so quit if we have passed the object
694 0 0       0 last if $objNum < $start;
695 0 0       0 if ($objNum < $start + $count) {
696 0         0 my $offset = $size * ($objNum - $start + $tot);
697 0 0       0 last if $offset + $size > length $$dict{_stream};
698 0         0 my @c = unpack("x$offset C$size", $$dict{_stream});
699             # extract values from this table entry
700             # (can be 1, 2, 3, 4, etc.. bytes per value)
701 0         0 my (@t, $j, $k);
702 0         0 my $w = $$dict{W};
703 0         0 for ($j=0; $j<3; ++$j) {
704             # use default value if W entry is 0 (as per spec)
705             # - 0th element defaults to 1, others default to 0
706 0 0       0 $$w[$j] or $t[$j] = ($j ? 0 : 1), next;
    0          
707 0         0 $t[$j] = shift(@c);
708 0         0 for ($k=1; $k < $$w[$j]; ++$k) {
709 0         0 $t[$j] = 256 * $t[$j] + shift(@c);
710             }
711             }
712             # by default, use "o g R" as the xref key
713             # (o = object number, g = generation number)
714 0         0 my $ref2 = "$objNum $t[2] R";
715 0 0       0 if ($t[0] == 1) {
    0          
    0          
716             # normal object reference:
717             # $t[1]=offset of object from start, $t[2]=generation number
718 0         0 $$xref{$ref2} = $t[1];
719             } elsif ($t[0] == 2) {
720             # compressed object reference:
721             # $t[1]=stream object number, $t[2]=index of object in stream
722 0         0 $ref2 = "$objNum 0 R";
723 0         0 $$xref{$ref2} = "I$t[2] $t[1] 0 R";
724             } elsif ($t[0] == 0) {
725             # free object:
726             # $t[1]=next free object in linked list, $t[2]=generation number
727 0         0 $$xref{$ref2} = 0;
728             } else {
729             # treat as a null object
730 0         0 $$xref{$ref2} = undef;
731             }
732 0         0 $$xref{$objNum} = $t[1]; # remember offsets by object number too
733 0 0       0 return $$xref{$ref} if $ref eq $ref2;
734 0         0 return 0; # object is free or was reused
735             }
736 0         0 $tot += $count;
737             }
738             }
739 0         0 return undef;
740             }
741              
742             #------------------------------------------------------------------------------
743             # Locate a regular object in the XRef tables (does not include compressed objects)
744             # Inputs: 0) XRef reference, 1) object reference string (or free object number)
745             # Returns: offset to object in file, 0 if object is free,
746             # or undef on error or if object was compressed
747             sub LocateObject($$)
748             {
749 41     41 0 147 my ($xref, $ref) = @_;
750 41         139 my $offset = LocateAnyObject($xref, $ref);
751 41 50 66     408 return undef if $offset and $offset =~ /^I/;
752 41         207 return $offset;
753             }
754              
755             #------------------------------------------------------------------------------
756             # Check that the correct object is located at the specified file offset
757             # Inputs: 0) ExifTool ref, 1) object name, 2) object reference string, 3) file offset
758             # Returns: first non-blank line at start of object, or undef on error
759             sub CheckObject($$$$)
760             {
761 217     217 0 562 my ($et, $tag, $ref, $offset) = @_;
762 217         350 my ($data, $obj, $dat, $pat);
763              
764 217         467 my $raf = $$et{RAF};
765 217 50       1098 $raf->Seek($offset+$$et{PDFBase}, 0) or $et->Warn("Bad $tag offset"), return undef;
766             # verify that we are reading the expected object
767 217         1235 ($obj = $ref) =~ s/R/obj/;
768 217         398 for (;;) {
769 217 50       637 $raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
770 217 50       5450 last if $data =~ s/^$obj//;
771 0 0       0 next if $data =~ /^\s+$/; # keep reading if this was a blank line
772             # handle cases where other whitespace characters are used in the object ID string
773 0         0 while ($data =~ /^\d+(\s+\d+)?\s*$/) {
774 0         0 $raf->ReadLine($dat);
775 0         0 $data .= $dat;
776             }
777 0         0 ($pat = $obj) =~ s/ /\\s+/g;
778 0 0       0 unless ($data =~ s/$pat//) {
779 0         0 $tag = ucfirst $tag;
780 0         0 $et->Warn("$tag object ($obj) not found at offset $offset");
781 0         0 return undef;
782             }
783 0         0 last;
784             }
785             # read the first line of data from the object (ignoring blank lines and comments)
786 217         481 for (;;) {
787 434 100 66     2789 last if $data =~ /\S/ and $data !~ /^\s*%/;
788 217 50       665 $raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
789             }
790 217         642 return $data;
791             }
792              
793             #------------------------------------------------------------------------------
794             # Fetch indirect object from file (from inside a stream if required)
795             # Inputs: 0) ExifTool object reference, 1) object reference string,
796             # 2) xref lookup, 3) object name (for warning messages)
797             # Returns: object data or undefined on error
798             # Notes: sets $lastFetched to the object reference, or undef if the object
799             # was extracted from an encrypted stream
800             sub FetchObject($$$$)
801             {
802 197     197 0 541 my ($et, $ref, $xref, $tag) = @_;
803 197         376 $lastFetched = $ref; # save this for decoding if necessary
804 197         1427 my $offset = LocateAnyObject($xref, $ref);
805 197         357 $lastOffset = $offset;
806 197 100       491 unless ($offset) {
807 5 50       20 $et->Warn("Bad $tag reference") unless defined $offset;
808 5         18 return undef;
809             }
810 192         349 my ($data, $obj);
811 192 50       688 if ($offset =~ s/^I(\d+) //) {
812 0         0 my $index = $1; # object index in stream
813 0         0 my ($objNum) = split ' ', $ref; # save original object number
814 0         0 $ref = $offset; # now a reference to the containing stream object
815 0         0 $obj = $streamObjs{$ref};
816 0 0       0 unless ($obj) {
817             # don't try to load the same object stream twice
818 0 0       0 return undef if defined $obj;
819 0         0 $streamObjs{$ref} = '';
820             # load the parent object stream
821 0         0 $obj = FetchObject($et, $ref, $xref, $tag);
822             # make sure it contains everything we need
823 0 0 0     0 return undef unless defined $obj and ref($obj) eq 'HASH';
824 0 0 0     0 return undef unless $$obj{First} and $$obj{N};
825 0 0       0 return undef unless DecodeStream($et, $obj);
826             # add a special '_table' entry to this dictionary which contains
827             # the list of object number/offset pairs from the stream header
828 0         0 my $num = $$obj{N} * 2;
829 0         0 my @table = split ' ', $$obj{_stream}, $num;
830 0 0       0 return undef unless @table == $num;
831             # remove everything before first object in stream
832 0         0 $$obj{_stream} = substr($$obj{_stream}, $$obj{First});
833 0         0 $table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
834 0         0 $$obj{_table} = \@table;
835             # save the object stream so we don't have to re-load it later
836 0         0 $streamObjs{$ref} = $obj;
837             }
838             # verify that we have the specified object
839 0         0 my $i = 2 * $index;
840 0         0 my $table = $$obj{_table};
841 0 0 0     0 unless ($index < $$obj{N} and $$table[$i] == $objNum) {
842 0         0 $et->Warn("Bad index for stream object $tag");
843 0         0 return undef;
844             }
845             # extract the object at the specified index in the stream
846             # (offsets in table are in sequential order, so we can subtract from
847             # the next offset to get the object length)
848 0         0 $offset = $$table[$i + 1];
849 0   0     0 my $len = ($$table[$i + 3] || length($$obj{_stream})) - $offset;
850 0         0 $data = substr($$obj{_stream}, $offset, $len);
851             # avoid re-decrypting data in already decrypted streams
852 0 0       0 undef $lastFetched if $cryptStream;
853 0         0 return ExtractObject($et, \$data);
854             }
855             # load the start of the object
856 192         507 $data = CheckObject($et, $tag, $ref, $offset);
857 192 50       516 return undef unless defined $data;
858              
859 192         792 return ExtractObject($et, \$data, $$et{RAF}, $xref);
860             }
861              
862             #------------------------------------------------------------------------------
863             # Convert PDF value to something readable
864             # Inputs: 0) PDF object data
865             # Returns: converted object
866             sub ReadPDFValue($)
867             {
868 223     223 0 466 my $str = shift;
869             # decode all strings in an array
870 223 100       533 if (ref $str eq 'ARRAY') {
871             # create new list to not alter the original data when rewriting
872 27         66 my ($val, @vals);
873 27         100 foreach $val (@$str) {
874 80         195 push @vals, ReadPDFValue($val);
875             }
876 27         89 return \@vals;
877             }
878 196 50       522 length $str or return $str;
879 196         474 my $delim = substr($str, 0, 1);
880 196 100       675 if ($delim eq '(') { # literal string
    100          
    50          
881 58 50       498 $str = $1 if $str =~ /^.*?\((.*)\)/s; # remove brackets
882             # decode escape sequences in literal strings
883 58         214 while ($str =~ /\\(.)/sg) {
884 0         0 my $n = pos($str) - 2;
885 0         0 my $c = $1;
886 0         0 my $r;
887 0 0       0 if ($c =~ /[0-7]/) {
    0          
    0          
888             # get up to 2 more octal digits
889 0 0       0 $c .= $1 if $str =~ /\G([0-7]{1,2})/g;
890             # convert octal escape code
891 0         0 $r = chr(oct($c) & 0xff);
892             } elsif ($c eq "\x0d") {
893             # the string is continued if the line ends with '\'
894             # (also remove "\x0d\x0a")
895 0 0       0 $c .= $1 if $str =~ /\G(\x0a)/g;
896 0         0 $r = '';
897             } elsif ($c eq "\x0a") {
898 0         0 $r = '';
899             } else {
900             # convert escaped characters
901 0         0 ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
902             }
903 0         0 substr($str, $n, length($c)+1) = $r;
904             # continue search after this character
905 0         0 pos($str) = $n + length($r);
906             }
907 58 50       137 Crypt(\$str, $lastFetched) if $cryptString;
908             } elsif ($delim eq '<') { # hex string
909             # decode hex data
910 41         127 $str =~ tr/0-9A-Fa-f//dc;
911 41 50       134 $str .= '0' if length($str) & 0x01; # (by the spec)
912 41         181 $str = pack('H*', $str);
913 41 100       151 Crypt(\$str, $lastFetched) if $cryptString;
914             } elsif ($delim eq '/') { # name
915 0         0 $str = substr($str, 1);
916             # convert escape codes (PDF 1.2 or later)
917 0 0       0 $str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
  0         0  
918             }
919 196         563 return $str;
920             }
921              
922             #------------------------------------------------------------------------------
923             # Extract PDF object from combination of buffered data and file
924             # Inputs: 0) ExifTool object reference, 1) data reference,
925             # 2) optional raf reference, 3) optional xref table
926             # Returns: converted PDF object or undef on error
927             # a) dictionary object --> hash reference
928             # b) array object --> array reference
929             # c) indirect reference --> scalar reference
930             # d) string, name, integer, boolean, null --> scalar value
931             # - updates $$dataPt on return to contain unused data
932             # - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
933             # the stream data and a list of the tags (not including '_stream' and '_tags')
934             # in their original order
935             sub ExtractObject($$;$$)
936             {
937 754     754 0 2016 my ($et, $dataPt, $raf, $xref) = @_;
938 754         1214 my (@tags, $data, $objData);
939 754         1240 my $dict = { };
940 754         1442 my $delim;
941              
942 754         1540 for (;;) {
943 824 100       4122 if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/s) {
    50          
944 754         1851 $delim = $1;
945 754         1948 $$dataPt =~ s/^\s+//; # remove leading white space
946 754         1809 $objData = ReadToNested($dataPt, $raf);
947 754 50       1663 return undef unless defined $objData;
948 754         1356 last;
949             } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{}s) {
950             #
951             # extract boolean, numerical, string, name, null object or indirect reference
952             #
953 0         0 $objData = $1;
954             # look for an indirect reference
955 0 0 0     0 if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
956 0         0 $objData .= "$1 R";
957 0         0 $objData = \$objData; # return scalar reference
958             }
959 0         0 return $objData; # return simple scalar or scalar reference
960             }
961 70 50 33     364 $raf and $raf->ReadLine($data) or return undef;
962 70         178 $$dataPt .= $data;
963             }
964             #
965             # return literal string or hex string without parsing
966             #
967 754 100 100     3678 if ($delim eq '(' or $delim eq '<') {
    100          
968 160         633 return $objData;
969             #
970             # extract array
971             #
972             } elsif ($delim eq '[') {
973 167 50       993 $objData =~ /^.*?\[(.*)\]/s or return undef;
974 167         433 my $data = $1; # brackets removed
975 167         307 my @list;
976 167         253 for (;;) {
977 594 100       1916 last unless $data =~ m{\s*(\S[^[(/<>\s]*)}sg;
978 427         819 my $val = $1;
979 427 100       1658 if ($val =~ /^(<{1,2}|\[|\()/) {
    100          
980 78         176 my $pos = pos($data) - length($val);
981             # nested dict, array, literal string or hex string
982 78         186 my $buff = substr($data, $pos);
983 78         227 $val = ReadToNested(\$buff);
984 78 50       265 last unless defined $val;
985 78         216 pos($data) = $pos + length($val);
986 78         277 $val = ExtractObject($et, \$val);
987             } elsif ($val =~ /^\d/) {
988 245         397 my $pos = pos($data);
989 245 100       731 if ($data =~ /\G\s+(\d+)\s+R/g) {
990 37         165 $val = \ "$val $1 R"; # make a reference
991             } else {
992 208         464 pos($data) = $pos;
993             }
994             }
995 427         1034 push @list, $val;
996             }
997 167         730 return \@list;
998             }
999             #
1000             # extract dictionary
1001             #
1002             # Note: entries are not necessarily separated by whitespace (doh!)
1003             # eg) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
1004             # Also, they may be separated by a comment (eg. "/Tag%comment\nValue"),
1005             # but comments have already been removed
1006 427         3072 while ($objData =~ m{(\s*)/([^/[\]()<>{}\s]+)\s*(\S[^[(/<>\s]*)}sg) {
1007 1229         2611 my $tag = $2;
1008 1229         2562 my $val = $3;
1009 1229 100       4603 if ($val =~ /^(<{1,2}|\[|\()/) {
    100          
1010             # nested dict, array, literal string or hex string
1011 396         1200 $objData = substr($objData, pos($objData)-length($val));
1012 396         1089 $val = ReadToNested(\$objData, $raf);
1013 396 50       930 last unless defined $val;
1014 396         1184 $val = ExtractObject($et, \$val);
1015 396         1117 pos($objData) = 0;
1016             } elsif ($val =~ /^\d/) {
1017 618         1039 my $pos = pos($objData);
1018 618 100       2179 if ($objData =~ /\G\s+(\d+)\s+R/sg) {
1019 416         1641 $val = \ "$val $1 R"; # make a reference
1020             } else {
1021 202         531 pos($objData) = $pos;
1022             }
1023             }
1024 1229 50       2850 if ($$dict{$tag}) {
1025             # duplicate dictionary entries are not allowed
1026 0         0 $et->Warn("Duplicate '${tag}' entry in dictionary (ignored)");
1027             } else {
1028             # save the entry
1029 1229         2920 push @tags, $tag;
1030 1229         6817 $$dict{$tag} = $val;
1031             }
1032             }
1033 427 50       1220 return undef unless @tags;
1034 427         1342 $$dict{_tags} = \@tags;
1035 427 100       1217 return $dict unless $raf; # direct objects can not have streams
1036             #
1037             # extract the stream object
1038             #
1039             # dictionary must specify stream Length
1040 262 100       12636 my $length = $$dict{Length} or return $dict;
1041 43 100       136 if (ref $length) {
1042 25         61 $length = $$length;
1043 25         111 my $oldpos = $raf->Tell();
1044             # get the location of the object specifying the length
1045             # (compressed objects are not allowed)
1046 25 50       103 my $offset = LocateObject($xref, $length) or return $dict;
1047 25 50       76 $offset or $et->Warn('Bad stream Length object'), return $dict;
1048 25         136 $data = CheckObject($et, 'stream Length', $length, $offset);
1049 25 50       99 defined $data or return $dict;
1050 25 50       163 $data =~ /^\s*(\d+)/ or $et->Warn('Stream Length not found'), return $dict;
1051 25         75 $length = $1;
1052 25         78 $raf->Seek($oldpos, 0); # restore position to start of stream
1053             }
1054             # extract the trailing stream data
1055 43         88 for (;;) {
1056             # find the stream token
1057 86 100       372 if ($$dataPt =~ /(\S+)/) {
1058 43 50       175 last unless $1 eq 'stream';
1059             # read an extra line because it may contain our \x0a
1060 43 50       145 $$dataPt .= $data if $raf->ReadLine($data);
1061             # remove our stream header
1062 43         433 $$dataPt =~ s/^\s*stream(\x0a|\x0d\x0a)//s;
1063 43         155 my $more = $length - length($$dataPt);
1064 43 100       175 if ($more > 0) {
    50          
1065 28 50       93 unless ($raf->Read($data, $more) == $more) {
1066 0         0 $et->Warn('Error reading stream data');
1067 0         0 $$dataPt = '';
1068 0         0 return $dict;
1069             }
1070 28         182 $$dict{_stream} = $$dataPt . $data;
1071 28         74 $$dataPt = '';
1072             } elsif ($more < 0) {
1073 15         141 $$dict{_stream} = substr($$dataPt, 0, $length);
1074 15         47 $$dataPt = substr($$dataPt, $length);
1075             } else {
1076 0         0 $$dict{_stream} = $$dataPt;
1077 0         0 $$dataPt = '';
1078             }
1079 43         117 last;
1080             }
1081 43 50       183 $raf->ReadLine($data) or last;
1082 43         199 $$dataPt .= $data;
1083             }
1084 43         190 return $dict;
1085             }
1086              
1087             #------------------------------------------------------------------------------
1088             # Read to nested delimiter
1089             # Inputs: 0) data reference, 1) optional raf reference
1090             # Returns: data up to and including matching delimiter (or undef on error)
1091             # - updates data reference with trailing data
1092             # - unescapes characters in literal strings
1093             my %closingDelim = ( # lookup for matching delimiter
1094             '(' => ')',
1095             '[' => ']',
1096             '<' => '>',
1097             '<<' => '>>',
1098             );
1099             sub ReadToNested($;$)
1100             {
1101 1228     1228 0 2622 my ($dataPt, $raf) = @_;
1102 1228         2613 my @delim = (''); # closing delimiter list, most deeply nested first
1103 1228         3701 pos($$dataPt) = 0; # begin at start of data
1104 1228         2352 for (;;) {
1105 5744 100       61695 unless ($$dataPt =~ /(\\*)(\(|\)|<{1,2}|>{1,2}|\[|\]|%)/g) {
1106             # must read some more data
1107 1148         1657 my $buff;
1108 1148 50 33     3953 last unless $raf and $raf->ReadLine($buff);
1109 1148         3056 $$dataPt .= $buff;
1110 1148         2837 pos($$dataPt) = length($$dataPt) - length($buff);
1111 1148         2207 next;
1112             }
1113             # are we in a literal string?
1114 4596 100       12858 if ($delim[0] eq ')') {
    50          
1115             # ignore escaped delimiters (preceded by odd number of \'s)
1116 434 50       1207 next if length($1) & 0x01;
1117             # ignore all delimiters but unescaped braces
1118 434 50 33     1875 next unless $2 eq '(' or $2 eq ')';
1119             } elsif ($2 eq '%') {
1120             # ignore the comment
1121 0         0 my $pos = pos($$dataPt) - 1;
1122             # remove everything from '%' up to but not including newline
1123 0         0 $$dataPt =~ /.*/g;
1124 0         0 my $end = pos($$dataPt);
1125 0         0 $$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
1126 0         0 pos($$dataPt) = $pos;
1127 0         0 next;
1128             }
1129 4596 100       11880 if ($closingDelim{$2}) {
1130             # push the corresponding closing delimiter
1131 2298         5713 unshift @delim, $closingDelim{$2};
1132 2298         3659 next;
1133             }
1134 2298 50       5671 unless ($2 eq $delim[0]) {
1135             # handle the case where we find a ">>>" and interpret it
1136             # as ">> >" instead of "> >>"
1137 0 0 0     0 next unless $2 eq '>>' and $delim[0] eq '>';
1138 0         0 pos($$dataPt) = pos($$dataPt) - 1;
1139             }
1140 2298         3613 shift @delim; # remove from nesting list
1141 2298 100       5211 next if $delim[0]; # keep going if we have more nested delimiters
1142 1228         1940 my $pos = pos($$dataPt);
1143 1228         2781 my $buff = substr($$dataPt, 0, $pos);
1144 1228         2715 $$dataPt = substr($$dataPt, $pos);
1145 1228         3687 return $buff; # success!
1146             }
1147 0         0 return undef; # didn't find matching delimiter
1148             }
1149              
1150             #------------------------------------------------------------------------------
1151             # Decode LZW-encoded data (ref 1)
1152             # Inputs: 0) data reference
1153             # Returns: true on success and data is decoded, or false and data is untouched
1154             sub DecodeLZW($)
1155             {
1156 0     0 0 0 my $dataPt = shift;
1157 0 0       0 return 0 if length $$dataPt < 4;
1158 0         0 my @lzw = (map(chr, 0..255), undef, undef); # LZW code table
1159 0         0 my $mask = 0x01ff; # mask for least-significant 9 bits
1160 0         0 my @dat = unpack 'n*', $$dataPt . "\0";
1161 0         0 my $word = ($dat[0] << 16) | $dat[1];
1162 0         0 my ($bit, $pos, $bits, $out) = (0, 2, 9, '');
1163 0         0 my $lastVal;
1164 0         0 for (;;) {
1165             # bits are packed MSB first in PDF LZW (the PDF spec doesn't mention this)
1166 0         0 my $shift = 32 - ($bit + $bits);
1167 0 0       0 if ($shift < 0) {
1168 0 0       0 return 0 if $pos >= @dat; # missing EOD marker
1169 0         0 $word = (($word & 0xffff) << 16) | $dat[$pos++]; # read next word
1170 0         0 $bit -= 16;
1171 0         0 $shift += 16;
1172             };
1173 0         0 my $code = ($word >> $shift) & $mask;
1174 0         0 $bit += $bits;
1175 0         0 my $val = $lzw[$code];
1176 0 0       0 if (defined $val) {
    0          
    0          
    0          
1177             # store new code as previous sequence plus 1st char of new sequence
1178 0 0       0 push @lzw, $lastVal . substr($val, 0, 1) if defined $lastVal;
1179             } elsif ($code == @lzw) { # new code
1180 0 0       0 return 0 unless defined $lastVal;
1181             # we are using the code that we are about to generate, so the last
1182             # character in the new sequence must be the same as the first
1183             # character in the previous sequence (makes sense if you think about it)
1184 0         0 $val = $lastVal . substr($lastVal, 0, 1);
1185 0         0 push @lzw, $val;
1186             } elsif ($code == 256) { # clear table
1187 0         0 splice @lzw, 258;
1188 0         0 $bits = 9;
1189 0         0 $mask = 0x1ff;
1190 0         0 undef $lastVal;
1191 0         0 next;
1192             } elsif ($code == 257) { # EOD marker
1193 0         0 last; # all done!
1194             } else {
1195 0         0 return 0;
1196             }
1197 0         0 $out .= $val; # add this byte sequence to the output
1198             # we added a new entry to the LZW table, so we must increase
1199             # the bit width if necessary, up to a maximum of 12
1200 0 0 0     0 @lzw >= $mask and $bits < 12 and ++$bits, $mask |= $mask << 1;
1201 0         0 $lastVal = $val;
1202             }
1203 0         0 $$dataPt = $out; # return decompressed data
1204 0         0 return 1;
1205             }
1206              
1207             #------------------------------------------------------------------------------
1208             # Decode filtered stream
1209             # Inputs: 0) ExifTool object reference, 1) dictionary reference
1210             # Returns: true if stream has been decoded OK
1211             sub DecodeStream($$)
1212             {
1213 43     43 0 73 local $_;
1214 43         121 my ($et, $dict) = @_;
1215              
1216 43 50       165 return 0 unless $$dict{_stream}; # no stream to decode
1217              
1218             # get list of filters
1219 43         124 my (@filters, @decodeParms, $filter);
1220 43 50       303 if (ref $$dict{Filter} eq 'ARRAY') {
    50          
1221 0         0 @filters = @{$$dict{Filter}};
  0         0  
1222             } elsif (defined $$dict{Filter}) {
1223 0         0 @filters = ($$dict{Filter});
1224             }
1225             # be sure we can process all the filters before we take the time to do the decryption
1226 43         154 foreach $filter (@filters) {
1227 0 0       0 next if $supportedFilter{$filter};
1228 0         0 $et->Warn("Unsupported Filter $filter");
1229 0         0 return 0;
1230             }
1231             # apply decryption first if required (and if the default encryption
1232             # has not been overridden by a Crypt filter. Note: the Crypt filter
1233             # must be first in the Filter array: ref 3, page 38)
1234 43 50 33     252 unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
      33        
1235 43         242 CryptStream($dict, $lastFetched);
1236             }
1237 43 50       267 return 1 unless $$dict{Filter}; # Filter entry is mandatory
1238 0 0       0 return 0 if defined $$dict{_filtered}; # avoid double-filtering
1239 0         0 $$dict{_filtered} = 1; # set flag to prevent double-filtering
1240              
1241             # get array of DecodeParms dictionaries
1242 0 0       0 if (ref $$dict{DecodeParms} eq 'ARRAY') {
1243 0         0 @decodeParms = @{$$dict{DecodeParms}};
  0         0  
1244             } else {
1245 0         0 @decodeParms = ($$dict{DecodeParms});
1246             }
1247              
1248 0         0 foreach $filter (@filters) {
1249 0         0 my $decodeParms = shift @decodeParms;
1250              
1251 0 0       0 if ($filter eq '/FlateDecode') {
    0          
    0          
    0          
    0          
1252             # make sure we support the predictor (if used) before decoding
1253 0         0 my $pre;
1254 0 0       0 if (ref $decodeParms eq 'HASH') {
1255 0         0 $pre = $$decodeParms{Predictor};
1256 0 0 0     0 if ($pre and $pre ne '1' and $pre ne '12') {
      0        
1257 0         0 $et->Warn("FlateDecode Predictor $pre currently not supported");
1258 0         0 return 0;
1259             }
1260             }
1261 0 0       0 if (eval { require Compress::Zlib }) {
  0         0  
1262 0         0 my $inflate = Compress::Zlib::inflateInit();
1263 0         0 my ($buff, $stat);
1264 0 0       0 $inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
1265 0 0 0     0 if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
1266 0         0 $$dict{_stream} = $buff;
1267             } else {
1268 0         0 $et->Warn('Error inflating stream');
1269 0         0 return 0;
1270             }
1271             } else {
1272 0         0 $et->Warn('Install Compress::Zlib to process filtered streams');
1273 0         0 return 0;
1274             }
1275 0 0 0     0 next unless $pre and $pre eq '12'; # 12 = 'up' prediction
1276              
1277             # apply anti-predictor
1278 0         0 my $cols = $$decodeParms{Columns};
1279 0 0       0 unless ($cols) {
1280             # currently only support 'up' prediction
1281 0         0 $et->Warn('No Columns for decoding stream');
1282 0         0 return 0;
1283             }
1284 0         0 my @bytes = unpack('C*', $$dict{_stream});
1285 0         0 my @pre = (0) x $cols; # initialize predictor array
1286 0         0 my $buff = '';
1287 0         0 while (@bytes > $cols) {
1288 0 0       0 unless (($_ = shift @bytes) == 2) {
1289 0         0 $et->Warn("Unsupported PNG filter $_"); # (yes, PNG)
1290 0         0 return 0;
1291             }
1292 0         0 foreach (@pre) {
1293 0         0 $_ = ($_ + shift(@bytes)) & 0xff;
1294             }
1295 0         0 $buff .= pack('C*', @pre);
1296             }
1297 0         0 $$dict{_stream} = $buff;
1298              
1299             } elsif ($filter eq '/Crypt') {
1300              
1301             # (we shouldn't have to check the _decrypted flag since we
1302             # already checked the _filtered flag, but what the heck...)
1303 0 0       0 next if defined $$dict{_decrypted};
1304             # assume Identity filter (the default) if DecodeParms are missing
1305 0 0       0 next unless ref $decodeParms eq 'HASH';
1306 0         0 my $name = $$decodeParms{Name};
1307 0 0 0     0 next unless defined $name or $name eq 'Identity';
1308 0 0       0 if ($name ne 'StdCF') {
1309 0         0 $et->Warn("Unsupported Crypt Filter $name");
1310 0         0 return 0;
1311             }
1312 0 0       0 unless ($cryptInfo) {
1313 0         0 $et->Warn('Missing Encrypt StdCF entry');
1314 0         0 return 0;
1315             }
1316             # decrypt the stream manually because we want to:
1317             # 1) ignore $cryptStream (StmF) setting
1318             # 2) ignore EncryptMetadata setting (I can't find mention of how to
1319             # reconcile this in the spec., but this would make sense)
1320             # 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
1321             # 4) set _decrypted flag so we will recrypt according to StmF when
1322             # writing (since we don't yet write Filter'd streams)
1323 0         0 Crypt(\$$dict{_stream}, 'none');
1324 0 0       0 $$dict{_decrypted} = ($cryptStream ? 1 : 0);
1325              
1326             } elsif ($filter eq '/LZWDecode') {
1327              
1328             # make sure we don't have any unsupported decoding parameters
1329 0 0       0 if (ref $decodeParms eq 'HASH') {
1330 0 0       0 if ($$decodeParms{Predictor}) {
    0          
1331 0         0 $et->Warn("LZWDecode Predictor $$decodeParms{Predictor} currently not supported");
1332 0         0 return 0;
1333             } elsif ($$decodeParms{EarlyChange}) {
1334 0         0 $et->Warn("LZWDecode EarlyChange currently not supported");
1335 0         0 return 0;
1336             }
1337             }
1338 0 0       0 unless (DecodeLZW(\$$dict{_stream})) {
1339 0         0 $et->Warn('LZW decompress error');
1340 0         0 return 0;
1341             }
1342              
1343             } elsif ($filter eq '/ASCIIHexDecode') {
1344              
1345 0         0 $$dict{_stream} =~ s/>.*//; # truncate at '>' (end of data mark)
1346 0         0 $$dict{_stream} =~ tr/0-9a-zA-Z//d; # remove illegal characters
1347 0         0 $$dict{_stream} = pack 'H*', $$dict{_stream};
1348              
1349             } elsif ($filter eq '/ASCII85Decode') {
1350              
1351 0         0 my ($err, @out, $i);
1352 0         0 my ($n, $val) = (0, 0);
1353 0         0 foreach (split //, $$dict{_stream}) {
1354 0 0 0     0 if ($_ ge '!' and $_ le 'u') {;
    0          
    0          
1355 0         0 $val = 85 * $val + ord($_) - 33;
1356 0 0       0 next unless ++$n == 5;
1357             } elsif ($_ eq '~') {
1358 0 0       0 $n == 1 and $err = 1; # error to have a single char in the last group of 5
1359 0         0 for ($i=$n; $i<5; ++$i) { $val *= 85; }
  0         0  
1360             } elsif ($_ eq 'z') {
1361 0 0       0 $n and $err = 2, last; # error if 'z' isn't the first char
1362 0         0 $n = 5;
1363             } else {
1364 0 0       0 next if /^\s$/; # ignore white space
1365 0         0 $err = 3, last; # any other character is an error
1366             }
1367 0         0 $val = unpack('V', pack('N', $val)); # reverse byte order
1368 0         0 while (--$n > 0) {
1369 0         0 push @out, $val & 0xff;
1370 0         0 $val >>= 8;
1371             }
1372 0 0       0 last if $_ eq '~';
1373             # (both $n and $val are zero again now)
1374             }
1375 0 0       0 $err and $et->Warn("ASCII85Decode error $err");
1376 0         0 $$dict{_stream} = pack('C*', @out);
1377             }
1378             }
1379 0         0 return 1;
1380             }
1381              
1382             #------------------------------------------------------------------------------
1383             # Initialize state for RC4 en/decryption (ref 2)
1384             # Inputs: 0) RC4 key string
1385             # Returns: RC4 key hash reference
1386             sub RC4Init($)
1387             {
1388 22     22 0 90 my @key = unpack('C*', shift);
1389 22         322 my @state = (0 .. 255);
1390 22         63 my ($i, $j) = (0, 0);
1391 22         59 while ($i < 256) {
1392 5632         9139 my $st = $state[$i];
1393 5632         10555 $j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
1394 5632         9771 $state[$i++] = $state[$j];
1395 5632         12326 $state[$j] = $st;
1396             }
1397 22         161 return { State => \@state, XY => [ 0, 0 ] };
1398             }
1399              
1400             #------------------------------------------------------------------------------
1401             # Apply RC4 en/decryption (ref 2)
1402             # Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
1403             # - can call this method directly with a key string, or with with the key
1404             # reference returned by RC4Init
1405             # - RC4 is a symmetric algorithm, so encryption is the same as decryption
1406             sub RC4Crypt($$)
1407             {
1408 22     22 0 69 my ($dataPt, $key) = @_;
1409 22 50       82 $key = RC4Init($key) unless ref $key eq 'HASH';
1410 22         53 my $state = $$key{State};
1411 22         34 my ($x, $y) = @{$$key{XY}};
  22         74  
1412              
1413 22         95 my @data = unpack('C*', $$dataPt);
1414 22         57 foreach (@data) {
1415 356         645 $x = ($x + 1) & 0xff;
1416 356         613 my $stx = $$state[$x];
1417 356         569 $y = ($stx + $y) & 0xff;
1418 356         654 my $sty = $$state[$x] = $$state[$y];
1419 356         539 $$state[$y] = $stx;
1420 356         739 $_ ^= $$state[($stx + $sty) & 0xff];
1421             }
1422 22         70 $$key{XY} = [ $x, $y ];
1423 22         253 $$dataPt = pack('C*', @data);
1424             }
1425              
1426             #------------------------------------------------------------------------------
1427             # Update AES cipher with a bit of data
1428             # Inputs: 0) data
1429             # Returns: encrypted data
1430             my $cipherMore;
1431             sub CipherUpdate($)
1432             {
1433 0     0 0 0 my $dat = shift;
1434 0         0 my $pos = 0;
1435 0 0       0 $dat = $cipherMore . $dat if length $dat;
1436 0         0 while ($pos + 16 <= length($dat)) {
1437 0         0 substr($dat,$pos,16) = Image::ExifTool::AES::Cipher(substr($dat,$pos,16));
1438 0         0 $pos += 16;
1439             }
1440 0 0       0 if ($pos < length $dat) {
1441 0         0 $cipherMore = substr($dat,$pos);
1442 0         0 $dat = substr($dat,0,$pos);
1443             } else {
1444 0         0 $cipherMore = '';
1445             }
1446 0         0 return $dat;
1447             }
1448              
1449             #------------------------------------------------------------------------------
1450             # Get encrypted hash
1451             # Inputs: 0) Password, 1) salt, 2) vector, 3) encryption revision
1452             # Returns: hash
1453             sub GetHash($$$$)
1454             {
1455 6     6 0 37 my ($password, $salt, $vector, $rev) = @_;
1456              
1457             # return Rev 5 hash
1458 6 50       70 return Digest::SHA::sha256($password, $salt, $vector) if $rev == 5;
1459              
1460             # compute Rev 6 hardened hash
1461             # (ref http://code.google.com/p/origami-pdf/source/browse/lib/origami/encryption.rb)
1462 0         0 my $blockSize = 32;
1463 0         0 my $input = Digest::SHA::sha256($password, $salt, $vector) . ("\0" x 32);
1464 0         0 my $key = substr($input, 0, 16);
1465 0         0 my $iv = substr($input, 16, 16);
1466 0         0 my $h;
1467 0         0 my $x = '';
1468 0         0 my $i = 0;
1469 0   0     0 while ($i < 64 or $i < ord(substr($x,-1,1))+32) {
1470              
1471 0         0 my $block = substr($input, 0, $blockSize);
1472 0         0 $x = '';
1473 0         0 Image::ExifTool::AES::Crypt(\$x, $key, $iv, 1);
1474 0         0 $cipherMore = '';
1475              
1476 0         0 my ($j, $digest);
1477 0         0 for ($j=0; $j<64; ++$j) {
1478 0         0 $x = '';
1479 0 0       0 $x .= CipherUpdate($password) if length $password;
1480 0         0 $x .= CipherUpdate($block);
1481 0 0       0 $x .= CipherUpdate($vector) if length $vector;
1482 0 0       0 if ($j == 0) {
1483 0         0 my @a = unpack('C16', $x);
1484 0         0 my $sum = 0;
1485 0         0 $sum += $_ foreach @a;
1486             # set SHA block size (32, 48 or 64 bytes = SHA-256, 384 or 512)
1487 0         0 $blockSize = 32 + ($sum % 3) * 16;
1488 0         0 $digest = Digest::SHA->new($blockSize * 8);
1489             }
1490 0         0 $digest->add($x);
1491             }
1492              
1493 0         0 $h = $digest->digest();
1494 0         0 $key = substr($h, 0, 16);
1495 0         0 substr($input,0,16) = $h;
1496 0         0 $iv = substr($h, 16, 16);
1497 0         0 ++$i;
1498             }
1499 0         0 return substr($h, 0, 32);
1500             }
1501              
1502             #------------------------------------------------------------------------------
1503             # Initialize decryption
1504             # Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
1505             # 2) ID from file trailer dictionary
1506             # Returns: error string or undef on success (and sets $cryptInfo)
1507             sub DecryptInit($$$)
1508             {
1509 4     4 0 38 local $_;
1510 4         17 my ($et, $encrypt, $id) = @_;
1511              
1512 4         11 undef $cryptInfo;
1513 4 50 33     30 unless ($encrypt and ref $encrypt eq 'HASH') {
1514 0         0 return 'Error loading Encrypt object';
1515             }
1516 4         15 my $filt = $$encrypt{Filter};
1517 4 50 33     43 unless ($filt and $filt =~ s/^\///) {
1518 0         0 return 'Encrypt dictionary has no Filter!';
1519             }
1520             # extract some interesting tags
1521 4   50     49 my $ver = $$encrypt{V} || 0;
1522 4   100     20 my $rev = $$encrypt{R} || 0;
1523 4         15 my $enc = "$filt V$ver";
1524 4 50       23 $enc .= ".$rev" if $filt eq 'Standard';
1525 4 50 33     23 $enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
1526 4 50 100     36 $enc .= ' (' . ($$encrypt{Length} || 40) . '-bit)' if $filt eq 'Standard';
1527 4         19 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
1528 4         35 $et->HandleTag($tagTablePtr, 'Filter', $enc);
1529 4 50       30 if ($filt ne 'Standard') {
    50          
1530 0         0 return "Encryption filter $filt currently not supported";
1531             } elsif (not defined $$encrypt{R}) {
1532 0         0 return 'Standard security handler missing revision';
1533             }
1534 4 50 33     77 unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
      33        
1535 0         0 return 'Incomplete Encrypt specification';
1536             }
1537 4 50       39 if ("$ver.$rev" >= 5.6) {
1538             # apologize for poor performance (AES is a pure Perl implementation)
1539 0         0 $et->Warn('Decryption is very slow for encryption V5.6 or higher', 3);
1540             }
1541 4         20 $et->HandleTag($tagTablePtr, 'P', $$encrypt{P});
1542              
1543 4         9 my %parm; # optional parameters extracted from Encrypt dictionary
1544              
1545 4 100 66     45 if ($ver == 1 or $ver == 2) {
    50 66        
1546 1         3 $cryptString = $cryptStream = 1;
1547             } elsif ($ver == 4 or $ver == 5) {
1548             # initialize our $cryptString and $cryptStream flags
1549 3         11 foreach ('StrF', 'StmF') {
1550 6 100       24 my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
1551 6         22 $$flagPt = $$encrypt{$_};
1552 6 50 33     48 undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
1553 6 50 33     37 return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
1554             }
1555 3 50 33     29 if ($cryptString or $cryptStream) {
1556             return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
1557 3 50 33     42 ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
      33        
1558 3         8 my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
1559 3 50       26 unless ($cryptMeth =~ /^\/(V2|AESV2|AESV3)$/) {
1560 0         0 return "Unsupported encryption method $cryptMeth";
1561             }
1562             # set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
1563 3 50       32 $$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2|AESV3)$/;
1564             }
1565 3 100       13 if ($ver == 5) {
1566             # validate OE and UE entries
1567 2         9 foreach ('OE', 'UE') {
1568 4 50       16 return "Missing Encrypt $_ entry" unless $$encrypt{$_};
1569 4         16 $parm{$_} = ReadPDFValue($$encrypt{$_});
1570 4 50       19 return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
1571             }
1572 2         19 require Image::ExifTool::AES; # will need this later
1573             }
1574             } else {
1575 0         0 return "Encryption version $ver currently not supported";
1576             }
1577 4 50       21 $id or return "Can't decrypt (no document ID)";
1578              
1579             # make sure we have the necessary libraries available
1580 4 100       14 if ($ver < 5) {
1581 2 50       7 unless (eval { require Digest::MD5 }) {
  2         24  
1582 0         0 return "Install Digest::MD5 to process encrypted PDF";
1583             }
1584             } else {
1585 2 50       8 unless (eval { require Digest::SHA }) {
  2         56  
1586 0         0 return "Install Digest::SHA to process AES-256 encrypted PDF";
1587             }
1588             }
1589              
1590             # calculate file-level en/decryption key
1591 4         28 my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
1592             "\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
1593 4         19 my $o = ReadPDFValue($$encrypt{O});
1594 4         16 my $u = ReadPDFValue($$encrypt{U});
1595              
1596             # set flag indicating whether metadata is encrypted
1597             # (in version 4 and higher, metadata streams may not be encrypted)
1598 4 100 100     52 if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
      66        
1599 3         9 $$encrypt{_meta} = 1;
1600             }
1601             # try no password first, then try provided password if available
1602 4         10 my ($try, $key);
1603 4         12 for ($try=0; ; ++$try) {
1604 5         12 my $password;
1605 5 100       19 if ($try == 0) {
    50          
1606 4         12 $password = '';
1607             } elsif ($try == 1) {
1608 1         8 $password = $et->Options('Password');
1609 1 50       6 return 'Document is password protected (use Password option)' unless defined $password;
1610             # make sure there is no UTF-8 flag on the password
1611 1 50 33     12 if ($] >= 5.006 and ($$et{OPTIONS}{EncodeHangs} or
      33        
1612             eval { require Encode; Encode::is_utf8($password) } or $@))
1613             {
1614 0     0   0 local $SIG{'__WARN__'} = sub { };
1615             # repack by hand if Encode isn't available
1616 0 0 0     0 $password = ($$et{OPTIONS}{EncodeHangs} or $@) ? pack('C*', unpack($] < 5.010000 ?
    0          
1617             'U0C*' : 'C0C*', $password)) : Encode::encode('utf8', $password);
1618             }
1619             } else {
1620 0         0 return 'Incorrect password';
1621             }
1622 5 100       20 if ($ver < 5) {
1623 2 50       7 if (length $password) {
1624             # password must be encoding in PDFDocEncoding (ref iso32000)
1625 0         0 $password = $et->Encode($password, 'PDFDoc');
1626             # truncate or pad the password to exactly 32 bytes
1627 0 0       0 if (length($password) > 32) {
    0          
1628 0         0 $password = substr($password, 0, 32);
1629             } elsif (length($password) < 32) {
1630 0         0 $password .= substr($pad, 0, 32-length($password));
1631             }
1632             } else {
1633 2         4 $password = $pad;
1634             }
1635 2         11 $key = $password . $o . pack('V', $$encrypt{P}) . $id;
1636 2         5 my $rep = 1;
1637 2 100 66     14 if ($rev == 3 or $rev == 4) {
1638             # must add this if metadata not encrypted
1639 1 50       5 $key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
1640 1         3 $rep += 50; # repeat MD5 50 more times if revision is 3 or greater
1641             }
1642 2         6 my ($len, $i, $dat);
1643 2 100       7 if ($ver == 1) {
1644 1         2 $len = 5;
1645             } else {
1646 1   50     9 $len = $$encrypt{Length} || 40;
1647 1 50       5 $len >= 40 or return 'Bad Encrypt Length';
1648 1         21 $len = int($len / 8);
1649             }
1650 2         10 for ($i=0; $i<$rep; ++$i) {
1651 52         169 $key = substr(Digest::MD5::md5($key), 0, $len);
1652             }
1653             # decrypt U to see if a user password is required
1654 2 100       7 if ($rev >= 3) {
1655 1         6 $dat = Digest::MD5::md5($pad . $id);
1656 1         6 RC4Crypt(\$dat, $key);
1657 1         7 for ($i=1; $i<=19; ++$i) {
1658 19         56 my @key = unpack('C*', $key);
1659 19         39 foreach (@key) { $_ ^= $i; }
  304         547  
1660 19         68 RC4Crypt(\$dat, pack('C*', @key));
1661             }
1662 1         8 $dat .= substr($u, 16);
1663             } else {
1664 1         2 $dat = $pad;
1665 1         5 RC4Crypt(\$dat, $key);
1666             }
1667 2 50       14 last if $dat eq $u; # all done if this was the correct key
1668             } else {
1669 3 50 33     37 return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
1670 3 100       12 if (length $password) {
1671             # Note: this should be good for passwords containing reasonable characters,
1672             # but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
1673             # of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
1674 1         8 $password = $et->Encode($password, 'UTF8');
1675 1 50       5 $password = substr($password, 0, 127) if length($password) > 127;
1676             }
1677             # test for the owner password
1678 3         19 my $sha = GetHash($password, substr($o,32,8), substr($u,0,48), $rev);
1679 3 100       27 if ($sha eq substr($o, 0, 32)) {
1680 2         7 $key = GetHash($password, substr($o,40,8), substr($u,0,48), $rev);
1681 2         8 my $dat = ("\0" x 16) . $parm{OE};
1682             # decrypt with no padding
1683 2         15 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1684 2 50       11 return $err if $err;
1685 2         5 $key = $dat; # use this as the file decryption key
1686 2         7 last;
1687             }
1688             # test for the user password
1689 1         5 $sha = GetHash($password, substr($u,32,8), '', $rev);
1690 1 50       8 if ($sha eq substr($u, 0, 32)) {
1691 0         0 $key = GetHash($password, substr($u,40,8), '', $rev);
1692 0         0 my $dat = ("\0" x 16) . $parm{UE};
1693 0         0 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1694 0 0       0 return $err if $err;
1695 0         0 $key = $dat; # use this as the file decryption key
1696 0         0 last;
1697             }
1698             }
1699             }
1700 4         86 $$encrypt{_key} = $key; # save the file-level encryption key
1701 4         16 $cryptInfo = $encrypt; # save reference to the file-level Encrypt object
1702 4         39 return undef; # success!
1703             }
1704              
1705             #------------------------------------------------------------------------------
1706             # Decrypt/Encrypt data
1707             # Inputs: 0) data ref
1708             # 1) PDF object reference to use as crypt key extension (may be 'none' to
1709             # avoid extending the encryption key, as for streams with Crypt Filter)
1710             # 2) encrypt flag (false for decryption)
1711             sub Crypt($$;$)
1712             {
1713 29 100   29 0 162 return unless $cryptInfo;
1714 4         14 my ($dataPt, $keyExt, $encrypt) = @_;
1715             # do not decrypt if the key extension object is undefined
1716             # (this doubles as a flag to disable decryption/encryption)
1717 4 50       16 return unless defined $keyExt;
1718 4         14 my $key = $$cryptInfo{_key};
1719             # apply the necessary crypt key extension
1720 4 100       19 unless ($$cryptInfo{_aesv3}) {
1721 2 50       11 unless ($keyExt eq 'none') {
1722             # extend crypt key using object and generation number
1723 2 50       21 unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
1724 0         0 $$cryptInfo{_error} = 'Invalid object reference for encryption';
1725 0         0 return;
1726             }
1727 2         24 $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
1728             }
1729             # add AES-128 salt if necessary (this little gem is conveniently
1730             # omitted from the Adobe PDF 1.6 documentation, causing me to
1731             # waste 12 hours trying to figure out why this wasn't working --
1732             # it appears in ISO32000 though, so I should have been using that)
1733 2 100       9 $key .= 'sAlT' if $$cryptInfo{_aesv2};
1734 2         7 my $len = length($key);
1735 2         13 $key = Digest::MD5::md5($key); # get 16-byte MD5 digest
1736 2 100       10 $key = substr($key, 0, $len) if $len < 16; # trim if necessary
1737             }
1738             # perform the decryption/encryption
1739 4 100 100     32 if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
1740 3         33 require Image::ExifTool::AES;
1741 3         23 my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
1742 3 50       28 $err and $$cryptInfo{_error} = $err;
1743             } else {
1744 1         3 RC4Crypt($dataPt, $key);
1745             }
1746             }
1747              
1748             #------------------------------------------------------------------------------
1749             # Decrypt/Encrypt stream data
1750             # Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
1751             sub CryptStream($$)
1752             {
1753 52 50   52 0 158 return unless $cryptStream;
1754 0         0 my ($dict, $keyExt) = @_;
1755 0   0     0 my $type = $$dict{Type} || '';
1756             # XRef streams are not encrypted (ref 3, page 50),
1757             # and Metadata may or may not be encrypted
1758 0 0 0     0 if ($cryptInfo and $type ne '/XRef' and
      0        
      0        
1759             ($$cryptInfo{_meta} or $type ne '/Metadata'))
1760             {
1761 0         0 Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
1762             # toggle _decrypted flag
1763 0 0       0 $$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
1764             } else {
1765 0         0 $$dict{_decrypted} = 0; # stream should never be encrypted
1766             }
1767             }
1768              
1769             #------------------------------------------------------------------------------
1770             # Generate a new PDF tag (based on its ID) and add it to a tag table
1771             # Inputs: 0) tag table ref, 1) tag ID
1772             # Returns: tag info ref
1773             sub NewPDFTag($$)
1774             {
1775 0     0 0 0 my ($tagTablePtr, $tag) = @_;
1776 0         0 my $name = $tag;
1777             # translate URL-like escape sequences
1778 0         0 $name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
  0         0  
1779 0         0 $name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
1780 0         0 $name =~ s/(^|_)([a-z])/\U$2/g; # start words with upper case
1781 0         0 my $tagInfo = { Name => $name };
1782 0         0 AddTagToTable($tagTablePtr, $tag, $tagInfo);
1783 0         0 return $tagInfo;
1784             }
1785              
1786             #------------------------------------------------------------------------------
1787             # Process AcroForm dictionary to set HasXMLFormsArchitecture flag
1788             # Inputs: Same as ProcessDict
1789             sub ProcessAcroForm($$$$;$$)
1790             {
1791 0     0 0 0 my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1792 0 0       0 $et->HandleTag($tagTablePtr, '_has_xfa', $$dict{XFA} ? 'true' : 'false');
1793 0 0       0 return 1 unless $et->Options('Verbose');
1794 0         0 return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
1795             }
1796              
1797             #------------------------------------------------------------------------------
1798             # Process AF dictionary to extract C2PA manifest
1799             # Inputs: Same as ProcessDict
1800             sub ProcessAF($$$$;$$)
1801             {
1802 0     0 0 0 my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1803 0   0     0 $$et{AFRelationship} = $$dict{AFRelationship} || '';
1804             # go no further unless Verbose or this is the C2PA_Manifest item
1805 0 0 0     0 return 1 unless $et->Options('Verbose') or $$et{AFRelationship} eq '/C2PA_Manifest';
1806 0         0 return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
1807             }
1808              
1809             #------------------------------------------------------------------------------
1810             # Expand array into a string
1811             # Inputs: 0) array ref
1812             # Return: string
1813             sub ExpandArray($)
1814             {
1815 0     0 0 0 my $val = shift;
1816 0         0 my @list = @$val;
1817 0         0 foreach (@list) {
1818 0 0       0 ref $_ eq 'SCALAR' and $_ = "ref($$_)", next;
1819 0 0       0 ref $_ eq 'ARRAY' and $_ = ExpandArray($_), next;
1820 0 0       0 defined $_ or $_ = '', next;
1821             }
1822 0         0 return '[' . join(',',@list) . ']';
1823             }
1824              
1825             #------------------------------------------------------------------------------
1826             # Process PDF dictionary extract tag values
1827             # Inputs: 0) ExifTool object reference, 1) tag table reference
1828             # 2) dictionary reference, 3) cross-reference table reference,
1829             # 4) nesting depth, 5) dictionary capture type
1830             sub ProcessDict($$$$;$$)
1831             {
1832 350     350 0 577 local $_;
1833 350         970 my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1834 350         1349 my $verbose = $et->Options('Verbose');
1835 350         837 my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
1836 350   33     1183 my $embedded = (defined $unknown and not $unknown and $et->Options('ExtractEmbedded'));
1837 350         560 my @tags = @{$$dict{_tags}};
  350         1685  
1838 350         761 my ($next, %join, $validInfo);
1839 350         534 my $index = 0;
1840              
1841 350   100     1137 $nesting = ($nesting || 0) + 1;
1842 350 50       1024 if ($nesting > 50) {
1843 0         0 $et->Warn('Nesting too deep (directory ignored)');
1844 0         0 return;
1845             }
1846             # save entire dictionary for rewriting if specified
1847 350 50 100     1464 if ($$et{PDF_CAPTURE} and $$tagTablePtr{VARS} and
      66        
1848             $tagTablePtr->{VARS}->{CAPTURE})
1849             {
1850 66         111 my $name;
1851 66         113 foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
  66         218  
1852 82 100       302 next if $$et{PDF_CAPTURE}{$name};
1853             # make sure we load the right type if indicated
1854 66 50 66     243 next if $type and $type ne $name;
1855 66         238 $$et{PDF_CAPTURE}{$name} = $dict;
1856 66         273 last;
1857             }
1858             }
1859 350   33     1158 $validInfo = ($et->Options('Validate') and $tagTablePtr eq \%Image::ExifTool::PDF::Info);
1860             #
1861             # extract information from all tags in the dictionary
1862             #
1863 350         598 for (;;) {
1864 1473         2323 my ($tag, $isSubDoc);
1865 1473 100 33     3214 if (@tags) {
    50          
1866 1123         2552 $tag = shift @tags;
1867             } elsif (defined $next and not $next) {
1868 0         0 $tag = 'Next';
1869 0         0 $next = 1;
1870             } else {
1871 350         609 last;
1872             }
1873 1123         2719 my $val = $$dict{$tag};
1874 1123         3248 my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag);
1875 1123 100 33     3673 if ($tagInfo) {
    50 33        
      0        
      0        
1876 434 50       1168 undef $tagInfo if $$tagInfo{NoProcess};
1877             } elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
1878             $$tagTablePtr{$1} and (ref $val ne 'SCALAR' or not $fetched{$$val}))
1879             {
1880 0         0 my ($name, $num) = ($1, $2);
1881 0         0 $tagInfo = $et->GetTagInfo($tagTablePtr, $name);
1882 0 0 0     0 if (ref $tagInfo eq 'HASH' and $$tagInfo{JoinStreams}) {
1883 0         0 $fetched{$$val} = 1;
1884 0         0 my $obj = FetchObject($et, $$val, $xref, $tag);
1885 0 0       0 $join{$name} = [] unless $join{$name};
1886 0 0 0     0 next unless ref $obj eq 'HASH' and $$obj{_stream};
1887             # save all the stream data to join later
1888 0         0 DecodeStream($et, $obj);
1889 0         0 $join{$name}->[$num] = $$obj{_stream};
1890 0         0 undef $tagInfo; # don't process
1891             } else {
1892 0         0 $isSubDoc = 1; # treat as a sub-document
1893             }
1894             }
1895 1123 0 33     2516 if ($validInfo and $$et{PDFVersion} >= 2.0 and (not $tagInfo or not $$tagInfo{PDF2})) {
      0        
      33        
1896 0 0       0 my $name = $tagInfo ? ":$$tagInfo{Name}" : " Info tag '${tag}'";
1897 0         0 $et->Warn("PDF$name is deprecated in PDF 2.0");
1898             }
1899 1123 50       2424 if ($verbose) {
1900 0         0 my ($val2, $extra);
1901 0 0       0 if (ref $val eq 'SCALAR') {
    0          
    0          
1902 0         0 $extra = ", indirect object ($$val)";
1903 0 0 0     0 if ($fetched{$$val}) {
    0          
1904 0         0 $val2 = "ref($$val)";
1905             } elsif ($tag eq 'Next' and not $next) {
1906             # handle 'Next' links after all others
1907 0         0 $next = 0;
1908 0         0 next;
1909             } else {
1910 0         0 $fetched{$$val} = 1;
1911 0         0 $val = FetchObject($et, $$val, $xref, $tag);
1912 0 0       0 unless (defined $val) {
1913 0         0 my $str;
1914 0 0       0 if (defined $lastOffset) {
1915 0         0 $val2 = '';
1916 0         0 $str = 'Object was freed';
1917             } else {
1918 0         0 $val2 = '';
1919 0         0 $str = 'Error reading object';
1920             }
1921 0         0 $et->VPrint(0, "$$et{INDENT}${str}:\n");
1922             }
1923             }
1924             } elsif (ref $val eq 'HASH') {
1925 0         0 $extra = ', direct dictionary';
1926             } elsif (ref $val eq 'ARRAY') {
1927 0         0 $extra = ', direct array of ' . scalar(@$val) . ' objects';
1928             } else {
1929 0         0 $extra = ', direct object';
1930             }
1931 0         0 my $isSubdir;
1932 0 0       0 if (ref $val eq 'HASH') {
    0          
1933 0         0 $isSubdir = 1;
1934             } elsif (ref $val eq 'ARRAY') {
1935             # recurse into objects in arrays only if they are lists of
1936             # dictionaries or indirect objects which could be dictionaries
1937 0 0       0 $isSubdir = 1 if @$val;
1938 0         0 foreach (@$val) {
1939 0 0 0     0 next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
1940 0         0 undef $isSubdir;
1941 0         0 last;
1942             }
1943             }
1944 0 0       0 if ($isSubdir) {
1945             # create bogus subdirectory to recurse into this dict
1946 0 0       0 $tagInfo or $tagInfo = {
1947             Name => $tag,
1948             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Unknown' },
1949             };
1950             } else {
1951 0 0       0 $val2 = ExpandArray($val) if ref $val eq 'ARRAY';
1952             # generate tag info if we will use it later
1953 0 0 0     0 if (not $tagInfo and defined $val and $unknown) {
      0        
1954 0         0 $tagInfo = NewPDFTag($tagTablePtr, $tag);
1955             }
1956             }
1957 0   0     0 $et->VerboseInfo($tag, $tagInfo,
1958             Value => $val2 || $val,
1959             Extra => $extra,
1960             Index => $index++,
1961             );
1962 0 0       0 next unless defined $val;
1963             }
1964 1123 100       2533 unless ($tagInfo) {
1965             # add any tag found in Info dictionary to table
1966 689 50       1674 next unless $unknown;
1967 0         0 $tagInfo = NewPDFTag($tagTablePtr, $tag);
1968             }
1969             # increment document number if necessary
1970 434         785 my ($oldDocNum, $oldNumTags);
1971 434 50       916 if ($isSubDoc) {
1972 0         0 $oldDocNum = $$et{DOC_NUM};
1973 0         0 $oldNumTags = $$et{NUM_FOUND};
1974 0         0 $$et{DOC_NUM} = ++$$et{DOC_COUNT};
1975             }
1976 434 100       1061 if ($$tagInfo{SubDirectory}) {
1977             # process the subdirectory
1978 332         557 my @subDicts;
1979 332 100       813 if (ref $val eq 'ARRAY') {
1980             # hack to convert array to dictionary if necessary
1981 37 50 33     191 if ($$tagInfo{ConvertToDict} and @$val == 2 and not ref $$val[0]) {
      33        
1982 0         0 my $tg = $$val[0];
1983 0         0 $tg =~ s(^/)(); # remove name
1984 0         0 my %dict = ( _tags => [ $tg ], $tg => $$val[1] );
1985 0         0 @subDicts = ( \%dict );
1986             } else {
1987 37         64 @subDicts = @{$val};
  37         135  
1988             }
1989             } else {
1990 295         698 @subDicts = ( $val );
1991             }
1992             # loop through all values of this tag
1993 332         601 for (;;) {
1994 664 100       1978 my $subDict = shift @subDicts or last;
1995             # save last fetched object in case we fetch another one here
1996 332         701 my $prevFetched = $lastFetched;
1997 332 100       891 if (ref $subDict eq 'SCALAR') {
1998             # only fetch once (other copies are obsolete)
1999 244 100       770 next if $fetched{$$subDict};
2000 197 100       621 if ($$tagInfo{IgnoreDuplicates}) {
2001 28         64 my $flag = "ProcessedPDF_$tag";
2002 28 50       100 if ($$et{$flag}) {
2003 0 0       0 next if $et->Warn("Ignored duplicate $tag dictionary", 2);
2004             } else {
2005 28         100 $$et{$flag} = 1;
2006             }
2007             }
2008             # load dictionary via an indirect reference
2009 197         627 $fetched{$$subDict} = 1;
2010 197         669 my $obj = FetchObject($et, $$subDict, $xref, $tag);
2011 197 100       635 unless (defined $obj) {
2012 5 50       19 unless (defined $lastOffset) {
2013 0         0 $et->Warn("Error reading $tag object ($$subDict)");
2014             }
2015 5         13 next;
2016             }
2017 192         406 $subDict = $obj;
2018             }
2019 280 50       917 if (ref $subDict eq 'ARRAY') {
2020             # convert array of key/value pairs to a hash
2021 0 0       0 next if @$subDict < 2;
2022 0         0 my %hash = ( _tags => [] );
2023 0         0 while (@$subDict >= 2) {
2024 0         0 my $key = shift @$subDict;
2025 0         0 $key =~ s/^\///;
2026 0         0 push @{$hash{_tags}}, $key;
  0         0  
2027 0         0 $hash{$key} = shift @$subDict;
2028             }
2029 0         0 $subDict = \%hash;
2030             } else {
2031 280 50       758 next unless ref $subDict eq 'HASH';
2032             }
2033             # set flag to re-crypt all strings when rewriting if the dictionary
2034             # came from an encrypted stream
2035 280 50       624 $$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
2036 280         1472 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
2037 280 50       621 if (not $verbose) {
    0          
2038 280   50     1367 my $proc = $$subTablePtr{PROCESS_PROC} || \&ProcessDict;
2039 280         1052 &$proc($et, $subTablePtr, $subDict, $xref, $nesting);
2040             } elsif ($next) {
2041             # handle 'Next' links at this level to avoid deep recursion
2042 0         0 undef $next;
2043 0         0 $index = 0;
2044 0         0 $tagTablePtr = $subTablePtr;
2045 0         0 $dict = $subDict;
2046 0         0 @tags = @{$$subDict{_tags}};
  0         0  
2047 0         0 $et->VerboseDir($tag, scalar(@tags));
2048             } else {
2049 0         0 my $oldIndent = $$et{INDENT};
2050 0         0 my $oldDir = $$et{DIR_NAME};
2051 0         0 $$et{INDENT} .= '| ';
2052 0         0 $$et{DIR_NAME} = $tag;
2053 0         0 $et->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
  0         0  
2054 0   0     0 my $proc = $$subTablePtr{PROCESS_PROC} || \&ProcessDict;
2055 0         0 &$proc($et, $subTablePtr, $subDict, $xref, $nesting);
2056 0         0 $$et{INDENT} = $oldIndent;
2057 0         0 $$et{DIR_NAME} = $oldDir;
2058             }
2059 280         1981 $lastFetched = $prevFetched;
2060             }
2061             } else {
2062             # fetch object if necessary
2063             # (OS X 10.6 writes indirect objects in the Info dictionary!)
2064 102 50       311 if (ref $val eq 'SCALAR') {
2065 0         0 my $prevFetched = $lastFetched;
2066             # (note: fetching the same object multiple times is OK here)
2067 0         0 $val = FetchObject($et, $$val, $xref, $tag);
2068 0 0       0 if (defined $val) {
2069 0         0 $val = ReadPDFValue($val);
2070             # set flag to re-encrypt if necessary if rewritten
2071 0 0       0 $$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
    0          
2072 0         0 $lastFetched = $prevFetched; # restore last fetched object reference
2073             }
2074             } else {
2075 102         426 $val = ReadPDFValue($val);
2076             }
2077 102 100       337 if (ref $val) {
    50          
2078 27 50       119 if (ref $val eq 'ARRAY') {
2079 27 50       171 delete $$et{LIST_TAGS}{$tagInfo} if $$tagInfo{List};
2080 27         69 my $v;
2081 27         64 foreach $v (@$val) {
2082 80         239 $et->FoundTag($tagInfo, $v);
2083             }
2084             }
2085             } elsif (defined $val) {
2086             # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
2087             # unless this is binary data (hex-encoded strings would not have been converted)
2088 75   100     558 my $format = $$tagInfo{Format} || $$tagInfo{Writable} || 'string';
2089 75 100       265 $val = ConvertPDFDate($val) if $format eq 'date';
2090 75 50 33     556 if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
2091             # text string is already in Unicode if it starts with "\xfe\xff",
2092             # otherwise we must first convert from PDFDocEncoding
2093 0 0       0 $val = $et->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
2094             }
2095 75 100 66     348 if ($$tagInfo{List} and not $$et{OPTIONS}{NoPDFList}) {
2096             # separate tokens in comma or whitespace delimited lists
2097 12         44 my $comma = $val =~ tr/,/,/;
2098 12         65 my $semi = $val =~ tr/;/;/;
2099 12         23 my $split;
2100 12 50 33     49 if ($comma or $semi) {
2101 12 50       49 $split = $comma > $semi ? ',+\\s*' : ';+\\s*';
2102             } else {
2103 0         0 $split = ' ';
2104             }
2105 12         571 my @values = split $split, $val;
2106 12         88 $et->FoundTag($tagInfo, $_) foreach @values;
2107             } else {
2108             # a simple tag value
2109 63         251 $et->FoundTag($tagInfo, $val);
2110             }
2111             }
2112             }
2113 434 50       1292 if ($isSubDoc) {
2114             # restore original document number
2115 0         0 $$et{DOC_NUM} = $oldDocNum;
2116 0 0       0 --$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2117             }
2118             }
2119             #
2120             # extract information from joined streams if necessary
2121             #
2122              
2123 350 50       862 if (%join) {
2124 0         0 my ($tag, $i);
2125 0         0 foreach $tag (sort keys %join) {
2126 0         0 my $list = $join{$tag};
2127 0 0 0     0 last unless defined $$list[1] and $$list[1] =~ /^%.*?([\x0d\x0a]*)/;
2128 0         0 my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
2129 0         0 for ($i=1; defined $$list[$i]; ++$i) {
2130 0         0 $buff .= $$list[$i];
2131 0         0 undef $$list[$i]; # free memory
2132             }
2133             # increment document number for tags extracted from embedded EPS
2134 0         0 my $oldDocNum = $$et{DOC_NUM};
2135 0         0 my $oldNumTags = $$et{NUM_FOUND};
2136 0         0 $$et{DOC_NUM} = ++$$et{DOC_COUNT};
2137             # extract PostScript information
2138 0         0 $et->HandleTag($tagTablePtr, $tag, $buff);
2139 0         0 $$et{DOC_NUM} = $oldDocNum;
2140             # revert document counter if we didn't add any new tags
2141 0 0       0 --$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2142 0         0 delete $$et{DOC_NUM};
2143             }
2144             }
2145             #
2146             # extract information from stream object if it exists (eg. Metadata stream)
2147             #
2148 350         498 for (;;) { # (cheap goto)
2149 350 100       1909 last unless $$dict{_stream};
2150 43         137 my $tag = '_stream';
2151             # add Subtype (if it exists) to stream name and remove leading '/'
2152 43 100       315 ($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
2153 43 50       174 last unless $$tagTablePtr{$tag};
2154 43 50       163 my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag) or last;
2155 43         124 my $subdir = $$tagInfo{SubDirectory};
2156 43 50       147 unless ($subdir) {
2157             # don't build filter lists across different images
2158 0         0 delete $$et{LIST_TAGS}{$$tagTablePtr{Filter}};
2159             # we arrive here only when extracting embedded images
2160             # - only extract known image types and ignore others
2161 0   0     0 my $filter = $$dict{Filter} || '';
2162 0 0       0 $filter = @$filter[-1] if ref $filter eq 'ARRAY'; # (get last Filter type)
2163 0         0 my $result;
2164 0 0 0     0 if ($filter eq '/DCTDecode' or $filter eq '/JPXDecode') {
2165 0 0       0 DecodeStream($et, $dict) or last;
2166             # save the image itself
2167 0         0 $et->FoundTag($tagInfo, \$$dict{_stream});
2168             # extract information from embedded image
2169 0         0 $result = $et->ExtractInfo(\$$dict{_stream}, { ReEntry => 1 });
2170             }
2171 0 0       0 unless ($result) {
2172 0 0       0 $et->FoundTag('FileType', defined $result ? '(unknown)' : '(unsupported)');
2173             }
2174 0         0 last;
2175             }
2176             # decode stream if necessary
2177 43 0 0     241 if ($cryptInfo and ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3} and
      33        
      0        
      0        
2178             $$dict{Length} and $$dict{Length} > 10000) and not $$dict{_decrypted} and
2179             not $$et{PDF_CAPTURE}) # (capturing PDF for writing?)
2180             {
2181 0   0     0 my $type = $$dict{Type} || '';
2182 0 0 0     0 if ($type ne '/Metadata' or $$dict{Length} > 100000) {
2183 0 0       0 if ($$et{OPTIONS}{IgnoreMinorErrors}) {
2184 0         0 $et->Warn("Decrypting large $$tagInfo{Name} (will be slow)");
2185             } else {
2186 0         0 $et->Warn("Skipping large AES-encrypted $$tagInfo{Name}", 2);
2187 0         0 last;
2188             }
2189             }
2190             }
2191 43 50       293 DecodeStream($et, $dict) or last;
2192 43 50       115 if ($verbose > 2) {
2193 0         0 $et->VPrint(2,"$$et{INDENT}$$et{DIR_NAME} stream data\n");
2194 0         0 $et->VerboseDump(\$$dict{_stream});
2195             }
2196             # extract information from stream
2197             my %dirInfo = (
2198             DataPt => \$$dict{_stream},
2199             DataLen => length $$dict{_stream},
2200             DirStart => 0,
2201             DirLen => length $$dict{_stream},
2202             Parent => 'PDF',
2203             DirName => $$subdir{DirName},
2204 43         453 );
2205 43         195 my $subTablePtr = GetTagTable($$subdir{TagTable});
2206 43 50       242 unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
2207 0         0 $et->Warn("Error processing $$tagInfo{Name} information");
2208             }
2209 43         453 last;
2210             }
2211             }
2212              
2213             #------------------------------------------------------------------------------
2214             # Extract information from PDF file
2215             # Inputs: 0) ExifTool object reference, 1) dirInfo reference
2216             # Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
2217             sub ReadPDF($$)
2218             {
2219 37     37 0 88 my ($et, $dirInfo) = @_;
2220 37         141 my $raf = $$dirInfo{RAF};
2221 37         155 my $verbose = $et->Options('Verbose');
2222 37         85 my ($buff, $encrypt, $id);
2223             #
2224             # validate PDF file
2225             #
2226             # (linearization dictionary must be in the first 1024 bytes of the file)
2227 37 50       173 $raf->Read($buff, 1024) >= 8 or return 0;
2228 37 50       324 $buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
2229 37 50       235 $$et{PDFBase} = length $1 and $et->Warn('PDF header is not at start of file',1);
2230 37         164 $pdfVer = $$et{PDFVersion} = $2;
2231 37         260 $et->SetFileType(); # set the FileType tag
2232             # store PDFVersion tag
2233 37         144 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
2234 37         263 $et->HandleTag($tagTablePtr, 'Version', $pdfVer);
2235 37         158 $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
2236             #
2237             # check for a linearized PDF (only if reading)
2238             #
2239 37         111 my $capture = $$et{PDF_CAPTURE};
2240 37 100       147 unless ($capture) {
2241 18         34 my $lin = 'false';
2242 18 50       124 if ($buff =~ /<
2243 18         85 $buff = substr($buff, pos($buff) - 2);
2244 18         99 my $dict = ExtractObject($et, \$buff);
2245 18 0 33     161 if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
      33        
2246 0 0       0 if (not $$et{VALUE}{FileSize}) {
    0          
2247 0         0 undef $lin; # can't determine if it is linearized
2248             } elsif ($$dict{L} == $$et{VALUE}{FileSize} - $$et{PDFBase}) {
2249 0         0 $lin = 'true';
2250             }
2251             }
2252             }
2253 18 50       98 $et->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
2254             }
2255             #
2256             # read the xref tables referenced from startxref at the end of the file
2257             #
2258 37         104 my @xrefOffsets;
2259 37 50       167 $raf->Seek(0, 2) or return -2;
2260             # the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
2261 37         158 my $len = $raf->Tell();
2262 37 50       176 $len = 1024 if $len > 1024;
2263 37 50       130 $raf->Seek(-$len, 2) or return -2;
2264 37 50       144 $raf->Read($buff, $len) == $len or return -3;
2265             # find the LAST xref table in the file (may be multiple %%EOF marks,
2266             # and comments between "startxref" and "%%EOF")
2267 37 50       514 $buff =~ /^.*startxref(\s+)(\d+)(\s+)((%[^\x0d\x0a]*\s+)*)%%EOF/s or return -4;
2268             # parse comments to read SEAL information
2269 37 50       208 if ($4) {
2270 0         0 my @com = split /[\x0d\x0d]+/, $4;
2271 0         0 foreach (@com) {
2272 0 0       0 /^(%+\s*)
2273 0         0 my $dat = substr $_, length($1);
2274 0         0 my $tbl = GetTagTable('Image::ExifTool::XMP::SEAL');
2275 0         0 $et->ProcessDirectory({ DataPt => \$dat }, $tbl);
2276             }
2277             }
2278 37         198 my $ws = $1 . $3;
2279 37         131 my $xr = $2;
2280 37         138 push @xrefOffsets, $xr, 'Main';
2281             # set input record separator
2282 37 50       448 local $/ = $ws =~ /(\x0d\x0a|\x0d|\x0a)/ ? $1 : "\x0a";
2283 37         94 my (%xref, @mainDicts, %loaded, $mainFree);
2284 37         125 my ($xrefSize, $mainDictSize) = (0, 0);
2285             # initialize variables to capture when rewriting
2286 37 100       109 if ($capture) {
2287 19         108 $capture->{startxref} = $xr;
2288 19         68 $capture->{xref} = \%xref;
2289 19         64 $capture->{newline} = $/;
2290 19         76 $capture->{mainFree} = $mainFree = { };
2291             }
2292             XRef:
2293 37         112 while (@xrefOffsets) {
2294 70         157 my $offset = shift @xrefOffsets;
2295 70         127 my $type = shift @xrefOffsets;
2296 70 50       220 next if $loaded{$offset}; # avoid infinite recursion
2297 70 50       366 unless ($raf->Seek($offset+$$et{PDFBase}, 0)) {
2298 0 0       0 %loaded or return -5;
2299 0         0 $et->Warn('Bad offset for secondary xref table');
2300 0         0 next;
2301             }
2302             # Note: care must be taken because ReadLine may read more than we want if
2303             # the newline sequence for this table is different than the rest of the file
2304 70         119 for (;;) {
2305 70 50       269 unless ($raf->ReadLine($buff)) {
2306 0 0       0 %loaded or return -6;
2307 0         0 $et->Warn('Bad offset for secondary xref table');
2308 0         0 next XRef;
2309             }
2310 70 50       383 last if $buff =~/\S/; # skip blank lines
2311             }
2312 70         124 my $loadXRefStream;
2313 70 50       517 if ($buff =~ s/^\s*xref\s+//s) {
    0          
2314             # load xref table
2315 70         136 for (;;) {
2316             # read another line if necessary (skipping blank lines)
2317 177   50     662 $raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
2318 177 100       824 last if $buff =~ s/^\s*trailer([\s<[(])/$1/s;
2319 107 50       607 $buff =~ s/^\s*(\d+)\s+(\d+)\s+//s or return -4;
2320 107         377 my ($start, $num) = ($1, $2);
2321 107 50       1460 $raf->Seek(-length($buff), 1) or return -4;
2322 107         195 my $i;
2323 107         335 for ($i=0; $i<$num; ++$i) {
2324 622 50       1540 $raf->Read($buff, 20) == 20 or return -6;
2325 622 50       2717 $buff =~ /^\s*(\d{10}) (\d{5}) (f|n)/s or return -4;
2326 622         1091 my $num = $start + $i;
2327 622 100       1176 $xrefSize = $num if $num > $xrefSize;
2328             # locate object to generate entry from stream if necessary
2329             # (must do this before we test $xref{$num})
2330 622 50       1428 LocateAnyObject(\%xref, $num) if $xref{dicts};
2331             # save offset for newest copy of all objects
2332             # (or next object number for free objects)
2333 622 100       1716 unless (defined $xref{$num}) {
2334 526         1680 my ($offset, $gen) = (int($1), int($2));
2335 526         1306 $xref{$num} = $offset;
2336 526 100       1306 if ($3 eq 'f') {
2337             # save free objects in last xref table for rewriting
2338 52 100       256 $$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
2339 52         195 next;
2340             }
2341             # also save offset keyed by object reference string
2342 474         1801 $xref{"$num $gen R"} = $offset;
2343             }
2344             }
2345             # (I have a sample from Adobe which has an empty xref table)
2346             # %xref or return -4; # xref table may not be empty
2347 107         215 $buff = '';
2348             }
2349 70         177 undef $mainFree; # only do this for the last xref table
2350             } elsif ($buff =~ s/^\s*(\d+)\s+(\d+)\s+obj//s) {
2351             # this is a PDF-1.5 cross-reference stream dictionary
2352 0         0 $loadXRefStream = 1;
2353             } else {
2354 0 0       0 %loaded or return -4;
2355 0         0 $et->Warn('Invalid secondary xref table');
2356 0         0 next;
2357             }
2358 70         316 my $mainDict = ExtractObject($et, \$buff, $raf, \%xref);
2359 70 50       275 unless (ref $mainDict eq 'HASH') {
2360 0 0       0 %loaded or return -8;
2361 0         0 $et->Warn('Error loading secondary dictionary');
2362 0         0 next;
2363             }
2364             # keep track of total trailer dictionary Size
2365 70 100 66     541 $mainDictSize = $$mainDict{Size} if $$mainDict{Size} and $$mainDict{Size} > $mainDictSize;
2366 70 50       242 if ($loadXRefStream) {
2367             # decode and save our XRef stream from PDF-1.5 file
2368             # (but parse it later as required to save time)
2369             # Note: this technique can potentially result in an old object
2370             # being used if the file was incrementally updated and an older
2371             # object from an xref table was replaced by a newer object in an
2372             # xref stream. But doing so isn't a good idea (if allowed at all)
2373             # because a PDF 1.4 consumer would also make this same mistake.
2374 0 0 0     0 if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
      0        
      0        
      0        
2375 0         0 @{$$mainDict{W}} > 2 and $$mainDict{Size} and
2376             DecodeStream($et, $mainDict))
2377             {
2378             # create Index entry if it doesn't exist
2379 0 0       0 $$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
2380             # create '_entry_size' entry for internal use
2381 0         0 my $w = $$mainDict{W};
2382 0         0 my $size = 0;
2383 0         0 foreach (@$w) { $size += $_; }
  0         0  
2384 0         0 $$mainDict{_entry_size} = $size;
2385             # save this stream dictionary to use later if required
2386 0 0       0 $xref{dicts} = [] unless $xref{dicts};
2387 0         0 push @{$xref{dicts}}, $mainDict;
  0         0  
2388             } else {
2389 0 0       0 %loaded or return -9;
2390 0         0 $et->Warn('Invalid xref stream in secondary dictionary');
2391             }
2392             }
2393 70         239 $loaded{$offset} = 1;
2394             # load XRef stream in hybrid file if it exists
2395 70 50       653 push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
2396 70 50       193 $encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
2397 70 50 33     253 undef $encrypt if $encrypt and $encrypt eq 'null'; # (have seen "null")
2398 70 100 66     318 if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
2399 29         108 $id = ReadPDFValue($mainDict->{ID}->[0]);
2400             }
2401 70         190 push @mainDicts, $mainDict, $type;
2402             # load previous xref table if it exists
2403 70 100       367 push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
2404             }
2405 37 50       153 if ($xrefSize > $mainDictSize) {
2406 0         0 my $str = "Objects in xref table ($xrefSize) exceed trailer dictionary Size ($mainDictSize)";
2407 0 0       0 $capture ? $et->Error($str) : $et->Warn($str);
2408             }
2409             #
2410             # extract encryption information if necessary
2411             #
2412 37 50       121 if ($encrypt) {
2413 0 0       0 if (ref $encrypt eq 'SCALAR') {
2414 0         0 $encrypt = FetchObject($et, $$encrypt, \%xref, 'Encrypt');
2415             }
2416             # generate Encryption tag information
2417 0         0 my $err = DecryptInit($et, $encrypt, $id);
2418 0 0       0 if ($err) {
2419 0         0 $et->Warn($err);
2420 0 0       0 $$capture{Error} = $err if $capture;
2421 0         0 return -1;
2422             }
2423             }
2424             #
2425             # extract the information beginning with each of the main dictionaries
2426             #
2427 37         70 my $i = 0;
2428 37         114 my $num = (scalar @mainDicts) / 2;
2429 37         129 while (@mainDicts) {
2430 70         161 my $dict = shift @mainDicts;
2431 70         147 my $type = shift @mainDicts;
2432 70 50       222 if ($verbose) {
2433 0         0 ++$i;
2434 0         0 my $n = scalar(@{$$dict{_tags}});
  0         0  
2435 0         0 $et->VPrint(0, "PDF dictionary ($i of $num) with $n entries:\n");
2436             }
2437 70         341 ProcessDict($et, $tagTablePtr, $dict, \%xref, 0, $type);
2438             }
2439             # handle any decryption errors
2440 37 50       178 if ($encrypt) {
2441 0         0 my $err = $$encrypt{_error};
2442 0 0       0 if ($err) {
2443 0         0 $et->Warn($err);
2444 0 0       0 $$capture{Error} = $err if $capture;
2445 0         0 return -1;
2446             }
2447             }
2448 37         628 return 1;
2449             }
2450              
2451             #------------------------------------------------------------------------------
2452             # ReadPDF() warning strings for each error return value
2453             my %pdfWarning = (
2454             # -1 is reserved as error return value with no associated warning
2455             -2 => 'Error seeking in file',
2456             -3 => 'Error reading file',
2457             -4 => 'Invalid xref table',
2458             -5 => 'Invalid xref offset',
2459             -6 => 'Error reading xref table',
2460             -7 => 'Error reading trailer',
2461             -8 => 'Error reading main dictionary',
2462             -9 => 'Invalid xref stream in main dictionary',
2463             );
2464              
2465             #------------------------------------------------------------------------------
2466             # Extract information from PDF file
2467             # Inputs: 0) ExifTool object reference, 1) dirInfo reference
2468             # Returns: 1 if this was a valid PDF file
2469             sub ProcessPDF($$)
2470             {
2471 37     37 0 127 my ($et, $dirInfo) = @_;
2472              
2473 37         139 undef $cryptInfo; # (must not delete after returning so writer can use it)
2474 37         83 undef $cryptStream;
2475 37         86 undef $cryptString;
2476 37         161 my $result = ReadPDF($et, $dirInfo);
2477 37 50       143 if ($result < 0) {
2478 0 0       0 $et->Warn($pdfWarning{$result}) if $pdfWarning{$result};
2479 0         0 $result = 1;
2480             }
2481             # clean up and return
2482 37         95 undef %streamObjs;
2483 37         154 undef %fetched;
2484 37         179 return $result;
2485             }
2486              
2487             1; # end
2488              
2489              
2490             __END__