File Coverage

blib/lib/Image/ExifTool/PDF.pm

Criterion	Covered	Total	%
statement	535	1060	50.4
branch	290	742	39.0
condition	80	289	27.6
subroutine	22	29	75.8
pod	0	24	0.0
total	927	2144	43.2

line	stmt	bran	cond	sub	pod	time	code
1							#------------------------------------------------------------------------------
2							# File: PDF.pm
3							#
4							# Description: Read PDF meta information
5							#
6							# Revisions: 07/11/2005 - P. Harvey Created
7							# 07/25/2005 - P. Harvey Add support for encrypted documents
8							#
9							# References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
10							# 2) http://search.cpan.org/dist/Crypt-RC4/
11							# 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
12							# 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
13							# 5) http://tools.ietf.org/search/rfc3454
14							# 6) http://www.armware.dk/RFC/rfc/rfc4013.html
15							#------------------------------------------------------------------------------
16
17							package Image::ExifTool::PDF;
18
19	25			25		7442	use strict;
	25					64
	25					1330
20	25			25		155	use vars qw($VERSION $AUTOLOAD $lastFetched);
	25					62
	25					1874
21	25			25		149	use Image::ExifTool qw(:DataAccess :Utils);
	25					60
	25					467579
22							require Exporter;
23
24							$VERSION = '1.61';
25
26							sub FetchObject($$$$);
27							sub ExtractObject($$;$$);
28							sub ReadToNested($;$);
29							sub ProcessDict($$$$;$$);
30							sub ProcessAcroForm($$$$;$$);
31							sub ExpandArray($);
32							sub ReadPDFValue($);
33							sub CheckPDF($$$);
34
35							# $lastFetched - last fetched object reference (used for decryption)
36							# (undefined if fetched object was already decrypted, eg. object from stream)
37
38							my $cryptInfo; # encryption object reference (plus additional information)
39							my $cryptString; # flag that strings are encrypted
40							my $cryptStream; # flag that streams are encrypted
41							my $lastOffset; # last fetched object offset
42							my %streamObjs; # hash of stream objects
43							my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
44							my $pdfVer; # version of PDF file being processed (from header)
45
46							# filters supported in DecodeStream()
47							my %supportedFilter = (
48							'/FlateDecode' => 1,
49							'/Crypt' => 1,
50							'/Identity' => 1, # (not filtered)
51							'/DCTDecode' => 1, # (JPEG image - not filtered)
52							'/JPXDecode' => 1, # (Jpeg2000 image - not filtered)
53							'/LZWDecode' => 1, # (usually a bitmapped image)
54							'/ASCIIHexDecode' => 1,
55							'/ASCII85Decode' => 1,
56							# other standard filters that we currently don't support
57							#'/JBIG2Decode' => 0, # (JBIG2 image format not supported)
58							#'/CCITTFaxDecode' => 0,
59							#'/RunLengthDecode' => 0,
60							);
61
62							# tags in main PDF directories
63							%Image::ExifTool::PDF::Main = (
64							GROUPS => { 2 => 'Document' },
65							VARS => { CAPTURE => ['Main','Prev'] },
66							Info => {
67							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
68							# Adobe Acrobat 10.1.5 will create a duplicate Info dictionary with
69							# a different object number when metadata is edited. This flag
70							# is part of a patch to ignore this duplicate information (unless
71							# the IgnoreMinorErrors option is used)
72							IgnoreDuplicates => 1,
73							},
74							Root => {
75							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
76							},
77							Encrypt => {
78							NoProcess => 1, # don't process normally (processed in advance)
79							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
80							},
81							_linearized => {
82							Name => 'Linearized',
83							Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
84							PrintConv => { 'true' => 'Yes', 'false' => 'No' },
85							},
86							);
87
88							# tags in PDF Info dictionary
89							%Image::ExifTool::PDF::Info = (
90							GROUPS => { 2 => 'Document' },
91							VARS => { CAPTURE => ['Info'] },
92							EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
93							WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
94							CHECK_PROC => \&CheckPDF,
95							WRITABLE => 'string',
96							# set PRIORITY to 0 so most recent Info dictionary takes precedence
97							# (Acrobat Pro bug? doesn't use same object/generation number for
98							# new Info dictionary when doing incremental update)
99							PRIORITY => 0,
100							NOTES => q{
101							As well as the tags listed below, the PDF specification allows for
102							user-defined tags to exist in the Info dictionary. These tags, which should
103							have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
104							object, are also extracted by ExifTool.
105
106							B specifies the value format, and may be C, C,
107							C, C, C or C for PDF tags.
108							},
109							Title => { },
110							Author => { Groups => { 2 => 'Author' } },
111							Subject => { },
112							Keywords => {
113							List => 'string', # this is a string list
114							Notes => q{
115							stored as a string but treated as a comma- or semicolon-separated list of
116							items when reading if the string contains commas or semicolons, whichever is
117							more numerous, otherwise it is treated a space-separated list of items. The
118							list behaviour may be defeated by setting the API NoPDFList option. Written
119							as a comma-separated string. Note that the corresponding XMP-pdf:Keywords
120							tag is not treated as a list, so the NoPDFList option should be used when
121							copying between these two.
122							},
123							},
124							Creator => { },
125							Producer => { },
126							CreationDate => {
127							Name => 'CreateDate',
128							Writable => 'date',
129							PDF2 => 1, # not deprecated in PDF 2.0
130							Groups => { 2 => 'Time' },
131							Shift => 'Time',
132							PrintConv => '$self->ConvertDateTime($val)',
133							PrintConvInv => '$self->InverseDateTime($val)',
134							},
135							ModDate => {
136							Name => 'ModifyDate',
137							Writable => 'date',
138							PDF2 => 1, # not deprecated in PDF 2.0
139							Groups => { 2 => 'Time' },
140							Shift => 'Time',
141							PrintConv => '$self->ConvertDateTime($val)',
142							PrintConvInv => '$self->InverseDateTime($val)',
143							},
144							SourceModified => {
145							Name => 'SourceModified',
146							Writable => 'date',
147							PDF2 => 1,
148							Groups => { 2 => 'Time' },
149							Shift => 'Time',
150							PrintConv => '$self->ConvertDateTime($val)',
151							PrintConvInv => '$self->InverseDateTime($val)',
152							},
153							Trapped => {
154							Protected => 1,
155							# remove leading '/' from '/True' or '/False'
156							ValueConv => '$val=~s{^/}{}; $val',
157							ValueConvInv => '"/$val"',
158							},
159							'AAPL:Keywords' => { #PH
160							Name => 'AppleKeywords',
161							List => 'array', # this is an array of values
162							Notes => q{
163							keywords written by Apple utilities, although they seem to use PDF:Keywords
164							when reading
165							},
166							},
167							);
168
169							# tags in the PDF Root document catalog
170							%Image::ExifTool::PDF::Root = (
171							GROUPS => { 2 => 'Document' },
172							# note: can't capture previous versions of Root since they are not parsed
173							VARS => { CAPTURE => ['Root'] },
174							NOTES => 'This is the PDF document catalog.',
175							MarkInfo => {
176							SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
177							},
178							Metadata => {
179							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
180							},
181							Pages => {
182							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
183							},
184							Perms => {
185							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
186							},
187							AcroForm => {
188							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AcroForm' },
189							},
190							AF => {
191							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AF' },
192							},
193							Lang => 'Language',
194							PageLayout => { },
195							PageMode => { },
196							Version => {
197							Name => 'PDFVersion',
198							RawConv => '$$self{PDFVersion} = $val if $$self{PDFVersion} < $val; $val',
199							},
200							);
201
202							# tags extracted from the PDF Encrypt dictionary
203							%Image::ExifTool::PDF::Encrypt = (
204							GROUPS => { 2 => 'Document' },
205							NOTES => 'Tags extracted from the document Encrypt dictionary.',
206							Filter => {
207							Name => 'Encryption',
208							Notes => q{
209							extracted value is actually a combination of the Filter, SubFilter, V, R and
210							Length information from the Encrypt dictionary
211							},
212							},
213							P => {
214							Name => 'UserAccess',
215							ValueConv => '$val & 0x0f3c', # ignore reserved bits
216							PrintConvColumns => 2,
217							PrintConv => { BITMASK => {
218							2 => 'Print',
219							3 => 'Modify',
220							4 => 'Copy',
221							5 => 'Annotate',
222							8 => 'Fill forms',
223							9 => 'Extract',
224							10 => 'Assemble',
225							11 => 'Print high-res',
226							}},
227							},
228							);
229
230							# tags in PDF Pages dictionary
231							%Image::ExifTool::PDF::Pages = (
232							GROUPS => { 2 => 'Document' },
233							Count => 'PageCount',
234							Kids => {
235							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
236							},
237							MediaBox => { Name => 'MediaBox', List => 1 },
238							);
239
240							# tags in PDF Perms dictionary
241							%Image::ExifTool::PDF::Perms = (
242							NOTES => 'Additional document permissions imposed by digital signatures.',
243							DocMDP => {
244							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
245							},
246							FieldMDP => {
247							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
248							},
249							UR3 => {
250							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
251							},
252							);
253
254							# tags in PDF Perms dictionary
255							%Image::ExifTool::PDF::AcroForm = (
256							PROCESS_PROC => \&ProcessAcroForm,
257							_has_xfa => {
258							Name => 'HasXFA',
259							Notes => q{
260							this tag is defined if a document contains form fields, and is true if it
261							uses XML Forms Architecture; not a real Tag ID
262							},
263							PrintConv => { 'true' => 'Yes', 'false' => 'No' },
264							},
265							);
266
267							# tags extracted from AF dictionary
268							%Image::ExifTool::PDF::AF = (
269							PROCESS_PROC => \&ProcessAF,
270							NOTES => 'Processed only for C2PA information if AFRelationship is "/C2PA_Manifest".',
271							EF => {
272							SubDirectory => { TagTable => 'Image::ExifTool::PDF::EF' },
273							},
274							);
275
276							# tags extracted from EF dictionary
277							%Image::ExifTool::PDF::EF = (
278							F => {
279							Name => 'F_', # (don't want single-letter tag names)
280							SubDirectory => { TagTable => 'Image::ExifTool::PDF::F' },
281							},
282							);
283
284							# tags extracted from F dictionary
285							%Image::ExifTool::PDF::F = (
286							NOTES => 'C2PA JUMBF metadata extracted from "/C2PA_Manifest" file.',
287							_stream => {
288							Name => 'JUMBF',
289							Condition => '$$self{AFRelationship} eq "/C2PA_Manifest"',
290							SubDirectory => {
291							TagTable => 'Image::ExifTool::Jpeg2000::Main',
292							DirName => 'JUMBF',
293							ByteOrder => 'BigEndian',
294							},
295							},
296							);
297
298							# tags in PDF Kids dictionary
299							%Image::ExifTool::PDF::Kids = (
300							Metadata => {
301							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
302							},
303							PieceInfo => {
304							SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
305							},
306							Resources => {
307							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
308							},
309							Kids => {
310							Condition => '$self->Options("ExtractEmbedded")',
311							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
312							},
313							);
314
315							# tags in PDF Resources dictionary
316							%Image::ExifTool::PDF::Resources = (
317							ColorSpace => {
318							SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
319							},
320							XObject => {
321							Condition => '$self->Options("ExtractEmbedded")',
322							SubDirectory => { TagTable => 'Image::ExifTool::PDF::XObject' },
323							},
324							Properties => {
325							Condition => '$self->Options("ExtractEmbedded")',
326							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Properties' },
327							},
328							);
329
330							# tags in PDF ColorSpace dictionary
331							%Image::ExifTool::PDF::ColorSpace = (
332							DefaultRGB => {
333							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
334							ConvertToDict => 1, # (not seen yet, but just in case)
335							},
336							DefaultCMYK => {
337							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
338							# hack: this is stored as an array instead of a dictionary in my
339							# sample, so convert to a dictionary to extract the ICCBased element
340							ConvertToDict => 1,
341							},
342							Cs1 => {
343							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
344							ConvertToDict => 1, # (just in case)
345							},
346							CS0 => {
347							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
348							ConvertToDict => 1, # (just in case)
349							},
350							);
351
352							# tags in PDF DefaultRGB dictionary
353							%Image::ExifTool::PDF::DefaultRGB = (
354							ICCBased => {
355							SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
356							},
357							);
358
359							# tags in PDF ICCBased, Cs1 and CS0 dictionaries
360							%Image::ExifTool::PDF::ICCBased = (
361							_stream => {
362							Name => 'ICC_Profile',
363							SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
364							},
365							);
366
367							# tags in PDF XObject dictionary (parsed only if ExtractEmbedded is enabled)
368							%Image::ExifTool::PDF::XObject = (
369							EXTRACT_UNKNOWN => 0, # extract known but numbered tags (Im1, Im2, etc)
370							Im => {
371							Notes => q{
372							the L option enables information to be extracted from these
373							embedded images
374							},
375							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Im' },
376							},
377							);
378
379							# tags in PDF Im# dictionary
380							%Image::ExifTool::PDF::Im = (
381							NOTES => q{
382							Information extracted from embedded images with the L option.
383							The EmbeddedImage and its metadata are extracted only for JPEG and Jpeg2000
384							image formats.
385							},
386							Width => 'EmbeddedImageWidth',
387							Height => 'EmbeddedImageHeight',
388							Filter => { Name => 'EmbeddedImageFilter', List => 1 },
389							ColorSpace => {
390							Name => 'EmbeddedImageColorSpace',
391							List => 1,
392							RawConv => 'ref $val ? undef : $val', # (ignore color space data)
393							},
394							Image_stream => {
395							Name => 'EmbeddedImage',
396							Groups => { 2 => 'Preview' },
397							Binary => 1,
398							},
399							);
400
401							# tags in PDF Properties dictionary
402							%Image::ExifTool::PDF::Properties = (
403							EXTRACT_UNKNOWN => 0, # extract known but numbered tags (MC0, MC1, etc)
404							MC => {
405							Notes => q{
406							the L option enables information to be extracted from these
407							embedded metadata dictionaries
408							},
409							SubDirectory => { TagTable => 'Image::ExifTool::PDF::MC' },
410							}
411							);
412
413							# tags in PDF MC# dictionary
414							%Image::ExifTool::PDF::MC = (
415							Metadata => {
416							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
417							}
418							);
419
420							# tags in PDF PieceInfo dictionary
421							%Image::ExifTool::PDF::PieceInfo = (
422							AdobePhotoshop => {
423							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
424							},
425							Illustrator => {
426							# assume this is an illustrator file if it contains this directory
427							# and doesn't have a ".PDF" extension
428							Condition => q{
429							$self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
430							return 1;
431							},
432							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
433							},
434							);
435
436							# tags in PDF AdobePhotoshop dictionary
437							%Image::ExifTool::PDF::AdobePhotoshop = (
438							Private => {
439							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
440							},
441							);
442
443							# tags in PDF Illustrator dictionary
444							%Image::ExifTool::PDF::Illustrator = (
445							Private => {
446							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
447							},
448							);
449
450							# tags in PDF Private dictionary
451							%Image::ExifTool::PDF::Private = (
452							ImageResources => {
453							SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
454							},
455							);
456
457							# tags in PDF AI Private dictionary
458							%Image::ExifTool::PDF::AIPrivate = (
459							GROUPS => { 2 => 'Document' },
460							EXTRACT_UNKNOWN => 0, # extract known but numbered tags
461							AIMetaData => {
462							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
463							},
464							AIPrivateData => {
465							Notes => q{
466							the L option enables information to be extracted from embedded
467							PostScript documents in the AIPrivateData# and AIPDFPrivateData# streams
468							},
469							JoinStreams => 1, # join streams from numbered tags and process as one
470							SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
471							},
472							AIPDFPrivateData => {
473							JoinStreams => 1, # join streams from numbered tags and process as one
474							SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
475							},
476							RoundTripVersion => { },
477							ContainerVersion => { },
478							CreatorVersion => { },
479							);
480
481							# tags in PDF AIMetaData dictionary
482							%Image::ExifTool::PDF::AIMetaData = (
483							_stream => {
484							Name => 'AIStream',
485							SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
486							},
487							);
488
489							# tags in PDF ImageResources dictionary
490							%Image::ExifTool::PDF::ImageResources = (
491							_stream => {
492							Name => 'PhotoshopStream',
493							SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
494							},
495							);
496
497							# tags in PDF MarkInfo dictionary
498							%Image::ExifTool::PDF::MarkInfo = (
499							GROUPS => { 2 => 'Document' },
500							Marked => {
501							Name => 'TaggedPDF',
502							Notes => "not a Tagged PDF if this tag is missing",
503							PrintConv => { 'true' => 'Yes', 'false' => 'No' },
504							},
505							);
506
507							# tags in PDF Metadata dictionary
508							%Image::ExifTool::PDF::Metadata = (
509							GROUPS => { 2 => 'Document' },
510							XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
511							Name => 'XMP',
512							SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
513							},
514							);
515
516							# tags in PDF signature directories (DocMDP, FieldMDP or UR3)
517							%Image::ExifTool::PDF::Signature = (
518							GROUPS => { 2 => 'Document' },
519							ContactInfo => 'SignerContactInfo',
520							Location => 'SigningLocation',
521							M => {
522							Name => 'SigningDate',
523							Format => 'date',
524							Groups => { 2 => 'Time' },
525							PrintConv => '$self->ConvertDateTime($val)',
526							},
527							Name => 'SigningAuthority',
528							Reason => 'SigningReason',
529							Reference => {
530							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
531							},
532							Prop_AuthTime => {
533							Name => 'AuthenticationTime',
534							PrintConv => 'ConvertTimeSpan($val) . " ago"',
535							},
536							Prop_AuthType => 'AuthenticationType',
537							);
538
539							# tags in PDF Reference dictionary
540							%Image::ExifTool::PDF::Reference = (
541							TransformParams => {
542							SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
543							},
544							);
545
546							# tags in PDF TransformParams dictionary
547							%Image::ExifTool::PDF::TransformParams = (
548							GROUPS => { 2 => 'Document' },
549							Annots => {
550							Name => 'AnnotationUsageRights',
551							Notes => q{
552							possible values are Create, Delete, Modify, Copy, Import and Export;
553							additional values for UR3 signatures are Online and SummaryView
554							},
555							List => 1,
556							},
557							Document => {
558							Name => 'DocumentUsageRights',
559							Notes => 'only possible value is FullSave',
560							List => 1,
561							},
562							Form => {
563							Name => 'FormUsageRights',
564							Notes => q{
565							possible values are FillIn, Import, Export, SubmitStandalone and
566							SpawnTemplate; additional values for UR3 signatures are BarcodePlaintext and
567							Online
568							},
569							List => 1,
570							},
571							FormEX => {
572							Name => 'FormExtraUsageRights',
573							Notes => 'UR signatures only; only possible value is BarcodePlaintext',
574							List => 1,
575							},
576							Signature => {
577							Name => 'SignatureUsageRights',
578							Notes => 'only possible value is Modify',
579							List => 1,
580							},
581							EF => {
582							Name => 'EmbeddedFileUsageRights',
583							Notes => 'possible values are Create, Delete, Modify and Import',
584							List => 1,
585							},
586							Msg => 'UsageRightsMessage',
587							P => {
588							Name => 'ModificationPermissions',
589							Notes => q{
590							1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
591							false
592							},
593							PrintConv => {
594							1 => 'No changes permitted',
595							2 => 'Fill forms, Create page templates, Sign',
596							3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
597							'true' => 'Restrict all applications to reader permissions',
598							'false' => 'Do not restrict applications to reader permissions',
599							},
600							},
601							Action => {
602							Name => 'FieldPermissions',
603							Notes => 'FieldMDP signatures only',
604							PrintConv => {
605							'All' => 'Disallow changes to all form fields',
606							'Include' => 'Disallow changes to specified form fields',
607							'Exclude' => 'Allow changes to specified form fields',
608							},
609							},
610							Fields => {
611							Notes => 'FieldMDP signatures only',
612							Name => 'FormFields',
613							List => 1,
614							},
615							);
616
617							# unknown tags for use in verbose option
618							%Image::ExifTool::PDF::Unknown = (
619							GROUPS => { 2 => 'Unknown' },
620							);
621
622							#------------------------------------------------------------------------------
623							# AutoLoad our writer routines when necessary
624							#
625							sub AUTOLOAD
626							{
627	19			19		183	return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
628							}
629
630							#------------------------------------------------------------------------------
631							# Convert from PDF to EXIF-style date/time
632							# Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
633							# Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
634							sub ConvertPDFDate($)
635							{
636	10			10	0	17	my $date = shift;
637							# remove optional 'D:' prefix
638	10					53	$date =~ s/^D://;
639							# fill in default values if necessary
640							# YYYYmmddHHMMSS
641	10					24	my $default = '00000101000000';
642	10	50				31	if (length $date < length $default) {
643	0					0	$date .= substr($default, length $date);
644							}
645	10	50				58	$date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
646	10					73	$date = "$1:$2:$3 $4:$5:$6";
647	10	50				36	if ($7) {
648	10					26	my $tz = $7;
649	10	50				218	if ($tz =~ /^\s*Z/i) {
		50
650							# ignore any "HH'mm'" after the Z (OS X 10.6 does this)
651	0					0	$date .= 'Z';
652							# tolerate some improper formatting in timezone specification
653							} elsif ($tz =~ /^\s([-+])\s(\d+)[': ]+(\d*)/) {
654	10		50			68	$date .= $1 . $2 . ':' . ($3 \|\| '00');
655							}
656							}
657	10					25	return $date;
658							}
659
660							#------------------------------------------------------------------------------
661							# Locate any object in the XRef tables (including compressed objects)
662							# Inputs: 0) XRef reference, 1) object reference string (or free object number)
663							# Returns: offset to object in file or compressed object reference string,
664							# 0 if object is free, or undefined on error
665							sub LocateAnyObject($$)
666							{
667	238			238	0	532	my ($xref, $ref) = @_;
668	238	50				555	return undef unless $xref;
669	238	100				1008	return $$xref{$ref} if exists $$xref{$ref};
670							# get the object number
671	7	50				49	return undef unless $ref =~ /^(\d+)/;
672	7					23	my $objNum = $1;
673							# return 0 if the object number has been reused (old object is free)
674	7	100				38	return 0 if defined $$xref{$objNum};
675							#
676							# scan our XRef stream dictionaries for this object
677							#
678	1	50				19	return undef unless $$xref{dicts};
679	0					0	my $dict;
680	0					0	foreach $dict (@{$$xref{dicts}}) {
	0					0
681							# quick check to see if the object is in the range for this xref stream
682	0	0				0	next if $objNum >= $$dict{Size};
683	0					0	my $index = $$dict{Index};
684	0	0				0	next if $objNum < $$index[0];
685							# scan the tables for the specified object
686	0					0	my $size = $$dict{_entry_size};
687	0					0	my $num = scalar(@$index) / 2;
688	0					0	my $tot = 0;
689	0					0	my $i;
690	0					0	for ($i=0; $i<$num; ++$i) {
691	0					0	my $start = $$index[$i*2];
692	0					0	my $count = $$index[$i*2+1];
693							# table is in ascending order, so quit if we have passed the object
694	0	0				0	last if $objNum < $start;
695	0	0				0	if ($objNum < $start + $count) {
696	0					0	my $offset = $size * ($objNum - $start + $tot);
697	0	0				0	last if $offset + $size > length $$dict{_stream};
698	0					0	my @c = unpack("x$offset C$size", $$dict{_stream});
699							# extract values from this table entry
700							# (can be 1, 2, 3, 4, etc.. bytes per value)
701	0					0	my (@t, $j, $k);
702	0					0	my $w = $$dict{W};
703	0					0	for ($j=0; $j<3; ++$j) {
704							# use default value if W entry is 0 (as per spec)
705							# - 0th element defaults to 1, others default to 0
706	0	0				0	$$w[$j] or $t[$j] = ($j ? 0 : 1), next;
		0
707	0					0	$t[$j] = shift(@c);
708	0					0	for ($k=1; $k < $$w[$j]; ++$k) {
709	0					0	$t[$j] = 256 * $t[$j] + shift(@c);
710							}
711							}
712							# by default, use "o g R" as the xref key
713							# (o = object number, g = generation number)
714	0					0	my $ref2 = "$objNum $t[2] R";
715	0	0				0	if ($t[0] == 1) {
		0
		0
716							# normal object reference:
717							# $t[1]=offset of object from start, $t[2]=generation number
718	0					0	$$xref{$ref2} = $t[1];
719							} elsif ($t[0] == 2) {
720							# compressed object reference:
721							# $t[1]=stream object number, $t[2]=index of object in stream
722	0					0	$ref2 = "$objNum 0 R";
723	0					0	$$xref{$ref2} = "I$t[2] $t[1] 0 R";
724							} elsif ($t[0] == 0) {
725							# free object:
726							# $t[1]=next free object in linked list, $t[2]=generation number
727	0					0	$$xref{$ref2} = 0;
728							} else {
729							# treat as a null object
730	0					0	$$xref{$ref2} = undef;
731							}
732	0					0	$$xref{$objNum} = $t[1]; # remember offsets by object number too
733	0	0				0	return $$xref{$ref} if $ref eq $ref2;
734	0					0	return 0; # object is free or was reused
735							}
736	0					0	$tot += $count;
737							}
738							}
739	0					0	return undef;
740							}
741
742							#------------------------------------------------------------------------------
743							# Locate a regular object in the XRef tables (does not include compressed objects)
744							# Inputs: 0) XRef reference, 1) object reference string (or free object number)
745							# Returns: offset to object in file, 0 if object is free,
746							# or undef on error or if object was compressed
747							sub LocateObject($$)
748							{
749	41			41	0	147	my ($xref, $ref) = @_;
750	41					139	my $offset = LocateAnyObject($xref, $ref);
751	41	50	66			408	return undef if $offset and $offset =~ /^I/;
752	41					207	return $offset;
753							}
754
755							#------------------------------------------------------------------------------
756							# Check that the correct object is located at the specified file offset
757							# Inputs: 0) ExifTool ref, 1) object name, 2) object reference string, 3) file offset
758							# Returns: first non-blank line at start of object, or undef on error
759							sub CheckObject($$$$)
760							{
761	217			217	0	562	my ($et, $tag, $ref, $offset) = @_;
762	217					350	my ($data, $obj, $dat, $pat);
763
764	217					467	my $raf = $$et{RAF};
765	217	50				1098	$raf->Seek($offset+$$et{PDFBase}, 0) or $et->Warn("Bad $tag offset"), return undef;
766							# verify that we are reading the expected object
767	217					1235	($obj = $ref) =~ s/R/obj/;
768	217					398	for (;;) {
769	217	50				637	$raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
770	217	50				5450	last if $data =~ s/^$obj//;
771	0	0				0	next if $data =~ /^\s+$/; # keep reading if this was a blank line
772							# handle cases where other whitespace characters are used in the object ID string
773	0					0	while ($data =~ /^\d+(\s+\d+)?\s*$/) {
774	0					0	$raf->ReadLine($dat);
775	0					0	$data .= $dat;
776							}
777	0					0	($pat = $obj) =~ s/ /\\s+/g;
778	0	0				0	unless ($data =~ s/$pat//) {
779	0					0	$tag = ucfirst $tag;
780	0					0	$et->Warn("$tag object ($obj) not found at offset $offset");
781	0					0	return undef;
782							}
783	0					0	last;
784							}
785							# read the first line of data from the object (ignoring blank lines and comments)
786	217					481	for (;;) {
787	434	100	66			2789	last if $data =~ /\S/ and $data !~ /^\s*%/;
788	217	50				665	$raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
789							}
790	217					642	return $data;
791							}
792
793							#------------------------------------------------------------------------------
794							# Fetch indirect object from file (from inside a stream if required)
795							# Inputs: 0) ExifTool object reference, 1) object reference string,
796							# 2) xref lookup, 3) object name (for warning messages)
797							# Returns: object data or undefined on error
798							# Notes: sets $lastFetched to the object reference, or undef if the object
799							# was extracted from an encrypted stream
800							sub FetchObject($$$$)
801							{
802	197			197	0	541	my ($et, $ref, $xref, $tag) = @_;
803	197					376	$lastFetched = $ref; # save this for decoding if necessary
804	197					1427	my $offset = LocateAnyObject($xref, $ref);
805	197					357	$lastOffset = $offset;
806	197	100				491	unless ($offset) {
807	5	50				20	$et->Warn("Bad $tag reference") unless defined $offset;
808	5					18	return undef;
809							}
810	192					349	my ($data, $obj);
811	192	50				688	if ($offset =~ s/^I(\d+) //) {
812	0					0	my $index = $1; # object index in stream
813	0					0	my ($objNum) = split ' ', $ref; # save original object number
814	0					0	$ref = $offset; # now a reference to the containing stream object
815	0					0	$obj = $streamObjs{$ref};
816	0	0				0	unless ($obj) {
817							# don't try to load the same object stream twice
818	0	0				0	return undef if defined $obj;
819	0					0	$streamObjs{$ref} = '';
820							# load the parent object stream
821	0					0	$obj = FetchObject($et, $ref, $xref, $tag);
822							# make sure it contains everything we need
823	0	0	0			0	return undef unless defined $obj and ref($obj) eq 'HASH';
824	0	0	0			0	return undef unless $$obj{First} and $$obj{N};
825	0	0				0	return undef unless DecodeStream($et, $obj);
826							# add a special '_table' entry to this dictionary which contains
827							# the list of object number/offset pairs from the stream header
828	0					0	my $num = $$obj{N} * 2;
829	0					0	my @table = split ' ', $$obj{_stream}, $num;
830	0	0				0	return undef unless @table == $num;
831							# remove everything before first object in stream
832	0					0	$$obj{_stream} = substr($$obj{_stream}, $$obj{First});
833	0					0	$table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
834	0					0	$$obj{_table} = \@table;
835							# save the object stream so we don't have to re-load it later
836	0					0	$streamObjs{$ref} = $obj;
837							}
838							# verify that we have the specified object
839	0					0	my $i = 2 * $index;
840	0					0	my $table = $$obj{_table};
841	0	0	0			0	unless ($index < $$obj{N} and $$table[$i] == $objNum) {
842	0					0	$et->Warn("Bad index for stream object $tag");
843	0					0	return undef;
844							}
845							# extract the object at the specified index in the stream
846							# (offsets in table are in sequential order, so we can subtract from
847							# the next offset to get the object length)
848	0					0	$offset = $$table[$i + 1];
849	0		0			0	my $len = ($$table[$i + 3] \|\| length($$obj{_stream})) - $offset;
850	0					0	$data = substr($$obj{_stream}, $offset, $len);
851							# avoid re-decrypting data in already decrypted streams
852	0	0				0	undef $lastFetched if $cryptStream;
853	0					0	return ExtractObject($et, \$data);
854							}
855							# load the start of the object
856	192					507	$data = CheckObject($et, $tag, $ref, $offset);
857	192	50				516	return undef unless defined $data;
858
859	192					792	return ExtractObject($et, \$data, $$et{RAF}, $xref);
860							}
861
862							#------------------------------------------------------------------------------
863							# Convert PDF value to something readable
864							# Inputs: 0) PDF object data
865							# Returns: converted object
866							sub ReadPDFValue($)
867							{
868	223			223	0	466	my $str = shift;
869							# decode all strings in an array
870	223	100				533	if (ref $str eq 'ARRAY') {
871							# create new list to not alter the original data when rewriting
872	27					66	my ($val, @vals);
873	27					100	foreach $val (@$str) {
874	80					195	push @vals, ReadPDFValue($val);
875							}
876	27					89	return \@vals;
877							}
878	196	50				522	length $str or return $str;
879	196					474	my $delim = substr($str, 0, 1);
880	196	100				675	if ($delim eq '(') { # literal string
		100
		50
881	58	50				498	$str = $1 if $str =~ /^.?$(.)$/s; # remove brackets
882							# decode escape sequences in literal strings
883	58					214	while ($str =~ /\\(.)/sg) {
884	0					0	my $n = pos($str) - 2;
885	0					0	my $c = $1;
886	0					0	my $r;
887	0	0				0	if ($c =~ /[0-7]/) {
		0
		0
888							# get up to 2 more octal digits
889	0	0				0	$c .= $1 if $str =~ /\G([0-7]{1,2})/g;
890							# convert octal escape code
891	0					0	$r = chr(oct($c) & 0xff);
892							} elsif ($c eq "\x0d") {
893							# the string is continued if the line ends with '\'
894							# (also remove "\x0d\x0a")
895	0	0				0	$c .= $1 if $str =~ /\G(\x0a)/g;
896	0					0	$r = '';
897							} elsif ($c eq "\x0a") {
898	0					0	$r = '';
899							} else {
900							# convert escaped characters
901	0					0	($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
902							}
903	0					0	substr($str, $n, length($c)+1) = $r;
904							# continue search after this character
905	0					0	pos($str) = $n + length($r);
906							}
907	58	50				137	Crypt(\$str, $lastFetched) if $cryptString;
908							} elsif ($delim eq '<') { # hex string
909							# decode hex data
910	41					127	$str =~ tr/0-9A-Fa-f//dc;
911	41	50				134	$str .= '0' if length($str) & 0x01; # (by the spec)
912	41					181	$str = pack('H*', $str);
913	41	100				151	Crypt(\$str, $lastFetched) if $cryptString;
914							} elsif ($delim eq '/') { # name
915	0					0	$str = substr($str, 1);
916							# convert escape codes (PDF 1.2 or later)
917	0	0				0	$str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
	0					0
918							}
919	196					563	return $str;
920							}
921
922							#------------------------------------------------------------------------------
923							# Extract PDF object from combination of buffered data and file
924							# Inputs: 0) ExifTool object reference, 1) data reference,
925							# 2) optional raf reference, 3) optional xref table
926							# Returns: converted PDF object or undef on error
927							# a) dictionary object --> hash reference
928							# b) array object --> array reference
929							# c) indirect reference --> scalar reference
930							# d) string, name, integer, boolean, null --> scalar value
931							# - updates $$dataPt on return to contain unused data
932							# - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
933							# the stream data and a list of the tags (not including '_stream' and '_tags')
934							# in their original order
935							sub ExtractObject($$;$$)
936							{
937	754			754	0	2016	my ($et, $dataPt, $raf, $xref) = @_;
938	754					1214	my (@tags, $data, $objData);
939	754					1240	my $dict = { };
940	754					1442	my $delim;
941
942	754					1540	for (;;) {
943	824	100				4122	if ($$dataPt =~ /^\s*(<{1,2}\|\[\|\()/s) {
		50
944	754					1851	$delim = $1;
945	754					1948	$$dataPt =~ s/^\s+//; # remove leading white space
946	754					1809	$objData = ReadToNested($dataPt, $raf);
947	754	50				1663	return undef unless defined $objData;
948	754					1356	last;
949							} elsif ($$dataPt =~ s{^\s(\S[^[(/<>\s])\s*}{}s) {
950							#
951							# extract boolean, numerical, string, name, null object or indirect reference
952							#
953	0					0	$objData = $1;
954							# look for an indirect reference
955	0	0	0			0	if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
956	0					0	$objData .= "$1 R";
957	0					0	$objData = \$objData; # return scalar reference
958							}
959	0					0	return $objData; # return simple scalar or scalar reference
960							}
961	70	50	33			364	$raf and $raf->ReadLine($data) or return undef;
962	70					178	$$dataPt .= $data;
963							}
964							#
965							# return literal string or hex string without parsing
966							#
967	754	100	100			3678	if ($delim eq '(' or $delim eq '<') {
		100
968	160					633	return $objData;
969							#
970							# extract array
971							#
972							} elsif ($delim eq '[') {
973	167	50				993	$objData =~ /^.?\[(.)\]/s or return undef;
974	167					433	my $data = $1; # brackets removed
975	167					307	my @list;
976	167					253	for (;;) {
977	594	100				1916	last unless $data =~ m{\s(\S[^[(/<>\s])}sg;
978	427					819	my $val = $1;
979	427	100				1658	if ($val =~ /^(<{1,2}\|\[\|\()/) {
		100
980	78					176	my $pos = pos($data) - length($val);
981							# nested dict, array, literal string or hex string
982	78					186	my $buff = substr($data, $pos);
983	78					227	$val = ReadToNested(\$buff);
984	78	50				265	last unless defined $val;
985	78					216	pos($data) = $pos + length($val);
986	78					277	$val = ExtractObject($et, \$val);
987							} elsif ($val =~ /^\d/) {
988	245					397	my $pos = pos($data);
989	245	100				731	if ($data =~ /\G\s+(\d+)\s+R/g) {
990	37					165	$val = \ "$val $1 R"; # make a reference
991							} else {
992	208					464	pos($data) = $pos;
993							}
994							}
995	427					1034	push @list, $val;
996							}
997	167					730	return \@list;
998							}
999							#
1000							# extract dictionary
1001							#
1002							# Note: entries are not necessarily separated by whitespace (doh!)
1003							# eg) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
1004							# Also, they may be separated by a comment (eg. "/Tag%comment\nValue"),
1005							# but comments have already been removed
1006	427					3072	while ($objData =~ m{(\s)/([^/[\]()<>{}\s]+)\s(\S[^[(/<>\s]*)}sg) {
1007	1229					2611	my $tag = $2;
1008	1229					2562	my $val = $3;
1009	1229	100				4603	if ($val =~ /^(<{1,2}\|\[\|\()/) {
		100
1010							# nested dict, array, literal string or hex string
1011	396					1200	$objData = substr($objData, pos($objData)-length($val));
1012	396					1089	$val = ReadToNested(\$objData, $raf);
1013	396	50				930	last unless defined $val;
1014	396					1184	$val = ExtractObject($et, \$val);
1015	396					1117	pos($objData) = 0;
1016							} elsif ($val =~ /^\d/) {
1017	618					1039	my $pos = pos($objData);
1018	618	100				2179	if ($objData =~ /\G\s+(\d+)\s+R/sg) {
1019	416					1641	$val = \ "$val $1 R"; # make a reference
1020							} else {
1021	202					531	pos($objData) = $pos;
1022							}
1023							}
1024	1229	50				2850	if ($$dict{$tag}) {
1025							# duplicate dictionary entries are not allowed
1026	0					0	$et->Warn("Duplicate '${tag}' entry in dictionary (ignored)");
1027							} else {
1028							# save the entry
1029	1229					2920	push @tags, $tag;
1030	1229					6817	$$dict{$tag} = $val;
1031							}
1032							}
1033	427	50				1220	return undef unless @tags;
1034	427					1342	$$dict{_tags} = \@tags;
1035	427	100				1217	return $dict unless $raf; # direct objects can not have streams
1036							#
1037							# extract the stream object
1038							#
1039							# dictionary must specify stream Length
1040	262	100				12636	my $length = $$dict{Length} or return $dict;
1041	43	100				136	if (ref $length) {
1042	25					61	$length = $$length;
1043	25					111	my $oldpos = $raf->Tell();
1044							# get the location of the object specifying the length
1045							# (compressed objects are not allowed)
1046	25	50				103	my $offset = LocateObject($xref, $length) or return $dict;
1047	25	50				76	$offset or $et->Warn('Bad stream Length object'), return $dict;
1048	25					136	$data = CheckObject($et, 'stream Length', $length, $offset);
1049	25	50				99	defined $data or return $dict;
1050	25	50				163	$data =~ /^\s*(\d+)/ or $et->Warn('Stream Length not found'), return $dict;
1051	25					75	$length = $1;
1052	25					78	$raf->Seek($oldpos, 0); # restore position to start of stream
1053							}
1054							# extract the trailing stream data
1055	43					88	for (;;) {
1056							# find the stream token
1057	86	100				372	if ($$dataPt =~ /(\S+)/) {
1058	43	50				175	last unless $1 eq 'stream';
1059							# read an extra line because it may contain our \x0a
1060	43	50				145	$$dataPt .= $data if $raf->ReadLine($data);
1061							# remove our stream header
1062	43					433	$$dataPt =~ s/^\s*stream(\x0a\|\x0d\x0a)//s;
1063	43					155	my $more = $length - length($$dataPt);
1064	43	100				175	if ($more > 0) {
		50
1065	28	50				93	unless ($raf->Read($data, $more) == $more) {
1066	0					0	$et->Warn('Error reading stream data');
1067	0					0	$$dataPt = '';
1068	0					0	return $dict;
1069							}
1070	28					182	$$dict{_stream} = $$dataPt . $data;
1071	28					74	$$dataPt = '';
1072							} elsif ($more < 0) {
1073	15					141	$$dict{_stream} = substr($$dataPt, 0, $length);
1074	15					47	$$dataPt = substr($$dataPt, $length);
1075							} else {
1076	0					0	$$dict{_stream} = $$dataPt;
1077	0					0	$$dataPt = '';
1078							}
1079	43					117	last;
1080							}
1081	43	50				183	$raf->ReadLine($data) or last;
1082	43					199	$$dataPt .= $data;
1083							}
1084	43					190	return $dict;
1085							}
1086
1087							#------------------------------------------------------------------------------
1088							# Read to nested delimiter
1089							# Inputs: 0) data reference, 1) optional raf reference
1090							# Returns: data up to and including matching delimiter (or undef on error)
1091							# - updates data reference with trailing data
1092							# - unescapes characters in literal strings
1093							my %closingDelim = ( # lookup for matching delimiter
1094							'(' => ')',
1095							'[' => ']',
1096							'<' => '>',
1097							'<<' => '>>',
1098							);
1099							sub ReadToNested($;$)
1100							{
1101	1228			1228	0	2622	my ($dataPt, $raf) = @_;
1102	1228					2613	my @delim = (''); # closing delimiter list, most deeply nested first
1103	1228					3701	pos($$dataPt) = 0; # begin at start of data
1104	1228					2352	for (;;) {
1105	5744	100				61695	unless ($$dataPt =~ /(\\*)($\|$\|<{1,2}\|>{1,2}\|\[\|\]\|%)/g) {
1106							# must read some more data
1107	1148					1657	my $buff;
1108	1148	50	33			3953	last unless $raf and $raf->ReadLine($buff);
1109	1148					3056	$$dataPt .= $buff;
1110	1148					2837	pos($$dataPt) = length($$dataPt) - length($buff);
1111	1148					2207	next;
1112							}
1113							# are we in a literal string?
1114	4596	100				12858	if ($delim[0] eq ')') {
		50
1115							# ignore escaped delimiters (preceded by odd number of \'s)
1116	434	50				1207	next if length($1) & 0x01;
1117							# ignore all delimiters but unescaped braces
1118	434	50	33			1875	next unless $2 eq '(' or $2 eq ')';
1119							} elsif ($2 eq '%') {
1120							# ignore the comment
1121	0					0	my $pos = pos($$dataPt) - 1;
1122							# remove everything from '%' up to but not including newline
1123	0					0	$$dataPt =~ /.*/g;
1124	0					0	my $end = pos($$dataPt);
1125	0					0	$$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
1126	0					0	pos($$dataPt) = $pos;
1127	0					0	next;
1128							}
1129	4596	100				11880	if ($closingDelim{$2}) {
1130							# push the corresponding closing delimiter
1131	2298					5713	unshift @delim, $closingDelim{$2};
1132	2298					3659	next;
1133							}
1134	2298	50				5671	unless ($2 eq $delim[0]) {
1135							# handle the case where we find a ">>>" and interpret it
1136							# as ">> >" instead of "> >>"
1137	0	0	0			0	next unless $2 eq '>>' and $delim[0] eq '>';
1138	0					0	pos($$dataPt) = pos($$dataPt) - 1;
1139							}
1140	2298					3613	shift @delim; # remove from nesting list
1141	2298	100				5211	next if $delim[0]; # keep going if we have more nested delimiters
1142	1228					1940	my $pos = pos($$dataPt);
1143	1228					2781	my $buff = substr($$dataPt, 0, $pos);
1144	1228					2715	$$dataPt = substr($$dataPt, $pos);
1145	1228					3687	return $buff; # success!
1146							}
1147	0					0	return undef; # didn't find matching delimiter
1148							}
1149
1150							#------------------------------------------------------------------------------
1151							# Decode LZW-encoded data (ref 1)
1152							# Inputs: 0) data reference
1153							# Returns: true on success and data is decoded, or false and data is untouched
1154							sub DecodeLZW($)
1155							{
1156	0			0	0	0	my $dataPt = shift;
1157	0	0				0	return 0 if length $$dataPt < 4;
1158	0					0	my @lzw = (map(chr, 0..255), undef, undef); # LZW code table
1159	0					0	my $mask = 0x01ff; # mask for least-significant 9 bits
1160	0					0	my @dat = unpack 'n*', $$dataPt . "\0";
1161	0					0	my $word = ($dat[0] << 16) \| $dat[1];
1162	0					0	my ($bit, $pos, $bits, $out) = (0, 2, 9, '');
1163	0					0	my $lastVal;
1164	0					0	for (;;) {
1165							# bits are packed MSB first in PDF LZW (the PDF spec doesn't mention this)
1166	0					0	my $shift = 32 - ($bit + $bits);
1167	0	0				0	if ($shift < 0) {
1168	0	0				0	return 0 if $pos >= @dat; # missing EOD marker
1169	0					0	$word = (($word & 0xffff) << 16) \| $dat[$pos++]; # read next word
1170	0					0	$bit -= 16;
1171	0					0	$shift += 16;
1172							};
1173	0					0	my $code = ($word >> $shift) & $mask;
1174	0					0	$bit += $bits;
1175	0					0	my $val = $lzw[$code];
1176	0	0				0	if (defined $val) {
		0
		0
		0
1177							# store new code as previous sequence plus 1st char of new sequence
1178	0	0				0	push @lzw, $lastVal . substr($val, 0, 1) if defined $lastVal;
1179							} elsif ($code == @lzw) { # new code
1180	0	0				0	return 0 unless defined $lastVal;
1181							# we are using the code that we are about to generate, so the last
1182							# character in the new sequence must be the same as the first
1183							# character in the previous sequence (makes sense if you think about it)
1184	0					0	$val = $lastVal . substr($lastVal, 0, 1);
1185	0					0	push @lzw, $val;
1186							} elsif ($code == 256) { # clear table
1187	0					0	splice @lzw, 258;
1188	0					0	$bits = 9;
1189	0					0	$mask = 0x1ff;
1190	0					0	undef $lastVal;
1191	0					0	next;
1192							} elsif ($code == 257) { # EOD marker
1193	0					0	last; # all done!
1194							} else {
1195	0					0	return 0;
1196							}
1197	0					0	$out .= $val; # add this byte sequence to the output
1198							# we added a new entry to the LZW table, so we must increase
1199							# the bit width if necessary, up to a maximum of 12
1200	0	0	0			0	@lzw >= $mask and $bits < 12 and ++$bits, $mask \|= $mask << 1;
1201	0					0	$lastVal = $val;
1202							}
1203	0					0	$$dataPt = $out; # return decompressed data
1204	0					0	return 1;
1205							}
1206
1207							#------------------------------------------------------------------------------
1208							# Decode filtered stream
1209							# Inputs: 0) ExifTool object reference, 1) dictionary reference
1210							# Returns: true if stream has been decoded OK
1211							sub DecodeStream($$)
1212							{
1213	43			43	0	73	local $_;
1214	43					121	my ($et, $dict) = @_;
1215
1216	43	50				165	return 0 unless $$dict{_stream}; # no stream to decode
1217
1218							# get list of filters
1219	43					124	my (@filters, @decodeParms, $filter);
1220	43	50				303	if (ref $$dict{Filter} eq 'ARRAY') {
		50
1221	0					0	@filters = @{$$dict{Filter}};
	0					0
1222							} elsif (defined $$dict{Filter}) {
1223	0					0	@filters = ($$dict{Filter});
1224							}
1225							# be sure we can process all the filters before we take the time to do the decryption
1226	43					154	foreach $filter (@filters) {
1227	0	0				0	next if $supportedFilter{$filter};
1228	0					0	$et->Warn("Unsupported Filter $filter");
1229	0					0	return 0;
1230							}
1231							# apply decryption first if required (and if the default encryption
1232							# has not been overridden by a Crypt filter. Note: the Crypt filter
1233							# must be first in the Filter array: ref 3, page 38)
1234	43	50	33			252	unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
			33
1235	43					242	CryptStream($dict, $lastFetched);
1236							}
1237	43	50				267	return 1 unless $$dict{Filter}; # Filter entry is mandatory
1238	0	0				0	return 0 if defined $$dict{_filtered}; # avoid double-filtering
1239	0					0	$$dict{_filtered} = 1; # set flag to prevent double-filtering
1240
1241							# get array of DecodeParms dictionaries
1242	0	0				0	if (ref $$dict{DecodeParms} eq 'ARRAY') {
1243	0					0	@decodeParms = @{$$dict{DecodeParms}};
	0					0
1244							} else {
1245	0					0	@decodeParms = ($$dict{DecodeParms});
1246							}
1247
1248	0					0	foreach $filter (@filters) {
1249	0					0	my $decodeParms = shift @decodeParms;
1250
1251	0	0				0	if ($filter eq '/FlateDecode') {
		0
		0
		0
		0
1252							# make sure we support the predictor (if used) before decoding
1253	0					0	my $pre;
1254	0	0				0	if (ref $decodeParms eq 'HASH') {
1255	0					0	$pre = $$decodeParms{Predictor};
1256	0	0	0			0	if ($pre and $pre ne '1' and $pre ne '12') {
			0
1257	0					0	$et->Warn("FlateDecode Predictor $pre currently not supported");
1258	0					0	return 0;
1259							}
1260							}
1261	0	0				0	if (eval { require Compress::Zlib }) {
	0					0
1262	0					0	my $inflate = Compress::Zlib::inflateInit();
1263	0					0	my ($buff, $stat);
1264	0	0				0	$inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
1265	0	0	0			0	if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
1266	0					0	$$dict{_stream} = $buff;
1267							} else {
1268	0					0	$et->Warn('Error inflating stream');
1269	0					0	return 0;
1270							}
1271							} else {
1272	0					0	$et->Warn('Install Compress::Zlib to process filtered streams');
1273	0					0	return 0;
1274							}
1275	0	0	0			0	next unless $pre and $pre eq '12'; # 12 = 'up' prediction
1276
1277							# apply anti-predictor
1278	0					0	my $cols = $$decodeParms{Columns};
1279	0	0				0	unless ($cols) {
1280							# currently only support 'up' prediction
1281	0					0	$et->Warn('No Columns for decoding stream');
1282	0					0	return 0;
1283							}
1284	0					0	my @bytes = unpack('C*', $$dict{_stream});
1285	0					0	my @pre = (0) x $cols; # initialize predictor array
1286	0					0	my $buff = '';
1287	0					0	while (@bytes > $cols) {
1288	0	0				0	unless (($_ = shift @bytes) == 2) {
1289	0					0	$et->Warn("Unsupported PNG filter $_"); # (yes, PNG)
1290	0					0	return 0;
1291							}
1292	0					0	foreach (@pre) {
1293	0					0	$_ = ($_ + shift(@bytes)) & 0xff;
1294							}
1295	0					0	$buff .= pack('C*', @pre);
1296							}
1297	0					0	$$dict{_stream} = $buff;
1298
1299							} elsif ($filter eq '/Crypt') {
1300
1301							# (we shouldn't have to check the _decrypted flag since we
1302							# already checked the _filtered flag, but what the heck...)
1303	0	0				0	next if defined $$dict{_decrypted};
1304							# assume Identity filter (the default) if DecodeParms are missing
1305	0	0				0	next unless ref $decodeParms eq 'HASH';
1306	0					0	my $name = $$decodeParms{Name};
1307	0	0	0			0	next unless defined $name or $name eq 'Identity';
1308	0	0				0	if ($name ne 'StdCF') {
1309	0					0	$et->Warn("Unsupported Crypt Filter $name");
1310	0					0	return 0;
1311							}
1312	0	0				0	unless ($cryptInfo) {
1313	0					0	$et->Warn('Missing Encrypt StdCF entry');
1314	0					0	return 0;
1315							}
1316							# decrypt the stream manually because we want to:
1317							# 1) ignore $cryptStream (StmF) setting
1318							# 2) ignore EncryptMetadata setting (I can't find mention of how to
1319							# reconcile this in the spec., but this would make sense)
1320							# 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
1321							# 4) set _decrypted flag so we will recrypt according to StmF when
1322							# writing (since we don't yet write Filter'd streams)
1323	0					0	Crypt(\$$dict{_stream}, 'none');
1324	0	0				0	$$dict{_decrypted} = ($cryptStream ? 1 : 0);
1325
1326							} elsif ($filter eq '/LZWDecode') {
1327
1328							# make sure we don't have any unsupported decoding parameters
1329	0	0				0	if (ref $decodeParms eq 'HASH') {
1330	0	0				0	if ($$decodeParms{Predictor}) {
		0
1331	0					0	$et->Warn("LZWDecode Predictor $$decodeParms{Predictor} currently not supported");
1332	0					0	return 0;
1333							} elsif ($$decodeParms{EarlyChange}) {
1334	0					0	$et->Warn("LZWDecode EarlyChange currently not supported");
1335	0					0	return 0;
1336							}
1337							}
1338	0	0				0	unless (DecodeLZW(\$$dict{_stream})) {
1339	0					0	$et->Warn('LZW decompress error');
1340	0					0	return 0;
1341							}
1342
1343							} elsif ($filter eq '/ASCIIHexDecode') {
1344
1345	0					0	$$dict{_stream} =~ s/>.*//; # truncate at '>' (end of data mark)
1346	0					0	$$dict{_stream} =~ tr/0-9a-zA-Z//d; # remove illegal characters
1347	0					0	$$dict{_stream} = pack 'H*', $$dict{_stream};
1348
1349							} elsif ($filter eq '/ASCII85Decode') {
1350
1351	0					0	my ($err, @out, $i);
1352	0					0	my ($n, $val) = (0, 0);
1353	0					0	foreach (split //, $$dict{_stream}) {
1354	0	0	0			0	if ($_ ge '!' and $_ le 'u') {;
		0
		0
1355	0					0	$val = 85 * $val + ord($_) - 33;
1356	0	0				0	next unless ++$n == 5;
1357							} elsif ($_ eq '~') {
1358	0	0				0	$n == 1 and $err = 1; # error to have a single char in the last group of 5
1359	0					0	for ($i=$n; $i<5; ++$i) { $val *= 85; }
	0					0
1360							} elsif ($_ eq 'z') {
1361	0	0				0	$n and $err = 2, last; # error if 'z' isn't the first char
1362	0					0	$n = 5;
1363							} else {
1364	0	0				0	next if /^\s$/; # ignore white space
1365	0					0	$err = 3, last; # any other character is an error
1366							}
1367	0					0	$val = unpack('V', pack('N', $val)); # reverse byte order
1368	0					0	while (--$n > 0) {
1369	0					0	push @out, $val & 0xff;
1370	0					0	$val >>= 8;
1371							}
1372	0	0				0	last if $_ eq '~';
1373							# (both $n and $val are zero again now)
1374							}
1375	0	0				0	$err and $et->Warn("ASCII85Decode error $err");
1376	0					0	$$dict{_stream} = pack('C*', @out);
1377							}
1378							}
1379	0					0	return 1;
1380							}
1381
1382							#------------------------------------------------------------------------------
1383							# Initialize state for RC4 en/decryption (ref 2)
1384							# Inputs: 0) RC4 key string
1385							# Returns: RC4 key hash reference
1386							sub RC4Init($)
1387							{
1388	22			22	0	90	my @key = unpack('C*', shift);
1389	22					322	my @state = (0 .. 255);
1390	22					63	my ($i, $j) = (0, 0);
1391	22					59	while ($i < 256) {
1392	5632					9139	my $st = $state[$i];
1393	5632					10555	$j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
1394	5632					9771	$state[$i++] = $state[$j];
1395	5632					12326	$state[$j] = $st;
1396							}
1397	22					161	return { State => \@state, XY => [ 0, 0 ] };
1398							}
1399
1400							#------------------------------------------------------------------------------
1401							# Apply RC4 en/decryption (ref 2)
1402							# Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
1403							# - can call this method directly with a key string, or with with the key
1404							# reference returned by RC4Init
1405							# - RC4 is a symmetric algorithm, so encryption is the same as decryption
1406							sub RC4Crypt($$)
1407							{
1408	22			22	0	69	my ($dataPt, $key) = @_;
1409	22	50				82	$key = RC4Init($key) unless ref $key eq 'HASH';
1410	22					53	my $state = $$key{State};
1411	22					34	my ($x, $y) = @{$$key{XY}};
	22					74
1412
1413	22					95	my @data = unpack('C*', $$dataPt);
1414	22					57	foreach (@data) {
1415	356					645	$x = ($x + 1) & 0xff;
1416	356					613	my $stx = $$state[$x];
1417	356					569	$y = ($stx + $y) & 0xff;
1418	356					654	my $sty = $$state[$x] = $$state[$y];
1419	356					539	$$state[$y] = $stx;
1420	356					739	$_ ^= $$state[($stx + $sty) & 0xff];
1421							}
1422	22					70	$$key{XY} = [ $x, $y ];
1423	22					253	$$dataPt = pack('C*', @data);
1424							}
1425
1426							#------------------------------------------------------------------------------
1427							# Update AES cipher with a bit of data
1428							# Inputs: 0) data
1429							# Returns: encrypted data
1430							my $cipherMore;
1431							sub CipherUpdate($)
1432							{
1433	0			0	0	0	my $dat = shift;
1434	0					0	my $pos = 0;
1435	0	0				0	$dat = $cipherMore . $dat if length $dat;
1436	0					0	while ($pos + 16 <= length($dat)) {
1437	0					0	substr($dat,$pos,16) = Image::ExifTool::AES::Cipher(substr($dat,$pos,16));
1438	0					0	$pos += 16;
1439							}
1440	0	0				0	if ($pos < length $dat) {
1441	0					0	$cipherMore = substr($dat,$pos);
1442	0					0	$dat = substr($dat,0,$pos);
1443							} else {
1444	0					0	$cipherMore = '';
1445							}
1446	0					0	return $dat;
1447							}
1448
1449							#------------------------------------------------------------------------------
1450							# Get encrypted hash
1451							# Inputs: 0) Password, 1) salt, 2) vector, 3) encryption revision
1452							# Returns: hash
1453							sub GetHash($$$$)
1454							{
1455	6			6	0	37	my ($password, $salt, $vector, $rev) = @_;
1456
1457							# return Rev 5 hash
1458	6	50				70	return Digest::SHA::sha256($password, $salt, $vector) if $rev == 5;
1459
1460							# compute Rev 6 hardened hash
1461							# (ref http://code.google.com/p/origami-pdf/source/browse/lib/origami/encryption.rb)
1462	0					0	my $blockSize = 32;
1463	0					0	my $input = Digest::SHA::sha256($password, $salt, $vector) . ("\0" x 32);
1464	0					0	my $key = substr($input, 0, 16);
1465	0					0	my $iv = substr($input, 16, 16);
1466	0					0	my $h;
1467	0					0	my $x = '';
1468	0					0	my $i = 0;
1469	0		0			0	while ($i < 64 or $i < ord(substr($x,-1,1))+32) {
1470
1471	0					0	my $block = substr($input, 0, $blockSize);
1472	0					0	$x = '';
1473	0					0	Image::ExifTool::AES::Crypt(\$x, $key, $iv, 1);
1474	0					0	$cipherMore = '';
1475
1476	0					0	my ($j, $digest);
1477	0					0	for ($j=0; $j<64; ++$j) {
1478	0					0	$x = '';
1479	0	0				0	$x .= CipherUpdate($password) if length $password;
1480	0					0	$x .= CipherUpdate($block);
1481	0	0				0	$x .= CipherUpdate($vector) if length $vector;
1482	0	0				0	if ($j == 0) {
1483	0					0	my @a = unpack('C16', $x);
1484	0					0	my $sum = 0;
1485	0					0	$sum += $_ foreach @a;
1486							# set SHA block size (32, 48 or 64 bytes = SHA-256, 384 or 512)
1487	0					0	$blockSize = 32 + ($sum % 3) * 16;
1488	0					0	$digest = Digest::SHA->new($blockSize * 8);
1489							}
1490	0					0	$digest->add($x);
1491							}
1492
1493	0					0	$h = $digest->digest();
1494	0					0	$key = substr($h, 0, 16);
1495	0					0	substr($input,0,16) = $h;
1496	0					0	$iv = substr($h, 16, 16);
1497	0					0	++$i;
1498							}
1499	0					0	return substr($h, 0, 32);
1500							}
1501
1502							#------------------------------------------------------------------------------
1503							# Initialize decryption
1504							# Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
1505							# 2) ID from file trailer dictionary
1506							# Returns: error string or undef on success (and sets $cryptInfo)
1507							sub DecryptInit($$$)
1508							{
1509	4			4	0	38	local $_;
1510	4					17	my ($et, $encrypt, $id) = @_;
1511
1512	4					11	undef $cryptInfo;
1513	4	50	33			30	unless ($encrypt and ref $encrypt eq 'HASH') {
1514	0					0	return 'Error loading Encrypt object';
1515							}
1516	4					15	my $filt = $$encrypt{Filter};
1517	4	50	33			43	unless ($filt and $filt =~ s/^\///) {
1518	0					0	return 'Encrypt dictionary has no Filter!';
1519							}
1520							# extract some interesting tags
1521	4		50			49	my $ver = $$encrypt{V} \|\| 0;
1522	4		100			20	my $rev = $$encrypt{R} \|\| 0;
1523	4					15	my $enc = "$filt V$ver";
1524	4	50				23	$enc .= ".$rev" if $filt eq 'Standard';
1525	4	50	33			23	$enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
1526	4	50	100			36	$enc .= ' (' . ($$encrypt{Length} \|\| 40) . '-bit)' if $filt eq 'Standard';
1527	4					19	my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
1528	4					35	$et->HandleTag($tagTablePtr, 'Filter', $enc);
1529	4	50				30	if ($filt ne 'Standard') {
		50
1530	0					0	return "Encryption filter $filt currently not supported";
1531							} elsif (not defined $$encrypt{R}) {
1532	0					0	return 'Standard security handler missing revision';
1533							}
1534	4	50	33			77	unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
			33
1535	0					0	return 'Incomplete Encrypt specification';
1536							}
1537	4	50				39	if ("$ver.$rev" >= 5.6) {
1538							# apologize for poor performance (AES is a pure Perl implementation)
1539	0					0	$et->Warn('Decryption is very slow for encryption V5.6 or higher', 3);
1540							}
1541	4					20	$et->HandleTag($tagTablePtr, 'P', $$encrypt{P});
1542
1543	4					9	my %parm; # optional parameters extracted from Encrypt dictionary
1544
1545	4	100	66			45	if ($ver == 1 or $ver == 2) {
		50	66
1546	1					3	$cryptString = $cryptStream = 1;
1547							} elsif ($ver == 4 or $ver == 5) {
1548							# initialize our $cryptString and $cryptStream flags
1549	3					11	foreach ('StrF', 'StmF') {
1550	6	100				24	my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
1551	6					22	$$flagPt = $$encrypt{$_};
1552	6	50	33			48	undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
1553	6	50	33			37	return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
1554							}
1555	3	50	33			29	if ($cryptString or $cryptStream) {
1556							return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
1557	3	50	33			42	ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
			33
1558	3					8	my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
1559	3	50				26	unless ($cryptMeth =~ /^\/(V2\|AESV2\|AESV3)$/) {
1560	0					0	return "Unsupported encryption method $cryptMeth";
1561							}
1562							# set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
1563	3	50				32	$$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2\|AESV3)$/;
1564							}
1565	3	100				13	if ($ver == 5) {
1566							# validate OE and UE entries
1567	2					9	foreach ('OE', 'UE') {
1568	4	50				16	return "Missing Encrypt $_ entry" unless $$encrypt{$_};
1569	4					16	$parm{$_} = ReadPDFValue($$encrypt{$_});
1570	4	50				19	return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
1571							}
1572	2					19	require Image::ExifTool::AES; # will need this later
1573							}
1574							} else {
1575	0					0	return "Encryption version $ver currently not supported";
1576							}
1577	4	50				21	$id or return "Can't decrypt (no document ID)";
1578
1579							# make sure we have the necessary libraries available
1580	4	100				14	if ($ver < 5) {
1581	2	50				7	unless (eval { require Digest::MD5 }) {
	2					24
1582	0					0	return "Install Digest::MD5 to process encrypted PDF";
1583							}
1584							} else {
1585	2	50				8	unless (eval { require Digest::SHA }) {
	2					56
1586	0					0	return "Install Digest::SHA to process AES-256 encrypted PDF";
1587							}
1588							}
1589
1590							# calculate file-level en/decryption key
1591	4					28	my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
1592							"\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
1593	4					19	my $o = ReadPDFValue($$encrypt{O});
1594	4					16	my $u = ReadPDFValue($$encrypt{U});
1595
1596							# set flag indicating whether metadata is encrypted
1597							# (in version 4 and higher, metadata streams may not be encrypted)
1598	4	100	100			52	if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
			66
1599	3					9	$$encrypt{_meta} = 1;
1600							}
1601							# try no password first, then try provided password if available
1602	4					10	my ($try, $key);
1603	4					12	for ($try=0; ; ++$try) {
1604	5					12	my $password;
1605	5	100				19	if ($try == 0) {
		50
1606	4					12	$password = '';
1607							} elsif ($try == 1) {
1608	1					8	$password = $et->Options('Password');
1609	1	50				6	return 'Document is password protected (use Password option)' unless defined $password;
1610							# make sure there is no UTF-8 flag on the password
1611	1	50	33			12	if ($] >= 5.006 and ($$et{OPTIONS}{EncodeHangs} or
			33
1612							eval { require Encode; Encode::is_utf8($password) } or $@))
1613							{
1614	0			0		0	local $SIG{'__WARN__'} = sub { };
1615							# repack by hand if Encode isn't available
1616	0	0	0			0	$password = ($$et{OPTIONS}{EncodeHangs} or $@) ? pack('C*', unpack($] < 5.010000 ?
		0
1617							'U0C' : 'C0C', $password)) : Encode::encode('utf8', $password);
1618							}
1619							} else {
1620	0					0	return 'Incorrect password';
1621							}
1622	5	100				20	if ($ver < 5) {
1623	2	50				7	if (length $password) {
1624							# password must be encoding in PDFDocEncoding (ref iso32000)
1625	0					0	$password = $et->Encode($password, 'PDFDoc');
1626							# truncate or pad the password to exactly 32 bytes
1627	0	0				0	if (length($password) > 32) {
		0
1628	0					0	$password = substr($password, 0, 32);
1629							} elsif (length($password) < 32) {
1630	0					0	$password .= substr($pad, 0, 32-length($password));
1631							}
1632							} else {
1633	2					4	$password = $pad;
1634							}
1635	2					11	$key = $password . $o . pack('V', $$encrypt{P}) . $id;
1636	2					5	my $rep = 1;
1637	2	100	66			14	if ($rev == 3 or $rev == 4) {
1638							# must add this if metadata not encrypted
1639	1	50				5	$key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
1640	1					3	$rep += 50; # repeat MD5 50 more times if revision is 3 or greater
1641							}
1642	2					6	my ($len, $i, $dat);
1643	2	100				7	if ($ver == 1) {
1644	1					2	$len = 5;
1645							} else {
1646	1		50			9	$len = $$encrypt{Length} \|\| 40;
1647	1	50				5	$len >= 40 or return 'Bad Encrypt Length';
1648	1					21	$len = int($len / 8);
1649							}
1650	2					10	for ($i=0; $i<$rep; ++$i) {
1651	52					169	$key = substr(Digest::MD5::md5($key), 0, $len);
1652							}
1653							# decrypt U to see if a user password is required
1654	2	100				7	if ($rev >= 3) {
1655	1					6	$dat = Digest::MD5::md5($pad . $id);
1656	1					6	RC4Crypt(\$dat, $key);
1657	1					7	for ($i=1; $i<=19; ++$i) {
1658	19					56	my @key = unpack('C*', $key);
1659	19					39	foreach (@key) { $_ ^= $i; }
	304					547
1660	19					68	RC4Crypt(\$dat, pack('C*', @key));
1661							}
1662	1					8	$dat .= substr($u, 16);
1663							} else {
1664	1					2	$dat = $pad;
1665	1					5	RC4Crypt(\$dat, $key);
1666							}
1667	2	50				14	last if $dat eq $u; # all done if this was the correct key
1668							} else {
1669	3	50	33			37	return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
1670	3	100				12	if (length $password) {
1671							# Note: this should be good for passwords containing reasonable characters,
1672							# but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
1673							# of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
1674	1					8	$password = $et->Encode($password, 'UTF8');
1675	1	50				5	$password = substr($password, 0, 127) if length($password) > 127;
1676							}
1677							# test for the owner password
1678	3					19	my $sha = GetHash($password, substr($o,32,8), substr($u,0,48), $rev);
1679	3	100				27	if ($sha eq substr($o, 0, 32)) {
1680	2					7	$key = GetHash($password, substr($o,40,8), substr($u,0,48), $rev);
1681	2					8	my $dat = ("\0" x 16) . $parm{OE};
1682							# decrypt with no padding
1683	2					15	my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1684	2	50				11	return $err if $err;
1685	2					5	$key = $dat; # use this as the file decryption key
1686	2					7	last;
1687							}
1688							# test for the user password
1689	1					5	$sha = GetHash($password, substr($u,32,8), '', $rev);
1690	1	50				8	if ($sha eq substr($u, 0, 32)) {
1691	0					0	$key = GetHash($password, substr($u,40,8), '', $rev);
1692	0					0	my $dat = ("\0" x 16) . $parm{UE};
1693	0					0	my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1694	0	0				0	return $err if $err;
1695	0					0	$key = $dat; # use this as the file decryption key
1696	0					0	last;
1697							}
1698							}
1699							}
1700	4					86	$$encrypt{_key} = $key; # save the file-level encryption key
1701	4					16	$cryptInfo = $encrypt; # save reference to the file-level Encrypt object
1702	4					39	return undef; # success!
1703							}
1704
1705							#------------------------------------------------------------------------------
1706							# Decrypt/Encrypt data
1707							# Inputs: 0) data ref
1708							# 1) PDF object reference to use as crypt key extension (may be 'none' to
1709							# avoid extending the encryption key, as for streams with Crypt Filter)
1710							# 2) encrypt flag (false for decryption)
1711							sub Crypt($$;$)
1712							{
1713	29	100		29	0	162	return unless $cryptInfo;
1714	4					14	my ($dataPt, $keyExt, $encrypt) = @_;
1715							# do not decrypt if the key extension object is undefined
1716							# (this doubles as a flag to disable decryption/encryption)
1717	4	50				16	return unless defined $keyExt;
1718	4					14	my $key = $$cryptInfo{_key};
1719							# apply the necessary crypt key extension
1720	4	100				19	unless ($$cryptInfo{_aesv3}) {
1721	2	50				11	unless ($keyExt eq 'none') {
1722							# extend crypt key using object and generation number
1723	2	50				21	unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
1724	0					0	$$cryptInfo{_error} = 'Invalid object reference for encryption';
1725	0					0	return;
1726							}
1727	2					24	$key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
1728							}
1729							# add AES-128 salt if necessary (this little gem is conveniently
1730							# omitted from the Adobe PDF 1.6 documentation, causing me to
1731							# waste 12 hours trying to figure out why this wasn't working --
1732							# it appears in ISO32000 though, so I should have been using that)
1733	2	100				9	$key .= 'sAlT' if $$cryptInfo{_aesv2};
1734	2					7	my $len = length($key);
1735	2					13	$key = Digest::MD5::md5($key); # get 16-byte MD5 digest
1736	2	100				10	$key = substr($key, 0, $len) if $len < 16; # trim if necessary
1737							}
1738							# perform the decryption/encryption
1739	4	100	100			32	if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
1740	3					33	require Image::ExifTool::AES;
1741	3					23	my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
1742	3	50				28	$err and $$cryptInfo{_error} = $err;
1743							} else {
1744	1					3	RC4Crypt($dataPt, $key);
1745							}
1746							}
1747
1748							#------------------------------------------------------------------------------
1749							# Decrypt/Encrypt stream data
1750							# Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
1751							sub CryptStream($$)
1752							{
1753	52	50		52	0	158	return unless $cryptStream;
1754	0					0	my ($dict, $keyExt) = @_;
1755	0		0			0	my $type = $$dict{Type} \|\| '';
1756							# XRef streams are not encrypted (ref 3, page 50),
1757							# and Metadata may or may not be encrypted
1758	0	0	0			0	if ($cryptInfo and $type ne '/XRef' and
			0
			0
1759							($$cryptInfo{_meta} or $type ne '/Metadata'))
1760							{
1761	0					0	Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
1762							# toggle _decrypted flag
1763	0	0				0	$$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
1764							} else {
1765	0					0	$$dict{_decrypted} = 0; # stream should never be encrypted
1766							}
1767							}
1768
1769							#------------------------------------------------------------------------------
1770							# Generate a new PDF tag (based on its ID) and add it to a tag table
1771							# Inputs: 0) tag table ref, 1) tag ID
1772							# Returns: tag info ref
1773							sub NewPDFTag($$)
1774							{
1775	0			0	0	0	my ($tagTablePtr, $tag) = @_;
1776	0					0	my $name = $tag;
1777							# translate URL-like escape sequences
1778	0					0	$name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
	0					0
1779	0					0	$name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
1780	0					0	$name =~ s/(^\|_)([a-z])/\U$2/g; # start words with upper case
1781	0					0	my $tagInfo = { Name => $name };
1782	0					0	AddTagToTable($tagTablePtr, $tag, $tagInfo);
1783	0					0	return $tagInfo;
1784							}
1785
1786							#------------------------------------------------------------------------------
1787							# Process AcroForm dictionary to set HasXMLFormsArchitecture flag
1788							# Inputs: Same as ProcessDict
1789							sub ProcessAcroForm($$$$;$$)
1790							{
1791	0			0	0	0	my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1792	0	0				0	$et->HandleTag($tagTablePtr, '_has_xfa', $$dict{XFA} ? 'true' : 'false');
1793	0	0				0	return 1 unless $et->Options('Verbose');
1794	0					0	return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
1795							}
1796
1797							#------------------------------------------------------------------------------
1798							# Process AF dictionary to extract C2PA manifest
1799							# Inputs: Same as ProcessDict
1800							sub ProcessAF($$$$;$$)
1801							{
1802	0			0	0	0	my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1803	0		0			0	$$et{AFRelationship} = $$dict{AFRelationship} \|\| '';
1804							# go no further unless Verbose or this is the C2PA_Manifest item
1805	0	0	0			0	return 1 unless $et->Options('Verbose') or $$et{AFRelationship} eq '/C2PA_Manifest';
1806	0					0	return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
1807							}
1808
1809							#------------------------------------------------------------------------------
1810							# Expand array into a string
1811							# Inputs: 0) array ref
1812							# Return: string
1813							sub ExpandArray($)
1814							{
1815	0			0	0	0	my $val = shift;
1816	0					0	my @list = @$val;
1817	0					0	foreach (@list) {
1818	0	0				0	ref $_ eq 'SCALAR' and $_ = "ref($$_)", next;
1819	0	0				0	ref $_ eq 'ARRAY' and $_ = ExpandArray($_), next;
1820	0	0				0	defined $_ or $_ = '', next;
1821							}
1822	0					0	return '[' . join(',',@list) . ']';
1823							}
1824
1825							#------------------------------------------------------------------------------
1826							# Process PDF dictionary extract tag values
1827							# Inputs: 0) ExifTool object reference, 1) tag table reference
1828							# 2) dictionary reference, 3) cross-reference table reference,
1829							# 4) nesting depth, 5) dictionary capture type
1830							sub ProcessDict($$$$;$$)
1831							{
1832	350			350	0	577	local $_;
1833	350					970	my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1834	350					1349	my $verbose = $et->Options('Verbose');
1835	350					837	my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
1836	350		33			1183	my $embedded = (defined $unknown and not $unknown and $et->Options('ExtractEmbedded'));
1837	350					560	my @tags = @{$$dict{_tags}};
	350					1685
1838	350					761	my ($next, %join, $validInfo);
1839	350					534	my $index = 0;
1840
1841	350		100			1137	$nesting = ($nesting \|\| 0) + 1;
1842	350	50				1024	if ($nesting > 50) {
1843	0					0	$et->Warn('Nesting too deep (directory ignored)');
1844	0					0	return;
1845							}
1846							# save entire dictionary for rewriting if specified
1847	350	50	100			1464	if ($$et{PDF_CAPTURE} and $$tagTablePtr{VARS} and
			66
1848							$tagTablePtr->{VARS}->{CAPTURE})
1849							{
1850	66					111	my $name;
1851	66					113	foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
	66					218
1852	82	100				302	next if $$et{PDF_CAPTURE}{$name};
1853							# make sure we load the right type if indicated
1854	66	50	66			243	next if $type and $type ne $name;
1855	66					238	$$et{PDF_CAPTURE}{$name} = $dict;
1856	66					273	last;
1857							}
1858							}
1859	350		33			1158	$validInfo = ($et->Options('Validate') and $tagTablePtr eq \%Image::ExifTool::PDF::Info);
1860							#
1861							# extract information from all tags in the dictionary
1862							#
1863	350					598	for (;;) {
1864	1473					2323	my ($tag, $isSubDoc);
1865	1473	100	33			3214	if (@tags) {
		50
1866	1123					2552	$tag = shift @tags;
1867							} elsif (defined $next and not $next) {
1868	0					0	$tag = 'Next';
1869	0					0	$next = 1;
1870							} else {
1871	350					609	last;
1872							}
1873	1123					2719	my $val = $$dict{$tag};
1874	1123					3248	my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag);
1875	1123	100	33			3673	if ($tagInfo) {
		50	33
			0
			0
1876	434	50				1168	undef $tagInfo if $$tagInfo{NoProcess};
1877							} elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
1878							$$tagTablePtr{$1} and (ref $val ne 'SCALAR' or not $fetched{$$val}))
1879							{
1880	0					0	my ($name, $num) = ($1, $2);
1881	0					0	$tagInfo = $et->GetTagInfo($tagTablePtr, $name);
1882	0	0	0			0	if (ref $tagInfo eq 'HASH' and $$tagInfo{JoinStreams}) {
1883	0					0	$fetched{$$val} = 1;
1884	0					0	my $obj = FetchObject($et, $$val, $xref, $tag);
1885	0	0				0	$join{$name} = [] unless $join{$name};
1886	0	0	0			0	next unless ref $obj eq 'HASH' and $$obj{_stream};
1887							# save all the stream data to join later
1888	0					0	DecodeStream($et, $obj);
1889	0					0	$join{$name}->[$num] = $$obj{_stream};
1890	0					0	undef $tagInfo; # don't process
1891							} else {
1892	0					0	$isSubDoc = 1; # treat as a sub-document
1893							}
1894							}
1895	1123	0	33			2516	if ($validInfo and $$et{PDFVersion} >= 2.0 and (not $tagInfo or not $$tagInfo{PDF2})) {
			0
			33
1896	0	0				0	my $name = $tagInfo ? ":$$tagInfo{Name}" : " Info tag '${tag}'";
1897	0					0	$et->Warn("PDF$name is deprecated in PDF 2.0");
1898							}
1899	1123	50				2424	if ($verbose) {
1900	0					0	my ($val2, $extra);
1901	0	0				0	if (ref $val eq 'SCALAR') {
		0
		0
1902	0					0	$extra = ", indirect object ($$val)";
1903	0	0	0			0	if ($fetched{$$val}) {
		0
1904	0					0	$val2 = "ref($$val)";
1905							} elsif ($tag eq 'Next' and not $next) {
1906							# handle 'Next' links after all others
1907	0					0	$next = 0;
1908	0					0	next;
1909							} else {
1910	0					0	$fetched{$$val} = 1;
1911	0					0	$val = FetchObject($et, $$val, $xref, $tag);
1912	0	0				0	unless (defined $val) {
1913	0					0	my $str;
1914	0	0				0	if (defined $lastOffset) {
1915	0					0	$val2 = '';
1916	0					0	$str = 'Object was freed';
1917							} else {
1918	0					0	$val2 = '';
1919	0					0	$str = 'Error reading object';
1920							}
1921	0					0	$et->VPrint(0, "$$et{INDENT}${str}:\n");
1922							}
1923							}
1924							} elsif (ref $val eq 'HASH') {
1925	0					0	$extra = ', direct dictionary';
1926							} elsif (ref $val eq 'ARRAY') {
1927	0					0	$extra = ', direct array of ' . scalar(@$val) . ' objects';
1928							} else {
1929	0					0	$extra = ', direct object';
1930							}
1931	0					0	my $isSubdir;
1932	0	0				0	if (ref $val eq 'HASH') {
		0
1933	0					0	$isSubdir = 1;
1934							} elsif (ref $val eq 'ARRAY') {
1935							# recurse into objects in arrays only if they are lists of
1936							# dictionaries or indirect objects which could be dictionaries
1937	0	0				0	$isSubdir = 1 if @$val;
1938	0					0	foreach (@$val) {
1939	0	0	0			0	next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
1940	0					0	undef $isSubdir;
1941	0					0	last;
1942							}
1943							}
1944	0	0				0	if ($isSubdir) {
1945							# create bogus subdirectory to recurse into this dict
1946	0	0				0	$tagInfo or $tagInfo = {
1947							Name => $tag,
1948							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Unknown' },
1949							};
1950							} else {
1951	0	0				0	$val2 = ExpandArray($val) if ref $val eq 'ARRAY';
1952							# generate tag info if we will use it later
1953	0	0	0			0	if (not $tagInfo and defined $val and $unknown) {
			0
1954	0					0	$tagInfo = NewPDFTag($tagTablePtr, $tag);
1955							}
1956							}
1957	0		0			0	$et->VerboseInfo($tag, $tagInfo,
1958							Value => $val2 \|\| $val,
1959							Extra => $extra,
1960							Index => $index++,
1961							);
1962	0	0				0	next unless defined $val;
1963							}
1964	1123	100				2533	unless ($tagInfo) {
1965							# add any tag found in Info dictionary to table
1966	689	50				1674	next unless $unknown;
1967	0					0	$tagInfo = NewPDFTag($tagTablePtr, $tag);
1968							}
1969							# increment document number if necessary
1970	434					785	my ($oldDocNum, $oldNumTags);
1971	434	50				916	if ($isSubDoc) {
1972	0					0	$oldDocNum = $$et{DOC_NUM};
1973	0					0	$oldNumTags = $$et{NUM_FOUND};
1974	0					0	$$et{DOC_NUM} = ++$$et{DOC_COUNT};
1975							}
1976	434	100				1061	if ($$tagInfo{SubDirectory}) {
1977							# process the subdirectory
1978	332					557	my @subDicts;
1979	332	100				813	if (ref $val eq 'ARRAY') {
1980							# hack to convert array to dictionary if necessary
1981	37	50	33			191	if ($$tagInfo{ConvertToDict} and @$val == 2 and not ref $$val[0]) {
			33
1982	0					0	my $tg = $$val[0];
1983	0					0	$tg =~ s(^/)(); # remove name
1984	0					0	my %dict = ( _tags => [ $tg ], $tg => $$val[1] );
1985	0					0	@subDicts = ( \%dict );
1986							} else {
1987	37					64	@subDicts = @{$val};
	37					135
1988							}
1989							} else {
1990	295					698	@subDicts = ( $val );
1991							}
1992							# loop through all values of this tag
1993	332					601	for (;;) {
1994	664	100				1978	my $subDict = shift @subDicts or last;
1995							# save last fetched object in case we fetch another one here
1996	332					701	my $prevFetched = $lastFetched;
1997	332	100				891	if (ref $subDict eq 'SCALAR') {
1998							# only fetch once (other copies are obsolete)
1999	244	100				770	next if $fetched{$$subDict};
2000	197	100				621	if ($$tagInfo{IgnoreDuplicates}) {
2001	28					64	my $flag = "ProcessedPDF_$tag";
2002	28	50				100	if ($$et{$flag}) {
2003	0	0				0	next if $et->Warn("Ignored duplicate $tag dictionary", 2);
2004							} else {
2005	28					100	$$et{$flag} = 1;
2006							}
2007							}
2008							# load dictionary via an indirect reference
2009	197					627	$fetched{$$subDict} = 1;
2010	197					669	my $obj = FetchObject($et, $$subDict, $xref, $tag);
2011	197	100				635	unless (defined $obj) {
2012	5	50				19	unless (defined $lastOffset) {
2013	0					0	$et->Warn("Error reading $tag object ($$subDict)");
2014							}
2015	5					13	next;
2016							}
2017	192					406	$subDict = $obj;
2018							}
2019	280	50				917	if (ref $subDict eq 'ARRAY') {
2020							# convert array of key/value pairs to a hash
2021	0	0				0	next if @$subDict < 2;
2022	0					0	my %hash = ( _tags => [] );
2023	0					0	while (@$subDict >= 2) {
2024	0					0	my $key = shift @$subDict;
2025	0					0	$key =~ s/^\///;
2026	0					0	push @{$hash{_tags}}, $key;
	0					0
2027	0					0	$hash{$key} = shift @$subDict;
2028							}
2029	0					0	$subDict = \%hash;
2030							} else {
2031	280	50				758	next unless ref $subDict eq 'HASH';
2032							}
2033							# set flag to re-crypt all strings when rewriting if the dictionary
2034							# came from an encrypted stream
2035	280	50				624	$$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
2036	280					1472	my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
2037	280	50				621	if (not $verbose) {
		0
2038	280		50			1367	my $proc = $$subTablePtr{PROCESS_PROC} \|\| \&ProcessDict;
2039	280					1052	&$proc($et, $subTablePtr, $subDict, $xref, $nesting);
2040							} elsif ($next) {
2041							# handle 'Next' links at this level to avoid deep recursion
2042	0					0	undef $next;
2043	0					0	$index = 0;
2044	0					0	$tagTablePtr = $subTablePtr;
2045	0					0	$dict = $subDict;
2046	0					0	@tags = @{$$subDict{_tags}};
	0					0
2047	0					0	$et->VerboseDir($tag, scalar(@tags));
2048							} else {
2049	0					0	my $oldIndent = $$et{INDENT};
2050	0					0	my $oldDir = $$et{DIR_NAME};
2051	0					0	$$et{INDENT} .= '\| ';
2052	0					0	$$et{DIR_NAME} = $tag;
2053	0					0	$et->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
	0					0
2054	0		0			0	my $proc = $$subTablePtr{PROCESS_PROC} \|\| \&ProcessDict;
2055	0					0	&$proc($et, $subTablePtr, $subDict, $xref, $nesting);
2056	0					0	$$et{INDENT} = $oldIndent;
2057	0					0	$$et{DIR_NAME} = $oldDir;
2058							}
2059	280					1981	$lastFetched = $prevFetched;
2060							}
2061							} else {
2062							# fetch object if necessary
2063							# (OS X 10.6 writes indirect objects in the Info dictionary!)
2064	102	50				311	if (ref $val eq 'SCALAR') {
2065	0					0	my $prevFetched = $lastFetched;
2066							# (note: fetching the same object multiple times is OK here)
2067	0					0	$val = FetchObject($et, $$val, $xref, $tag);
2068	0	0				0	if (defined $val) {
2069	0					0	$val = ReadPDFValue($val);
2070							# set flag to re-encrypt if necessary if rewritten
2071	0	0				0	$$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
		0
2072	0					0	$lastFetched = $prevFetched; # restore last fetched object reference
2073							}
2074							} else {
2075	102					426	$val = ReadPDFValue($val);
2076							}
2077	102	100				337	if (ref $val) {
		50
2078	27	50				119	if (ref $val eq 'ARRAY') {
2079	27	50				171	delete $$et{LIST_TAGS}{$tagInfo} if $$tagInfo{List};
2080	27					69	my $v;
2081	27					64	foreach $v (@$val) {
2082	80					239	$et->FoundTag($tagInfo, $v);
2083							}
2084							}
2085							} elsif (defined $val) {
2086							# convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
2087							# unless this is binary data (hex-encoded strings would not have been converted)
2088	75		100			558	my $format = $$tagInfo{Format} \|\| $$tagInfo{Writable} \|\| 'string';
2089	75	100				265	$val = ConvertPDFDate($val) if $format eq 'date';
2090	75	50	33			556	if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
2091							# text string is already in Unicode if it starts with "\xfe\xff",
2092							# otherwise we must first convert from PDFDocEncoding
2093	0	0				0	$val = $et->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
2094							}
2095	75	100	66			348	if ($$tagInfo{List} and not $$et{OPTIONS}{NoPDFList}) {
2096							# separate tokens in comma or whitespace delimited lists
2097	12					44	my $comma = $val =~ tr/,/,/;
2098	12					65	my $semi = $val =~ tr/;/;/;
2099	12					23	my $split;
2100	12	50	33			49	if ($comma or $semi) {
2101	12	50				49	$split = $comma > $semi ? ',+\\s' : ';+\\s';
2102							} else {
2103	0					0	$split = ' ';
2104							}
2105	12					571	my @values = split $split, $val;
2106	12					88	$et->FoundTag($tagInfo, $_) foreach @values;
2107							} else {
2108							# a simple tag value
2109	63					251	$et->FoundTag($tagInfo, $val);
2110							}
2111							}
2112							}
2113	434	50				1292	if ($isSubDoc) {
2114							# restore original document number
2115	0					0	$$et{DOC_NUM} = $oldDocNum;
2116	0	0				0	--$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2117							}
2118							}
2119							#
2120							# extract information from joined streams if necessary
2121							#
2122
2123	350	50				862	if (%join) {
2124	0					0	my ($tag, $i);
2125	0					0	foreach $tag (sort keys %join) {
2126	0					0	my $list = $join{$tag};
2127	0	0	0			0	last unless defined $$list[1] and $$list[1] =~ /^%.?([\x0d\x0a])/;
2128	0					0	my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
2129	0					0	for ($i=1; defined $$list[$i]; ++$i) {
2130	0					0	$buff .= $$list[$i];
2131	0					0	undef $$list[$i]; # free memory
2132							}
2133							# increment document number for tags extracted from embedded EPS
2134	0					0	my $oldDocNum = $$et{DOC_NUM};
2135	0					0	my $oldNumTags = $$et{NUM_FOUND};
2136	0					0	$$et{DOC_NUM} = ++$$et{DOC_COUNT};
2137							# extract PostScript information
2138	0					0	$et->HandleTag($tagTablePtr, $tag, $buff);
2139	0					0	$$et{DOC_NUM} = $oldDocNum;
2140							# revert document counter if we didn't add any new tags
2141	0	0				0	--$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2142	0					0	delete $$et{DOC_NUM};
2143							}
2144							}
2145							#
2146							# extract information from stream object if it exists (eg. Metadata stream)
2147							#
2148	350					498	for (;;) { # (cheap goto)
2149	350	100				1909	last unless $$dict{_stream};
2150	43					137	my $tag = '_stream';
2151							# add Subtype (if it exists) to stream name and remove leading '/'
2152	43	100				315	($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
2153	43	50				174	last unless $$tagTablePtr{$tag};
2154	43	50				163	my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag) or last;
2155	43					124	my $subdir = $$tagInfo{SubDirectory};
2156	43	50				147	unless ($subdir) {
2157							# don't build filter lists across different images
2158	0					0	delete $$et{LIST_TAGS}{$$tagTablePtr{Filter}};
2159							# we arrive here only when extracting embedded images
2160							# - only extract known image types and ignore others
2161	0		0			0	my $filter = $$dict{Filter} \|\| '';
2162	0	0				0	$filter = @$filter[-1] if ref $filter eq 'ARRAY'; # (get last Filter type)
2163	0					0	my $result;
2164	0	0	0			0	if ($filter eq '/DCTDecode' or $filter eq '/JPXDecode') {
2165	0	0				0	DecodeStream($et, $dict) or last;
2166							# save the image itself
2167	0					0	$et->FoundTag($tagInfo, \$$dict{_stream});
2168							# extract information from embedded image
2169	0					0	$result = $et->ExtractInfo(\$$dict{_stream}, { ReEntry => 1 });
2170							}
2171	0	0				0	unless ($result) {
2172	0	0				0	$et->FoundTag('FileType', defined $result ? '(unknown)' : '(unsupported)');
2173							}
2174	0					0	last;
2175							}
2176							# decode stream if necessary
2177	43	0	0			241	if ($cryptInfo and ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3} and
			33
			0
			0
2178							$$dict{Length} and $$dict{Length} > 10000) and not $$dict{_decrypted} and
2179							not $$et{PDF_CAPTURE}) # (capturing PDF for writing?)
2180							{
2181	0		0			0	my $type = $$dict{Type} \|\| '';
2182	0	0	0			0	if ($type ne '/Metadata' or $$dict{Length} > 100000) {
2183	0	0				0	if ($$et{OPTIONS}{IgnoreMinorErrors}) {
2184	0					0	$et->Warn("Decrypting large $$tagInfo{Name} (will be slow)");
2185							} else {
2186	0					0	$et->Warn("Skipping large AES-encrypted $$tagInfo{Name}", 2);
2187	0					0	last;
2188							}
2189							}
2190							}
2191	43	50				293	DecodeStream($et, $dict) or last;
2192	43	50				115	if ($verbose > 2) {
2193	0					0	$et->VPrint(2,"$$et{INDENT}$$et{DIR_NAME} stream data\n");
2194	0					0	$et->VerboseDump(\$$dict{_stream});
2195							}
2196							# extract information from stream
2197							my %dirInfo = (
2198							DataPt => \$$dict{_stream},
2199							DataLen => length $$dict{_stream},
2200							DirStart => 0,
2201							DirLen => length $$dict{_stream},
2202							Parent => 'PDF',
2203							DirName => $$subdir{DirName},
2204	43					453	);
2205	43					195	my $subTablePtr = GetTagTable($$subdir{TagTable});
2206	43	50				242	unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
2207	0					0	$et->Warn("Error processing $$tagInfo{Name} information");
2208							}
2209	43					453	last;
2210							}
2211							}
2212
2213							#------------------------------------------------------------------------------
2214							# Extract information from PDF file
2215							# Inputs: 0) ExifTool object reference, 1) dirInfo reference
2216							# Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
2217							sub ReadPDF($$)
2218							{
2219	37			37	0	88	my ($et, $dirInfo) = @_;
2220	37					141	my $raf = $$dirInfo{RAF};
2221	37					155	my $verbose = $et->Options('Verbose');
2222	37					85	my ($buff, $encrypt, $id);
2223							#
2224							# validate PDF file
2225							#
2226							# (linearization dictionary must be in the first 1024 bytes of the file)
2227	37	50				173	$raf->Read($buff, 1024) >= 8 or return 0;
2228	37	50				324	$buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
2229	37	50				235	$$et{PDFBase} = length $1 and $et->Warn('PDF header is not at start of file',1);
2230	37					164	$pdfVer = $$et{PDFVersion} = $2;
2231	37					260	$et->SetFileType(); # set the FileType tag
2232							# store PDFVersion tag
2233	37					144	my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
2234	37					263	$et->HandleTag($tagTablePtr, 'Version', $pdfVer);
2235	37					158	$tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
2236							#
2237							# check for a linearized PDF (only if reading)
2238							#
2239	37					111	my $capture = $$et{PDF_CAPTURE};
2240	37	100				147	unless ($capture) {
2241	18					34	my $lin = 'false';
2242	18	50				124	if ($buff =~ /<
2243	18					85	$buff = substr($buff, pos($buff) - 2);
2244	18					99	my $dict = ExtractObject($et, \$buff);
2245	18	0	33			161	if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
			33
2246	0	0				0	if (not $$et{VALUE}{FileSize}) {
		0
2247	0					0	undef $lin; # can't determine if it is linearized
2248							} elsif ($$dict{L} == $$et{VALUE}{FileSize} - $$et{PDFBase}) {
2249	0					0	$lin = 'true';
2250							}
2251							}
2252							}
2253	18	50				98	$et->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
2254							}
2255							#
2256							# read the xref tables referenced from startxref at the end of the file
2257							#
2258	37					104	my @xrefOffsets;
2259	37	50				167	$raf->Seek(0, 2) or return -2;
2260							# the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
2261	37					158	my $len = $raf->Tell();
2262	37	50				176	$len = 1024 if $len > 1024;
2263	37	50				130	$raf->Seek(-$len, 2) or return -2;
2264	37	50				144	$raf->Read($buff, $len) == $len or return -3;
2265							# find the LAST xref table in the file (may be multiple %%EOF marks,
2266							# and comments between "startxref" and "%%EOF")
2267	37	50				514	$buff =~ /^.startxref(\s+)(\d+)(\s+)((%[^\x0d\x0a]\s+)*)%%EOF/s or return -4;
2268							# parse comments to read SEAL information
2269	37	50				208	if ($4) {
2270	0					0	my @com = split /[\x0d\x0d]+/, $4;
2271	0					0	foreach (@com) {
2272	0	0				0	/^(%+\s*)
2273	0					0	my $dat = substr $_, length($1);
2274	0					0	my $tbl = GetTagTable('Image::ExifTool::XMP::SEAL');
2275	0					0	$et->ProcessDirectory({ DataPt => \$dat }, $tbl);
2276							}
2277							}
2278	37					198	my $ws = $1 . $3;
2279	37					131	my $xr = $2;
2280	37					138	push @xrefOffsets, $xr, 'Main';
2281							# set input record separator
2282	37	50				448	local $/ = $ws =~ /(\x0d\x0a\|\x0d\|\x0a)/ ? $1 : "\x0a";
2283	37					94	my (%xref, @mainDicts, %loaded, $mainFree);
2284	37					125	my ($xrefSize, $mainDictSize) = (0, 0);
2285							# initialize variables to capture when rewriting
2286	37	100				109	if ($capture) {
2287	19					108	$capture->{startxref} = $xr;
2288	19					68	$capture->{xref} = \%xref;
2289	19					64	$capture->{newline} = $/;
2290	19					76	$capture->{mainFree} = $mainFree = { };
2291							}
2292							XRef:
2293	37					112	while (@xrefOffsets) {
2294	70					157	my $offset = shift @xrefOffsets;
2295	70					127	my $type = shift @xrefOffsets;
2296	70	50				220	next if $loaded{$offset}; # avoid infinite recursion
2297	70	50				366	unless ($raf->Seek($offset+$$et{PDFBase}, 0)) {
2298	0	0				0	%loaded or return -5;
2299	0					0	$et->Warn('Bad offset for secondary xref table');
2300	0					0	next;
2301							}
2302							# Note: care must be taken because ReadLine may read more than we want if
2303							# the newline sequence for this table is different than the rest of the file
2304	70					119	for (;;) {
2305	70	50				269	unless ($raf->ReadLine($buff)) {
2306	0	0				0	%loaded or return -6;
2307	0					0	$et->Warn('Bad offset for secondary xref table');
2308	0					0	next XRef;
2309							}
2310	70	50				383	last if $buff =~/\S/; # skip blank lines
2311							}
2312	70					124	my $loadXRefStream;
2313	70	50				517	if ($buff =~ s/^\s*xref\s+//s) {
		0
2314							# load xref table
2315	70					136	for (;;) {
2316							# read another line if necessary (skipping blank lines)
2317	177		50			662	$raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
2318	177	100				824	last if $buff =~ s/^\s*trailer([\s<[(])/$1/s;
2319	107	50				607	$buff =~ s/^\s*(\d+)\s+(\d+)\s+//s or return -4;
2320	107					377	my ($start, $num) = ($1, $2);
2321	107	50				1460	$raf->Seek(-length($buff), 1) or return -4;
2322	107					195	my $i;
2323	107					335	for ($i=0; $i<$num; ++$i) {
2324	622	50				1540	$raf->Read($buff, 20) == 20 or return -6;
2325	622	50				2717	$buff =~ /^\s*(\d{10}) (\d{5}) (f\|n)/s or return -4;
2326	622					1091	my $num = $start + $i;
2327	622	100				1176	$xrefSize = $num if $num > $xrefSize;
2328							# locate object to generate entry from stream if necessary
2329							# (must do this before we test $xref{$num})
2330	622	50				1428	LocateAnyObject(\%xref, $num) if $xref{dicts};
2331							# save offset for newest copy of all objects
2332							# (or next object number for free objects)
2333	622	100				1716	unless (defined $xref{$num}) {
2334	526					1680	my ($offset, $gen) = (int($1), int($2));
2335	526					1306	$xref{$num} = $offset;
2336	526	100				1306	if ($3 eq 'f') {
2337							# save free objects in last xref table for rewriting
2338	52	100				256	$$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
2339	52					195	next;
2340							}
2341							# also save offset keyed by object reference string
2342	474					1801	$xref{"$num $gen R"} = $offset;
2343							}
2344							}
2345							# (I have a sample from Adobe which has an empty xref table)
2346							# %xref or return -4; # xref table may not be empty
2347	107					215	$buff = '';
2348							}
2349	70					177	undef $mainFree; # only do this for the last xref table
2350							} elsif ($buff =~ s/^\s*(\d+)\s+(\d+)\s+obj//s) {
2351							# this is a PDF-1.5 cross-reference stream dictionary
2352	0					0	$loadXRefStream = 1;
2353							} else {
2354	0	0				0	%loaded or return -4;
2355	0					0	$et->Warn('Invalid secondary xref table');
2356	0					0	next;
2357							}
2358	70					316	my $mainDict = ExtractObject($et, \$buff, $raf, \%xref);
2359	70	50				275	unless (ref $mainDict eq 'HASH') {
2360	0	0				0	%loaded or return -8;
2361	0					0	$et->Warn('Error loading secondary dictionary');
2362	0					0	next;
2363							}
2364							# keep track of total trailer dictionary Size
2365	70	100	66			541	$mainDictSize = $$mainDict{Size} if $$mainDict{Size} and $$mainDict{Size} > $mainDictSize;
2366	70	50				242	if ($loadXRefStream) {
2367							# decode and save our XRef stream from PDF-1.5 file
2368							# (but parse it later as required to save time)
2369							# Note: this technique can potentially result in an old object
2370							# being used if the file was incrementally updated and an older
2371							# object from an xref table was replaced by a newer object in an
2372							# xref stream. But doing so isn't a good idea (if allowed at all)
2373							# because a PDF 1.4 consumer would also make this same mistake.
2374	0	0	0			0	if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
			0
			0
			0
2375	0					0	@{$$mainDict{W}} > 2 and $$mainDict{Size} and
2376							DecodeStream($et, $mainDict))
2377							{
2378							# create Index entry if it doesn't exist
2379	0	0				0	$$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
2380							# create '_entry_size' entry for internal use
2381	0					0	my $w = $$mainDict{W};
2382	0					0	my $size = 0;
2383	0					0	foreach (@$w) { $size += $_; }
	0					0
2384	0					0	$$mainDict{_entry_size} = $size;
2385							# save this stream dictionary to use later if required
2386	0	0				0	$xref{dicts} = [] unless $xref{dicts};
2387	0					0	push @{$xref{dicts}}, $mainDict;
	0					0
2388							} else {
2389	0	0				0	%loaded or return -9;
2390	0					0	$et->Warn('Invalid xref stream in secondary dictionary');
2391							}
2392							}
2393	70					239	$loaded{$offset} = 1;
2394							# load XRef stream in hybrid file if it exists
2395	70	50				653	push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
2396	70	50				193	$encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
2397	70	50	33			253	undef $encrypt if $encrypt and $encrypt eq 'null'; # (have seen "null")
2398	70	100	66			318	if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
2399	29					108	$id = ReadPDFValue($mainDict->{ID}->[0]);
2400							}
2401	70					190	push @mainDicts, $mainDict, $type;
2402							# load previous xref table if it exists
2403	70	100				367	push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
2404							}
2405	37	50				153	if ($xrefSize > $mainDictSize) {
2406	0					0	my $str = "Objects in xref table ($xrefSize) exceed trailer dictionary Size ($mainDictSize)";
2407	0	0				0	$capture ? $et->Error($str) : $et->Warn($str);
2408							}
2409							#
2410							# extract encryption information if necessary
2411							#
2412	37	50				121	if ($encrypt) {
2413	0	0				0	if (ref $encrypt eq 'SCALAR') {
2414	0					0	$encrypt = FetchObject($et, $$encrypt, \%xref, 'Encrypt');
2415							}
2416							# generate Encryption tag information
2417	0					0	my $err = DecryptInit($et, $encrypt, $id);
2418	0	0				0	if ($err) {
2419	0					0	$et->Warn($err);
2420	0	0				0	$$capture{Error} = $err if $capture;
2421	0					0	return -1;
2422							}
2423							}
2424							#
2425							# extract the information beginning with each of the main dictionaries
2426							#
2427	37					70	my $i = 0;
2428	37					114	my $num = (scalar @mainDicts) / 2;
2429	37					129	while (@mainDicts) {
2430	70					161	my $dict = shift @mainDicts;
2431	70					147	my $type = shift @mainDicts;
2432	70	50				222	if ($verbose) {
2433	0					0	++$i;
2434	0					0	my $n = scalar(@{$$dict{_tags}});
	0					0
2435	0					0	$et->VPrint(0, "PDF dictionary ($i of $num) with $n entries:\n");
2436							}
2437	70					341	ProcessDict($et, $tagTablePtr, $dict, \%xref, 0, $type);
2438							}
2439							# handle any decryption errors
2440	37	50				178	if ($encrypt) {
2441	0					0	my $err = $$encrypt{_error};
2442	0	0				0	if ($err) {
2443	0					0	$et->Warn($err);
2444	0	0				0	$$capture{Error} = $err if $capture;
2445	0					0	return -1;
2446							}
2447							}
2448	37					628	return 1;
2449							}
2450
2451							#------------------------------------------------------------------------------
2452							# ReadPDF() warning strings for each error return value
2453							my %pdfWarning = (
2454							# -1 is reserved as error return value with no associated warning
2455							-2 => 'Error seeking in file',
2456							-3 => 'Error reading file',
2457							-4 => 'Invalid xref table',
2458							-5 => 'Invalid xref offset',
2459							-6 => 'Error reading xref table',
2460							-7 => 'Error reading trailer',
2461							-8 => 'Error reading main dictionary',
2462							-9 => 'Invalid xref stream in main dictionary',
2463							);
2464
2465							#------------------------------------------------------------------------------
2466							# Extract information from PDF file
2467							# Inputs: 0) ExifTool object reference, 1) dirInfo reference
2468							# Returns: 1 if this was a valid PDF file
2469							sub ProcessPDF($$)
2470							{
2471	37			37	0	127	my ($et, $dirInfo) = @_;
2472
2473	37					139	undef $cryptInfo; # (must not delete after returning so writer can use it)
2474	37					83	undef $cryptStream;
2475	37					86	undef $cryptString;
2476	37					161	my $result = ReadPDF($et, $dirInfo);
2477	37	50				143	if ($result < 0) {
2478	0	0				0	$et->Warn($pdfWarning{$result}) if $pdfWarning{$result};
2479	0					0	$result = 1;
2480							}
2481							# clean up and return
2482	37					95	undef %streamObjs;
2483	37					154	undef %fetched;
2484	37					179	return $result;
2485							}
2486
2487							1; # end
2488
2489
2490							__END__