File Coverage

blib/lib/Image/ExifTool/PDF.pm

Criterion	Covered	Total	%
statement	526	1029	51.1
branch	289	724	39.9
condition	75	247	30.3
subroutine	22	27	81.4
pod	0	23	0.0
total	912	2050	44.4

line	stmt	bran	cond	sub	pod	time	code
1							#------------------------------------------------------------------------------
2							# File: PDF.pm
3							#
4							# Description: Read PDF meta information
5							#
6							# Revisions: 07/11/2005 - P. Harvey Created
7							# 07/25/2005 - P. Harvey Add support for encrypted documents
8							#
9							# References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
10							# 2) http://search.cpan.org/dist/Crypt-RC4/
11							# 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
12							# 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
13							# 5) http://tools.ietf.org/search/rfc3454
14							# 6) http://www.armware.dk/RFC/rfc/rfc4013.html
15							#------------------------------------------------------------------------------
16
17							package Image::ExifTool::PDF;
18
19	25			25		3999	use strict;
	25					56
	25					832
20	25			25		121	use vars qw($VERSION $AUTOLOAD $lastFetched);
	25					58
	25					1293
21	25			25		135	use Image::ExifTool qw(:DataAccess :Utils);
	25					56
	25					282217
22							require Exporter;
23
24							$VERSION = '1.55';
25
26							sub FetchObject($$$$);
27							sub ExtractObject($$;$$);
28							sub ReadToNested($;$);
29							sub ProcessDict($$$$;$$);
30							sub ProcessAcroForm($$$$;$$);
31							sub ExpandArray($);
32							sub ReadPDFValue($);
33							sub CheckPDF($$$);
34
35							# $lastFetched - last fetched object reference (used for decryption)
36							# (undefined if fetched object was already decrypted, eg. object from stream)
37
38							my $cryptInfo; # encryption object reference (plus additional information)
39							my $cryptString; # flag that strings are encrypted
40							my $cryptStream; # flag that streams are encrypted
41							my $lastOffset; # last fetched object offset
42							my %streamObjs; # hash of stream objects
43							my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
44							my $pdfVer; # version of PDF file being processed
45
46							# filters supported in DecodeStream()
47							my %supportedFilter = (
48							'/FlateDecode' => 1,
49							'/Crypt' => 1,
50							'/Identity' => 1, # (not filtered)
51							'/DCTDecode' => 1, # (JPEG image - not filtered)
52							'/JPXDecode' => 1, # (Jpeg2000 image - not filtered)
53							'/LZWDecode' => 1, # (usually a bitmapped image)
54							'/ASCIIHexDecode' => 1,
55							'/ASCII85Decode' => 1,
56							# other standard filters that we currently don't support
57							#'/JBIG2Decode' => 0, # (JBIG2 image format not supported)
58							#'/CCITTFaxDecode' => 0,
59							#'/RunLengthDecode' => 0,
60							);
61
62							# tags in main PDF directories
63							%Image::ExifTool::PDF::Main = (
64							GROUPS => { 2 => 'Document' },
65							VARS => { CAPTURE => ['Main','Prev'] },
66							Info => {
67							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
68							# Adobe Acrobat 10.1.5 will create a duplicate Info dictionary with
69							# a different object number when metadata is edited. This flag
70							# is part of a patch to ignore this duplicate information (unless
71							# the IgnoreMinorErrors option is used)
72							IgnoreDuplicates => 1,
73							},
74							Root => {
75							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
76							},
77							Encrypt => {
78							NoProcess => 1, # don't process normally (processed in advance)
79							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
80							},
81							_linearized => {
82							Name => 'Linearized',
83							Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
84							PrintConv => { 'true' => 'Yes', 'false' => 'No' },
85							},
86							);
87
88							# tags in PDF Info dictionary
89							%Image::ExifTool::PDF::Info = (
90							GROUPS => { 2 => 'Document' },
91							VARS => { CAPTURE => ['Info'] },
92							EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
93							WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
94							CHECK_PROC => \&CheckPDF,
95							WRITABLE => 'string',
96							# set PRIORITY to 0 so most recent Info dictionary takes precedence
97							# (Acrobat Pro bug? doesn't use same object/generation number for
98							# new Info dictionary when doing incremental update)
99							PRIORITY => 0,
100							NOTES => q{
101							As well as the tags listed below, the PDF specification allows for
102							user-defined tags to exist in the Info dictionary. These tags, which should
103							have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
104							object, are also extracted by ExifTool.
105
106							B specifies the value format, and may be C, C,
107							C, C, C or C for PDF tags.
108							},
109							Title => { },
110							Author => { Groups => { 2 => 'Author' } },
111							Subject => { },
112							Keywords => { List => 'string' }, # this is a string list
113							Creator => { },
114							Producer => { },
115							CreationDate => {
116							Name => 'CreateDate',
117							Writable => 'date',
118							Groups => { 2 => 'Time' },
119							Shift => 'Time',
120							PrintConv => '$self->ConvertDateTime($val)',
121							PrintConvInv => '$self->InverseDateTime($val)',
122							},
123							ModDate => {
124							Name => 'ModifyDate',
125							Writable => 'date',
126							Groups => { 2 => 'Time' },
127							Shift => 'Time',
128							PrintConv => '$self->ConvertDateTime($val)',
129							PrintConvInv => '$self->InverseDateTime($val)',
130							},
131							Trapped => {
132							Protected => 1,
133							# remove leading '/' from '/True' or '/False'
134							ValueConv => '$val=~s{^/}{}; $val',
135							ValueConvInv => '"/$val"',
136							},
137							'AAPL:Keywords' => { #PH
138							Name => 'AppleKeywords',
139							List => 'array', # this is an array of values
140							Notes => q{
141							keywords written by Apple utilities, although they seem to use PDF:Keywords
142							when reading
143							},
144							},
145							);
146
147							# tags in the PDF Root document catalog
148							%Image::ExifTool::PDF::Root = (
149							GROUPS => { 2 => 'Document' },
150							# note: can't capture previous versions of Root since they are not parsed
151							VARS => { CAPTURE => ['Root'] },
152							NOTES => 'This is the PDF document catalog.',
153							MarkInfo => {
154							SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
155							},
156							Metadata => {
157							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
158							},
159							Pages => {
160							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
161							},
162							Perms => {
163							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
164							},
165							AcroForm => {
166							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AcroForm' },
167							},
168							Lang => 'Language',
169							PageLayout => { },
170							PageMode => { },
171							Version => 'PDFVersion',
172							);
173
174							# tags extracted from the PDF Encrypt dictionary
175							%Image::ExifTool::PDF::Encrypt = (
176							GROUPS => { 2 => 'Document' },
177							NOTES => 'Tags extracted from the document Encrypt dictionary.',
178							Filter => {
179							Name => 'Encryption',
180							Notes => q{
181							extracted value is actually a combination of the Filter, SubFilter, V, R and
182							Length information from the Encrypt dictionary
183							},
184							},
185							P => {
186							Name => 'UserAccess',
187							ValueConv => '$val & 0x0f3c', # ignore reserved bits
188							PrintConvColumns => 2,
189							PrintConv => { BITMASK => {
190							2 => 'Print',
191							3 => 'Modify',
192							4 => 'Copy',
193							5 => 'Annotate',
194							8 => 'Fill forms',
195							9 => 'Extract',
196							10 => 'Assemble',
197							11 => 'Print high-res',
198							}},
199							},
200							);
201
202							# tags in PDF Pages dictionary
203							%Image::ExifTool::PDF::Pages = (
204							GROUPS => { 2 => 'Document' },
205							Count => 'PageCount',
206							Kids => {
207							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
208							},
209							);
210
211							# tags in PDF Perms dictionary
212							%Image::ExifTool::PDF::Perms = (
213							NOTES => 'Additional document permissions imposed by digital signatures.',
214							DocMDP => {
215							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
216							},
217							FieldMDP => {
218							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
219							},
220							UR3 => {
221							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
222							},
223							);
224
225							# tags in PDF Perms dictionary
226							%Image::ExifTool::PDF::AcroForm = (
227							PROCESS_PROC => \&ProcessAcroForm,
228							_has_xfa => {
229							Name => 'HasXFA',
230							Notes => q{
231							this tag is defined if a document contains form fields, and is true if it
232							uses XML Forms Architecture; not a real Tag ID
233							},
234							PrintConv => { 'true' => 'Yes', 'false' => 'No' },
235							},
236							);
237
238							# tags in PDF Kids dictionary
239							%Image::ExifTool::PDF::Kids = (
240							Metadata => {
241							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
242							},
243							PieceInfo => {
244							SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
245							},
246							Resources => {
247							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
248							},
249							Kids => {
250							Condition => '$self->Options("ExtractEmbedded")',
251							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
252							},
253							);
254
255							# tags in PDF Resources dictionary
256							%Image::ExifTool::PDF::Resources = (
257							ColorSpace => {
258							SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
259							},
260							XObject => {
261							Condition => '$self->Options("ExtractEmbedded")',
262							SubDirectory => { TagTable => 'Image::ExifTool::PDF::XObject' },
263							},
264							Properties => {
265							Condition => '$self->Options("ExtractEmbedded")',
266							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Properties' },
267							},
268							);
269
270							# tags in PDF ColorSpace dictionary
271							%Image::ExifTool::PDF::ColorSpace = (
272							DefaultRGB => {
273							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
274							ConvertToDict => 1, # (not seen yet, but just in case)
275							},
276							DefaultCMYK => {
277							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
278							# hack: this is stored as an array instead of a dictionary in my
279							# sample, so convert to a dictionary to extract the ICCBased element
280							ConvertToDict => 1,
281							},
282							Cs1 => {
283							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
284							ConvertToDict => 1, # (just in case)
285							},
286							CS0 => {
287							SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
288							ConvertToDict => 1, # (just in case)
289							},
290							);
291
292							# tags in PDF DefaultRGB dictionary
293							%Image::ExifTool::PDF::DefaultRGB = (
294							ICCBased => {
295							SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
296							},
297							);
298
299							# tags in PDF ICCBased, Cs1 and CS0 dictionaries
300							%Image::ExifTool::PDF::ICCBased = (
301							_stream => {
302							SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
303							},
304							);
305
306							# tags in PDF XObject dictionary (parsed only if ExtractEmbedded is enabled)
307							%Image::ExifTool::PDF::XObject = (
308							EXTRACT_UNKNOWN => 0, # extract known but numbered tags (Im1, Im2, etc)
309							Im => {
310							Notes => q{
311							the L option enables information to be extracted from these
312							embedded images
313							},
314							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Im' },
315							},
316							);
317
318							# tags in PDF Im# dictionary
319							%Image::ExifTool::PDF::Im = (
320							NOTES => q{
321							Information extracted from embedded images with the L option.
322							The EmbeddedImage and its metadata are extracted only for JPEG and Jpeg2000
323							image formats.
324							},
325							Width => 'EmbeddedImageWidth',
326							Height => 'EmbeddedImageHeight',
327							Filter => { Name => 'EmbeddedImageFilter', List => 1 },
328							ColorSpace => {
329							Name => 'EmbeddedImageColorSpace',
330							List => 1,
331							RawConv => 'ref $val ? undef : $val', # (ignore color space data)
332							},
333							Image_stream => {
334							Name => 'EmbeddedImage',
335							Groups => { 2 => 'Preview' },
336							Binary => 1,
337							},
338							);
339
340							# tags in PDF Properties dictionary
341							%Image::ExifTool::PDF::Properties = (
342							EXTRACT_UNKNOWN => 0, # extract known but numbered tags (MC0, MC1, etc)
343							MC => {
344							Notes => q{
345							the L option enables information to be extracted from these
346							embedded metadata dictionaries
347							},
348							SubDirectory => { TagTable => 'Image::ExifTool::PDF::MC' },
349							}
350							);
351
352							# tags in PDF MC# dictionary
353							%Image::ExifTool::PDF::MC = (
354							Metadata => {
355							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
356							}
357							);
358
359							# tags in PDF PieceInfo dictionary
360							%Image::ExifTool::PDF::PieceInfo = (
361							AdobePhotoshop => {
362							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
363							},
364							Illustrator => {
365							# assume this is an illustrator file if it contains this directory
366							# and doesn't have a ".PDF" extension
367							Condition => q{
368							$self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
369							return 1;
370							},
371							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
372							},
373							);
374
375							# tags in PDF AdobePhotoshop dictionary
376							%Image::ExifTool::PDF::AdobePhotoshop = (
377							Private => {
378							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
379							},
380							);
381
382							# tags in PDF Illustrator dictionary
383							%Image::ExifTool::PDF::Illustrator = (
384							Private => {
385							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
386							},
387							);
388
389							# tags in PDF Private dictionary
390							%Image::ExifTool::PDF::Private = (
391							ImageResources => {
392							SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
393							},
394							);
395
396							# tags in PDF AI Private dictionary
397							%Image::ExifTool::PDF::AIPrivate = (
398							GROUPS => { 2 => 'Document' },
399							EXTRACT_UNKNOWN => 0, # extract known but numbered tags
400							AIMetaData => {
401							SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
402							},
403							AIPrivateData => {
404							Notes => q{
405							the L option enables information to be extracted from embedded
406							PostScript documents in the AIPrivateData# and AIPDFPrivateData# streams
407							},
408							JoinStreams => 1, # join streams from numbered tags and process as one
409							SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
410							},
411							AIPDFPrivateData => {
412							JoinStreams => 1, # join streams from numbered tags and process as one
413							SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
414							},
415							RoundTripVersion => { },
416							ContainerVersion => { },
417							CreatorVersion => { },
418							);
419
420							# tags in PDF AIMetaData dictionary
421							%Image::ExifTool::PDF::AIMetaData = (
422							_stream => {
423							SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
424							},
425							);
426
427							# tags in PDF ImageResources dictionary
428							%Image::ExifTool::PDF::ImageResources = (
429							_stream => {
430							SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
431							},
432							);
433
434							# tags in PDF MarkInfo dictionary
435							%Image::ExifTool::PDF::MarkInfo = (
436							GROUPS => { 2 => 'Document' },
437							Marked => {
438							Name => 'TaggedPDF',
439							Notes => "not a Tagged PDF if this tag is missing",
440							PrintConv => { 'true' => 'Yes', 'false' => 'No' },
441							},
442							);
443
444							# tags in PDF Metadata dictionary
445							%Image::ExifTool::PDF::Metadata = (
446							GROUPS => { 2 => 'Document' },
447							XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
448							Name => 'XMP',
449							SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
450							},
451							);
452
453							# tags in PDF signature directories (DocMDP, FieldMDP or UR3)
454							%Image::ExifTool::PDF::Signature = (
455							GROUPS => { 2 => 'Document' },
456							ContactInfo => 'SignerContactInfo',
457							Location => 'SigningLocation',
458							M => {
459							Name => 'SigningDate',
460							Format => 'date',
461							Groups => { 2 => 'Time' },
462							PrintConv => '$self->ConvertDateTime($val)',
463							},
464							Name => 'SigningAuthority',
465							Reason => 'SigningReason',
466							Reference => {
467							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
468							},
469							Prop_AuthTime => {
470							Name => 'AuthenticationTime',
471							PrintConv => 'ConvertTimeSpan($val) . " ago"',
472							},
473							Prop_AuthType => 'AuthenticationType',
474							);
475
476							# tags in PDF Reference dictionary
477							%Image::ExifTool::PDF::Reference = (
478							TransformParams => {
479							SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
480							},
481							);
482
483							# tags in PDF TransformParams dictionary
484							%Image::ExifTool::PDF::TransformParams = (
485							GROUPS => { 2 => 'Document' },
486							Annots => {
487							Name => 'AnnotationUsageRights',
488							Notes => q{
489							possible values are Create, Delete, Modify, Copy, Import and Export;
490							additional values for UR3 signatures are Online and SummaryView
491							},
492							List => 1,
493							},
494							Document => {
495							Name => 'DocumentUsageRights',
496							Notes => 'only possible value is FullSave',
497							List => 1,
498							},
499							Form => {
500							Name => 'FormUsageRights',
501							Notes => q{
502							possible values are FillIn, Import, Export, SubmitStandalone and
503							SpawnTemplate; additional values for UR3 signatures are BarcodePlaintext and
504							Online
505							},
506							List => 1,
507							},
508							FormEX => {
509							Name => 'FormExtraUsageRights',
510							Notes => 'UR signatures only; only possible value is BarcodePlaintext',
511							List => 1,
512							},
513							Signature => {
514							Name => 'SignatureUsageRights',
515							Notes => 'only possible value is Modify',
516							List => 1,
517							},
518							EF => {
519							Name => 'EmbeddedFileUsageRights',
520							Notes => 'possible values are Create, Delete, Modify and Import',
521							List => 1,
522							},
523							Msg => 'UsageRightsMessage',
524							P => {
525							Name => 'ModificationPermissions',
526							Notes => q{
527							1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
528							false
529							},
530							PrintConv => {
531							1 => 'No changes permitted',
532							2 => 'Fill forms, Create page templates, Sign',
533							3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
534							'true' => 'Restrict all applications to reader permissions',
535							'false' => 'Do not restrict applications to reader permissions',
536							},
537							},
538							Action => {
539							Name => 'FieldPermissions',
540							Notes => 'FieldMDP signatures only',
541							PrintConv => {
542							'All' => 'Disallow changes to all form fields',
543							'Include' => 'Disallow changes to specified form fields',
544							'Exclude' => 'Allow changes to specified form fields',
545							},
546							},
547							Fields => {
548							Notes => 'FieldMDP signatures only',
549							Name => 'FormFields',
550							List => 1,
551							},
552							);
553
554							# unknown tags for use in verbose option
555							%Image::ExifTool::PDF::Unknown = (
556							GROUPS => { 2 => 'Unknown' },
557							);
558
559							#------------------------------------------------------------------------------
560							# AutoLoad our writer routines when necessary
561							#
562							sub AUTOLOAD
563							{
564	19			19		100	return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
565							}
566
567							#------------------------------------------------------------------------------
568							# Convert from PDF to EXIF-style date/time
569							# Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
570							# Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
571							sub ConvertPDFDate($)
572							{
573	10			10	0	12	my $date = shift;
574							# remove optional 'D:' prefix
575	10					32	$date =~ s/^D://;
576							# fill in default values if necessary
577							# YYYYmmddHHMMSS
578	10					27	my $default = '00000101000000';
579	10	50				25	if (length $date < length $default) {
580	0					0	$date .= substr($default, length $date);
581							}
582	10	50				40	$date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
583	10					49	$date = "$1:$2:$3 $4:$5:$6";
584	10	50				26	if ($7) {
585	10					16	my $tz = $7;
586	10	50				43	if ($tz =~ /^\s*Z/i) {
		50
587							# ignore any "HH'mm'" after the Z (OS X 10.6 does this)
588	0					0	$date .= 'Z';
589							# tolerate some improper formatting in timezone specification
590							} elsif ($tz =~ /^\s([-+])\s(\d+)[': ]+(\d*)/) {
591	10		50			36	$date .= $1 . $2 . ':' . ($3 \|\| '00');
592							}
593							}
594	10					42	return $date;
595							}
596
597							#------------------------------------------------------------------------------
598							# Locate any object in the XRef tables (including compressed objects)
599							# Inputs: 0) XRef reference, 1) object reference string (or free object number)
600							# Returns: offset to object in file or compressed object reference string,
601							# 0 if object is free, or undefined on error
602							sub LocateAnyObject($$)
603							{
604	238			238	0	348	my ($xref, $ref) = @_;
605	238	50				387	return undef unless $xref;
606	238	100				587	return $$xref{$ref} if exists $$xref{$ref};
607							# get the object number
608	7	50				38	return undef unless $ref =~ /^(\d+)/;
609	7					16	my $objNum = $1;
610							# return 0 if the object number has been reused (old object is free)
611	7	100				24	return 0 if defined $$xref{$objNum};
612							#
613							# scan our XRef stream dictionaries for this object
614							#
615	1	50				5	return undef unless $$xref{dicts};
616	0					0	my $dict;
617	0					0	foreach $dict (@{$$xref{dicts}}) {
	0					0
618							# quick check to see if the object is in the range for this xref stream
619	0	0				0	next if $objNum >= $$dict{Size};
620	0					0	my $index = $$dict{Index};
621	0	0				0	next if $objNum < $$index[0];
622							# scan the tables for the specified object
623	0					0	my $size = $$dict{_entry_size};
624	0					0	my $num = scalar(@$index) / 2;
625	0					0	my $tot = 0;
626	0					0	my $i;
627	0					0	for ($i=0; $i<$num; ++$i) {
628	0					0	my $start = $$index[$i*2];
629	0					0	my $count = $$index[$i*2+1];
630							# table is in ascending order, so quit if we have passed the object
631	0	0				0	last if $objNum < $start;
632	0	0				0	if ($objNum < $start + $count) {
633	0					0	my $offset = $size * ($objNum - $start + $tot);
634	0	0				0	last if $offset + $size > length $$dict{_stream};
635	0					0	my @c = unpack("x$offset C$size", $$dict{_stream});
636							# extract values from this table entry
637							# (can be 1, 2, 3, 4, etc.. bytes per value)
638	0					0	my (@t, $j, $k);
639	0					0	my $w = $$dict{W};
640	0					0	for ($j=0; $j<3; ++$j) {
641							# use default value if W entry is 0 (as per spec)
642							# - 0th element defaults to 1, others default to 0
643	0	0				0	$$w[$j] or $t[$j] = ($j ? 0 : 1), next;
		0
644	0					0	$t[$j] = shift(@c);
645	0					0	for ($k=1; $k < $$w[$j]; ++$k) {
646	0					0	$t[$j] = 256 * $t[$j] + shift(@c);
647							}
648							}
649							# by default, use "o g R" as the xref key
650							# (o = object number, g = generation number)
651	0					0	my $ref2 = "$objNum $t[2] R";
652	0	0				0	if ($t[0] == 1) {
		0
		0
653							# normal object reference:
654							# $t[1]=offset of object from start, $t[2]=generation number
655	0					0	$$xref{$ref2} = $t[1];
656							} elsif ($t[0] == 2) {
657							# compressed object reference:
658							# $t[1]=stream object number, $t[2]=index of object in stream
659	0					0	$ref2 = "$objNum 0 R";
660	0					0	$$xref{$ref2} = "I$t[2] $t[1] 0 R";
661							} elsif ($t[0] == 0) {
662							# free object:
663							# $t[1]=next free object in linked list, $t[2]=generation number
664	0					0	$$xref{$ref2} = 0;
665							} else {
666							# treat as a null object
667	0					0	$$xref{$ref2} = undef;
668							}
669	0					0	$$xref{$objNum} = $t[1]; # remember offsets by object number too
670	0	0				0	return $$xref{$ref} if $ref eq $ref2;
671	0					0	return 0; # object is free or was reused
672							}
673	0					0	$tot += $count;
674							}
675							}
676	0					0	return undef;
677							}
678
679							#------------------------------------------------------------------------------
680							# Locate a regular object in the XRef tables (does not include compressed objects)
681							# Inputs: 0) XRef reference, 1) object reference string (or free object number)
682							# Returns: offset to object in file, 0 if object is free,
683							# or undef on error or if object was compressed
684							sub LocateObject($$)
685							{
686	41			41	0	82	my ($xref, $ref) = @_;
687	41					69	my $offset = LocateAnyObject($xref, $ref);
688	41	50	66			151	return undef if $offset and $offset =~ /^I/;
689	41					111	return $offset;
690							}
691
692							#------------------------------------------------------------------------------
693							# Check that the correct object is located at the specified file offset
694							# Inputs: 0) ExifTool ref, 1) object name, 2) object reference string, 3) file offset
695							# Returns: first non-blank line at start of object, or undef on error
696							sub CheckObject($$$$)
697							{
698	217			217	0	375	my ($et, $tag, $ref, $offset) = @_;
699	217					282	my ($data, $obj, $dat, $pat);
700
701	217					318	my $raf = $$et{RAF};
702	217	50				623	$raf->Seek($offset+$$et{PDFBase}, 0) or $et->Warn("Bad $tag offset"), return undef;
703							# verify that we are reading the expected object
704	217					875	($obj = $ref) =~ s/R/obj/;
705	217					376	for (;;) {
706	217	50				524	$raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
707	217	50				3237	last if $data =~ s/^$obj//;
708	0	0				0	next if $data =~ /^\s+$/; # keep reading if this was a blank line
709							# handle cases where other whitespace characters are used in the object ID string
710	0					0	while ($data =~ /^\d+(\s+\d+)?\s*$/) {
711	0					0	$raf->ReadLine($dat);
712	0					0	$data .= $dat;
713							}
714	0					0	($pat = $obj) =~ s/ /\\s+/g;
715	0	0				0	unless ($data =~ s/$pat//) {
716	0					0	$tag = ucfirst $tag;
717	0					0	$et->Warn("$tag object ($obj) not found at offset $offset");
718	0					0	return undef;
719							}
720	0					0	last;
721							}
722							# read the first line of data from the object (ignoring blank lines and comments)
723	217					360	for (;;) {
724	434	100	66			1565	last if $data =~ /\S/ and $data !~ /^\s*%/;
725	217	50				575	$raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
726							}
727	217					452	return $data;
728							}
729
730							#------------------------------------------------------------------------------
731							# Fetch indirect object from file (from inside a stream if required)
732							# Inputs: 0) ExifTool object reference, 1) object reference string,
733							# 2) xref lookup, 3) object name (for warning messages)
734							# Returns: object data or undefined on error
735							# Notes: sets $lastFetched to the object reference, or undef if the object
736							# was extracted from an encrypted stream
737							sub FetchObject($$$$)
738							{
739	197			197	0	323	my ($et, $ref, $xref, $tag) = @_;
740	197					268	$lastFetched = $ref; # save this for decoding if necessary
741	197					304	my $offset = LocateAnyObject($xref, $ref);
742	197					257	$lastOffset = $offset;
743	197	100				348	unless ($offset) {
744	5	50				11	$et->Warn("Bad $tag reference") unless defined $offset;
745	5					10	return undef;
746							}
747	192					236	my ($data, $obj);
748	192	50				448	if ($offset =~ s/^I(\d+) //) {
749	0					0	my $index = $1; # object index in stream
750	0					0	my ($objNum) = split ' ', $ref; # save original object number
751	0					0	$ref = $offset; # now a reference to the containing stream object
752	0					0	$obj = $streamObjs{$ref};
753	0	0				0	unless ($obj) {
754							# don't try to load the same object stream twice
755	0	0				0	return undef if defined $obj;
756	0					0	$streamObjs{$ref} = '';
757							# load the parent object stream
758	0					0	$obj = FetchObject($et, $ref, $xref, $tag);
759							# make sure it contains everything we need
760	0	0	0			0	return undef unless defined $obj and ref($obj) eq 'HASH';
761	0	0	0			0	return undef unless $$obj{First} and $$obj{N};
762	0	0				0	return undef unless DecodeStream($et, $obj);
763							# add a special '_table' entry to this dictionary which contains
764							# the list of object number/offset pairs from the stream header
765	0					0	my $num = $$obj{N} * 2;
766	0					0	my @table = split ' ', $$obj{_stream}, $num;
767	0	0				0	return undef unless @table == $num;
768							# remove everything before first object in stream
769	0					0	$$obj{_stream} = substr($$obj{_stream}, $$obj{First});
770	0					0	$table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
771	0					0	$$obj{_table} = \@table;
772							# save the object stream so we don't have to re-load it later
773	0					0	$streamObjs{$ref} = $obj;
774							}
775							# verify that we have the specified object
776	0					0	my $i = 2 * $index;
777	0					0	my $table = $$obj{_table};
778	0	0	0			0	unless ($index < $$obj{N} and $$table[$i] == $objNum) {
779	0					0	$et->Warn("Bad index for stream object $tag");
780	0					0	return undef;
781							}
782							# extract the object at the specified index in the stream
783							# (offsets in table are in sequential order, so we can subtract from
784							# the next offset to get the object length)
785	0					0	$offset = $$table[$i + 1];
786	0		0			0	my $len = ($$table[$i + 3] \|\| length($$obj{_stream})) - $offset;
787	0					0	$data = substr($$obj{_stream}, $offset, $len);
788							# avoid re-decrypting data in already decrypted streams
789	0	0				0	undef $lastFetched if $cryptStream;
790	0					0	return ExtractObject($et, \$data);
791							}
792							# load the start of the object
793	192					329	$data = CheckObject($et, $tag, $ref, $offset);
794	192	50				331	return undef unless defined $data;
795
796	192					515	return ExtractObject($et, \$data, $$et{RAF}, $xref);
797							}
798
799							#------------------------------------------------------------------------------
800							# Convert PDF value to something readable
801							# Inputs: 0) PDF object data
802							# Returns: converted object
803							sub ReadPDFValue($)
804							{
805	148			148	0	229	my $str = shift;
806							# decode all strings in an array
807	148	100				237	if (ref $str eq 'ARRAY') {
808							# create new list to not alter the original data when rewriting
809	12					19	my ($val, @vals);
810	12					21	foreach $val (@$str) {
811	20					40	push @vals, ReadPDFValue($val);
812							}
813	12					28	return \@vals;
814							}
815	136	50				232	length $str or return $str;
816	136					210	my $delim = substr($str, 0, 1);
817	136	100				300	if ($delim eq '(') { # literal string
		100
		50
818	58	50				268	$str = $1 if $str =~ /^.?$(.)$/s; # remove brackets
819							# decode escape sequences in literal strings
820	58					145	while ($str =~ /\\(.)/sg) {
821	0					0	my $n = pos($str) - 2;
822	0					0	my $c = $1;
823	0					0	my $r;
824	0	0				0	if ($c =~ /[0-7]/) {
		0
		0
825							# get up to 2 more octal digits
826	0	0				0	$c .= $1 if $str =~ /\G([0-7]{1,2})/g;
827							# convert octal escape code
828	0					0	$r = chr(oct($c) & 0xff);
829							} elsif ($c eq "\x0d") {
830							# the string is continued if the line ends with '\'
831							# (also remove "\x0d\x0a")
832	0	0				0	$c .= $1 if $str =~ /\G(\x0a)/g;
833	0					0	$r = '';
834							} elsif ($c eq "\x0a") {
835	0					0	$r = '';
836							} else {
837							# convert escaped characters
838	0					0	($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
839							}
840	0					0	substr($str, $n, length($c)+1) = $r;
841							# continue search after this character
842	0					0	pos($str) = $n + length($r);
843							}
844	58	50				91	Crypt(\$str, $lastFetched) if $cryptString;
845							} elsif ($delim eq '<') { # hex string
846							# decode hex data
847	41					83	$str =~ tr/0-9A-Fa-f//dc;
848	41	50				84	$str .= '0' if length($str) & 0x01; # (by the spec)
849	41					131	$str = pack('H*', $str);
850	41	100				82	Crypt(\$str, $lastFetched) if $cryptString;
851							} elsif ($delim eq '/') { # name
852	0					0	$str = substr($str, 1);
853							# convert escape codes (PDF 1.2 or later)
854	0	0				0	$str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
	0					0
855							}
856	136					248	return $str;
857							}
858
859							#------------------------------------------------------------------------------
860							# Extract PDF object from combination of buffered data and file
861							# Inputs: 0) ExifTool object reference, 1) data reference,
862							# 2) optional raf reference, 3) optional xref table
863							# Returns: converted PDF object or undef on error
864							# a) dictionary object --> hash reference
865							# b) array object --> array reference
866							# c) indirect reference --> scalar reference
867							# d) string, name, integer, boolean, null --> scalar value
868							# - updates $$dataPt on return to contain unused data
869							# - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
870							# the stream data and a list of the tags (not including '_stream' and '_tags')
871							# in their original order
872							sub ExtractObject($$;$$)
873							{
874	754			754	0	1280	my ($et, $dataPt, $raf, $xref) = @_;
875	754					911	my (@tags, $data, $objData);
876	754					1022	my $dict = { };
877	754					836	my $delim;
878
879	754					807	for (;;) {
880	824	100				2330	if ($$dataPt =~ /^\s*(<{1,2}\|\[\|\()/s) {
		50
881	754					1252	$delim = $1;
882	754					1271	$$dataPt =~ s/^\s+//; # remove leading white space
883	754					1303	$objData = ReadToNested($dataPt, $raf);
884	754	50				1213	return undef unless defined $objData;
885	754					968	last;
886							} elsif ($$dataPt =~ s{^\s(\S[^[(/<>\s])\s*}{}s) {
887							#
888							# extract boolean, numerical, string, name, null object or indirect reference
889							#
890	0					0	$objData = $1;
891							# look for an indirect reference
892	0	0	0			0	if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
893	0					0	$objData .= "$1 R";
894	0					0	$objData = \$objData; # return scalar reference
895							}
896	0					0	return $objData; # return simple scalar or scalar reference
897							}
898	70	50	33			254	$raf and $raf->ReadLine($data) or return undef;
899	70					137	$$dataPt .= $data;
900							}
901							#
902							# return literal string or hex string without parsing
903							#
904	754	100	100			2146	if ($delim eq '(' or $delim eq '<') {
		100
905	160					386	return $objData;
906							#
907							# extract array
908							#
909							} elsif ($delim eq '[') {
910	167	50				561	$objData =~ /^.?\[(.)\]/s or return undef;
911	167					290	my $data = $1; # brackets removed
912	167					184	my @list;
913	167					178	for (;;) {
914	594	100				1447	last unless $data =~ m{\s(\S[^[(/<>\s])}sg;
915	427					711	my $val = $1;
916	427	100				1062	if ($val =~ /^(<{1,2}\|\[\|\()/) {
		100
917	78					115	my $pos = pos($data) - length($val);
918							# nested dict, array, literal string or hex string
919	78					123	my $buff = substr($data, $pos);
920	78					139	$val = ReadToNested(\$buff);
921	78	50				142	last unless defined $val;
922	78					135	pos($data) = $pos + length($val);
923	78					150	$val = ExtractObject($et, \$val);
924							} elsif ($val =~ /^\d/) {
925	245					294	my $pos = pos($data);
926	245	100				443	if ($data =~ /\G\s+(\d+)\s+R/g) {
927	37					104	$val = \ "$val $1 R"; # make a reference
928							} else {
929	208					368	pos($data) = $pos;
930							}
931							}
932	427					733	push @list, $val;
933							}
934	167					502	return \@list;
935							}
936							#
937							# extract dictionary
938							#
939							# Note: entries are not necessarily separated by whitespace (doh!)
940							# eg) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
941							# Also, they may be separated by a comment (eg. "/Tag%comment\nValue"),
942							# but comments have already been removed
943	427					1841	while ($objData =~ m{(\s)/([^/[\]()<>{}\s]+)\s(\S[^[(/<>\s]*)}sg) {
944	1229					1984	my $tag = $2;
945	1229					1719	my $val = $3;
946	1229	100				3407	if ($val =~ /^(<{1,2}\|\[\|\()/) {
		100
947							# nested dict, array, literal string or hex string
948	396					885	$objData = substr($objData, pos($objData)-length($val));
949	396					681	$val = ReadToNested(\$objData, $raf);
950	396	50				675	last unless defined $val;
951	396					689	$val = ExtractObject($et, \$val);
952	396					651	pos($objData) = 0;
953							} elsif ($val =~ /^\d/) {
954	618					768	my $pos = pos($objData);
955	618	100				1423	if ($objData =~ /\G\s+(\d+)\s+R/sg) {
956	416					1058	$val = \ "$val $1 R"; # make a reference
957							} else {
958	202					344	pos($objData) = $pos;
959							}
960							}
961	1229	50				2023	if ($$dict{$tag}) {
962							# duplicate dictionary entries are not allowed
963	0					0	$et->Warn("Duplicate '${tag}' entry in dictionary (ignored)");
964							} else {
965							# save the entry
966	1229					1802	push @tags, $tag;
967	1229					4446	$$dict{$tag} = $val;
968							}
969							}
970	427	50				742	return undef unless @tags;
971	427					749	$$dict{_tags} = \@tags;
972	427	100				850	return $dict unless $raf; # direct objects can not have streams
973							#
974							# extract the stream object
975							#
976							# dictionary must specify stream Length
977	262	100				826	my $length = $$dict{Length} or return $dict;
978	43	100				81	if (ref $length) {
979	25					40	$length = $$length;
980	25					58	my $oldpos = $raf->Tell();
981							# get the location of the object specifying the length
982							# (compressed objects are not allowed)
983	25	50				65	my $offset = LocateObject($xref, $length) or return $dict;
984	25	50				47	$offset or $et->Warn('Bad stream Length object'), return $dict;
985	25					45	$data = CheckObject($et, 'stream Length', $length, $offset);
986	25	50				74	defined $data or return $dict;
987	25	50				88	$data =~ /^\s*(\d+)/ or $et->Warn('Stream Length not found'), return $dict;
988	25					53	$length = $1;
989	25					63	$raf->Seek($oldpos, 0); # restore position to start of stream
990							}
991							# extract the trailing stream data
992	43					52	for (;;) {
993							# find the stream token
994	86	100				276	if ($$dataPt =~ /(\S+)/) {
995	43	50				113	last unless $1 eq 'stream';
996							# read an extra line because it may contain our \x0a
997	43	50				91	$$dataPt .= $data if $raf->ReadLine($data);
998							# remove our stream header
999	43					268	$$dataPt =~ s/^\s*stream(\x0a\|\x0d\x0a)//s;
1000	43					121	my $more = $length - length($$dataPt);
1001	43	100				105	if ($more > 0) {
		50
1002	28	50				74	unless ($raf->Read($data, $more) == $more) {
1003	0					0	$et->Warn('Error reading stream data');
1004	0					0	$$dataPt = '';
1005	0					0	return $dict;
1006							}
1007	28					132	$$dict{_stream} = $$dataPt . $data;
1008	28					56	$$dataPt = '';
1009							} elsif ($more < 0) {
1010	15					90	$$dict{_stream} = substr($$dataPt, 0, $length);
1011	15					36	$$dataPt = substr($$dataPt, $length);
1012							} else {
1013	0					0	$$dict{_stream} = $$dataPt;
1014	0					0	$$dataPt = '';
1015							}
1016	43					75	last;
1017							}
1018	43	50				106	$raf->ReadLine($data) or last;
1019	43					120	$$dataPt .= $data;
1020							}
1021	43					155	return $dict;
1022							}
1023
1024							#------------------------------------------------------------------------------
1025							# Read to nested delimiter
1026							# Inputs: 0) data reference, 1) optional raf reference
1027							# Returns: data up to and including matching delimiter (or undef on error)
1028							# - updates data reference with trailing data
1029							# - unescapes characters in literal strings
1030							my %closingDelim = ( # lookup for matching delimiter
1031							'(' => ')',
1032							'[' => ']',
1033							'<' => '>',
1034							'<<' => '>>',
1035							);
1036							sub ReadToNested($;$)
1037							{
1038	1228			1228	0	1728	my ($dataPt, $raf) = @_;
1039	1228					1734	my @delim = (''); # closing delimiter list, most deeply nested first
1040	1228					2172	pos($$dataPt) = 0; # begin at start of data
1041	1228					1707	for (;;) {
1042	5744	100				30017	unless ($$dataPt =~ /(\\*)($\|$\|<{1,2}\|>{1,2}\|\[\|\]\|%)/g) {
1043							# must read some more data
1044	1148					1239	my $buff;
1045	1148	50	33			2616	last unless $raf and $raf->ReadLine($buff);
1046	1148					2135	$$dataPt .= $buff;
1047	1148					1932	pos($$dataPt) = length($$dataPt) - length($buff);
1048	1148					1675	next;
1049							}
1050							# are we in a literal string?
1051	4596	100				9080	if ($delim[0] eq ')') {
		50
1052							# ignore escaped delimiters (preceded by odd number of \'s)
1053	434	50				763	next if length($1) & 0x01;
1054							# ignore all delimiters but unescaped braces
1055	434	50	33			1176	next unless $2 eq '(' or $2 eq ')';
1056							} elsif ($2 eq '%') {
1057							# ignore the comment
1058	0					0	my $pos = pos($$dataPt) - 1;
1059							# remove everything from '%' up to but not including newline
1060	0					0	$$dataPt =~ /.*/g;
1061	0					0	my $end = pos($$dataPt);
1062	0					0	$$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
1063	0					0	pos($$dataPt) = $pos;
1064	0					0	next;
1065							}
1066	4596	100				7367	if ($closingDelim{$2}) {
1067							# push the corresponding closing delimiter
1068	2298					3894	unshift @delim, $closingDelim{$2};
1069	2298					2732	next;
1070							}
1071	2298	50				3359	unless ($2 eq $delim[0]) {
1072							# handle the case where we find a ">>>" and interpret it
1073							# as ">> >" instead of "> >>"
1074	0	0	0			0	next unless $2 eq '>>' and $delim[0] eq '>';
1075	0					0	pos($$dataPt) = pos($$dataPt) - 1;
1076							}
1077	2298					2446	shift @delim; # remove from nesting list
1078	2298	100				3566	next if $delim[0]; # keep going if we have more nested delimiters
1079	1228					1449	my $pos = pos($$dataPt);
1080	1228					1924	my $buff = substr($$dataPt, 0, $pos);
1081	1228					1930	$$dataPt = substr($$dataPt, $pos);
1082	1228					2278	return $buff; # success!
1083							}
1084	0					0	return undef; # didn't find matching delimiter
1085							}
1086
1087							#------------------------------------------------------------------------------
1088							# Decode LZW-encoded data (ref 1)
1089							# Inputs: 0) data reference
1090							# Returns: true on success and data is decoded, or false and data is untouched
1091							sub DecodeLZW($)
1092							{
1093	0			0	0	0	my $dataPt = shift;
1094	0	0				0	return 0 if length $$dataPt < 4;
1095	0					0	my @lzw = (map(chr, 0..255), undef, undef); # LZW code table
1096	0					0	my $mask = 0x01ff; # mask for least-significant 9 bits
1097	0					0	my @dat = unpack 'n*', $$dataPt . "\0";
1098	0					0	my $word = ($dat[0] << 16) \| $dat[1];
1099	0					0	my ($bit, $pos, $bits, $out) = (0, 2, 9, '');
1100	0					0	my $lastVal;
1101	0					0	for (;;) {
1102							# bits are packed MSB first in PDF LZW (the PDF spec doesn't mention this)
1103	0					0	my $shift = 32 - ($bit + $bits);
1104	0	0				0	if ($shift < 0) {
1105	0	0				0	return 0 if $pos >= @dat; # missing EOD marker
1106	0					0	$word = (($word & 0xffff) << 16) \| $dat[$pos++]; # read next word
1107	0					0	$bit -= 16;
1108	0					0	$shift += 16;
1109							};
1110	0					0	my $code = ($word >> $shift) & $mask;
1111	0					0	$bit += $bits;
1112	0					0	my $val = $lzw[$code];
1113	0	0				0	if (defined $val) {
		0
		0
		0
1114							# store new code as previous sequence plus 1st char of new sequence
1115	0	0				0	push @lzw, $lastVal . substr($val, 0, 1) if defined $lastVal;
1116							} elsif ($code == @lzw) { # new code
1117	0	0				0	return 0 unless defined $lastVal;
1118							# we are using the code that we are about to generate, so the last
1119							# character in the new sequence must be the same as the first
1120							# character in the previous sequence (makes sense if you think about it)
1121	0					0	$val = $lastVal . substr($lastVal, 0, 1);
1122	0					0	push @lzw, $val;
1123							} elsif ($code == 256) { # clear table
1124	0					0	splice @lzw, 258;
1125	0					0	$bits = 9;
1126	0					0	$mask = 0x1ff;
1127	0					0	undef $lastVal;
1128	0					0	next;
1129							} elsif ($code == 257) { # EOD marker
1130	0					0	last; # all done!
1131							} else {
1132	0					0	return 0;
1133							}
1134	0					0	$out .= $val; # add this byte sequence to the output
1135							# we added a new entry to the LZW table, so we must increase
1136							# the bit width if necessary, up to a maximum of 12
1137	0	0	0			0	@lzw >= $mask and $bits < 12 and ++$bits, $mask \|= $mask << 1;
1138	0					0	$lastVal = $val;
1139							}
1140	0					0	$$dataPt = $out; # return decompressed data
1141	0					0	return 1;
1142							}
1143
1144							#------------------------------------------------------------------------------
1145							# Decode filtered stream
1146							# Inputs: 0) ExifTool object reference, 1) dictionary reference
1147							# Returns: true if stream has been decoded OK
1148							sub DecodeStream($$)
1149							{
1150	43			43	0	53	local $_;
1151	43					68	my ($et, $dict) = @_;
1152
1153	43	50				77	return 0 unless $$dict{_stream}; # no stream to decode
1154
1155							# get list of filters
1156	43					70	my (@filters, @decodeParms, $filter);
1157	43	50				112	if (ref $$dict{Filter} eq 'ARRAY') {
		50
1158	0					0	@filters = @{$$dict{Filter}};
	0					0
1159							} elsif (defined $$dict{Filter}) {
1160	0					0	@filters = ($$dict{Filter});
1161							}
1162							# be sure we can process all the filters before we take the time to do the decryption
1163	43					72	foreach $filter (@filters) {
1164	0	0				0	next if $supportedFilter{$filter};
1165	0					0	$et->WarnOnce("Unsupported Filter $filter");
1166	0					0	return 0;
1167							}
1168							# apply decryption first if required (and if the default encryption
1169							# has not been overridden by a Crypt filter. Note: the Crypt filter
1170							# must be first in the Filter array: ref 3, page 38)
1171	43	50	33			134	unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
			33
1172	43					79	CryptStream($dict, $lastFetched);
1173							}
1174	43	50				123	return 1 unless $$dict{Filter}; # Filter entry is mandatory
1175	0	0				0	return 0 if defined $$dict{_filtered}; # avoid double-filtering
1176	0					0	$$dict{_filtered} = 1; # set flag to prevent double-filtering
1177
1178							# get array of DecodeParms dictionaries
1179	0	0				0	if (ref $$dict{DecodeParms} eq 'ARRAY') {
1180	0					0	@decodeParms = @{$$dict{DecodeParms}};
	0					0
1181							} else {
1182	0					0	@decodeParms = ($$dict{DecodeParms});
1183							}
1184
1185	0					0	foreach $filter (@filters) {
1186	0					0	my $decodeParms = shift @decodeParms;
1187
1188	0	0				0	if ($filter eq '/FlateDecode') {
		0
		0
		0
		0
1189							# make sure we support the predictor (if used) before decoding
1190	0					0	my $pre;
1191	0	0				0	if (ref $decodeParms eq 'HASH') {
1192	0					0	$pre = $$decodeParms{Predictor};
1193	0	0	0			0	if ($pre and $pre ne '1' and $pre ne '12') {
			0
1194	0					0	$et->WarnOnce("FlateDecode Predictor $pre currently not supported");
1195	0					0	return 0;
1196							}
1197							}
1198	0	0				0	if (eval { require Compress::Zlib }) {
	0					0
1199	0					0	my $inflate = Compress::Zlib::inflateInit();
1200	0					0	my ($buff, $stat);
1201	0	0				0	$inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
1202	0	0	0			0	if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
1203	0					0	$$dict{_stream} = $buff;
1204							} else {
1205	0					0	$et->Warn('Error inflating stream');
1206	0					0	return 0;
1207							}
1208							} else {
1209	0					0	$et->WarnOnce('Install Compress::Zlib to process filtered streams');
1210	0					0	return 0;
1211							}
1212	0	0	0			0	next unless $pre and $pre eq '12'; # 12 = 'up' prediction
1213
1214							# apply anti-predictor
1215	0					0	my $cols = $$decodeParms{Columns};
1216	0	0				0	unless ($cols) {
1217							# currently only support 'up' prediction
1218	0					0	$et->WarnOnce('No Columns for decoding stream');
1219	0					0	return 0;
1220							}
1221	0					0	my @bytes = unpack('C*', $$dict{_stream});
1222	0					0	my @pre = (0) x $cols; # initialize predictor array
1223	0					0	my $buff = '';
1224	0					0	while (@bytes > $cols) {
1225	0	0				0	unless (($_ = shift @bytes) == 2) {
1226	0					0	$et->WarnOnce("Unsupported PNG filter $_"); # (yes, PNG)
1227	0					0	return 0;
1228							}
1229	0					0	foreach (@pre) {
1230	0					0	$_ = ($_ + shift(@bytes)) & 0xff;
1231							}
1232	0					0	$buff .= pack('C*', @pre);
1233							}
1234	0					0	$$dict{_stream} = $buff;
1235
1236							} elsif ($filter eq '/Crypt') {
1237
1238							# (we shouldn't have to check the _decrypted flag since we
1239							# already checked the _filtered flag, but what the heck...)
1240	0	0				0	next if defined $$dict{_decrypted};
1241							# assume Identity filter (the default) if DecodeParms are missing
1242	0	0				0	next unless ref $decodeParms eq 'HASH';
1243	0					0	my $name = $$decodeParms{Name};
1244	0	0	0			0	next unless defined $name or $name eq 'Identity';
1245	0	0				0	if ($name ne 'StdCF') {
1246	0					0	$et->WarnOnce("Unsupported Crypt Filter $name");
1247	0					0	return 0;
1248							}
1249	0	0				0	unless ($cryptInfo) {
1250	0					0	$et->WarnOnce('Missing Encrypt StdCF entry');
1251	0					0	return 0;
1252							}
1253							# decrypt the stream manually because we want to:
1254							# 1) ignore $cryptStream (StmF) setting
1255							# 2) ignore EncryptMetadata setting (I can't find mention of how to
1256							# reconcile this in the spec., but this would make sense)
1257							# 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
1258							# 4) set _decrypted flag so we will recrypt according to StmF when
1259							# writing (since we don't yet write Filter'd streams)
1260	0					0	Crypt(\$$dict{_stream}, 'none');
1261	0	0				0	$$dict{_decrypted} = ($cryptStream ? 1 : 0);
1262
1263							} elsif ($filter eq '/LZWDecode') {
1264
1265							# make sure we don't have any unsupported decoding parameters
1266	0	0				0	if (ref $decodeParms eq 'HASH') {
1267	0	0				0	if ($$decodeParms{Predictor}) {
		0
1268	0					0	$et->WarnOnce("LZWDecode Predictor $$decodeParms{Predictor} currently not supported");
1269	0					0	return 0;
1270							} elsif ($$decodeParms{EarlyChange}) {
1271	0					0	$et->WarnOnce("LZWDecode EarlyChange currently not supported");
1272	0					0	return 0;
1273							}
1274							}
1275	0	0				0	unless (DecodeLZW(\$$dict{_stream})) {
1276	0					0	$et->WarnOnce('LZW decompress error');
1277	0					0	return 0;
1278							}
1279
1280							} elsif ($filter eq '/ASCIIHexDecode') {
1281
1282	0					0	$$dict{_stream} =~ s/>.*//; # truncate at '>' (end of data mark)
1283	0					0	$$dict{_stream} =~ tr/0-9a-zA-Z//d; # remove illegal characters
1284	0					0	$$dict{_stream} = pack 'H*', $$dict{_stream};
1285
1286							} elsif ($filter eq '/ASCII85Decode') {
1287
1288	0					0	my ($err, @out, $i);
1289	0					0	my ($n, $val) = (0, 0);
1290	0					0	foreach (split //, $$dict{_stream}) {
1291	0	0	0			0	if ($_ ge '!' and $_ le 'u') {;
		0
		0
1292	0					0	$val = 85 * $val + ord($_) - 33;
1293	0	0				0	next unless ++$n == 5;
1294							} elsif ($_ eq '~') {
1295	0	0				0	$n == 1 and $err = 1; # error to have a single char in the last group of 5
1296	0					0	for ($i=$n; $i<5; ++$i) { $val *= 85; }
	0					0
1297							} elsif ($_ eq 'z') {
1298	0	0				0	$n and $err = 2, last; # error if 'z' isn't the first char
1299	0					0	$n = 5;
1300							} else {
1301	0	0				0	next if /^\s$/; # ignore white space
1302	0					0	$err = 3, last; # any other character is an error
1303							}
1304	0					0	$val = unpack('V', pack('N', $val)); # reverse byte order
1305	0					0	while (--$n > 0) {
1306	0					0	push @out, $val & 0xff;
1307	0					0	$val >>= 8;
1308							}
1309	0	0				0	last if $_ eq '~';
1310							# (both $n and $val are zero again now)
1311							}
1312	0	0				0	$err and $et->WarnOnce("ASCII85Decode error $err");
1313	0					0	$$dict{_stream} = pack('C*', @out);
1314							}
1315							}
1316	0					0	return 1;
1317							}
1318
1319							#------------------------------------------------------------------------------
1320							# Initialize state for RC4 en/decryption (ref 2)
1321							# Inputs: 0) RC4 key string
1322							# Returns: RC4 key hash reference
1323							sub RC4Init($)
1324							{
1325	22			22	0	49	my @key = unpack('C*', shift);
1326	22					172	my @state = (0 .. 255);
1327	22					31	my ($i, $j) = (0, 0);
1328	22					38	while ($i < 256) {
1329	5632					5874	my $st = $state[$i];
1330	5632					6635	$j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
1331	5632					6082	$state[$i++] = $state[$j];
1332	5632					7473	$state[$j] = $st;
1333							}
1334	22					80	return { State => \@state, XY => [ 0, 0 ] };
1335							}
1336
1337							#------------------------------------------------------------------------------
1338							# Apply RC4 en/decryption (ref 2)
1339							# Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
1340							# - can call this method directly with a key string, or with with the key
1341							# reference returned by RC4Init
1342							# - RC4 is a symmetric algorithm, so encryption is the same as decryption
1343							sub RC4Crypt($$)
1344							{
1345	22			22	0	39	my ($dataPt, $key) = @_;
1346	22	50				46	$key = RC4Init($key) unless ref $key eq 'HASH';
1347	22					34	my $state = $$key{State};
1348	22					26	my ($x, $y) = @{$$key{XY}};
	22					33
1349
1350	22					59	my @data = unpack('C*', $$dataPt);
1351	22					35	foreach (@data) {
1352	356					373	$x = ($x + 1) & 0xff;
1353	356					365	my $stx = $$state[$x];
1354	356					367	$y = ($stx + $y) & 0xff;
1355	356					383	my $sty = $$state[$x] = $$state[$y];
1356	356					357	$$state[$y] = $stx;
1357	356					440	$_ ^= $$state[($stx + $sty) & 0xff];
1358							}
1359	22					44	$$key{XY} = [ $x, $y ];
1360	22					155	$$dataPt = pack('C*', @data);
1361							}
1362
1363							#------------------------------------------------------------------------------
1364							# Update AES cipher with a bit of data
1365							# Inputs: 0) data
1366							# Returns: encrypted data
1367							my $cipherMore;
1368							sub CipherUpdate($)
1369							{
1370	0			0	0	0	my $dat = shift;
1371	0					0	my $pos = 0;
1372	0	0				0	$dat = $cipherMore . $dat if length $dat;
1373	0					0	while ($pos + 16 <= length($dat)) {
1374	0					0	substr($dat,$pos,16) = Image::ExifTool::AES::Cipher(substr($dat,$pos,16));
1375	0					0	$pos += 16;
1376							}
1377	0	0				0	if ($pos < length $dat) {
1378	0					0	$cipherMore = substr($dat,$pos);
1379	0					0	$dat = substr($dat,0,$pos);
1380							} else {
1381	0					0	$cipherMore = '';
1382							}
1383	0					0	return $dat;
1384							}
1385
1386							#------------------------------------------------------------------------------
1387							# Get encrypted hash
1388							# Inputs: 0) Password, 1) salt, 2) vector, 3) encryption revision
1389							# Returns: hash
1390							sub GetHash($$$$)
1391							{
1392	6			6	0	20	my ($password, $salt, $vector, $rev) = @_;
1393
1394							# return Rev 5 hash
1395	6	50				46	return Digest::SHA::sha256($password, $salt, $vector) if $rev == 5;
1396
1397							# compute Rev 6 hardened hash
1398							# (ref http://code.google.com/p/origami-pdf/source/browse/lib/origami/encryption.rb)
1399	0					0	my $blockSize = 32;
1400	0					0	my $input = Digest::SHA::sha256($password, $salt, $vector) . ("\0" x 32);
1401	0					0	my $key = substr($input, 0, 16);
1402	0					0	my $iv = substr($input, 16, 16);
1403	0					0	my $h;
1404	0					0	my $x = '';
1405	0					0	my $i = 0;
1406	0		0			0	while ($i < 64 or $i < ord(substr($x,-1,1))+32) {
1407
1408	0					0	my $block = substr($input, 0, $blockSize);
1409	0					0	$x = '';
1410	0					0	Image::ExifTool::AES::Crypt(\$x, $key, $iv, 1);
1411	0					0	$cipherMore = '';
1412
1413	0					0	my ($j, $digest);
1414	0					0	for ($j=0; $j<64; ++$j) {
1415	0					0	$x = '';
1416	0	0				0	$x .= CipherUpdate($password) if length $password;
1417	0					0	$x .= CipherUpdate($block);
1418	0	0				0	$x .= CipherUpdate($vector) if length $vector;
1419	0	0				0	if ($j == 0) {
1420	0					0	my @a = unpack('C16', $x);
1421	0					0	my $sum = 0;
1422	0					0	$sum += $_ foreach @a;
1423							# set SHA block size (32, 48 or 64 bytes = SHA-256, 384 or 512)
1424	0					0	$blockSize = 32 + ($sum % 3) * 16;
1425	0					0	$digest = Digest::SHA->new($blockSize * 8);
1426							}
1427	0					0	$digest->add($x);
1428							}
1429
1430	0					0	$h = $digest->digest();
1431	0					0	$key = substr($h, 0, 16);
1432	0					0	substr($input,0,16) = $h;
1433	0					0	$iv = substr($h, 16, 16);
1434	0					0	++$i;
1435							}
1436	0					0	return substr($h, 0, 32);
1437							}
1438
1439							#------------------------------------------------------------------------------
1440							# Initialize decryption
1441							# Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
1442							# 2) ID from file trailer dictionary
1443							# Returns: error string or undef on success (and sets $cryptInfo)
1444							sub DecryptInit($$$)
1445							{
1446	4			4	0	22	local $_;
1447	4					10	my ($et, $encrypt, $id) = @_;
1448
1449	4					5	undef $cryptInfo;
1450	4	50	33			19	unless ($encrypt and ref $encrypt eq 'HASH') {
1451	0					0	return 'Error loading Encrypt object';
1452							}
1453	4					7	my $filt = $$encrypt{Filter};
1454	4	50	33			28	unless ($filt and $filt =~ s/^\///) {
1455	0					0	return 'Encrypt dictionary has no Filter!';
1456							}
1457							# extract some interesting tags
1458	4		50			12	my $ver = $$encrypt{V} \|\| 0;
1459	4		100			14	my $rev = $$encrypt{R} \|\| 0;
1460	4					10	my $enc = "$filt V$ver";
1461	4	50				14	$enc .= ".$rev" if $filt eq 'Standard';
1462	4	50	33			12	$enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
1463	4	50	100			18	$enc .= ' (' . ($$encrypt{Length} \|\| 40) . '-bit)' if $filt eq 'Standard';
1464	4					11	my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
1465	4					16	$et->HandleTag($tagTablePtr, 'Filter', $enc);
1466	4	50				16	if ($filt ne 'Standard') {
		50
1467	0					0	return "Encryption filter $filt currently not supported";
1468							} elsif (not defined $$encrypt{R}) {
1469	0					0	return 'Standard security handler missing revision';
1470							}
1471	4	50	33			26	unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
			33
1472	0					0	return 'Incomplete Encrypt specification';
1473							}
1474	4	50				19	if ("$ver.$rev" >= 5.6) {
1475							# apologize for poor performance (AES is a pure Perl implementation)
1476	0					0	$et->Warn('Decryption is very slow for encryption V5.6 or higher', 3);
1477							}
1478	4					15	$et->HandleTag($tagTablePtr, 'P', $$encrypt{P});
1479
1480	4					7	my %parm; # optional parameters extracted from Encrypt dictionary
1481
1482	4	100	66			25	if ($ver == 1 or $ver == 2) {
		50	66
1483	1					3	$cryptString = $cryptStream = 1;
1484							} elsif ($ver == 4 or $ver == 5) {
1485							# initialize our $cryptString and $cryptStream flags
1486	3					4	foreach ('StrF', 'StmF') {
1487	6	100				13	my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
1488	6					12	$$flagPt = $$encrypt{$_};
1489	6	50	33			18	undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
1490	6	50	33			20	return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
1491							}
1492	3	50	33			8	if ($cryptString or $cryptStream) {
1493							return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
1494	3	50	33			18	ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
			33
1495	3					7	my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
1496	3	50				15	unless ($cryptMeth =~ /^\/(V2\|AESV2\|AESV3)$/) {
1497	0					0	return "Unsupported encryption method $cryptMeth";
1498							}
1499							# set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
1500	3	50				18	$$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2\|AESV3)$/;
1501							}
1502	3	100				9	if ($ver == 5) {
1503							# validate OE and UE entries
1504	2					5	foreach ('OE', 'UE') {
1505	4	50				9	return "Missing Encrypt $_ entry" unless $$encrypt{$_};
1506	4					9	$parm{$_} = ReadPDFValue($$encrypt{$_});
1507	4	50				11	return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
1508							}
1509	2					10	require Image::ExifTool::AES; # will need this later
1510							}
1511							} else {
1512	0					0	return "Encryption version $ver currently not supported";
1513							}
1514	4	50				8	$id or return "Can't decrypt (no document ID)";
1515
1516							# make sure we have the necessary libraries available
1517	4	100				12	if ($ver < 5) {
1518	2	50				4	unless (eval { require Digest::MD5 }) {
	2					14
1519	0					0	return "Install Digest::MD5 to process encrypted PDF";
1520							}
1521							} else {
1522	2	50				4	unless (eval { require Digest::SHA }) {
	2					10
1523	0					0	return "Install Digest::SHA to process AES-256 encrypted PDF";
1524							}
1525							}
1526
1527							# calculate file-level en/decryption key
1528	4					8	my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
1529							"\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
1530	4					10	my $o = ReadPDFValue($$encrypt{O});
1531	4					10	my $u = ReadPDFValue($$encrypt{U});
1532
1533							# set flag indicating whether metadata is encrypted
1534							# (in version 4 and higher, metadata streams may not be encrypted)
1535	4	100	100			25	if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
			66
1536	3					7	$$encrypt{_meta} = 1;
1537							}
1538							# try no password first, then try provided password if available
1539	4					6	my ($try, $key);
1540	4					7	for ($try=0; ; ++$try) {
1541	5					7	my $password;
1542	5	100				12	if ($try == 0) {
		50
1543	4					8	$password = '';
1544							} elsif ($try == 1) {
1545	1					4	$password = $et->Options('Password');
1546	1	50				5	return 'Document is password protected (use Password option)' unless defined $password;
1547							# make sure there is no UTF-8 flag on the password
1548	1	50	33			5	if ($] >= 5.006 and (eval { require Encode; Encode::is_utf8($password) } or $@)) {
			33
1549							# repack by hand if Encode isn't available
1550	0	0				0	$password = $@ ? pack('C',unpack($] < 5.010000 ? 'U0C' : 'C0C*',$password)) : Encode::encode('utf8',$password);
		0
1551							}
1552							} else {
1553	0					0	return 'Incorrect password';
1554							}
1555	5	100				8	if ($ver < 5) {
1556	2	50				6	if (length $password) {
1557							# password must be encoding in PDFDocEncoding (ref iso32000)
1558	0					0	$password = $et->Encode($password, 'PDFDoc');
1559							# truncate or pad the password to exactly 32 bytes
1560	0	0				0	if (length($password) > 32) {
		0
1561	0					0	$password = substr($password, 0, 32);
1562							} elsif (length($password) < 32) {
1563	0					0	$password .= substr($pad, 0, 32-length($password));
1564							}
1565							} else {
1566	2					4	$password = $pad;
1567							}
1568	2					10	$key = $password . $o . pack('V', $$encrypt{P}) . $id;
1569	2					3	my $rep = 1;
1570	2	100	66			12	if ($rev == 3 or $rev == 4) {
1571							# must add this if metadata not encrypted
1572	1	50				4	$key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
1573	1					3	$rep += 50; # repeat MD5 50 more times if revision is 3 or greater
1574							}
1575	2					3	my ($len, $i, $dat);
1576	2	100				5	if ($ver == 1) {
1577	1					2	$len = 5;
1578							} else {
1579	1		50			13	$len = $$encrypt{Length} \|\| 40;
1580	1	50				4	$len >= 40 or return 'Bad Encrypt Length';
1581	1					3	$len = int($len / 8);
1582							}
1583	2					7	for ($i=0; $i<$rep; ++$i) {
1584	52					119	$key = substr(Digest::MD5::md5($key), 0, $len);
1585							}
1586							# decrypt U to see if a user password is required
1587	2	100				6	if ($rev >= 3) {
1588	1					4	$dat = Digest::MD5::md5($pad . $id);
1589	1					4	RC4Crypt(\$dat, $key);
1590	1					4	for ($i=1; $i<=19; ++$i) {
1591	19					35	my @key = unpack('C*', $key);
1592	19					27	foreach (@key) { $_ ^= $i; }
	304					316
1593	19					42	RC4Crypt(\$dat, pack('C*', @key));
1594							}
1595	1					3	$dat .= substr($u, 16);
1596							} else {
1597	1					2	$dat = $pad;
1598	1					3	RC4Crypt(\$dat, $key);
1599							}
1600	2	50				19	last if $dat eq $u; # all done if this was the correct key
1601							} else {
1602	3	50	33			13	return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
1603	3	100				6	if (length $password) {
1604							# Note: this should be good for passwords containing reasonable characters,
1605							# but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
1606							# of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
1607	1					5	$password = $et->Encode($password, 'UTF8');
1608	1	50				5	$password = substr($password, 0, 127) if length($password) > 127;
1609							}
1610							# test for the owner password
1611	3					10	my $sha = GetHash($password, substr($o,32,8), substr($u,0,48), $rev);
1612	3	100				11	if ($sha eq substr($o, 0, 32)) {
1613	2					7	$key = GetHash($password, substr($o,40,8), substr($u,0,48), $rev);
1614	2					5	my $dat = ("\0" x 16) . $parm{OE};
1615							# decrypt with no padding
1616	2					8	my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1617	2	50				7	return $err if $err;
1618	2					4	$key = $dat; # use this as the file decryption key
1619	2					4	last;
1620							}
1621							# test for the user password
1622	1					4	$sha = GetHash($password, substr($u,32,8), '', $rev);
1623	1	50				5	if ($sha eq substr($u, 0, 32)) {
1624	0					0	$key = GetHash($password, substr($u,40,8), '', $rev);
1625	0					0	my $dat = ("\0" x 16) . $parm{UE};
1626	0					0	my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1627	0	0				0	return $err if $err;
1628	0					0	$key = $dat; # use this as the file decryption key
1629	0					0	last;
1630							}
1631							}
1632							}
1633	4					11	$$encrypt{_key} = $key; # save the file-level encryption key
1634	4					6	$cryptInfo = $encrypt; # save reference to the file-level Encrypt object
1635	4					16	return undef; # success!
1636							}
1637
1638							#------------------------------------------------------------------------------
1639							# Decrypt/Encrypt data
1640							# Inputs: 0) data ref
1641							# 1) PDF object reference to use as crypt key extension (may be 'none' to
1642							# avoid extending the encryption key, as for streams with Crypt Filter)
1643							# 2) encrypt flag (false for decryption)
1644							sub Crypt($$;$)
1645							{
1646	29	100		29	0	84	return unless $cryptInfo;
1647	4					9	my ($dataPt, $keyExt, $encrypt) = @_;
1648							# do not decrypt if the key extension object is undefined
1649							# (this doubles as a flag to disable decryption/encryption)
1650	4	50				10	return unless defined $keyExt;
1651	4					7	my $key = $$cryptInfo{_key};
1652							# apply the necessary crypt key extension
1653	4	100				11	unless ($$cryptInfo{_aesv3}) {
1654	2	50				7	unless ($keyExt eq 'none') {
1655							# extend crypt key using object and generation number
1656	2	50				15	unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
1657	0					0	$$cryptInfo{_error} = 'Invalid object reference for encryption';
1658	0					0	return;
1659							}
1660	2					25	$key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
1661							}
1662							# add AES-128 salt if necessary (this little gem is conveniently
1663							# omitted from the Adobe PDF 1.6 documentation, causing me to
1664							# waste 12 hours trying to figure out why this wasn't working --
1665							# it appears in ISO32000 though, so I should have been using that)
1666	2	100				7	$key .= 'sAlT' if $$cryptInfo{_aesv2};
1667	2					4	my $len = length($key);
1668	2					8	$key = Digest::MD5::md5($key); # get 16-byte MD5 digest
1669	2	100				8	$key = substr($key, 0, $len) if $len < 16; # trim if necessary
1670							}
1671							# perform the decryption/encryption
1672	4	100	100			17	if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
1673	3					15	require Image::ExifTool::AES;
1674	3					8	my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
1675	3	50				11	$err and $$cryptInfo{_error} = $err;
1676							} else {
1677	1					3	RC4Crypt($dataPt, $key);
1678							}
1679							}
1680
1681							#------------------------------------------------------------------------------
1682							# Decrypt/Encrypt stream data
1683							# Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
1684							sub CryptStream($$)
1685							{
1686	52	50		52	0	104	return unless $cryptStream;
1687	0					0	my ($dict, $keyExt) = @_;
1688	0		0			0	my $type = $$dict{Type} \|\| '';
1689							# XRef streams are not encrypted (ref 3, page 50),
1690							# and Metadata may or may not be encrypted
1691	0	0	0			0	if ($cryptInfo and $type ne '/XRef' and
			0
			0
1692							($$cryptInfo{_meta} or $type ne '/Metadata'))
1693							{
1694	0					0	Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
1695							# toggle _decrypted flag
1696	0	0				0	$$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
1697							} else {
1698	0					0	$$dict{_decrypted} = 0; # stream should never be encrypted
1699							}
1700							}
1701
1702							#------------------------------------------------------------------------------
1703							# Generate a new PDF tag (based on its ID) and add it to a tag table
1704							# Inputs: 0) tag table ref, 1) tag ID
1705							# Returns: tag info ref
1706							sub NewPDFTag($$)
1707							{
1708	0			0	0	0	my ($tagTablePtr, $tag) = @_;
1709	0					0	my $name = $tag;
1710							# translate URL-like escape sequences
1711	0					0	$name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
	0					0
1712	0					0	$name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
1713	0					0	$name =~ s/(^\|_)([a-z])/\U$2/g; # start words with upper case
1714	0					0	my $tagInfo = { Name => $name };
1715	0					0	AddTagToTable($tagTablePtr, $tag, $tagInfo);
1716	0					0	return $tagInfo;
1717							}
1718
1719							#------------------------------------------------------------------------------
1720							# Process AcroForm dictionary to set HasXMLFormsArchitecture flag
1721							# Inputs: Same as ProcessDict
1722							sub ProcessAcroForm($$$$;$$)
1723							{
1724	0			0	0	0	my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1725	0	0				0	$et->HandleTag($tagTablePtr, '_has_xfa', $$dict{XFA} ? 'true' : 'false');
1726	0					0	return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
1727							}
1728
1729							#------------------------------------------------------------------------------
1730							# Expand array into a string
1731							# Inputs: 0) array ref
1732							# Return: string
1733							sub ExpandArray($)
1734							{
1735	0			0	0	0	my $val = shift;
1736	0					0	my @list = @$val;
1737	0					0	foreach (@list) {
1738	0	0				0	ref $_ eq 'SCALAR' and $_ = "ref($$_)", next;
1739	0	0				0	ref $_ eq 'ARRAY' and $_ = ExpandArray($_), next;
1740	0	0				0	defined $_ or $_ = '', next;
1741							}
1742	0					0	return '[' . join(',',@list) . ']';
1743							}
1744
1745							#------------------------------------------------------------------------------
1746							# Process PDF dictionary extract tag values
1747							# Inputs: 0) ExifTool object reference, 1) tag table reference
1748							# 2) dictionary reference, 3) cross-reference table reference,
1749							# 4) nesting depth, 5) dictionary capture type
1750							sub ProcessDict($$$$;$$)
1751							{
1752	350			350	0	665	my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1753	350					850	my $verbose = $et->Options('Verbose');
1754	350					582	my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
1755	350		33			839	my $embedded = (defined $unknown and not $unknown and $et->Options('ExtractEmbedded'));
1756	350					398	my @tags = @{$$dict{_tags}};
	350					943
1757	350					486	my ($next, %join);
1758	350					398	my $index = 0;
1759
1760	350		100			699	$nesting = ($nesting \|\| 0) + 1;
1761	350	50				563	if ($nesting > 50) {
1762	0					0	$et->WarnOnce('Nesting too deep (directory ignored)');
1763	0					0	return;
1764							}
1765							# save entire dictionary for rewriting if specified
1766	350	50	100			919	if ($$et{PDF_CAPTURE} and $$tagTablePtr{VARS} and
			66
1767							$tagTablePtr->{VARS}->{CAPTURE})
1768							{
1769	66					82	my $name;
1770	66					83	foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
	66					165
1771	82	100				156	next if $$et{PDF_CAPTURE}{$name};
1772							# make sure we load the right type if indicated
1773	66	50	66			160	next if $type and $type ne $name;
1774	66					118	$$et{PDF_CAPTURE}{$name} = $dict;
1775	66					87	last;
1776							}
1777							}
1778							#
1779							# extract information from all tags in the dictionary
1780							#
1781	350					419	for (;;) {
1782	1473					1693	my ($tag, $isSubDoc);
1783	1473	100	33			2442	if (@tags) {
		50
1784	1123					1501	$tag = shift @tags;
1785							} elsif (defined $next and not $next) {
1786	0					0	$tag = 'Next';
1787	0					0	$next = 1;
1788							} else {
1789	350					426	last;
1790							}
1791	1123					1650	my $val = $$dict{$tag};
1792	1123					2285	my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag);
1793	1123	100	33			2355	if ($tagInfo) {
		50	33
			0
			0
1794	419	50				659	undef $tagInfo if $$tagInfo{NoProcess};
1795							} elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
1796							$$tagTablePtr{$1} and (ref $val ne 'SCALAR' or not $fetched{$$val}))
1797							{
1798	0					0	my ($name, $num) = ($1, $2);
1799	0					0	$tagInfo = $et->GetTagInfo($tagTablePtr, $name);
1800	0	0	0			0	if (ref $tagInfo eq 'HASH' and $$tagInfo{JoinStreams}) {
1801	0					0	$fetched{$$val} = 1;
1802	0					0	my $obj = FetchObject($et, $$val, $xref, $tag);
1803	0	0				0	$join{$name} = [] unless $join{$name};
1804	0	0	0			0	next unless ref $obj eq 'HASH' and $$obj{_stream};
1805							# save all the stream data to join later
1806	0					0	DecodeStream($et, $obj);
1807	0					0	$join{$name}->[$num] = $$obj{_stream};
1808	0					0	undef $tagInfo; # don't process
1809							} else {
1810	0					0	$isSubDoc = 1; # treat as a sub-document
1811							}
1812							}
1813	1123	50				1600	if ($verbose) {
1814	0					0	my ($val2, $extra);
1815	0	0				0	if (ref $val eq 'SCALAR') {
		0
		0
1816	0					0	$extra = ", indirect object ($$val)";
1817	0	0	0			0	if ($fetched{$$val}) {
		0
1818	0					0	$val2 = "ref($$val)";
1819							} elsif ($tag eq 'Next' and not $next) {
1820							# handle 'Next' links after all others
1821	0					0	$next = 0;
1822	0					0	next;
1823							} else {
1824	0					0	$fetched{$$val} = 1;
1825	0					0	$val = FetchObject($et, $$val, $xref, $tag);
1826	0	0				0	unless (defined $val) {
1827	0					0	my $str;
1828	0	0				0	if (defined $lastOffset) {
1829	0					0	$val2 = '';
1830	0					0	$str = 'Object was freed';
1831							} else {
1832	0					0	$val2 = '';
1833	0					0	$str = 'Error reading object';
1834							}
1835	0					0	$et->VPrint(0, "$$et{INDENT}${str}:\n");
1836							}
1837							}
1838							} elsif (ref $val eq 'HASH') {
1839	0					0	$extra = ', direct dictionary';
1840							} elsif (ref $val eq 'ARRAY') {
1841	0					0	$extra = ', direct array of ' . scalar(@$val) . ' objects';
1842							} else {
1843	0					0	$extra = ', direct object';
1844							}
1845	0					0	my $isSubdir;
1846	0	0				0	if (ref $val eq 'HASH') {
		0
1847	0					0	$isSubdir = 1;
1848							} elsif (ref $val eq 'ARRAY') {
1849							# recurse into objects in arrays only if they are lists of
1850							# dictionaries or indirect objects which could be dictionaries
1851	0	0				0	$isSubdir = 1 if @$val;
1852	0					0	foreach (@$val) {
1853	0	0	0			0	next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
1854	0					0	undef $isSubdir;
1855	0					0	last;
1856							}
1857							}
1858	0	0				0	if ($isSubdir) {
1859							# create bogus subdirectory to recurse into this dict
1860	0	0				0	$tagInfo or $tagInfo = {
1861							Name => $tag,
1862							SubDirectory => { TagTable => 'Image::ExifTool::PDF::Unknown' },
1863							};
1864							} else {
1865	0	0				0	$val2 = ExpandArray($val) if ref $val eq 'ARRAY';
1866							# generate tag info if we will use it later
1867	0	0	0			0	if (not $tagInfo and defined $val and $unknown) {
			0
1868	0					0	$tagInfo = NewPDFTag($tagTablePtr, $tag);
1869							}
1870							}
1871	0		0			0	$et->VerboseInfo($tag, $tagInfo,
1872							Value => $val2 \|\| $val,
1873							Extra => $extra,
1874							Index => $index++,
1875							);
1876	0	0				0	next unless defined $val;
1877							}
1878	1123	100				1610	unless ($tagInfo) {
1879							# add any tag found in Info dictionary to table
1880	704	50				1170	next unless $unknown;
1881	0					0	$tagInfo = NewPDFTag($tagTablePtr, $tag);
1882							}
1883							# increment document number if necessary
1884	419					558	my ($oldDocNum, $oldNumTags);
1885	419	50				722	if ($isSubDoc) {
1886	0					0	$oldDocNum = $$et{DOC_NUM};
1887	0					0	$oldNumTags = $$et{NUM_FOUND};
1888	0					0	$$et{DOC_NUM} = ++$$et{DOC_COUNT};
1889							}
1890	419	100				720	if ($$tagInfo{SubDirectory}) {
1891							# process the subdirectory
1892	332					391	my @subDicts;
1893	332	100				636	if (ref $val eq 'ARRAY') {
1894							# hack to convert array to dictionary if necessary
1895	37	50	33			97	if ($$tagInfo{ConvertToDict} and @$val == 2 and not ref $$val[0]) {
			33
1896	0					0	my $tg = $$val[0];
1897	0					0	$tg =~ s(^/)(); # remove name
1898	0					0	my %dict = ( _tags => [ $tg ], $tg => $$val[1] );
1899	0					0	@subDicts = ( \%dict );
1900							} else {
1901	37					40	@subDicts = @{$val};
	37					71
1902							}
1903							} else {
1904	295					431	@subDicts = ( $val );
1905							}
1906							# loop through all values of this tag
1907	332					457	for (;;) {
1908	664	100				1171	my $subDict = shift @subDicts or last;
1909							# save last fetched object in case we fetch another one here
1910	332					452	my $prevFetched = $lastFetched;
1911	332	100				660	if (ref $subDict eq 'SCALAR') {
1912							# only fetch once (other copies are obsolete)
1913	244	100				480	next if $fetched{$$subDict};
1914	197	100				322	if ($$tagInfo{IgnoreDuplicates}) {
1915	28					53	my $flag = "ProcessedPDF_$tag";
1916	28	50				57	if ($$et{$flag}) {
1917	0	0				0	next if $et->WarnOnce("Ignored duplicate $tag dictionary", 2);
1918							} else {
1919	28					52	$$et{$flag} = 1;
1920							}
1921							}
1922							# load dictionary via an indirect reference
1923	197					302	$fetched{$$subDict} = 1;
1924	197					360	my $obj = FetchObject($et, $$subDict, $xref, $tag);
1925	197	100				410	unless (defined $obj) {
1926	5	50				12	unless (defined $lastOffset) {
1927	0					0	$et->Warn("Error reading $tag object ($$subDict)");
1928							}
1929	5					11	next;
1930							}
1931	192					337	$subDict = $obj;
1932							}
1933	280	50				572	if (ref $subDict eq 'ARRAY') {
1934							# convert array of key/value pairs to a hash
1935	0	0				0	next if @$subDict < 2;
1936	0					0	my %hash = ( _tags => [] );
1937	0					0	while (@$subDict >= 2) {
1938	0					0	my $key = shift @$subDict;
1939	0					0	$key =~ s/^\///;
1940	0					0	push @{$hash{_tags}}, $key;
	0					0
1941	0					0	$hash{$key} = shift @$subDict;
1942							}
1943	0					0	$subDict = \%hash;
1944							} else {
1945	280	50				473	next unless ref $subDict eq 'HASH';
1946							}
1947							# set flag to re-crypt all strings when rewriting if the dictionary
1948							# came from an encrypted stream
1949	280	50				443	$$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
1950	280					848	my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
1951	280	50				445	if (not $verbose) {
		0
1952	280		50			904	my $proc = $$subTablePtr{PROCESS_PROC} \|\| \&ProcessDict;
1953	280					652	&$proc($et, $subTablePtr, $subDict, $xref, $nesting);
1954							} elsif ($next) {
1955							# handle 'Next' links at this level to avoid deep recursion
1956	0					0	undef $next;
1957	0					0	$index = 0;
1958	0					0	$tagTablePtr = $subTablePtr;
1959	0					0	$dict = $subDict;
1960	0					0	@tags = @{$$subDict{_tags}};
	0					0
1961	0					0	$et->VerboseDir($tag, scalar(@tags));
1962							} else {
1963	0					0	my $oldIndent = $$et{INDENT};
1964	0					0	my $oldDir = $$et{DIR_NAME};
1965	0					0	$$et{INDENT} .= '\| ';
1966	0					0	$$et{DIR_NAME} = $tag;
1967	0					0	$et->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
	0					0
1968	0					0	ProcessDict($et, $subTablePtr, $subDict, $xref, $nesting);
1969	0					0	$$et{INDENT} = $oldIndent;
1970	0					0	$$et{DIR_NAME} = $oldDir;
1971							}
1972	280					805	$lastFetched = $prevFetched;
1973							}
1974							} else {
1975							# fetch object if necessary
1976							# (OS X 10.6 writes indirect objects in the Info dictionary!)
1977	87	50				144	if (ref $val eq 'SCALAR') {
1978	0					0	my $prevFetched = $lastFetched;
1979							# (note: fetching the same object multiple times is OK here)
1980	0					0	$val = FetchObject($et, $$val, $xref, $tag);
1981	0	0				0	if (defined $val) {
1982	0					0	$val = ReadPDFValue($val);
1983							# set flag to re-encrypt if necessary if rewritten
1984	0	0				0	$$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
		0
1985	0					0	$lastFetched = $prevFetched; # restore last fetched object reference
1986							}
1987							} else {
1988	87					144	$val = ReadPDFValue($val);
1989							}
1990	87	100				231	if (ref $val) {
		50
1991	12	50				31	if (ref $val eq 'ARRAY') {
1992	12	50				34	delete $$et{LIST_TAGS}{$tagInfo} if $$tagInfo{List};
1993	12					16	my $v;
1994	12					24	foreach $v (@$val) {
1995	20					46	$et->FoundTag($tagInfo, $v);
1996							}
1997							}
1998							} elsif (defined $val) {
1999							# convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
2000							# unless this is binary data (hex-encoded strings would not have been converted)
2001	75		100			313	my $format = $$tagInfo{Format} \|\| $$tagInfo{Writable} \|\| 'string';
2002	75	100				162	$val = ConvertPDFDate($val) if $format eq 'date';
2003	75	50	33			289	if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
2004							# text string is already in Unicode if it starts with "\xfe\xff",
2005							# otherwise we must first convert from PDFDocEncoding
2006	0	0				0	$val = $et->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
2007							}
2008	75	100	66			187	if ($$tagInfo{List} and not $$et{OPTIONS}{NoPDFList}) {
2009							# separate tokens in comma or whitespace delimited lists
2010	12	50				70	my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val;
2011	12					30	foreach $val (@values) {
2012	28					58	$et->FoundTag($tagInfo, $val);
2013							}
2014							} else {
2015							# a simple tag value
2016	63					157	$et->FoundTag($tagInfo, $val);
2017							}
2018							}
2019							}
2020	419	50				731	if ($isSubDoc) {
2021							# restore original document number
2022	0					0	$$et{DOC_NUM} = $oldDocNum;
2023	0	0				0	--$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2024							}
2025							}
2026							#
2027							# extract information from joined streams if necessary
2028							#
2029
2030	350	50				573	if (%join) {
2031	0					0	my ($tag, $i);
2032	0					0	foreach $tag (sort keys %join) {
2033	0					0	my $list = $join{$tag};
2034	0	0	0			0	last unless defined $$list[1] and $$list[1] =~ /^%.?([\x0d\x0a])/;
2035	0					0	my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
2036	0					0	for ($i=1; defined $$list[$i]; ++$i) {
2037	0					0	$buff .= $$list[$i];
2038	0					0	undef $$list[$i]; # free memory
2039							}
2040							# increment document number for tags extracted from embedded EPS
2041	0					0	my $oldDocNum = $$et{DOC_NUM};
2042	0					0	my $oldNumTags = $$et{NUM_FOUND};
2043	0					0	$$et{DOC_NUM} = ++$$et{DOC_COUNT};
2044							# extract PostScript information
2045	0					0	$et->HandleTag($tagTablePtr, $tag, $buff);
2046	0					0	$$et{DOC_NUM} = $oldDocNum;
2047							# revert document counter if we didn't add any new tags
2048	0	0				0	--$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2049	0					0	delete $$et{DOC_NUM};
2050							}
2051							}
2052							#
2053							# extract information from stream object if it exists (eg. Metadata stream)
2054							#
2055	350					414	for (;;) { # (cheap goto)
2056	350	100				914	last unless $$dict{_stream};
2057	43					84	my $tag = '_stream';
2058							# add Subtype (if it exists) to stream name and remove leading '/'
2059	43	100				166	($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
2060	43	50				107	last unless $$tagTablePtr{$tag};
2061	43	50				87	my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag) or last;
2062	43	50				111	unless ($$tagInfo{SubDirectory}) {
2063							# don't build filter lists across different images
2064	0					0	delete $$et{LIST_TAGS}{$$tagTablePtr{Filter}};
2065							# we arrive here only when extracting embedded images
2066							# - only extract known image types and ignore others
2067	0		0			0	my $filter = $$dict{Filter} \|\| '';
2068	0	0				0	$filter = @$filter[-1] if ref $filter eq 'ARRAY'; # (get last Filter type)
2069	0					0	my $result;
2070	0	0	0			0	if ($filter eq '/DCTDecode' or $filter eq '/JPXDecode') {
2071	0	0				0	DecodeStream($et, $dict) or last;
2072							# save the image itself
2073	0					0	$et->FoundTag($tagInfo, \$$dict{_stream});
2074							# extract information from embedded image
2075	0					0	$result = $et->ExtractInfo(\$$dict{_stream}, { ReEntry => 1 });
2076							}
2077	0	0				0	unless ($result) {
2078	0	0				0	$et->FoundTag('FileType', defined $result ? '(unknown)' : '(unsupported)');
2079							}
2080	0					0	last;
2081							}
2082							# decode stream if necessary
2083	43	50				107	DecodeStream($et, $dict) or last;
2084	43	50				81	if ($verbose > 2) {
2085	0					0	$et->VPrint(2,"$$et{INDENT}$$et{DIR_NAME} stream data\n");
2086	0					0	$et->VerboseDump(\$$dict{_stream});
2087							}
2088							# extract information from stream
2089							my %dirInfo = (
2090							DataPt => \$$dict{_stream},
2091							DataLen => length $$dict{_stream},
2092							DirStart => 0,
2093							DirLen => length $$dict{_stream},
2094	43					182	Parent => 'PDF',
2095							);
2096	43					129	my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
2097	43	50				115	unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
2098	0					0	$et->Warn("Error processing $$tagInfo{Name} information");
2099							}
2100	43					159	last;
2101							}
2102							}
2103
2104							#------------------------------------------------------------------------------
2105							# Extract information from PDF file
2106							# Inputs: 0) ExifTool object reference, 1) dirInfo reference
2107							# Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
2108							sub ReadPDF($$)
2109							{
2110	37			37	0	54	my ($et, $dirInfo) = @_;
2111	37					58	my $raf = $$dirInfo{RAF};
2112	37					103	my $verbose = $et->Options('Verbose');
2113	37					54	my ($buff, $encrypt, $id);
2114							#
2115							# validate PDF file
2116							#
2117							# (linearization dictionary must be in the first 1024 bytes of the file)
2118	37	50				108	$raf->Read($buff, 1024) >= 8 or return 0;
2119	37	50				226	$buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
2120	37	50				143	$$et{PDFBase} = length $1 and $et->Warn('PDF header is not at start of file',1);
2121	37					66	$pdfVer = $2;
2122	37					152	$et->SetFileType(); # set the FileType tag
2123	37	50				143	$et->Warn("The PDF $pdfVer specification is held hostage by the ISO") if $pdfVer >= 2.0;
2124							# store PDFVersion tag
2125	37					75	my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
2126	37					109	$et->HandleTag($tagTablePtr, 'Version', $pdfVer);
2127	37					74	$tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
2128							#
2129							# check for a linearized PDF (only if reading)
2130							#
2131	37					725	my $capture = $$et{PDF_CAPTURE};
2132	37	100				75	unless ($capture) {
2133	18					27	my $lin = 'false';
2134	18	50				74	if ($buff =~ /<
2135	18					61	$buff = substr($buff, pos($buff) - 2);
2136	18					39	my $dict = ExtractObject($et, \$buff);
2137	18	0	33			106	if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
			33
2138	0	0				0	if (not $$et{VALUE}{FileSize}) {
		0
2139	0					0	undef $lin; # can't determine if it is linearized
2140							} elsif ($$dict{L} == $$et{VALUE}{FileSize} - $$et{PDFBase}) {
2141	0					0	$lin = 'true';
2142							}
2143							}
2144							}
2145	18	50				69	$et->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
2146							}
2147							#
2148							# read the xref tables referenced from startxref at the end of the file
2149							#
2150	37					71	my @xrefOffsets;
2151	37	50				94	$raf->Seek(0, 2) or return -2;
2152							# the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
2153	37					125	my $len = $raf->Tell();
2154	37	50				106	$len = 1024 if $len > 1024;
2155	37	50				83	$raf->Seek(-$len, 2) or return -2;
2156	37	50				107	$raf->Read($buff, $len) == $len or return -3;
2157							# find the LAST xref table in the file (may be multiple %%EOF marks,
2158							# and comments between "startxref" and "%%EOF")
2159	37	50				343	$buff =~ /^.startxref(\s+)(\d+)(\s+)(%[^\x0d\x0a]\s+)*%%EOF/s or return -4;
2160	37					121	my $ws = $1 . $3;
2161	37					69	my $xr = $2;
2162	37					93	push @xrefOffsets, $xr, 'Main';
2163							# set input record separator
2164	37	50				261	local $/ = $ws =~ /(\x0d\x0a\|\x0d\|\x0a)/ ? $1 : "\x0a";
2165	37					71	my (%xref, @mainDicts, %loaded, $mainFree);
2166	37					75	my ($xrefSize, $mainDictSize) = (0, 0);
2167							# initialize variables to capture when rewriting
2168	37	100				71	if ($capture) {
2169	19					50	$capture->{startxref} = $xr;
2170	19					38	$capture->{xref} = \%xref;
2171	19					43	$capture->{newline} = $/;
2172	19					40	$capture->{mainFree} = $mainFree = { };
2173							}
2174							XRef:
2175	37					73	while (@xrefOffsets) {
2176	70					111	my $offset = shift @xrefOffsets;
2177	70					95	my $type = shift @xrefOffsets;
2178	70	50				147	next if $loaded{$offset}; # avoid infinite recursion
2179	70	50				262	unless ($raf->Seek($offset+$$et{PDFBase}, 0)) {
2180	0	0				0	%loaded or return -5;
2181	0					0	$et->Warn('Bad offset for secondary xref table');
2182	0					0	next;
2183							}
2184							# Note: care must be taken because ReadLine may read more than we want if
2185							# the newline sequence for this table is different than the rest of the file
2186	70					120	for (;;) {
2187	70	50				185	unless ($raf->ReadLine($buff)) {
2188	0	0				0	%loaded or return -6;
2189	0					0	$et->Warn('Bad offset for secondary xref table');
2190	0					0	next XRef;
2191							}
2192	70	50				273	last if $buff =~/\S/; # skip blank lines
2193							}
2194	70					102	my $loadXRefStream;
2195	70	50				369	if ($buff =~ s/^\s*xref\s+//s) {
		0
2196							# load xref table
2197	70					100	for (;;) {
2198							# read another line if necessary (skipping blank lines)
2199	177		50			491	$raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
2200	177	100				560	last if $buff =~ s/^\s*trailer([\s<[(])/$1/s;
2201	107	50				379	$buff =~ s/^\s*(\d+)\s+(\d+)\s+//s or return -4;
2202	107					285	my ($start, $num) = ($1, $2);
2203	107	50				253	$raf->Seek(-length($buff), 1) or return -4;
2204	107					171	my $i;
2205	107					243	for ($i=0; $i<$num; ++$i) {
2206	622	50				1191	$raf->Read($buff, 20) == 20 or return -6;
2207	622	50				1960	$buff =~ /^\s*(\d{10}) (\d{5}) (f\|n)/s or return -4;
2208	622					822	my $num = $start + $i;
2209	622	100				917	$xrefSize = $num if $num > $xrefSize;
2210							# locate object to generate entry from stream if necessary
2211							# (must do this before we test $xref{$num})
2212	622	50				899	LocateAnyObject(\%xref, $num) if $xref{dicts};
2213							# save offset for newest copy of all objects
2214							# (or next object number for free objects)
2215	622	100				1040	unless (defined $xref{$num}) {
2216	526					1187	my ($offset, $gen) = (int($1), int($2));
2217	526					808	$xref{$num} = $offset;
2218	526	100				910	if ($3 eq 'f') {
2219							# save free objects in last xref table for rewriting
2220	52	100				124	$$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
2221	52					115	next;
2222							}
2223							# also save offset keyed by object reference string
2224	474					1262	$xref{"$num $gen R"} = $offset;
2225							}
2226							}
2227							# (I have a sample from Adobe which has an empty xref table)
2228							# %xref or return -4; # xref table may not be empty
2229	107					214	$buff = '';
2230							}
2231	70					127	undef $mainFree; # only do this for the last xref table
2232							} elsif ($buff =~ s/^\s*(\d+)\s+(\d+)\s+obj//s) {
2233							# this is a PDF-1.5 cross-reference stream dictionary
2234	0					0	$loadXRefStream = 1;
2235							} else {
2236	0	0				0	%loaded or return -4;
2237	0					0	$et->Warn('Invalid secondary xref table');
2238	0					0	next;
2239							}
2240	70					181	my $mainDict = ExtractObject($et, \$buff, $raf, \%xref);
2241	70	50				172	unless (ref $mainDict eq 'HASH') {
2242	0	0				0	%loaded or return -8;
2243	0					0	$et->Warn('Error loading secondary dictionary');
2244	0					0	next;
2245							}
2246							# keep track of total trailer dictionary Size
2247	70	100	66			284	$mainDictSize = $$mainDict{Size} if $$mainDict{Size} and $$mainDict{Size} > $mainDictSize;
2248	70	50				124	if ($loadXRefStream) {
2249							# decode and save our XRef stream from PDF-1.5 file
2250							# (but parse it later as required to save time)
2251							# Note: this technique can potentially result in an old object
2252							# being used if the file was incrementally updated and an older
2253							# object from an xref table was replaced by a newer object in an
2254							# xref stream. But doing so isn't a good idea (if allowed at all)
2255							# because a PDF 1.4 consumer would also make this same mistake.
2256	0	0	0			0	if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
			0
			0
			0
2257	0					0	@{$$mainDict{W}} > 2 and $$mainDict{Size} and
2258							DecodeStream($et, $mainDict))
2259							{
2260							# create Index entry if it doesn't exist
2261	0	0				0	$$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
2262							# create '_entry_size' entry for internal use
2263	0					0	my $w = $$mainDict{W};
2264	0					0	my $size = 0;
2265	0					0	foreach (@$w) { $size += $_; }
	0					0
2266	0					0	$$mainDict{_entry_size} = $size;
2267							# save this stream dictionary to use later if required
2268	0	0				0	$xref{dicts} = [] unless $xref{dicts};
2269	0					0	push @{$xref{dicts}}, $mainDict;
	0					0
2270							} else {
2271	0	0				0	%loaded or return -9;
2272	0					0	$et->Warn('Invalid xref stream in secondary dictionary');
2273							}
2274							}
2275	70					136	$loaded{$offset} = 1;
2276							# load XRef stream in hybrid file if it exists
2277	70	50				146	push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
2278	70	50				114	$encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
2279	70	50	33			128	undef $encrypt if $encrypt and $encrypt eq 'null'; # (have seen "null")
2280	70	100	66			179	if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
2281	29					62	$id = ReadPDFValue($mainDict->{ID}->[0]);
2282							}
2283	70					143	push @mainDicts, $mainDict, $type;
2284							# load previous xref table if it exists
2285	70	100				222	push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
2286							}
2287	37	50				75	if ($xrefSize > $mainDictSize) {
2288	0					0	my $str = "Objects in xref table ($xrefSize) exceed trailer dictionary Size ($mainDictSize)";
2289	0	0				0	$capture ? $et->Error($str) : $et->Warn($str);
2290							}
2291							#
2292							# extract encryption information if necessary
2293							#
2294	37	50				68	if ($encrypt) {
2295	0	0				0	if (ref $encrypt eq 'SCALAR') {
2296	0					0	$encrypt = FetchObject($et, $$encrypt, \%xref, 'Encrypt');
2297							}
2298							# generate Encryption tag information
2299	0					0	my $err = DecryptInit($et, $encrypt, $id);
2300	0	0				0	if ($err) {
2301	0					0	$et->Warn($err);
2302	0	0				0	$$capture{Error} = $err if $capture;
2303	0					0	return -1;
2304							}
2305							}
2306							#
2307							# extract the information beginning with each of the main dictionaries
2308							#
2309	37					79	my $i = 0;
2310	37					75	my $num = (scalar @mainDicts) / 2;
2311	37					65	while (@mainDicts) {
2312	70					94	my $dict = shift @mainDicts;
2313	70					94	my $type = shift @mainDicts;
2314	70	50				149	if ($verbose) {
2315	0					0	++$i;
2316	0					0	my $n = scalar(@{$$dict{_tags}});
	0					0
2317	0					0	$et->VPrint(0, "PDF dictionary ($i of $num) with $n entries:\n");
2318							}
2319	70					140	ProcessDict($et, $tagTablePtr, $dict, \%xref, 0, $type);
2320							}
2321							# handle any decryption errors
2322	37	50				83	if ($encrypt) {
2323	0					0	my $err = $$encrypt{_error};
2324	0	0				0	if ($err) {
2325	0					0	$et->Warn($err);
2326	0	0				0	$$capture{Error} = $err if $capture;
2327	0					0	return -1;
2328							}
2329							}
2330	37					229	return 1;
2331							}
2332
2333							#------------------------------------------------------------------------------
2334							# ReadPDF() warning strings for each error return value
2335							my %pdfWarning = (
2336							# -1 is reserved as error return value with no associated warning
2337							-2 => 'Error seeking in file',
2338							-3 => 'Error reading file',
2339							-4 => 'Invalid xref table',
2340							-5 => 'Invalid xref offset',
2341							-6 => 'Error reading xref table',
2342							-7 => 'Error reading trailer',
2343							-8 => 'Error reading main dictionary',
2344							-9 => 'Invalid xref stream in main dictionary',
2345							);
2346
2347							#------------------------------------------------------------------------------
2348							# Extract information from PDF file
2349							# Inputs: 0) ExifTool object reference, 1) dirInfo reference
2350							# Returns: 1 if this was a valid PDF file
2351							sub ProcessPDF($$)
2352							{
2353	37			37	0	70	my ($et, $dirInfo) = @_;
2354
2355	37					57	undef $cryptInfo; # (must not delete after returning so writer can use it)
2356	37					53	undef $cryptStream;
2357	37					45	undef $cryptString;
2358	37					80	my $result = ReadPDF($et, $dirInfo);
2359	37	50				89	if ($result < 0) {
2360	0	0				0	$et->Warn($pdfWarning{$result}) if $pdfWarning{$result};
2361	0					0	$result = 1;
2362							}
2363							# clean up and return
2364	37					59	undef %streamObjs;
2365	37					66	undef %fetched;
2366	37					80	return $result;
2367							}
2368
2369							1; # end
2370
2371
2372							__END__