File Coverage

blib/lib/Image/ExifTool/OOXML.pm
Criterion Covered Total %
statement 104 112 92.8
branch 50 72 69.4
condition 18 31 58.0
subroutine 9 9 100.0
pod 0 3 0.0
total 181 227 79.7


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: OOXML.pm
3             #
4             # Description: Read Office Open XML+ZIP files
5             #
6             # Revisions: 2009/10/31 - P. Harvey Created
7             #------------------------------------------------------------------------------
8              
9             package Image::ExifTool::OOXML;
10              
11 1     1   7 use strict;
  1         1  
  1         1452  
12 1     1   7 use vars qw($VERSION);
  1         2  
  1         61  
13 1     1   8 use Image::ExifTool qw(:DataAccess :Utils);
  1         2  
  1         357  
14 1     1   1046 use Image::ExifTool::XMP;
  1         4  
  1         171  
15 1     1   7 use Image::ExifTool::ZIP;
  1         1  
  1         792  
16              
17             $VERSION = '1.10';
18              
19             # test for recognized OOXML document extensions
20             my %isOOXML = (
21             DOCX => 1, DOCM => 1,
22             DOTX => 1, DOTM => 1,
23             POTX => 1, POTM => 1,
24             PPAX => 1, PPAM => 1,
25             PPSX => 1, PPSM => 1,
26             PPTX => 1, PPTM => 1, THMX => 1,
27             XLAM => 1,
28             XLSX => 1, XLSM => 1, XLSB => 1,
29             XLTX => 1, XLTM => 1,
30             VSDX => 1,
31             );
32              
33             # generate reverse lookup for file type based on MIME
34             my %fileType;
35             {
36             my $type;
37             foreach $type (keys %isOOXML) {
38             $fileType{$Image::ExifTool::mimeType{$type}} = $type;
39             }
40             }
41              
42             # XML attributes to queue
43             my %queuedAttrs;
44             my %queueAttrs = (
45             fmtid => 1,
46             pid => 1,
47             name => 1,
48             );
49              
50             # keep track of items in a vector (to accumulate as a list)
51             my $vectorCount;
52             my @vectorVals;
53              
54             # Office Open XML tags
55             %Image::ExifTool::OOXML::Main = (
56             GROUPS => { 0 => 'XML', 1 => 'XML', 2 => 'Document' },
57             PROCESS_PROC => \&Image::ExifTool::XMP::ProcessXMP,
58             VARS => { ID_FMT => 'none' },
59             NOTES => q{
60             The Office Open XML (OOXML) format was introduced with Microsoft Office 2007
61             and is used by file types such as DOCX, PPTX, XLSX and VSDX. These are
62             essentially ZIP archives containing XML files. The table below lists some
63             tags which have been observed in OOXML documents, but ExifTool will extract
64             any tags found from XML files of the OOXML document properties ("docProps")
65             directory.
66              
67             B
68              
69             1) Structural ZIP tags may be ignored (if desired) with C<--ZIP:all> on the
70             command line.
71              
72             2) Tags may be grouped by their document number in the ZIP archive with the
73             C<-g3> or C<-G3> option.
74             },
75             # These tags all have 1:1 correspondence with FlashPix tags except for:
76             # OOXML FlashPix
77             # --------------- -------------
78             # DocSecurity Security
79             # Application Software
80             # dc:Description Comments
81             # dc:Creator Author
82             Application => { },
83             AppVersion => { },
84             category => { },
85             Characters => { },
86             CharactersWithSpaces => { },
87             CheckedBy => { },
88             Client => { },
89             Company => { },
90             created => {
91             Name => 'CreateDate',
92             Groups => { 2 => 'Time' },
93             Format => 'date',
94             PrintConv => '$self->ConvertDateTime($val)',
95             },
96             createdType => { Hidden => 1, RawConv => 'undef' }, # ignore this XML type name
97             DateCompleted => {
98             Groups => { 2 => 'Time' },
99             Format => 'date',
100             PrintConv => '$self->ConvertDateTime($val)',
101             },
102             Department => { },
103             Destination => { },
104             Disposition => { },
105             Division => { },
106             DocSecurity => {
107             # (http://msdn.microsoft.com/en-us/library/documentformat.openxml.extendedproperties.documentsecurity.aspx)
108             PrintConv => {
109             0 => 'None',
110             1 => 'Password protected',
111             2 => 'Read-only recommended',
112             4 => 'Read-only enforced',
113             8 => 'Locked for annotations',
114             },
115             },
116             DocumentNumber=> { },
117             Editor => { Groups => { 2 => 'Author'} },
118             ForwardTo => { },
119             Group => { },
120             HeadingPairs=> { },
121             HiddenSlides=> { },
122             HyperlinkBase=>{ },
123             HyperlinksChanged => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
124             keywords => { },
125             Language => { },
126             lastModifiedBy => { Groups => { 2 => 'Author'} },
127             lastPrinted => {
128             Groups => { 2 => 'Time' },
129             Format => 'date',
130             PrintConv => '$self->ConvertDateTime($val)',
131             },
132             Lines => { },
133             LinksUpToDate=>{ PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
134             Mailstop => { },
135             Manager => { },
136             Matter => { },
137             MMClips => { },
138             modified => {
139             Name => 'ModifyDate',
140             Groups => { 2 => 'Time' },
141             Format => 'date',
142             PrintConv => '$self->ConvertDateTime($val)',
143             },
144             modifiedType=> { Hidden => 1, RawConv => 'undef' }, # ignore this XML type name
145             Notes => { },
146             Office => { },
147             Owner => { Groups => { 2 => 'Author'} },
148             Pages => { },
149             Paragraphs => { },
150             PresentationFormat => { },
151             Project => { },
152             Publisher => { },
153             Purpose => { },
154             ReceivedFrom=> { },
155             RecordedBy => { },
156             RecordedDate=> {
157             Groups => { 2 => 'Time' },
158             Format => 'date',
159             PrintConv => '$self->ConvertDateTime($val)',
160             },
161             Reference => { },
162             revision => { Name => 'RevisionNumber' },
163             ScaleCrop => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
164             SharedDoc => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
165             Slides => { },
166             Source => { },
167             Status => { },
168             TelephoneNumber => { },
169             Template => { },
170             TitlesOfParts=>{ },
171             TotalTime => {
172             Name => 'TotalEditTime',
173             PrintConv => 'ConvertTimeSpan($val, 60)',
174             },
175             Typist => { },
176             Words => { },
177             );
178              
179             #------------------------------------------------------------------------------
180             # Generate a tag ID for this XML tag
181             # Inputs: 0) tag property name list ref
182             # Returns: tagID and outtermost interesting namespace (or '' if no namespace)
183             sub GetTagID($)
184             {
185 67     67 0 102 my $props = shift;
186 67         75 my ($tag, $prop, $namespace);
187 67         108 foreach $prop (@$props) {
188             # split name into namespace and property name
189             # (Note: namespace can be '' for property qualifiers)
190 180 100       494 my ($ns, $nm) = ($prop =~ /(.*?):(.*)/) ? ($1, $2) : ('', $prop);
191 180 100       271 next if $ns eq 'vt'; # ignore 'vt' properties
192 136 100 100     431 if (defined $tag) {
    100 100        
193 2         4 $tag .= ucfirst($nm); # add to tag name
194             } elsif ($prop ne 'Properties' and $prop ne 'cp:coreProperties' and
195             $prop ne 'property')
196             {
197 39         42 $tag = $nm;
198             # save namespace of first property to contribute to tag name
199 39 50       62 $namespace = $ns unless $namespace;
200             }
201             }
202 67   100     199 return ($tag, $namespace || '');
203             }
204              
205             #------------------------------------------------------------------------------
206             # We found an XMP property name/value
207             # Inputs: 0) ExifTool object ref, 1) tag table ref
208             # 2) reference to array of XMP property names (last is current property)
209             # 3) property value, 4) attribute hash ref (not used here)
210             # Returns: 1 if valid tag was found
211             sub FoundTag($$$$;$)
212             {
213 151     151 0 241 my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
214 151 50       214 return 0 unless @$props;
215 151         369 my $verbose = $et->Options('Verbose');
216              
217 151         231 my $tag = $$props[-1];
218 151 50       209 $et->VPrint(0, " | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
219              
220             # un-escape XML character entities
221 151         282 $val = Image::ExifTool::XMP::UnescapeXML($val);
222             # convert OOXML-escaped characters (eg. "_x0000d_" is a newline)
223 151         186 $val =~ s/_x([0-9a-f]{4})_/Image::ExifTool::PackUTF8(hex($1))/gie;
  0         0  
224             # convert from UTF8 to ExifTool Charset
225 151         318 $val = $et->Decode($val, 'UTF8');
226             # queue this attribute for later if necessary
227 151 100       279 if ($queueAttrs{$tag}) {
228 84         160 $queuedAttrs{$tag} = $val;
229 84         127 return 0;
230             }
231 67         69 my $ns;
232 67         88 ($tag, $ns) = GetTagID($props);
233 67 100 66     235 if (not $tag) {
    100 100        
    100          
    100          
234             # all properties are in ignored namespaces
235             # so 'name' from our queued attributes for the tag
236 28 50       62 my $name = $queuedAttrs{name} or return 0;
237 28         143 $name =~ s/(^| )([a-z])/$1\U$2/g; # start words with uppercase
238 28         47 ($tag = $name) =~ tr/-_a-zA-Z0-9//dc;
239 28 50       53 return 0 unless length $tag;
240 28 100       121 unless ($$tagTablePtr{$tag}) {
241 1         12 my %tagInfo = (
242             Name => $tag,
243             Description => $name,
244             );
245             # format as a date/time value if type is 'vt:filetime'
246 1 50       4 if ($$props[-1] eq 'vt:filetime') {
247             $tagInfo{Groups} = { 2 => 'Time' },
248             $tagInfo{Format} = 'date',
249 0         0 $tagInfo{PrintConv} = '$self->ConvertDateTime($val)';
250             }
251 1 50       4 $et->VPrint(0, " | [adding $tag]\n") if $verbose;
252 1         6 AddTagToTable($tagTablePtr, $tag, \%tagInfo);
253             }
254             } elsif ($tag eq 'xmlns') {
255             # ignore namespaces (for now)
256 2         5 return 0;
257             } elsif (ref $Image::ExifTool::XMP::Main{$ns} eq 'HASH' and
258             $Image::ExifTool::XMP::Main{$ns}{SubDirectory})
259             {
260             # use standard XMP table if it exists
261 4         9 my $table = $Image::ExifTool::XMP::Main{$ns}{SubDirectory}{TagTable};
262 1     1   6 no strict 'refs';
  1         2  
  1         770  
263 4 50 33     19 if ($table and %$table) {
264 4         31 $tagTablePtr = Image::ExifTool::GetTagTable($table);
265             }
266             } elsif (@$props > 2 and grep /^vt:vector$/, @$props) {
267             # handle vector properties (accumulate as lists)
268 7 100       21 if ($$props[-1] eq 'vt:size') {
    100          
    50          
269 2         2 $vectorCount = $val;
270 2         4 undef @vectorVals;
271 2         5 return 0;
272             } elsif ($$props[-1] eq 'vt:baseType') {
273 2         4 return 0; # ignore baseType
274             } elsif ($vectorCount) {
275 3         7 --$vectorCount;
276 3 100       6 if ($vectorCount) {
277 1         3 push @vectorVals, $val;
278 1         5 return 0;
279             }
280 2 100       7 $val = [ @vectorVals, $val ] if @vectorVals;
281             # Note: we will lose any improper-sized vector elements here
282             }
283             }
284             # add any unknown tags to table
285 60 50       108 if ($$tagTablePtr{$tag}) {
286 60         83 my $tagInfo = $$tagTablePtr{$tag};
287 60 50       107 if (ref $tagInfo eq 'HASH') {
288             # reformat date/time values
289 60   100     210 my $fmt = $$tagInfo{Format} || $$tagInfo{Writable} || '';
290 60 100       101 $val = Image::ExifTool::XMP::ConvertXMPDate($val) if $fmt eq 'date';
291             }
292             } else {
293 0 0       0 $et->VPrint(0, " [adding $tag]\n") if $verbose;
294 0         0 AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
295             }
296             # save the tag
297 60         197 $et->HandleTag($tagTablePtr, $tag, $val);
298              
299             # start fresh for next tag
300 60         78 undef $vectorCount;
301 60         94 undef %queuedAttrs;
302              
303 60         128 return 1;
304             }
305              
306             #------------------------------------------------------------------------------
307             # Extract information from an OOXML file
308             # Inputs: 0) ExifTool object reference, 1) dirInfo reference
309             # Returns: 1
310             # Notes: Upon entry to this routine, the file type has already been verified
311             # and the dirInfo hash contains 2 elements unique to this process proc:
312             # MIME - mime type of main document from "[Content_Types].xml"
313             # ZIP - reference to Archive::Zip object for this file
314             sub ProcessDOCX($$)
315             {
316 1     1 0 2 my ($et, $dirInfo) = @_;
317 1         7 my $zip = $$dirInfo{ZIP};
318 1         4 my $tagTablePtr = GetTagTable('Image::ExifTool::OOXML::Main');
319 1   33     5 my $mime = $$dirInfo{MIME} || $Image::ExifTool::mimeType{DOCX};
320              
321             # set the file type ('DOCX' by default)
322 1         3 my $fileType = $fileType{$mime};
323 1 50       2 if ($fileType) {
324             # THMX is a special case because its contents.main MIME types is PPTX
325 1 0 33     2 if ($fileType eq 'PPTX' and $$et{FILE_EXT} and $$et{FILE_EXT} eq 'THMX') {
      0        
326 0         0 $fileType = 'THMX';
327             }
328             } else {
329 0         0 $et->VPrint(0, "Unrecognized MIME type: $mime\n");
330             # get MIME type according to file extension
331 0         0 $fileType = $$et{FILE_EXT};
332             # default to 'DOCX' if this isn't a known OOXML extension
333 0 0 0     0 $fileType = 'DOCX' unless $fileType and $isOOXML{$fileType};
334             }
335 1         6 $et->SetFileType($fileType);
336              
337             # must catch all Archive::Zip warnings
338 1         5 local $SIG{'__WARN__'} = \&Image::ExifTool::ZIP::WarnProc;
339             # extract meta information from all files in ZIP "docProps" directory
340 1         1 my $docNum = 0;
341 1         8 my @members = $zip->members();
342 1         8 my $member;
343 1         1 foreach $member (@members) {
344             # get filename of this ZIP member
345 18         58 my $file = $member->fileName();
346 18 50       151 next unless defined $file;
347 18         64 $et->VPrint(0, "File: $file\n");
348             # set the document number and extract ZIP tags
349 18         33 $$et{DOC_NUM} = ++$docNum;
350 18         47 Image::ExifTool::ZIP::HandleMember($et, $member);
351             # process only XML and JPEG/WMF thumbnail images in "docProps" directory
352 18 100       64 next unless $file =~ m{^docProps/(.*\.xml|(thumbnail\.(jpe?g|wmf)))$}i;
353             # get the file contents (CAREFUL! $buff MUST be local since we hand off a value ref)
354 4         24 my ($buff, $status) = $zip->contents($member);
355 4 50       3928 $status and $et->Warn("Error extracting $file"), next;
356             # extract docProps/thumbnail.(jpg|mwf) as PreviewImage|PreviewMWF
357 4 100       28 if ($file =~ /\.(jpe?g|wmf)$/i) {
358 1 50       7 my $tag = $file =~ /\.wmf$/i ? 'PreviewWMF' : 'PreviewImage';
359 1         5 $et->FoundTag($tag, \$buff);
360 1         4 next;
361             }
362             # process XML files (docProps/app.xml, docProps/core.xml, docProps/custom.xml)
363 3 50       35 my %dirInfo = (
364             DataPt => \$buff,
365             DirLen => length $buff,
366             # (skip over XML header if it exists)
367             DirStart => ($buff =~ /<\?xml\s+.*?\?>/g ? pos($buff) : 0),
368             DataLen => length $buff,
369             XMPParseOpts => { FoundProc => \&FoundTag },
370             );
371 3         17 $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
372 3         15 undef $buff; # (free memory now)
373             }
374 1         2 delete $$et{DOC_NUM};
375 1         6 return 1;
376             }
377              
378             1; # end
379              
380             __END__