File Coverage

blib/lib/Image/ExifTool/OOXML.pm
Criterion Covered Total %
statement 104 112 92.8
branch 49 70 70.0
condition 18 31 58.0
subroutine 9 9 100.0
pod 0 3 0.0
total 180 225 80.0


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: OOXML.pm
3             #
4             # Description: Read Office Open XML+ZIP files
5             #
6             # Revisions: 2009/10/31 - P. Harvey Created
7             #------------------------------------------------------------------------------
8              
9             package Image::ExifTool::OOXML;
10              
11 1     1   6 use strict;
  1         2  
  1         28  
12 1     1   4 use vars qw($VERSION);
  1         1  
  1         47  
13 1     1   5 use Image::ExifTool qw(:DataAccess :Utils);
  1         2  
  1         199  
14 1     1   878 use Image::ExifTool::XMP;
  1         23  
  1         47  
15 1     1   6 use Image::ExifTool::ZIP;
  1         2  
  1         677  
16              
17             $VERSION = '1.08';
18              
19             # test for recognized OOXML document extensions
20             my %isOOXML = (
21             DOCX => 1, DOCM => 1,
22             DOTX => 1, DOTM => 1,
23             POTX => 1, POTM => 1,
24             PPAX => 1, PPAM => 1,
25             PPSX => 1, PPSM => 1,
26             PPTX => 1, PPTM => 1, THMX => 1,
27             XLAM => 1,
28             XLSX => 1, XLSM => 1, XLSB => 1,
29             XLTX => 1, XLTM => 1,
30             );
31              
32             # generate reverse lookup for file type based on MIME
33             my %fileType;
34             {
35             my $type;
36             foreach $type (keys %isOOXML) {
37             $fileType{$Image::ExifTool::mimeType{$type}} = $type;
38             }
39             }
40              
41             # XML attributes to queue
42             my %queuedAttrs;
43             my %queueAttrs = (
44             fmtid => 1,
45             pid => 1,
46             name => 1,
47             );
48              
49             # keep track of items in a vector (to accumulate as a list)
50             my $vectorCount;
51             my @vectorVals;
52              
53             # Office Open XML tags
54             %Image::ExifTool::OOXML::Main = (
55             GROUPS => { 0 => 'XML', 1 => 'XML', 2 => 'Document' },
56             PROCESS_PROC => \&Image::ExifTool::XMP::ProcessXMP,
57             VARS => { NO_ID => 1 },
58             NOTES => q{
59             The Office Open XML (OOXML) format was introduced with Microsoft Office 2007
60             and is used by file types such as DOCX, PPTX and XLSX. These are
61             essentially ZIP archives containing XML files. The table below lists some
62             tags which have been observed in OOXML documents, but ExifTool will extract
63             any tags found from XML files of the OOXML document properties ("docProps")
64             directory.
65              
66             B
67              
68             1) Structural ZIP tags may be ignored (if desired) with C<--ZIP:all> on the
69             command line.
70              
71             2) Tags may be grouped by their document number in the ZIP archive with the
72             C<-g3> or C<-G3> option.
73             },
74             # These tags all have 1:1 correspondence with FlashPix tags except for:
75             # OOXML FlashPix
76             # --------------- -------------
77             # DocSecurity Security
78             # Application Software
79             # dc:Description Comments
80             # dc:Creator Author
81             Application => { },
82             AppVersion => { },
83             category => { },
84             Characters => { },
85             CharactersWithSpaces => { },
86             CheckedBy => { },
87             Client => { },
88             Company => { },
89             created => {
90             Name => 'CreateDate',
91             Groups => { 2 => 'Time' },
92             Format => 'date',
93             PrintConv => '$self->ConvertDateTime($val)',
94             },
95             createdType => { Hidden => 1, RawConv => 'undef' }, # ignore this XML type name
96             DateCompleted => {
97             Groups => { 2 => 'Time' },
98             Format => 'date',
99             PrintConv => '$self->ConvertDateTime($val)',
100             },
101             Department => { },
102             Destination => { },
103             Disposition => { },
104             Division => { },
105             DocSecurity => {
106             # (http://msdn.microsoft.com/en-us/library/documentformat.openxml.extendedproperties.documentsecurity.aspx)
107             PrintConv => {
108             0 => 'None',
109             1 => 'Password protected',
110             2 => 'Read-only recommended',
111             4 => 'Read-only enforced',
112             8 => 'Locked for annotations',
113             },
114             },
115             DocumentNumber=> { },
116             Editor => { Groups => { 2 => 'Author'} },
117             ForwardTo => { },
118             Group => { },
119             HeadingPairs=> { },
120             HiddenSlides=> { },
121             HyperlinkBase=>{ },
122             HyperlinksChanged => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
123             keywords => { },
124             Language => { },
125             lastModifiedBy => { Groups => { 2 => 'Author'} },
126             lastPrinted => {
127             Groups => { 2 => 'Time' },
128             Format => 'date',
129             PrintConv => '$self->ConvertDateTime($val)',
130             },
131             Lines => { },
132             LinksUpToDate=>{ PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
133             Mailstop => { },
134             Manager => { },
135             Matter => { },
136             MMClips => { },
137             modified => {
138             Name => 'ModifyDate',
139             Groups => { 2 => 'Time' },
140             Format => 'date',
141             PrintConv => '$self->ConvertDateTime($val)',
142             },
143             modifiedType=> { Hidden => 1, RawConv => 'undef' }, # ignore this XML type name
144             Notes => { },
145             Office => { },
146             Owner => { Groups => { 2 => 'Author'} },
147             Pages => { },
148             Paragraphs => { },
149             PresentationFormat => { },
150             Project => { },
151             Publisher => { },
152             Purpose => { },
153             ReceivedFrom=> { },
154             RecordedBy => { },
155             RecordedDate=> {
156             Groups => { 2 => 'Time' },
157             Format => 'date',
158             PrintConv => '$self->ConvertDateTime($val)',
159             },
160             Reference => { },
161             revision => { Name => 'RevisionNumber' },
162             ScaleCrop => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
163             SharedDoc => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
164             Slides => { },
165             Source => { },
166             Status => { },
167             TelephoneNumber => { },
168             Template => { },
169             TitlesOfParts=>{ },
170             TotalTime => {
171             Name => 'TotalEditTime',
172             PrintConv => 'ConvertTimeSpan($val, 60)',
173             },
174             Typist => { },
175             Words => { },
176             );
177              
178             #------------------------------------------------------------------------------
179             # Generate a tag ID for this XML tag
180             # Inputs: 0) tag property name list ref
181             # Returns: tagID and outtermost interesting namespace (or '' if no namespace)
182             sub GetTagID($)
183             {
184 67     67 0 82 my $props = shift;
185 67         84 my ($tag, $prop, $namespace);
186 67         101 foreach $prop (@$props) {
187             # split name into namespace and property name
188             # (Note: namespace can be '' for property qualifiers)
189 180 100       520 my ($ns, $nm) = ($prop =~ /(.*?):(.*)/) ? ($1, $2) : ('', $prop);
190 180 100       302 next if $ns eq 'vt'; # ignore 'vt' properties
191 136 100 100     470 if (defined $tag) {
    100 100        
192 2         7 $tag .= ucfirst($nm); # add to tag name
193             } elsif ($prop ne 'Properties' and $prop ne 'cp:coreProperties' and
194             $prop ne 'property')
195             {
196 39         43 $tag = $nm;
197             # save namespace of first property to contribute to tag name
198 39 50       78 $namespace = $ns unless $namespace;
199             }
200             }
201 67   100     215 return ($tag, $namespace || '');
202             }
203              
204             #------------------------------------------------------------------------------
205             # We found an XMP property name/value
206             # Inputs: 0) ExifTool object ref, 1) tag table ref
207             # 2) reference to array of XMP property names (last is current property)
208             # 3) property value, 4) attribute hash ref (not used here)
209             # Returns: 1 if valid tag was found
210             sub FoundTag($$$$;$)
211             {
212 151     151 0 228 my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
213 151 50       241 return 0 unless @$props;
214 151         301 my $verbose = $et->Options('Verbose');
215              
216 151         229 my $tag = $$props[-1];
217 151 50       227 $et->VPrint(0, " | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
218              
219             # un-escape XML character entities
220 151         269 $val = Image::ExifTool::XMP::UnescapeXML($val);
221             # convert OOXML-escaped characters (eg. "_x0000d_" is a newline)
222 151         195 $val =~ s/_x([0-9a-f]{4})_/Image::ExifTool::PackUTF8(hex($1))/gie;
  0         0  
223             # convert from UTF8 to ExifTool Charset
224 151         328 $val = $et->Decode($val, 'UTF8');
225             # queue this attribute for later if necessary
226 151 100       268 if ($queueAttrs{$tag}) {
227 84         124 $queuedAttrs{$tag} = $val;
228 84         138 return 0;
229             }
230 67         76 my $ns;
231 67         103 ($tag, $ns) = GetTagID($props);
232 67 100 66     241 if (not $tag) {
    100 100        
    100          
    100          
233             # all properties are in ignored namespaces
234             # so 'name' from our queued attributes for the tag
235 28 50       54 my $name = $queuedAttrs{name} or return 0;
236 28         117 $name =~ s/(^| )([a-z])/$1\U$2/g; # start words with uppercase
237 28         52 ($tag = $name) =~ tr/-_a-zA-Z0-9//dc;
238 28 50       53 return 0 unless length $tag;
239 28 100       63 unless ($$tagTablePtr{$tag}) {
240 1         4 my %tagInfo = (
241             Name => $tag,
242             Description => $name,
243             );
244             # format as a date/time value if type is 'vt:filetime'
245 1 50       4 if ($$props[-1] eq 'vt:filetime') {
246             $tagInfo{Groups} = { 2 => 'Time' },
247             $tagInfo{Format} = 'date',
248 0         0 $tagInfo{PrintConv} = '$self->ConvertDateTime($val)';
249             }
250 1 50       3 $et->VPrint(0, " | [adding $tag]\n") if $verbose;
251 1         5 AddTagToTable($tagTablePtr, $tag, \%tagInfo);
252             }
253             } elsif ($tag eq 'xmlns') {
254             # ignore namespaces (for now)
255 2         5 return 0;
256             } elsif (ref $Image::ExifTool::XMP::Main{$ns} eq 'HASH' and
257             $Image::ExifTool::XMP::Main{$ns}{SubDirectory})
258             {
259             # use standard XMP table if it exists
260 4         7 my $table = $Image::ExifTool::XMP::Main{$ns}{SubDirectory}{TagTable};
261 1     1   6 no strict 'refs';
  1         1  
  1         636  
262 4 50 33     19 if ($table and %$table) {
263 4         10 $tagTablePtr = Image::ExifTool::GetTagTable($table);
264             }
265             } elsif (@$props > 2 and grep /^vt:vector$/, @$props) {
266             # handle vector properties (accumulate as lists)
267 7 100       22 if ($$props[-1] eq 'vt:size') {
    100          
    50          
268 2         3 $vectorCount = $val;
269 2         3 undef @vectorVals;
270 2         5 return 0;
271             } elsif ($$props[-1] eq 'vt:baseType') {
272 2         6 return 0; # ignore baseType
273             } elsif ($vectorCount) {
274 3         6 --$vectorCount;
275 3 100       7 if ($vectorCount) {
276 1         2 push @vectorVals, $val;
277 1         3 return 0;
278             }
279 2 100       6 $val = [ @vectorVals, $val ] if @vectorVals;
280             # Note: we will lose any improper-sized vector elements here
281             }
282             }
283             # add any unknown tags to table
284 60 50       101 if ($$tagTablePtr{$tag}) {
285 60         82 my $tagInfo = $$tagTablePtr{$tag};
286 60 50       116 if (ref $tagInfo eq 'HASH') {
287             # reformat date/time values
288 60   100     187 my $fmt = $$tagInfo{Format} || $$tagInfo{Writable} || '';
289 60 100       125 $val = Image::ExifTool::XMP::ConvertXMPDate($val) if $fmt eq 'date';
290             }
291             } else {
292 0 0       0 $et->VPrint(0, " [adding $tag]\n") if $verbose;
293 0         0 AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
294             }
295             # save the tag
296 60         166 $et->HandleTag($tagTablePtr, $tag, $val);
297              
298             # start fresh for next tag
299 60         82 undef $vectorCount;
300 60         89 undef %queuedAttrs;
301              
302 60         130 return 1;
303             }
304              
305             #------------------------------------------------------------------------------
306             # Extract information from an OOXML file
307             # Inputs: 0) ExifTool object reference, 1) dirInfo reference
308             # Returns: 1
309             # Notes: Upon entry to this routine, the file type has already been verified
310             # and the dirInfo hash contains 2 elements unique to this process proc:
311             # MIME - mime type of main document from "[Content_Types].xml"
312             # ZIP - reference to Archive::Zip object for this file
313             sub ProcessDOCX($$)
314             {
315 1     1 0 2 my ($et, $dirInfo) = @_;
316 1         2 my $zip = $$dirInfo{ZIP};
317 1         3 my $tagTablePtr = GetTagTable('Image::ExifTool::OOXML::Main');
318 1   33     5 my $mime = $$dirInfo{MIME} || $Image::ExifTool::mimeType{DOCX};
319              
320             # set the file type ('DOCX' by default)
321 1         2 my $fileType = $fileType{$mime};
322 1 50       3 if ($fileType) {
323             # THMX is a special case because its contents.main MIME types is PPTX
324 1 0 33     2 if ($fileType eq 'PPTX' and $$et{FILE_EXT} and $$et{FILE_EXT} eq 'THMX') {
      0        
325 0         0 $fileType = 'THMX';
326             }
327             } else {
328 0         0 $et->VPrint(0, "Unrecognized MIME type: $mime\n");
329             # get MIME type according to file extension
330 0         0 $fileType = $$et{FILE_EXT};
331             # default to 'DOCX' if this isn't a known OOXML extension
332 0 0 0     0 $fileType = 'DOCX' unless $fileType and $isOOXML{$fileType};
333             }
334 1         5 $et->SetFileType($fileType);
335              
336             # must catch all Archive::Zip warnings
337 1         4 local $SIG{'__WARN__'} = \&Image::ExifTool::ZIP::WarnProc;
338             # extract meta information from all files in ZIP "docProps" directory
339 1         2 my $docNum = 0;
340 1         4 my @members = $zip->members();
341 1         7 my $member;
342 1         3 foreach $member (@members) {
343             # get filename of this ZIP member
344 18         45 my $file = $member->fileName();
345 18 50       157 next unless defined $file;
346 18         60 $et->VPrint(0, "File: $file\n");
347             # set the document number and extract ZIP tags
348 18         30 $$et{DOC_NUM} = ++$docNum;
349 18         44 Image::ExifTool::ZIP::HandleMember($et, $member);
350             # process only XML and JPEG/WMF thumbnail images in "docProps" directory
351 18 100       67 next unless $file =~ m{^docProps/(.*\.xml|(thumbnail\.(jpe?g|wmf)))$}i;
352             # get the file contents (CAREFUL! $buff MUST be local since we hand off a value ref)
353 4         13 my ($buff, $status) = $zip->contents($member);
354 4 50       3892 $status and $et->Warn("Error extracting $file"), next;
355             # extract docProps/thumbnail.(jpg|mwf) as PreviewImage|PreviewMWF
356 4 100       23 if ($file =~ /\.(jpe?g|wmf)$/i) {
357 1 50       5 my $tag = $file =~ /\.wmf$/i ? 'PreviewWMF' : 'PreviewImage';
358 1         4 $et->FoundTag($tag, \$buff);
359 1         3 next;
360             }
361             # process XML files (docProps/app.xml, docProps/core.xml, docProps/custom.xml)
362 3         14 my %dirInfo = (
363             DataPt => \$buff,
364             DirLen => length $buff,
365             DataLen => length $buff,
366             XMPParseOpts => {
367             FoundProc => \&FoundTag,
368             },
369             );
370 3         13 $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
371 3         11 undef $buff; # (free memory now)
372             }
373 1         3 delete $$et{DOC_NUM};
374 1         5 return 1;
375             }
376              
377             1; # end
378              
379             __END__