File Coverage

blib/lib/MsOffice/Word/Surgeon.pm
Criterion Covered Total %
statement 60 84 71.4
branch 6 16 37.5
condition 2 6 33.3
subroutine 17 24 70.8
pod 7 10 70.0
total 92 140 65.7


line stmt bran cond sub pod time code
1             package MsOffice::Word::Surgeon;
2 1     1   82989 use 5.24.0;
  1         4  
3 1     1   492 use Moose;
  1         378995  
  1         7  
4 1     1   7026 use MooseX::StrictConstructor;
  1         25321  
  1         3  
5 1     1   8758 use Archive::Zip qw(AZ_OK);
  1         77246  
  1         61  
6 1     1   10 use Encode qw(encode_utf8 decode_utf8);
  1         2  
  1         49  
7 1     1   6 use Carp qw(croak);
  1         2  
  1         46  
8 1     1   446 use MsOffice::Word::Surgeon::Revision;
  1         4  
  1         52  
9 1     1   672 use MsOffice::Word::Surgeon::PackagePart;
  1         4  
  1         132  
10              
11             # syntactic sugar for attributes
12             sub has_lazy ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr")}
13             sub has_inner ($@) {my $attr = shift; has_lazy($attr => @_, init_arg => undef)}
14              
15              
16 1     1   8 use namespace::clean -except => 'meta';
  1         2  
  1         5  
17              
18             our $VERSION = '2.02';
19              
20              
21             #======================================================================
22             # ATTRIBUTES
23             #======================================================================
24              
25             # attributes to the constructor -- either the filename or an existing zip archive
26             has 'docx' => (is => 'ro', isa => 'Str');
27             has_lazy 'zip' => (is => 'ro', isa => 'Archive::Zip');
28              
29             # inner attributes lazily constructed by the module
30             has_inner 'parts' => (is => 'ro', isa => 'HashRef[MsOffice::Word::Surgeon::PackagePart]',
31             traits => ['Hash'], handles => {part => 'get'});
32              
33             has_inner 'document' => (is => 'ro', isa => 'MsOffice::Word::Surgeon::PackagePart',
34             handles => [qw/contents original_contents indented_contents plain_text replace/]);
35             # Note: this attribute is equivalent to $self->part('document'); made into an attribute
36             # for convenience and for automatic delegation of methods through the 'handles' declaration
37              
38             # just a slot for internal storage
39             has 'next_rev_id' => (is => 'bare', isa => 'Num', default => 1, init_arg => undef);
40             # used by the revision() method for creating *::Revision objects -- each instance
41             # gets a fresh value
42              
43              
44             #======================================================================
45             # BUILDING INSTANCES
46             #======================================================================
47              
48              
49             # syntactic sugar for supporting ->new($path) instead of ->new(docx => $path)
50             around BUILDARGS => sub {
51             my $orig = shift;
52             my $class = shift;
53              
54             if ( @_ == 1 && !ref $_[0] ) {
55             return $class->$orig(docx => $_[0]);
56             }
57             else {
58             return $class->$orig(@_);
59             }
60             };
61              
62              
63             # make sure that the constructor got either a 'docx' or a 'zip' attribute
64             sub BUILD {
65 1     1 0 2413 my $self = shift;
66              
67 1         4 my $class = ref $self;
68              
69             $self->{docx} || $self->{zip}
70 1 0 33     5 or croak "$class->new() : need either 'docx' or 'zip' attribute";
71             not ($self->{docx} && $self->{zip})
72 1 50 33     10 or croak "$class->new() : can't have both 'docx' and 'zip' attributes";
73             }
74              
75              
76             #======================================================================
77             # LAZY ATTRIBUTE CONSTRUCTORS
78             #======================================================================
79              
80             sub _zip {
81 1     1   2 my $self = shift;
82              
83 1         10 my $zip = Archive::Zip->new;
84 1 50       50 $zip->read($self->{docx}) == AZ_OK
85             or croak "cannot unzip $self->{docx}";
86              
87 1         5348 return $zip;
88             }
89              
90              
91              
92             sub _parts {
93 1     1   3 my $self = shift;
94              
95             # first create a package part for the main document
96 1         12 my $doc = MsOffice::Word::Surgeon::PackagePart->new(surgeon => $self,
97             part_name => 'document');
98              
99             # gather names of headers and footers related to that document
100 6         36 my @headers_footers = map {$_->{Target} =~ s/\.xml$//r}
101 1 100       2564 grep {$_ && $_->{short_type} =~ /^(header|footer)$/}
  22         95  
102             $doc->relationships->@*;
103              
104             # create package parts for headers and footers and assemble all parts into a hash
105 1         11 my %parts = (document => $doc);
106             $parts{$_} = MsOffice::Word::Surgeon::PackagePart->new(surgeon => $self,
107             part_name => $_)
108 1         9 for @headers_footers;
109              
110 1         12930 return \%parts;
111             }
112              
113              
114 1     1   36 sub _document {shift->part('document')}
115              
116              
117             #======================================================================
118             # ACCESSING OR CHANGING THE INTERNAL STATE
119             #======================================================================
120              
121             sub xml_member {
122 8     8 1 27 my ($self, $member_name, $new_content) = @_;
123              
124 8 50       22 if (! defined $new_content) { # used as a reader
125 8 50       225 my $bytes = $self->zip->contents($member_name)
126             or croak "no zip member for $member_name";
127 8         12121 return decode_utf8($bytes);
128             }
129             else { # used as a writer
130 0         0 my $bytes = encode_utf8($new_content);
131 0         0 return $self->zip->contents($member_name, $bytes);
132             }
133             }
134              
135             sub _content_types {
136 0     0   0 my ($self, $new_content_types) = @_;
137 0         0 return $self->xml_member('[Content_Types].xml', $new_content_types);
138             }
139              
140              
141             sub headers {
142 2     2 1 1148 my ($self) = @_;
143 2         78 return sort {substr($a, 6) <=> substr($b, 6)} grep {/^header/} keys $self->parts->%*;
  6         35  
  14         44  
144             }
145              
146             sub footers {
147 1     1 1 8 my ($self) = @_;
148 1         39 return sort {substr($a, 6) <=> substr($b, 6)} grep {/^footer/} keys $self->parts->%*;
  3         35  
  7         21  
149             }
150              
151             sub new_rev_id {
152 0     0 0 0 my ($self) = @_;
153 0         0 return $self->{next_rev_id}++;
154             }
155              
156              
157              
158             #======================================================================
159             # GENERIC PROPAGATION TO ALL PARTS
160             #======================================================================
161              
162              
163             sub all_parts_do {
164 1     1 1 6 my ($self, $method_name, @args) = @_;
165              
166 1         39 my $parts = $self->parts;
167              
168             # apply the method to each package part
169 1         6 my %result;
170 1         13 $result{$_} = $parts->{$_}->$method_name(@args) foreach keys %$parts;
171              
172              
173 1         13 return \%result;
174             }
175              
176              
177             #======================================================================
178             # CLONING
179             #======================================================================
180              
181             sub clone {
182 0     0 0   my $self = shift;
183              
184             # create a new Zip archive and copy all members to it
185 0           my $new_zip = Archive::Zip->new;
186 0           foreach my $member ($self->zip->members) {
187 0           $new_zip->addMember($member);
188             }
189              
190             # create a new instance of this class
191 0           my $class = ref $self;
192 0           my $clone = $class->new(zip => $new_zip);
193              
194             # other attributes will be recreated lazily within the clone .. not
195             # the most efficient way, but it is easier and safer, otherwise there is
196             # a risk of mixed references
197              
198 0           return $clone;
199             }
200              
201             #======================================================================
202             # SAVING THE FILE
203             #======================================================================
204              
205              
206             sub _update_contents_in_zip {
207 0     0     my $self = shift;
208 0           $_->_update_contents_in_zip foreach values $self->parts->%*;
209             }
210              
211              
212             sub overwrite {
213 0     0 1   my $self = shift;
214              
215 0           $self->_update_contents_in_zip;
216 0 0         $self->zip->overwrite == AZ_OK
217             or croak "error overwriting zip archive " . $self->docx;
218             }
219              
220             sub save_as {
221 0     0 1   my ($self, $docx) = @_;
222              
223 0           $self->_update_contents_in_zip;
224 0 0         $self->zip->writeToFileNamed($docx) == AZ_OK
225             or croak "error writing zip archive to $docx";
226             }
227              
228              
229             #======================================================================
230             # DELEGATION TO OTHER CLASSES
231             #======================================================================
232              
233             sub new_revision {
234 0     0 1   my $self = shift;
235              
236 0           my $revision = MsOffice::Word::Surgeon::Revision->new(rev_id => $self->new_rev_id, @_);
237 0           return $revision->as_xml;
238             }
239              
240              
241             1;
242              
243             __END__
244              
245             =encoding ISO-8859-1
246              
247             =head1 NAME
248              
249             MsOffice::Word::Surgeon - tamper with the guts of Microsoft docx documents, with regexes
250              
251             =head1 SYNOPSIS
252              
253             my $surgeon = MsOffice::Word::Surgeon->new(docx => $filename);
254              
255             # extract plain text
256             my $main_text = $surgeon->document->plain_text;
257             my @header_texts = map {$surgeon->part($_)->plain_text} $surgeon->headers;
258              
259             # anonymize
260             my %alias = ('Claudio MONTEVERDI' => 'A_____', 'Heinrich SCHÜTZ' => 'B_____');
261             my $pattern = join "|", keys %alias;
262             my $replacement_callback = sub {
263             my %args = @_;
264             my $replacement = $surgeon->new_revision(to_delete => $args{matched},
265             to_insert => $alias{$args{matched}},
266             run => $args{run},
267             xml_before => $args{xml_before},
268             );
269             return $replacement;
270             };
271             $surgeon->all_parts_do(replace => qr[$pattern], $replacement_callback);
272              
273             # save the result
274             $surgeon->overwrite; # or ->save_as($new_filename);
275              
276              
277             =head1 DESCRIPTION
278              
279             =head2 Purpose
280              
281             This module supports a few operations for inspecting or modifying contents
282             in Microsoft Word documents in '.docx' format -- therefore the name
283             'surgeon'. Since a surgeon does not give life, there is no support for
284             creating fresh documents; if you have such needs, use one of the other
285             packages listed in the L<SEE ALSO> section -- or use the companion module
286             L<MsOffice::Word::Template>.
287              
288             Some applications for this module are :
289              
290             =over
291              
292             =item *
293              
294             content extraction in plain text format;
295              
296             =item *
297              
298             unlinking fields (equivalent of performing Ctrl-Shift-F9 on the whole document)
299              
300             =item *
301              
302             regex replacements within text, for example for :
303              
304             =over
305              
306             =item *
307              
308             anonymization, i.e. replacement of names or adresses by aliases;
309              
310             =item *
311              
312             templating, i.e. replacement of special markup by contents coming from a data tree
313             (see also L<MsOffice::Word::Template>).
314              
315             =back
316              
317             =item *
318              
319             insertion of generated images (for example barcodes) -- see L<MsOffice::Word::Surgeon::PackagePart/images>;
320              
321             =item *
322              
323             pretty-printing the internal XML structure.
324              
325             =back
326              
327              
328              
329              
330             =head2 Operating mode
331              
332             The format of Microsoft C<.docx> documents is described in
333             L<http://www.ecma-international.org/publications/standards/Ecma-376.htm>
334             and L<http://officeopenxml.com/>. An excellent introduction can be
335             found at L<https://www.toptal.com/xml/an-informal-introduction-to-docx>.
336             Internally, a document is a zipped
337             archive, where the member named C<word/document.xml> stores the main
338             document contents, in XML format.
339              
340             The present module does not parse all details of the whole XML
341             structure because it only focuses on I<text> nodes (those that contain
342             literal text) and I<run> nodes (those that contain text formatting
343             properties). All remaining XML information, for example for
344             representing sections, paragraphs, tables, etc., is stored as opaque
345             XML fragments; these fragments are re-inserted at proper places when
346             reassembling the whole document after having modified some text nodes.
347              
348              
349             =head1 METHODS
350              
351             =head2 Constructor
352              
353             =head3 new
354              
355             my $surgeon = MsOffice::Word::Surgeon->new(docx => $filename);
356             # or simply : ->new($filename);
357              
358             Builds a new surgeon instance, initialized with the contents of the given filename.
359              
360             =head2 Accessors
361              
362             =head3 docx
363              
364             Path to the C<.docx> file
365              
366             =head3 zip
367              
368             Instance of L<Archive::Zip> associated with this file
369              
370             =head3 parts
371              
372             Hashref to L<MsOffice::Word::Surgeon::PackagePart> objects, keyed by their part name in the ZIP file.
373             There is always a C<'document'> part. Currently, other optional parts may be headers and footers.
374             Future versions may include other parts like footnotes or endnotes.
375              
376             =head3 document
377              
378             Shortcut to C<< $surgeon->part('document') >> -- the
379             L<MsOffice::Word::Surgeon::PackagePart> object corresponding to the main document.
380             See the C<PackagePart> documentation for operations on part objects.
381             Besides, the following operations are supported directly as methods to the C<< $surgeon >> object
382             and are automatically delegated to the C<< document >> part :
383             C<contents>, C<original_contents>, C<indented_contents>, C<plain_text>, C<replace>.
384              
385              
386              
387             =head3 headers
388              
389             my @header_parts = $surgeon->headers;
390              
391             Returns the ordered list of names of header members stored in the ZIP file.
392              
393             =head3 footers
394              
395             my @footer_parts = $surgeon->footers;
396              
397             Returns the ordered list of names of footer members stored in the ZIP file.
398              
399              
400             =head2 Other methods
401              
402              
403             =head3 part
404              
405             my $part = $surgeon->part($part_name);
406              
407             Returns the L<MsOffice::Word::Surgeon::PackagePart> object corresponding to the given part name.
408              
409              
410             =head3 all_parts_do
411              
412             my $result = $surgeon->all_parts_do($method_name => %args);
413              
414             Calls the given method on all part objects. Results are accumulated
415             in a hash, with part names as keys to the results. This is mostly
416             used to invoke the L<MsOffice::Word::Surgeon::PackagePart/replace> method, i.e.
417              
418             $surgeon->all_parts_do(replace => qr[$pattern], $replacement_callback, %replacement_args);
419              
420              
421             =head3 xml_member
422              
423             my $xml = $surgeon->xml_member($member_name); # reading
424             # or
425             $surgeon->xml_member($member_name, $new_xml); # writing
426              
427             Reads or writes the given member name in the ZIP file, with utf8 decoding or encoding.
428              
429              
430             =head3 save_as
431              
432             $surgeon->save_as($docx_file);
433              
434             Writes the ZIP archive into the given file.
435              
436              
437             =head3 overwrite
438              
439             $surgeon->overwrite;
440              
441             Writes the updated ZIP archive into the initial file.
442              
443              
444             =head3 new_revision
445              
446             my $xml = $surgeon->new_revision(
447             to_delete => $text_to_delete,
448             to_insert => $text_to_insert,
449             author => $author_string,
450             date => $date_string,
451             run => $run_object,
452             xml_before => $xml_string,
453             );
454              
455             This method is syntactic sugar for instantiating the
456             L<MsOffice::Word::Surgeon::Revision> class and returning
457             XML markup for MsWord revisions (a.k.a. "tracked changes")
458             generated by that class. Users can
459             then manually review those revisions within MsWord and accept or reject
460             them. This is best used in collaboration with the L</replace> method :
461             the replacement callback can call C<< $self->new_revision(...) >> to
462             generate revision marks in the document.
463              
464             Either C<to_delete> or C<to_insert> (or both) must
465             be present. Other parameters are optional. The parameters are :
466              
467             =over
468              
469             =item to_delete
470              
471             The string of text to delete (usually this will be the C<matched> argument
472             passed to the replacement callback).
473              
474             =item to_insert
475              
476             The string of new text to insert.
477              
478             =item author
479              
480             A short string that will be displayed by MsWord as the "author" of this revision.
481              
482             =item date
483              
484             A date (and optional time) in ISO format that will be displayed by
485             MsWord as the date of this revision. The current date and time
486             will be used by default.
487              
488             =item run
489              
490             A reference to the L<MsOffice::Word::Surgeon::Run> object surrounding
491             this revision. The formatting properties of that run will be
492             copied into the C<< <w:r> >> nodes of the deleted and inserted text fragments.
493              
494              
495             =item xml_before
496              
497             An optional XML fragment to be inserted before the C<< <w:t> >> node
498             of the inserted text
499              
500             =back
501              
502              
503             =head1 SEE ALSO
504              
505             The L<https://metacpan.org/pod/Document::OOXML> distribution on CPAN
506             also manipulates C<docx> documents, but with another approach :
507             internally it uses L<XML::LibXML> and XPath expressions for
508             manipulating XML nodes. The API has some intersections with the
509             present module, but there are also some differences : C<Document::OOXML>
510             has more support for styling, while C<MsOffice::Word::Surgeon>
511             has more flexible mechanisms for replacing
512             text fragments.
513              
514              
515             Other programming languages also have packages for dealing with C<docx> documents; here
516             are some references :
517              
518             =over
519              
520             =item L<https://docs.microsoft.com/en-us/office/open-xml/word-processing>
521              
522             The C# Open XML SDK from Microsoft
523              
524             =item L<http://www.ericwhite.com/blog/open-xml-powertools-developer-center/>
525              
526             Additional functionalities built on top of the XML SDK.
527              
528             =item L<https://poi.apache.org>
529              
530             An open source Java library from the Apache foundation.
531              
532             =item L<https://www.docx4java.org/trac/docx4j>
533              
534             Another open source Java library, competitor to Apache POI.
535              
536             =item L<https://phpword.readthedocs.io/en/latest/>
537              
538             A PHP library dealing not only with Microsoft OOXML documents but also
539             with OASIS and RTF formats.
540              
541             =item L<https://pypi.org/project/python-docx/>
542              
543             A Python library, documented at L<https://python-docx.readthedocs.io/en/latest/>.
544              
545             =back
546              
547             As far as I can tell, most of these libraries provide objects and methods that
548             closely reflect the complete XML structure : for example they have classes for
549             paragraphs, styles, fonts, inline shapes, etc.
550              
551             The present module is much simpler but also much more limited : it was optimised
552             for dealing with the text contents and offers no support for presentation or
553             paging features. However, it has the rare advantage of providing an API for
554             regex substitutions within Word documents.
555              
556             The L<MsOffice::Word::Template> module relies on the present module, together with
557             the L<Perl Template Toolkit|Template>, to implement a templating system for Word documents.
558              
559              
560             =head1 AUTHOR
561              
562             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
563              
564             =head1 COPYRIGHT AND LICENSE
565              
566             Copyright 2019-2023 by Laurent Dami.
567              
568             This library is free software; you can redistribute it and/or modify
569             it under the same terms as Perl itself.