File Coverage

blib/lib/MsOffice/Word/Surgeon.pm
Criterion Covered Total %
statement 75 94 79.7
branch 10 20 50.0
condition 2 6 33.3
subroutine 21 26 80.7
pod 7 10 70.0
total 115 156 73.7


line stmt bran cond sub pod time code
1             package MsOffice::Word::Surgeon;
2 4     4   479203 use 5.24.0;
  4         16  
3 4     4   2490 use Moose;
  4         1931103  
  4         57  
4 4     4   34500 use MooseX::StrictConstructor;
  4         134558  
  4         20  
5 4     4   41367 use Moose::Util::TypeConstraints qw(union);
  4         10  
  4         47  
6 4     4   6084 use Archive::Zip qw(AZ_OK);
  4         389294  
  4         256  
7 4     4   40 use Encode qw(encode_utf8 decode_utf8);
  4         13  
  4         215  
8 4     4   20 use Scalar::Util qw(openhandle);
  4         9  
  4         161  
9 4     4   1939 use MsOffice::Word::Surgeon::Carp;
  4         14  
  4         26  
10 4     4   2019 use MsOffice::Word::Surgeon::Revision;
  4         30  
  4         194  
11 4     4   2446 use MsOffice::Word::Surgeon::PackagePart;
  4         73  
  4         722  
12              
13             # syntactic sugar for attributes
14             sub has_lazy ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr")}
15             sub has_inner ($@) {my $attr = shift; has_lazy($attr => @_, init_arg => undef)}
16              
17              
18 4     4   42 use namespace::clean -except => 'meta';
  4         9  
  4         35  
19              
20             our $VERSION = '2.11';
21              
22              
23             #======================================================================
24             # TYPES AND ATTRIBUTES
25             #======================================================================
26              
27             my $DocxSource = union([qw/Str FileHandle/]);
28              
29             # how to access the document
30             has 'docx' => (is => 'ro', isa => $DocxSource); # filename or filehandle, or ..
31             has_lazy 'zip' => (is => 'ro', isa => 'Archive::Zip'); # .. an already opened zip archive
32              
33             # syntax to show embedded fields -- used by PackagePart::replace_field
34             has 'show_embedded_field' => (is => 'ro', isa => 'Str', default => '{%s}');
35              
36             # inner attributes lazily constructed by the module
37             has_inner 'parts' => (is => 'ro', isa => 'HashRef[MsOffice::Word::Surgeon::PackagePart]',
38             traits => ['Hash'], handles => {part => 'get'});
39            
40             has_inner 'document' => (is => 'ro', isa => 'MsOffice::Word::Surgeon::PackagePart',
41             handles => [qw/contents original_contents indented_contents plain_text replace/]);
42             # Note: this attribute is equivalent to $self->part('document'); made into an attribute
43             # for convenience and for automatic delegation of methods through the 'handles' declaration
44              
45             # just a slot for internal storage
46             has 'next_rev_id' => (is => 'bare', isa => 'Num', default => 1, init_arg => undef);
47             # used by the revision() method for creating *::Revision objects -- each instance
48             # gets a fresh value
49              
50              
51             #======================================================================
52             # BUILDING INSTANCES
53             #======================================================================
54              
55              
56             # syntactic sugar for supporting ->new($path) instead of ->new(docx => $path)
57             around BUILDARGS => sub {
58             my $orig = shift;
59             my $class = shift;
60              
61             unshift @_, 'docx' if scalar(@_) % 2 and $DocxSource->check($_[0]);
62              
63             $class->$orig(@_);
64             };
65              
66              
67             # make sure that the constructor got either a 'docx' or a 'zip' attribute
68             sub BUILD {
69 4     4 0 13744 my $self = shift;
70              
71 4         14 my $class = ref $self;
72              
73             $self->{docx} || $self->{zip}
74 4 0 33     25 or croak "$class->new() : need either 'docx' or 'zip' attribute";
75             not ($self->{docx} && $self->{zip})
76 4 50 33     42 or croak "$class->new() : can't have both 'docx' and 'zip' attributes";
77             }
78              
79              
80             #======================================================================
81             # LAZY ATTRIBUTE CONSTRUCTORS
82             #======================================================================
83              
84             sub _zip {
85 4     4   9 my $self = shift;
86              
87 4         148 my $docx = $self->docx;
88 4 100       37 my ($meth, $source_name) = openhandle($docx) ? (readFromFileHandle => 'filehandle')
89             : (read => $docx);
90 4         62 my $zip = Archive::Zip->new;
91 4         234 my $result = $zip->$meth($docx);
92 4 50       22274 $result == AZ_OK or croak "cannot unzip from $source_name";
93              
94 4         189 return $zip;
95             }
96              
97              
98              
99             sub _parts {
100 4     4   11 my $self = shift;
101              
102             # first create a package part for the main document
103 4         69 my $doc = MsOffice::Word::Surgeon::PackagePart->new(surgeon => $self,
104             part_name => 'document');
105              
106             # gather names of headers and footers related to that document
107 24         96 my @headers_footers = map {$_->{Target} =~ s/\.xml$//r}
108 4 100       12601 grep {$_ && $_->{short_type} =~ /^(header|footer)$/}
  136         439  
109             $doc->relationships->@*;
110              
111             # create package parts for headers and footers and assemble all parts into a hash
112 4         17 my %parts = (document => $doc);
113             $parts{$_} = MsOffice::Word::Surgeon::PackagePart->new(surgeon => $self,
114             part_name => $_)
115 4         47 for @headers_footers;
116              
117 4         56793 return \%parts;
118             }
119              
120              
121 4     4   192 sub _document {shift->part('document')}
122              
123              
124             #======================================================================
125             # ACCESSING OR CHANGING THE INTERNAL STATE
126             #======================================================================
127              
128             sub xml_member {
129 14     14 1 41 my ($self, $member_name, $new_content) = @_;
130              
131 14 50       72 if (! defined $new_content) { # used as a reader
132 14 50       469 my $bytes = $self->zip->contents($member_name)
133             or croak "no zip member for $member_name";
134 14         26864 return decode_utf8($bytes);
135             }
136             else { # used as a writer
137 0         0 my $bytes = encode_utf8($new_content);
138 0         0 return $self->zip->contents($member_name, $bytes);
139             }
140             }
141              
142             sub _content_types {
143 0     0   0 my ($self, $new_content_types) = @_;
144 0         0 return $self->xml_member('[Content_Types].xml', $new_content_types);
145             }
146              
147              
148             sub headers {
149 2     2 1 1239 my ($self) = @_;
150 2         115 return sort {substr($a, 6) <=> substr($b, 6)} grep {/^header/} keys $self->parts->%*;
  4         38  
  14         74  
151             }
152              
153             sub footers {
154 1     1 1 3 my ($self) = @_;
155 1         57 return sort {substr($a, 6) <=> substr($b, 6)} grep {/^footer/} keys $self->parts->%*;
  2         16  
  7         22  
156             }
157              
158             sub new_rev_id {
159 0     0 0 0 my ($self) = @_;
160 0         0 return $self->{next_rev_id}++;
161             }
162              
163              
164              
165             #======================================================================
166             # GENERIC PROPAGATION TO ALL PARTS
167             #======================================================================
168              
169              
170             sub all_parts_do {
171 1     1 1 7 my ($self, $method_name, @args) = @_;
172              
173 1         55 my $parts = $self->parts;
174              
175             # apply the method to each package part
176 1         3 my %result;
177 1         15 $result{$_} = $parts->{$_}->$method_name(@args) foreach keys %$parts;
178              
179              
180 1         6 return \%result;
181             }
182              
183              
184             #======================================================================
185             # CLONING
186             #======================================================================
187              
188             sub clone {
189 0     0 0 0 my $self = shift;
190              
191             # create a new Zip archive and copy all members to it
192 0         0 my $new_zip = Archive::Zip->new;
193 0         0 foreach my $member ($self->zip->members) {
194 0         0 $new_zip->addMember($member);
195             }
196              
197             # create a new instance of this class
198 0         0 my $class = ref $self;
199 0         0 my $clone = $class->new(zip => $new_zip);
200              
201             # other attributes will be recreated lazily within the clone .. not
202             # the most efficient way, but it is easier and safer, otherwise there is
203             # a risk of mixed references
204              
205 0         0 return $clone;
206             }
207              
208             #======================================================================
209             # SAVING THE FILE
210             #======================================================================
211              
212              
213             sub _update_contents_in_zip {
214 1     1   1 my $self = shift;
215 1         36 $_->_update_contents_in_zip foreach values $self->parts->%*;
216             }
217              
218              
219             sub overwrite {
220 0     0 1 0 my $self = shift;
221              
222 0         0 $self->_update_contents_in_zip;
223 0 0       0 $self->zip->overwrite == AZ_OK
224             or croak "error overwriting zip archive " . $self->docx;
225             }
226              
227             sub save_as {
228 1     1 1 2540 my ($self, $docx) = @_;
229              
230 1         5 $self->_update_contents_in_zip;
231 1 50       5 my $method = openhandle($docx) ? 'writeToFileHandle' : 'writeToFileNamed';
232 1 50       24 $self->zip->$method($docx) == AZ_OK
233             or croak "error writing zip archive to $docx";
234             }
235              
236              
237             #======================================================================
238             # DELEGATION TO OTHER CLASSES
239             #======================================================================
240              
241             sub new_revision {
242 0     0 1   my $self = shift;
243              
244 0           my $revision = MsOffice::Word::Surgeon::Revision->new(rev_id => $self->new_rev_id, @_);
245 0           return $revision->as_xml;
246             }
247              
248              
249             1;
250              
251             __END__
252              
253             =encoding ISO-8859-1
254              
255             =head1 NAME
256              
257             MsOffice::Word::Surgeon - tamper with the guts of Microsoft docx documents, with regexes
258              
259             =head1 SYNOPSIS
260              
261             my $surgeon = MsOffice::Word::Surgeon->new(docx => $filename);
262              
263             # extract plain text
264             my $main_text = $surgeon->document->plain_text;
265             my @header_texts = map {$surgeon->part($_)->plain_text} $surgeon->headers;
266              
267             # unlink fields
268             $surgeon->document->unlink_fields;
269              
270             # reveal bookmarks
271             $surgeon->document->reveal_bookmarks(color => 'cyan');
272              
273             # anonymize
274             my %alias = ('Claudio MONTEVERDI' => 'A_____', 'Heinrich SCHÜTZ' => 'B_____');
275             my $pattern = join "|", keys %alias;
276             my $replacement_callback = sub {
277             my %args = @_;
278             my $replacement = $surgeon->new_revision(to_delete => $args{matched},
279             to_insert => $alias{$args{matched}},
280             run => $args{run},
281             xml_before => $args{xml_before},
282             );
283             return $replacement;
284             };
285             $surgeon->all_parts_do(replace => qr[$pattern], $replacement_callback);
286              
287             # save the result
288             $surgeon->overwrite; # or ->save_as($new_filename);
289              
290              
291             =head1 DESCRIPTION
292              
293             =head2 Purpose
294              
295             This module supports a few operations for inspecting or modifying contents
296             in Microsoft Word documents in '.docx' format -- therefore the name
297             'surgeon'. Since a surgeon does not give life, there is no support for
298             creating fresh documents; if you have such needs, use one of the other
299             packages listed in the L<SEE ALSO> section -- or use the companion module
300             L<MsOffice::Word::Template>.
301              
302             Some applications for this module are :
303              
304             =over
305              
306             =item *
307              
308             content extraction in plain text format;
309              
310             =item *
311              
312             unlinking fields (equivalent of performing Ctrl-Shift-F9 on the whole document)
313              
314             =item *
315              
316             adding markers at bookmark start and end positions
317              
318             =item *
319              
320             regex replacements within text, for example for :
321              
322             =over
323              
324             =item *
325              
326             anonymization, i.e. replacement of names or addresses by aliases;
327              
328             =item *
329              
330             templating, i.e. replacement of special markup by contents coming from a data tree
331             (see also L<MsOffice::Word::Template>).
332              
333             =back
334              
335             =item *
336              
337             insertion of generated images (for example barcodes) -- see L<MsOffice::Word::Surgeon::PackagePart/images>;
338              
339             =item *
340              
341             pretty-printing the internal XML structure.
342              
343             =back
344              
345              
346              
347             =head2 The C<.docx> format
348              
349             The format of Microsoft C<.docx> documents is described in
350             L<http://www.ecma-international.org/publications/standards/Ecma-376.htm>
351             and L<http://officeopenxml.com/>. An excellent introduction can be
352             found at L<https://www.toptal.com/xml/an-informal-introduction-to-docx>.
353             Another precious source of documentation is L<http://officeopenxml.com/WPcontentOverview.php>.
354             Internally, a document is a zipped
355             archive, where the member named C<word/document.xml> stores the main
356             document contents, in XML format.
357              
358              
359              
360             =head2 Operating mode
361              
362             The present module does not parse all details of the whole XML
363             structure because it only focuses on I<text> nodes (those that contain
364             literal text) and I<run> nodes (those that contain text formatting
365             properties). All remaining XML information, for example for
366             representing sections, paragraphs, tables, etc., is stored as opaque
367             XML fragments; these fragments are re-inserted at proper places when
368             reassembling the whole document after having modified some text nodes.
369              
370              
371              
372             =head1 METHODS
373              
374             =head2 Constructor
375              
376             =head3 new
377              
378             my $surgeon = MsOffice::Word::Surgeon->new(docx => $filename_or_filehandle);
379             # or simply : ->new($filename);
380              
381             Builds a new surgeon instance, initialized with the contents of the given filename or filehandle.
382              
383             =head2 Accessors
384              
385             =head3 docx
386              
387             Path to the C<.docx> file
388              
389             =head3 zip
390              
391             Instance of L<Archive::Zip> associated with this file
392              
393             =head3 parts
394              
395             Hashref to L<MsOffice::Word::Surgeon::PackagePart> objects, keyed by their part name in the ZIP file.
396             There is always a C<'document'> part. Other parts may be headers, footers, footnotes or endnotes.
397              
398             =head3 document
399              
400             Shortcut to C<< $surgeon->part('document') >> -- the
401             L<MsOffice::Word::Surgeon::PackagePart> object corresponding to the main document.
402             See the C<PackagePart> documentation for operations on part objects.
403             Besides, the following operations are supported directly as methods to the C<< $surgeon >> object
404             and are automatically delegated to the C<< document >> part :
405             C<contents>, C<original_contents>, C<indented_contents>, C<plain_text>, C<replace>.
406              
407              
408              
409             =head3 headers
410              
411             my @header_parts = $surgeon->headers;
412              
413             Returns the ordered list of names of header members stored in the ZIP file.
414              
415             =head3 footers
416              
417             my @footer_parts = $surgeon->footers;
418              
419             Returns the ordered list of names of footer members stored in the ZIP file.
420              
421              
422             =head2 Other methods
423              
424              
425             =head3 part
426              
427             my $part = $surgeon->part($part_name);
428              
429             Returns the L<MsOffice::Word::Surgeon::PackagePart> object corresponding to the given part name.
430              
431              
432             =head3 all_parts_do
433              
434             my $result = $surgeon->all_parts_do($method_name => %args);
435              
436             Calls the given method on all part objects. Results are accumulated
437             in a hash, with part names as keys to the results. This is mostly
438             used to invoke the L<MsOffice::Word::Surgeon::PackagePart/replace> method, i.e.
439              
440             $surgeon->all_parts_do(replace => qr[$pattern], $replacement_callback, %replacement_args);
441              
442              
443             =head3 xml_member
444              
445             my $xml = $surgeon->xml_member($member_name); # reading
446             # or
447             $surgeon->xml_member($member_name, $new_xml); # writing
448              
449             Reads or writes the given member name in the ZIP file, with utf8 decoding or encoding.
450              
451              
452             =head3 save_as
453              
454             $surgeon->save_as($docx_file_or_filehandle);
455              
456             Writes the ZIP archive into the given file or filehandle.
457              
458              
459             =head3 overwrite
460              
461             $surgeon->overwrite;
462              
463             Writes the updated ZIP archive into the initial file.
464             If the initial C<docx> was given as a filehandle, use the L</save_as> method instead.
465              
466             =head3 new_revision
467              
468             my $xml = $surgeon->new_revision(
469             to_delete => $text_to_delete,
470             to_insert => $text_to_insert,
471             author => $author_string,
472             date => $date_string,
473             run => $run_object,
474             xml_before => $xml_string,
475             );
476              
477             This method is syntactic sugar for instantiating the
478             L<MsOffice::Word::Surgeon::Revision> class and returning
479             XML markup for MsWord revisions (a.k.a. "tracked changes")
480             generated by that class. Users can
481             then manually review those revisions within MsWord and accept or reject
482             them. This is best used in collaboration with the L</replace> method :
483             the replacement callback can call C<< $self->new_revision(...) >> to
484             generate revision marks in the document.
485              
486             Either C<to_delete> or C<to_insert> (or both) must
487             be present. Other parameters are optional. The parameters are :
488              
489             =over
490              
491             =item to_delete
492              
493             The string of text to delete (usually this will be the C<matched> argument
494             passed to the replacement callback).
495              
496             =item to_insert
497              
498             The string of new text to insert.
499              
500             =item author
501              
502             A short string that will be displayed by MsWord as the "author" of this revision.
503              
504             =item date
505              
506             A date (and optional time) in ISO format that will be displayed by
507             MsWord as the date of this revision. The current date and time
508             will be used by default.
509              
510             =item run
511              
512             A reference to the L<MsOffice::Word::Surgeon::Run> object surrounding
513             this revision. The formatting properties of that run will be
514             copied into the C<< <w:r> >> nodes of the deleted and inserted text fragments.
515              
516              
517             =item xml_before
518              
519             An optional XML fragment to be inserted before the C<< <w:t> >> node
520             of the inserted text
521              
522             =back
523              
524              
525             =head2 Operations on parts
526              
527             See the L<MsOffice::Word::Surgeon::PackagePart> documentation for other
528             operations on package parts, including operations on fields, bookmarks or images.
529              
530             =head1 SEE ALSO
531              
532             The L<https://metacpan.org/pod/Document::OOXML> distribution on CPAN
533             also manipulates C<docx> documents, but with another approach :
534             internally it uses L<XML::LibXML> and XPath expressions for
535             manipulating XML nodes. The API has some intersections with the
536             present module, but there are also some differences : C<Document::OOXML>
537             has more support for styling, while C<MsOffice::Word::Surgeon>
538             has more flexible mechanisms for replacing
539             text fragments.
540              
541              
542             Other programming languages also have packages for dealing with C<docx> documents; here
543             are some references :
544              
545             =over
546              
547             =item L<https://docs.microsoft.com/en-us/office/open-xml/word-processing>
548              
549             The C# Open XML SDK from Microsoft
550              
551             =item L<http://www.ericwhite.com/blog/open-xml-powertools-developer-center/>
552              
553             Additional functionalities built on top of the XML SDK.
554              
555             =item L<https://poi.apache.org>
556              
557             An open source Java library from the Apache foundation.
558              
559             =item L<https://www.docx4java.org/trac/docx4j>
560              
561             Another open source Java library, competitor to Apache POI.
562              
563             =item L<https://phpword.readthedocs.io/en/latest/>
564              
565             A PHP library dealing not only with Microsoft OOXML documents but also
566             with OASIS and RTF formats.
567              
568             =item L<https://pypi.org/project/python-docx/>
569              
570             A Python library, documented at L<https://python-docx.readthedocs.io/en/latest/>.
571              
572             =back
573              
574             As far as I can tell, most of these libraries provide objects and methods that
575             closely reflect the complete XML structure : for example they have classes for
576             paragraphs, styles, fonts, inline shapes, etc.
577              
578             The present module is much simpler but also much more limited : it was optimised
579             for dealing with the text contents and offers no support for presentation or
580             paging features. However, it has the rare advantage of providing an API for
581             regex substitutions within Word documents.
582              
583             The L<MsOffice::Word::Template> module relies on the present module, together with
584             the L<Perl Template Toolkit|Template>, to implement a templating system for Word documents.
585              
586              
587             =head1 AUTHOR
588              
589             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
590              
591             =head1 COPYRIGHT AND LICENSE
592              
593             Copyright 2019-2024 by Laurent Dami.
594              
595             This program is free software, you can redistribute it and/or modify it under the terms of the Artistic License version 2.0.