File Coverage

blib/lib/MsOffice/Word/Surgeon/PackagePart.pm

Criterion	Covered	Total	%
statement	148	193	76.6
branch	16	32	50.0
condition	23	40	57.5
subroutine	31	37	83.7
pod	13	16	81.2
total	231	318	72.6

line	stmt	bran	cond	sub	pod	time	code
1							package MsOffice::Word::Surgeon::PackagePart;
2	1			1		18	use 5.24.0;
	1					4
3	1			1		19	use Moose;
	1					3
	1					7
4	1			1		7069	use MooseX::StrictConstructor;
	1					2
	1					9
5	1			1		3442	use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level);
	1					2
	1					75
6	1			1		504	use MsOffice::Word::Surgeon::Run;
	1					4
	1					45
7	1			1		674	use MsOffice::Word::Surgeon::Text;
	1					2
	1					35
8	1			1		919	use XML::LibXML;
	1					35333
	1					9
9	1			1		196	use List::Util qw(max);
	1					3
	1					76
10	1			1		10	use Carp qw(croak);
	1					7
	1					97
11
12							# syntactic sugar for attributes
13							sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
14
15							# constant integers to specify indentation modes -- see L<XML::LibXML>
16	1			1		16	use constant XML_NO_INDENT => 0;
	1					3
	1					72
17	1			1		8	use constant XML_SIMPLE_INDENT => 1;
	1					2
	1					60
18
19	1			1		11	use namespace::clean -except => 'meta';
	1					2
	1					16
20
21							our $VERSION = '2.01';
22
23
24							#======================================================================
25							# ATTRIBUTES
26							#======================================================================
27
28
29							# attributes passed to the constructor
30							has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
31							has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
32
33
34							# attributes constructed by the module -- not received through the constructor
35							has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
36							has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
37							has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
38							has_inner 'images' => (is => 'ro', isa => 'HashRef');
39
40							has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
41
42							#======================================================================
43							# GLOBAL VARIABLES
44							#======================================================================
45
46							# Various regexes for removing uninteresting XML information
47							my %noise_reduction_regexes = (
48							proof_checking => qr(<w:(?:proofErr[^>]+\|noProof/)>),
49							revision_ids => qr(\sw:rsid\w+="[^"]+"),
50							complex_script_bold => qr(<w:bCs/>),
51							page_breaks => qr(<w:lastRenderedPageBreak/>),
52							language => qr(<w:lang w:val="[^/>]+/>),
53							empty_run_props => qr(<w:rPr></w:rPr>),
54							soft_hyphens => qr(<w:softHyphen/>),
55							);
56
57							my @noise_reduction_list = qw/proof_checking revision_ids
58							complex_script_bold page_breaks language
59							empty_run_props soft_hyphens/;
60
61							#======================================================================
62							# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
63							#======================================================================
64
65
66							sub _runs {
67	26			26		50	my $self = shift;
68
69	26					47	state $run_regex = qr[
70							<w:r> # opening tag for the run
71							(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
72							(.*?) # run contents -- capture in $2
73							</w:r> # closing tag for the run
74							]x;
75
76	26					58	state $txt_regex = qr[
77							<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
78							(.*?) # text contents -- capture in $1
79							</w:t> # closing tag for text
80							]x;
81
82
83							# split XML content into run fragments
84	26					847	my $contents = $self->contents;
85	26					20284	my @run_fragments = split m[$run_regex], $contents, -1;
86	26					74	my @runs;
87
88							# build internal RUN objects
89							RUN:
90	26					139	while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
91	3052		100			3611652	$run_contents //= '';
92
93							# split XML of this run into text fragmentsn
94	3052					34760	my @txt_fragments = split m[$txt_regex], $run_contents, -1;
95	3052					5959	my @texts;
96
97							# build internal TEXT objects
98							TXT:
99	3052					9404	while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
100	5698	100	66			2540602	next TXT if !$xml_before_text && ( !(defined $txt_contents) \|\| $txt_contents eq '');
			66
101	2874		50			12695	push @texts, MsOffice::Word::Surgeon::Text->new(
			100
102							xml_before => $xml_before_text // '',
103							literal_text => $txt_contents // '',
104							);
105							}
106
107							# assemble TEXT objects into a RUN object
108	3052	100	100			41539	next RUN if !$xml_before_run && !@texts;
109	2893		50			13636	push @runs, MsOffice::Word::Surgeon::Run->new(
			100
110							xml_before => $xml_before_run // '',
111							props => $props // '',
112							inner_texts => \@texts,
113							);
114							}
115
116	26					34342	return \@runs;
117							}
118
119
120							sub _relationships {
121	1			1		5	my $self = shift;
122
123							# xml that describes the relationships for this package part
124	1					4	my $rel_xml = $self->_rels_xml;
125
126							# parse the relationships and assemble into a sparse array indexed by relationship ids
127	1					69	my @relationships;
128	1					22	while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
129	21					49	my %attrs = parse_attrs($1);
130	21		50			100	$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
131	21					80	($attrs{num} = $attrs{Id}) =~ s[^\D+][];
132	21					103	($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
133	21					156	$relationships[$attrs{num}] = \%attrs;
134							}
135
136	1					41	return \@relationships;
137							}
138
139
140							sub _images {
141	0			0		0	my $self = shift;
142
143							# get relationship ids associated with images
144	0					0	my %rel_image = map {$_->{Id} => $_->{Target}}
145	0	0				0	grep {$_ && $_->{short_type} eq 'image'}
	0					0
146							$self->relationships->@*;
147
148							# get titles and relationship ids of images found within the part contents
149	0					0	my %image;
150	0					0	my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
151							DRAWING:
152	0					0	foreach my $drawing (@drawings) {
153	0	0				0	if ($drawing =~ m[<wp:docPr \s+ (.*?) />
154							.*?
155							<a:blip \s+ r:embed="(\w+)"]x) {
156	0					0	my ($lst_attrs, $rId) = ($1, $2);
157	0					0	my %attrs = parse_attrs($lst_attrs);
158							my $img_id = $attrs{title} \|\| $attrs{descr}
159	0	0	0			0	or next DRAWING;
160
161	0	0				0	$image{$img_id} = "word/$rel_image{$rId}"
162							or die "couldn't find image for relationship '$rId' associated with image '$img_id'";
163							# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why.
164							}
165							}
166
167	0					0	return \%image;
168							}
169
170
171	7			7		20	sub _contents {shift->original_contents}
172
173							sub _on_new_contents {
174	72			72		138	my $self = shift;
175
176	72					2638	$self->clear_runs;
177	72					228344	$self->{contents_has_changed} = 1;
178							}
179
180							#======================================================================
181							# METHODS
182							#======================================================================
183
184
185							sub _rels_xml {
186	1			1		4	my ($self, $new_xml) = @_;
187	1					33	my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name;
188	1					35	return $self->surgeon->xml_member($rels_name, $new_xml);
189							}
190
191
192							sub zip_member_name {
193	7			7	0	19	my $self = shift;
194	7					223	return sprintf "word/%s.xml", $self->part_name;
195							}
196
197
198							sub original_contents {
199	7			7	1	13	my $self = shift;
200
201	7					222	return $self->surgeon->xml_member($self->zip_member_name);
202							}
203
204
205							sub image {
206	0			0	0	0	my ($self, $title, $new_image_content) = @_;
207
208							# name of the image file within the zip
209	0		0			0	my $zip_member_name = $self->images->{$title}
210							\|\| ($title =~ /^\d+$/ ? "word/media/image$title.png"
211							: die "couldn't find image '$title'");
212
213							# delegate to Archive::Zip::contents
214	0					0	return $self->surgeon->zip->contents($zip_member_name, $new_image_content);
215							}
216
217
218
219							#======================================================================
220							# CONTENTS RESTITUTION
221							#======================================================================
222
223							sub indented_contents {
224	0			0	1	0	my $self = shift;
225
226	0					0	my $dom = XML::LibXML->load_xml(string => $self->contents);
227	0					0	return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string
228							}
229
230
231							sub plain_text {
232	2			2	1	32	my $self = shift;
233
234							# XML contents
235	2					66	my $txt = $self->contents;
236
237							# replace opening paragraph tags by newlines
238	2					4526	$txt =~ s/(<w:p[ >])/\n$1/g;
239
240							# replace break tags by newlines
241	2					3902	$txt =~ s[<w:br/>][\n]g;
242
243							# replace tab nodes by ASCII tabs
244	2					5236	$txt =~ s/<w:tab[^s][^>]*>/\t/g;
245
246							# remove all remaining XML tags
247	2					15930	$txt =~ s/<[^>]+>//g;
248
249	2					44	return $txt;
250							}
251
252
253
254
255							#======================================================================
256							# MODIFYING CONTENTS
257							#======================================================================
258
259							sub cleanup_XML {
260	15			15	1	85070	my ($self, @merge_args) = @_;
261
262	15					51	$self->reduce_all_noises;
263	15					73	my $names_of_ASK_fields = $self->unlink_fields;
264	15					63	$self->suppress_bookmarks(@$names_of_ASK_fields);
265	15					51	$self->merge_runs(@merge_args);
266							}
267
268							sub noise_reduction_regex {
269	105			105	1	196	my ($self, $regex_name) = @_;
270	105	50				273	my $regex = $noise_reduction_regexes{$regex_name}
271							or croak "->noise_reduction_regex('$regex_name') : unknown regex name";
272	105					224	return $regex;
273							}
274
275							sub reduce_noise {
276	30			30	1	91	my ($self, @noises) = @_;
277
278							# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes
279	30	100				61	my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises;
	150					347
280
281							# get contents, apply all regexes, put back the modified contents.
282	30					1049	my $contents = $self->contents;
283	1			1		2755	no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef
	1					3
	1					1597
284	30					53086	$contents =~ s/$_/$1/g foreach @regexes;
285	30					1072	$self->contents($contents);
286							}
287
288							sub reduce_all_noises {
289	15			15	1	28	my $self = shift;
290
291	15					50	$self->reduce_noise(@noise_reduction_list);
292							}
293
294							sub suppress_bookmarks {
295	15			15	1	36	my ($self, @names_to_erase) = @_;
296
297							# closure to decide what to do with bookmark contents
298	15					29	my %should_erase_contents = map {($_ => 1)} @names_to_erase;
	2					9
299							my $deal_with_bookmark_text = sub {
300	18			18		54	my ($bookmark_name, $bookmark_contents) = @_;
301	18	100				372	return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents;
302	15					82	};
303
304							# regex to find bookmarks markup
305	15					28	state $bookmark_rx = qr{
306							<w:bookmarkStart # initial tag
307							.+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1
308							.+? w:name="([^"]+)" # 'name' attribute -- capture 2
309							.*? /> # end of this tag
310							(.*?) # bookmark contents (may be empty) -- capture 3
311							<w:bookmarkEnd # ending tag
312							\s+ w:id="\1" # same 'id' attribute
313							.*? /> # end of this tag
314							}sx;
315
316							# remove bookmarks markup
317	15					476	my $contents = $self->contents;
318	15					4092	$contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg;
	18					45
319
320							# re-inject the modified contents
321	15					497	$self->contents($contents);
322							}
323
324							sub merge_runs {
325	15			15	1	43	my ($self, %args) = @_;
326
327							# check validity of received args
328	15					32	state $is_valid_arg = {no_caps => 1};
329							$is_valid_arg->{$_} or croak "merge_runs(): invalid arg: $_"
330	15		50			56	foreach keys %args;
331
332	15					29	my @new_runs;
333							# loop over internal "run" objects
334	15					23	foreach my $run (@{$self->runs}) {
	15					472
335
336	1844	100				5147	$run->remove_caps_property if $args{no_caps};
337
338							# check if the current run can be merged with the previous one
339	1844	100	66			52337	if ( !$run->xml_before # no other XML markup between the 2 runs
			100
340							&& @new_runs # there was a previous run
341							&& $new_runs[-1]->props eq $run->props # both runs have the same properties
342							) {
343							# conditions are OK, so merge this run with the previous one
344	277					689	$new_runs[-1]->merge($run);
345							}
346							else {
347							# conditions not OK, just push this run without merging
348	1567					3994	push @new_runs, $run;
349							}
350							}
351
352							# reassemble the whole stuff and inject it as new contents
353	15					64	$self->contents(join "", map {$_->as_xml} @new_runs);
	1567					4116
354							}
355
356
357
358							sub unlink_fields {
359	15			15	1	35	my $self = shift;
360
361							# must find out what are the ASK fields before erasing the markup
362	15					31	state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)];
363	15					475	my $contents = $self->contents;
364	15					610	my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g;
365
366
367							# regexes to remove field nodes and "field instruction" nodes
368	15					39	state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>];
369	15					36	state $field_boundary_rx = qr[<w:fldChar
370							(?: [^>]*?/> # ignore all attributes until end of node ..
371							\| # .. or
372							[^>]?>.?</w:fldChar>) # .. ignore node content until closing tag
373							]x; # field boundaries are encoded as "begin" / "separate" / "end"
374	15					27	state $simple_field_rx = qr[</?w:fldSimple[^>]*>];
375
376							# apply the regexes
377	15					48	$self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx);
378
379	15					49	return \@names_of_ASK_fields;
380							}
381
382
383							sub replace {
384	11			11	1	61	my ($self, $pattern, $replacement_callback, %replacement_args) = @_;
385
386							# cleanup the XML structure so that replacements work better
387	11					25	my $keep_xml_as_is = delete $replacement_args{keep_xml_as_is};
388	11	100				41	$self->cleanup_XML unless $keep_xml_as_is;
389
390							# check for presences of a special option to avoid modying contents
391	11					168847	my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents};
392
393							# apply replacements and generate new XML
394							my $xml = join "",
395	11					401	map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*;
	1049					4084
396
397							# overwrite previous contents
398	11	50				552	$self->contents($xml) unless $dont_overwrite_contents;
399
400	11					111	return $xml;
401							}
402
403
404							sub _update_contents_in_zip { # called for each part before saving the zip file
405	0			0		0	my $self = shift;
406
407							$self->surgeon->xml_member($self->zip_member_name, $self->contents)
408	0	0				0	if $self->{contents_has_changed};
409							}
410
411
412							sub replace_image {
413	0			0	1	0	my ($self, $image_title, $image_PNG_content) = @_;
414
415	0	0				0	my $member_name = $self->images->{$image_title}
416							or die "could not find an image with title: $image_title";
417	0					0	$self->surgeon->zip->contents($member_name, $image_PNG_content);
418							}
419
420
421
422							sub add_image {
423	0			0	1	0	my ($self, $image_PNG_content) = @_;
424
425							# compute a fresh image number and a fresh relationship id
426	0					0	my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]);
427	0					0	my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members;
	0					0
428	0		0			0	my $last_img_num = max @image_nums // 0;
429	0					0	my $target = sprintf "media/image%d.png", $last_img_num + 1;
430	0					0	my $last_rId_num = $self->relationships->$#*;
431	0					0	my $rId = sprintf "rId%d", $last_rId_num + 1;
432
433							# assemble XML for the new relationship
434	0					0	my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
435	0					0	my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>};
436
437							# update the rels member
438	0					0	my $xml = $self->_rels_xml;
439	0					0	$xml =~ s[</Relationships>][$new_rel_xml</Relationships>];
440	0					0	$self->_rels_xml($xml);
441
442							# add the image as a new member into the archive
443	0					0	my $member_name = "word/$target";
444	0					0	$self->surgeon->zip->addString(\$image_PNG_content, $member_name);
445
446							# update the global content_types if it doesn't include PNG
447	0					0	my $ct = $self->surgeon->_content_types;
448	0	0				0	if ($ct !~ /Extension="png"/) {
449	0					0	$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>];
450	0					0	$self->surgeon->_content_types($ct);
451							}
452
453							# return the relationship id
454	0					0	return $rId;
455							}
456
457
458
459							#======================================================================
460							# UTILITY FUNCTIONS
461							#======================================================================
462
463
464							sub parse_attrs { # cheap parsing of attribute lists in an XML node
465	21			21	0	56	my ($lst_attrs) = @_;
466
467	21					29	state $attr_pair_regex = qr[
468							([^=\s"'&<>]+) # attribute name
469							\h* = \h* # Eq
470							(?: # attribute value
471							" ([^<"]*) " # .. enclosed in double quotes
472							\|
473							' ([^<']*) ' # .. or enclosed in single quotes
474							)
475							]x;
476
477	21					35	state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'};
478	21					35	state $entity_names = join "\|", keys %$entity;
479
480	21					26	my %attr;
481	21					141	while ($lst_attrs =~ /$attr_pair_regex/g) {
482	69		33			241	my ($name, $val) = ($1, $2 // $3);
483	69					136	$val =~ s/&(entity_names);/$entity->{$1}/eg;
	0					0
484	69					477	$attr{$name} = $val;
485							}
486
487	21					132	return %attr;
488							}
489
490
491							1;
492
493							__END__
494
495							=encoding ISO-8859-1
496
497							=head1 NAME
498
499							MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document
500
501							=head1 DESCRIPTION
502
503							This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single
504							I<package part> within the ZIP package of a C<.docx> document.
505							It is mostly used for the I<document> part, that contains the XML representation of the
506							main document body. However, other parts such as headers, footers, footnotes, etc. have the
507							same internal representation and therefore the same operations can be invoked.
508
509
510							=head1 METHODS
511
512							=head2 new
513
514							my $run = MsOffice::Word::Surgeon::PackagePart->new(
515							surgeon => $surgeon,
516							part_name => $name,
517							);
518
519							Constructor for a new part object. This is called internally from
520							L<MsOffice::Word::Surgeon>; it is not meant to be called directly
521							by clients.
522
523							=head3 Constructor arguments
524
525
526							=over
527
528							=item surgeon
529
530							a weak reference to the main surgeon object
531
532							=item part_name
533
534							ZIP member name of this part
535
536							=back
537
538							=head3 Other attributes
539
540							Other attributes, which are not passed through the constructor but are generated lazily on demand, are :
541
542							=over
543
544							=item contents
545
546							the XML contents of this part
547
548							=item runs
549
550							a decomposition of the XML contents into a collection of
551							L<MsOffice::Word::Surgeon::Run> objects.
552
553							=item relationships
554
555							an arrayref of Office relationships associated with this part. This information comes from
556							a C<.rels> member in the ZIP archive, named after the name of the package part.
557							Array indices correspond to relationship numbers. Array values are hashrefs with
558							keys
559
560							=over
561
562							=item Id
563
564							the full relationship id
565
566							=item num
567
568							the numeric part of C<rId>
569
570							=item Type
571
572							the full reference to the XML schema for this relationship
573
574							=item short_type
575
576							only the last word of the type, e.g. 'image', 'style', etc.
577
578							=item Target
579
580							designation of the target within the ZIP file. The prefix 'word/' must be
581							added for having a complete Zip member name.
582
583							=back
584
585
586
587							=item images
588
589							a hashref of images within this package part. Keys of the hash are image I<alternative texts>.
590							If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken
591							(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display
592							the I<description> field -- see
593							L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669\|MsOffice documentation>).
594
595							Images without alternative text will not be accessible through the current Perl module.
596
597							Values of the hash are zip member names for the corresponding
598							image representations in C<.png> format.
599
600
601							=back
602
603
604							=head2 Contents restitution
605
606							=head3 contents
607
608							Returns a Perl string with the current internal XML representation of the part
609							contents.
610
611							=head3 original_contents
612
613							Returns a Perl string with the XML representation of the
614							part contents, as it was in the ZIP archive before any
615							modification.
616
617							=head3 indented_contents
618
619							Returns an indented version of the XML contents, suitable for inspection in a text editor.
620							This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded
621							byte string, not a Perl string.
622
623							=head3 plain_text
624
625							Returns the text contents of the part, without any markup.
626							Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored.
627
628
629							=head3 runs
630
631							Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of
632							these objects holds an XML fragment; joining all fragments
633							restores the complete document.
634
635							my $contents = join "", map {$_->as_xml} $self->runs;
636
637
638							=head2 Modifying contents
639
640
641							=head3 cleanup_XML
642
643							$part->cleanup_XML;
644
645							Apply several other methods for removing unnecessary nodes within the internal
646							XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>,
647							L</suppress_bookmarks> and L</merge_runs>.
648
649
650							=head3 reduce_noise
651
652							$part->reduce_noise($regex1, $regex2, ...);
653
654							This method is used for removing unnecessary information in the XML
655							markup. It applies the given list of regexes to the whole document,
656							suppressing matches. The final result is put back into
657							C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >>
658							references, or as names of builtin regexes (described below). Regexes
659							are applied to the whole XML contents, not only to run nodes.
660
661
662							=head3 noise_reduction_regex
663
664							my $regex = $part->noise_reduction_regex($regex_name);
665
666							Returns the builtin regex corresponding to the given name.
667							Known regexes are :
668
669							proof_checking => qr(<w:(?:proofErr[^>]+\|noProof/)>),
670							revision_ids => qr(\sw:rsid\w+="[^"]+"),
671							complex_script_bold => qr(<w:bCs/>),
672							page_breaks => qr(<w:lastRenderedPageBreak/>),
673							language => qr(<w:lang w:val="[^/>]+/>),
674							empty_run_props => qr(<w:rPr></w:rPr>),
675							soft_hyphens => qr(<w:softHyphen/>),
676
677							=head3 reduce_all_noises
678
679							$part->reduce_all_noises;
680
681							Applies all regexes from the previous method.
682
683							=head3 unlink_fields
684
685							my $names_of_ASK_fields = $part->unlink_fields;
686
687							Removes all fields from the part, just leaving the current
688							value stored in each field. This is the equivalent of performing Ctrl-Shift-F9
689							on the whole document.
690
691							The return value is an arrayref to a list of names of ASK fields within the document.
692							Such names should then be passed to the L</suppress_bookmarks> method
693							(see below).
694
695
696							=head3 suppress_bookmarks
697
698							$part->suppress_bookmarks(@names_to_erase);
699
700							Removes bookmarks markup in the part. This is useful because
701							MsWord may silently insert bookmarks in unexpected places; therefore
702							some searches within the text may fail because of such bookmarks.
703
704							By default, this method only removes the bookmarks markup, leaving
705							intact the contents of the bookmark. However, when the name of a
706							bookmark belongs to the list C<< @names_to_erase >>, the contents
707							is also removed. Currently this is used for suppressing ASK fields,
708							because such fields contain a bookmark content that is never displayed by MsWord.
709
710
711
712							=head3 merge_runs
713
714							$part->merge_runs(no_caps => 1); # optional arg
715
716							Walks through all runs of text within the document, trying to merge
717							adjacent runs when possible (i.e. when both runs have the same
718							properties, and there is no other XML node inbetween).
719
720							This operation is a prerequisite before performing replace operations, because
721							documents edited in MsWord often have run boundaries across sentences or
722							even in the middle of words; so regex searches can only be successful if those
723							artificial boundaries have been removed.
724
725							If the argument C<< no_caps => 1 >> is present, the merge operation
726							will also convert runs with the C<w:caps> property, putting all letters
727							into uppercase and removing the property; this makes more merges possible.
728
729
730							=head3 replace
731
732							$part->replace($pattern, $replacement, %replacement_args);
733
734							Replaces all occurrences of C<$pattern> regex within the text nodes by the
735							given C<$replacement>. This is not exactly like a search-replace
736							operation performed within MsWord, because the search does not cross boundaries
737							of text nodes. In order to maximize the chances of successful replacements,
738							the L</cleanup_XML> method is automatically called before starting the operation.
739
740							The argument C<$pattern> can be either a string or a reference to a regular expression.
741							It should not contain any capturing parentheses, because that would perturb text
742							splitting operations.
743
744							The argument C<$replacement> can be either a fixed string, or a reference to
745							a callback subroutine that will be called for each match.
746
747
748							The C<< %replacement_args >> hash can be used to pass information to the callback
749							subroutine. That hash will be enriched with three entries :
750
751							=over
752
753							=item matched
754
755							The string that has been matched by C<$pattern>.
756
757							=item run
758
759							The run object in which this text resides.
760
761							=item xml_before
762
763							The XML fragment (possibly empty) found before the matched text .
764
765							=back
766
767							The callback subroutine may return either plain text or structured XML.
768							See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback.
769
770							The following special keys within C<< %replacement_args >> are interpreted by the
771							C<replace()> method itself, and therefore are not passed to the callback subroutine :
772
773							=over
774
775							=item keep_xml_as_is
776
777							if true, no call is made to the L</cleanup_XML> method before performing the replacements
778
779							=item dont_overwrite_contents
780
781							if true, the internal XML contents is not modified in place; the new XML after performing
782							replacements is merely returned to the caller.
783
784							=back
785
786
787							=head3 replace_image
788
789							$part->replace_image($image_alt_text, $image_PNG_content);
790
791							Replaces an existing PNG image by a new image. All features of the old image will
792							be preserved (size, positioning, border, etc.) -- only the image itself will be
793							replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word
794							for this image.
795
796							This operation replaces a ZIP member within the C<.docx> file. If several XML
797							nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several
798							locations, the new image will appear at all locations, even if they do not have the
799							same alternative text -- unfortunately this module currently has no facility for
800							duplicating an existing image into separate instances. So if your intent is to only replace
801							one image, your original document should contain several distinct images, coming from
802							several distinct C<.PNG> file copies.
803
804
805							=head3 add_image
806
807							my $rId = $part->add_image($image_PNG_content);
808
809							Stores the given PNG image within the ZIP file, adds it as a relationship to the
810							current part, and returns the relationship id. This operation is not sufficient
811							to make the image visible in Word : it just stores the image, but you still
812							have to insert a proper C<drawing> node in the contents XML, using the C<$rId>.
813							Future versions of this module may offer helper methods for that purpose;
814							currently it must be done by hand.
815
816
817							=head1 AUTHOR
818
819							Laurent Dami, E<lt>dami AT cpan DOT org<gt>
820
821							=head1 COPYRIGHT AND LICENSE
822
823							Copyright 2019-2022 by Laurent Dami.
824
825							This library is free software; you can redistribute it and/or modify
826							it under the same terms as Perl itself.
827
828
829