File Coverage

blib/lib/MsOffice/Word/Surgeon/PackagePart.pm

Criterion	Covered	Total	%
statement	157	205	76.5
branch	21	44	47.7
condition	24	40	60.0
subroutine	31	37	83.7
pod	13	16	81.2
total	246	342	71.9

line	stmt	bran	cond	sub	pod	time	code
1							package MsOffice::Word::Surgeon::PackagePart;
2	1			1		18	use 5.24.0;
	1					5
3	1			1		18	use Moose;
	1					2
	1					9
4	1			1		6839	use MooseX::StrictConstructor;
	1					2
	1					11
5	1			1		3278	use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level);
	1					5
	1					74
6	1			1		504	use MsOffice::Word::Surgeon::Run;
	1					4
	1					46
7	1			1		616	use MsOffice::Word::Surgeon::Text;
	1					3
	1					34
8	1			1		705	use XML::LibXML;
	1					31826
	1					8
9	1			1		170	use List::Util qw(max);
	1					4
	1					70
10	1			1		6	use Carp qw(croak carp);
	1					3
	1					93
11
12							# syntactic sugar for attributes
13							sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
14
15							# constant integers to specify indentation modes -- see L<XML::LibXML>
16	1			1		7	use constant XML_NO_INDENT => 0;
	1					2
	1					56
17	1			1		6	use constant XML_SIMPLE_INDENT => 1;
	1					3
	1					63
18
19	1			1		8	use namespace::clean -except => 'meta';
	1					2
	1					10
20
21							our $VERSION = '2.02';
22
23
24							#======================================================================
25							# ATTRIBUTES
26							#======================================================================
27
28
29							# attributes passed to the constructor
30							has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
31							has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
32
33
34							# attributes constructed by the module -- not received through the constructor
35							has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
36							has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
37							has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
38							has_inner 'images' => (is => 'ro', isa => 'HashRef');
39
40							has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
41							has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0);
42
43							#======================================================================
44							# GLOBAL VARIABLES
45							#======================================================================
46
47							# Various regexes for removing uninteresting XML information
48							my %noise_reduction_regexes = (
49							proof_checking => qr(<w:(?:proofErr[^>]+\|noProof/)>),
50							revision_ids => qr(\sw:rsid\w+="[^"]+"),
51							complex_script_bold => qr(<w:bCs/>),
52							page_breaks => qr(<w:lastRenderedPageBreak/>),
53							language => qr(<w:lang w:val="[^/>]+/>),
54							empty_run_props => qr(<w:rPr></w:rPr>),
55							soft_hyphens => qr(<w:softHyphen/>),
56							);
57
58							my @noise_reduction_list = qw/proof_checking revision_ids
59							complex_script_bold page_breaks language
60							empty_run_props soft_hyphens/;
61
62							#======================================================================
63							# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
64							#======================================================================
65
66
67							sub _runs {
68	12			12		23	my $self = shift;
69
70	12					24	state $run_regex = qr[
71							<w:r> # opening tag for the run
72							(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
73							(.*?) # run contents -- capture in $2
74							</w:r> # closing tag for the run
75							]x;
76
77	12					24	state $txt_regex = qr[
78							<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
79							(.*?) # text contents -- capture in $1
80							</w:t> # closing tag for text
81							]x;
82
83
84							# split XML content into run fragments
85	12					384	my $contents = $self->contents;
86	12					12876	my @run_fragments = split m[$run_regex], $contents, -1;
87	12					35	my @runs;
88
89							# build internal RUN objects
90							RUN:
91	12					60	while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
92	2004		100			2338004	$run_contents //= '';
93
94							# split XML of this run into text fragmentsn
95	2004					22690	my @txt_fragments = split m[$txt_regex], $run_contents, -1;
96	2004					4251	my @texts;
97
98							# build internal TEXT objects
99							TXT:
100	2004					6191	while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
101	3634	100	66			1627409	next TXT if !$xml_before_text && ( !(defined $txt_contents) \|\| $txt_contents eq '');
			66
102	1840		50			7805	push @texts, MsOffice::Word::Surgeon::Text->new(
			100
103							xml_before => $xml_before_text // '',
104							literal_text => $txt_contents // '',
105							);
106							}
107
108							# assemble TEXT objects into a RUN object
109	2004	100	100			41488	next RUN if !$xml_before_run && !@texts;
110	1852		50			8101	push @runs, MsOffice::Word::Surgeon::Run->new(
			100
111							xml_before => $xml_before_run // '',
112							props => $props // '',
113							inner_texts => \@texts,
114							);
115							}
116
117	12					16007	return \@runs;
118							}
119
120
121							sub _relationships {
122	1			1		3	my $self = shift;
123
124							# xml that describes the relationships for this package part
125	1					8	my $rel_xml = $self->_rels_xml;
126
127							# parse the relationships and assemble into a sparse array indexed by relationship ids
128	1					74	my @relationships;
129	1					19	while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
130	21					46	my %attrs = parse_attrs($1);
131	21		50			95	$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
132	21					79	($attrs{num} = $attrs{Id}) =~ s[^\D+][];
133	21					102	($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
134	21					150	$relationships[$attrs{num}] = \%attrs;
135							}
136
137	1					42	return \@relationships;
138							}
139
140
141							sub _images {
142	0			0		0	my $self = shift;
143
144							# get relationship ids associated with images
145	0					0	my %rel_image = map {$_->{Id} => $_->{Target}}
146	0	0				0	grep {$_ && $_->{short_type} eq 'image'}
	0					0
147							$self->relationships->@*;
148
149							# get titles and relationship ids of images found within the part contents
150	0					0	my %image;
151	0					0	my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
152							DRAWING:
153	0					0	foreach my $drawing (@drawings) {
154	0	0				0	if ($drawing =~ m[<wp:docPr \s+ (.*?) />
155							.*?
156							<a:blip \s+ r:embed="(\w+)"]x) {
157	0					0	my ($lst_attrs, $rId) = ($1, $2);
158	0					0	my %attrs = parse_attrs($lst_attrs);
159							my $img_id = $attrs{title} \|\| $attrs{descr}
160	0	0	0			0	or next DRAWING;
161
162	0	0				0	$image{$img_id} = "word/$rel_image{$rId}"
163							or die "couldn't find image for relationship '$rId' associated with image '$img_id'";
164							# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why.
165							}
166							}
167
168	0					0	return \%image;
169							}
170
171
172	7			7		19	sub _contents {shift->original_contents}
173
174							sub _on_new_contents {
175	16			16		31	my $self = shift;
176
177	16					633	$self->clear_runs;
178	16					224038	$self->{contents_has_changed} = 1;
179							}
180
181							#======================================================================
182							# METHODS
183							#======================================================================
184
185
186							sub _rels_xml {
187	1			1		4	my ($self, $new_xml) = @_;
188	1					34	my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name;
189	1					36	return $self->surgeon->xml_member($rels_name, $new_xml);
190							}
191
192
193							sub zip_member_name {
194	7			7	0	15	my $self = shift;
195	7					248	return sprintf "word/%s.xml", $self->part_name;
196							}
197
198
199							sub original_contents {
200	7			7	1	13	my $self = shift;
201
202	7					224	return $self->surgeon->xml_member($self->zip_member_name);
203							}
204
205
206							sub image {
207	0			0	0	0	my ($self, $title, $new_image_content) = @_;
208
209							# name of the image file within the zip
210	0		0			0	my $zip_member_name = $self->images->{$title}
211							\|\| ($title =~ /^\d+$/ ? "word/media/image$title.png"
212							: die "couldn't find image '$title'");
213
214							# delegate to Archive::Zip::contents
215	0					0	return $self->surgeon->zip->contents($zip_member_name, $new_image_content);
216							}
217
218
219
220							#======================================================================
221							# CONTENTS RESTITUTION
222							#======================================================================
223
224							sub indented_contents {
225	0			0	1	0	my $self = shift;
226
227	0					0	my $dom = XML::LibXML->load_xml(string => $self->contents);
228	0					0	return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string
229							}
230
231
232							sub plain_text {
233	2			2	1	30	my $self = shift;
234
235							# XML contents
236	2					67	my $txt = $self->contents;
237
238							# replace opening paragraph tags by newlines
239	2					4535	$txt =~ s/(<w:p[ >])/\n$1/g;
240
241							# replace break tags by newlines
242	2					3857	$txt =~ s[<w:br/>][\n]g;
243
244							# replace tab nodes by ASCII tabs
245	2					5192	$txt =~ s/<w:tab[^s][^>]*>/\t/g;
246
247							# remove all remaining XML tags
248	2					15472	$txt =~ s/<[^>]+>//g;
249
250	2					37	return $txt;
251							}
252
253
254
255
256							#======================================================================
257							# MODIFYING CONTENTS
258							#======================================================================
259
260							sub cleanup_XML {
261	2			2	1	10	my ($self, @merge_args) = @_;
262
263							# avoid doing it twice
264	2	100				11	return if $self->{was_cleaned_up};
265
266							# do the cleanup
267	1					7	$self->reduce_all_noises;
268	1					6	my $names_of_ASK_fields = $self->unlink_fields;
269	1					12	$self->suppress_bookmarks(@$names_of_ASK_fields);
270	1					11	$self->merge_runs(@merge_args);
271
272							# remember it was done
273	1					84971	$self->{was_cleaned_up} = 1;
274							}
275
276							sub noise_reduction_regex {
277	7			7	1	13	my ($self, $regex_name) = @_;
278	7	50				25	my $regex = $noise_reduction_regexes{$regex_name}
279							or croak "->noise_reduction_regex('$regex_name') : unknown regex name";
280	7					15	return $regex;
281							}
282
283							sub reduce_noise {
284	2			2	1	8	my ($self, @noises) = @_;
285
286							# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes
287	2	100				6	my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises;
	10					26
288
289							# get contents, apply all regexes, put back the modified contents.
290	2					74	my $contents = $self->contents;
291	1			1		2814	no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef
	1					3
	1					1642
292	2					25522	$contents =~ s/$_/$1/g foreach @regexes;
293	2					94	$self->contents($contents);
294							}
295
296							sub reduce_all_noises {
297	1			1	1	2	my $self = shift;
298
299	1					5	$self->reduce_noise(@noise_reduction_list);
300							}
301
302							sub suppress_bookmarks {
303	1			1	1	4	my ($self, @names_to_erase) = @_;
304
305							# closure to decide what to do with bookmark contents
306	1					4	my %should_erase_contents = map {($_ => 1)} @names_to_erase;
	2					11
307							my $deal_with_bookmark_text = sub {
308	17			17		64	my ($bookmark_name, $bookmark_contents) = @_;
309	17	100				698	return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents;
310	1					8	};
311
312							# regex to find bookmarks markup
313	1					6	state $bookmark_rx = qr{
314							<w:bookmarkStart # initial tag
315							.+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1
316							.+? w:name="([^"]+)" # 'name' attribute -- capture 2
317							.*? /> # end of this tag
318							(.*?) # bookmark contents (may be empty) -- capture 3
319							<w:bookmarkEnd # ending tag
320							\s+ w:id="\1" # same 'id' attribute
321							.*? /> # end of this tag
322							}sx;
323
324							# remove bookmarks markup
325	1					34	my $contents = $self->contents;
326	1					1428	$contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg;
	17					50
327
328							# re-inject the modified contents
329	1					45	$self->contents($contents);
330							}
331
332							sub merge_runs {
333	1			1	1	5	my ($self, %args) = @_;
334
335							# check validity of received args
336	1					4	state $is_valid_arg = {no_caps => 1};
337	1					6	my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args;
	1					5
338	1	50				5	croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args;
339
340	1					3	my @new_runs;
341							# loop over internal "run" objects
342	1					2	foreach my $run (@{$self->runs}) {
	1					35
343
344	793	50				2817	$run->remove_caps_property if $args{no_caps};
345
346							# check if the current run can be merged with the previous one
347	793	100	66			22914	if ( !$run->xml_before # no other XML markup between the 2 runs
			100
348							&& @new_runs # there was a previous run
349							&& $new_runs[-1]->props eq $run->props # both runs have the same properties
350							) {
351							# conditions are OK, so merge this run with the previous one
352	272					644	$new_runs[-1]->merge($run);
353							}
354							else {
355							# conditions not OK, just push this run without merging
356	521					1280	push @new_runs, $run;
357							}
358							}
359
360							# reassemble the whole stuff and inject it as new contents
361	1					15	$self->contents(join "", map {$_->as_xml} @new_runs);
	521					1315
362							}
363
364
365
366							sub unlink_fields {
367	1			1	1	3	my $self = shift;
368
369							# must find out what are the ASK fields before erasing the markup
370	1					5	state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)];
371	1					33	my $contents = $self->contents;
372	1					233	my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g;
373
374
375							# regexes to remove field nodes and "field instruction" nodes
376	1					5	state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>];
377	1					5	state $field_boundary_rx = qr[<w:fldChar
378							(?: [^>]*?/> # ignore all attributes until end of node ..
379							\| # .. or
380							[^>]?>.?</w:fldChar>) # .. ignore node content until closing tag
381							]x; # field boundaries are encoded as "begin" / "separate" / "end"
382	1					4	state $simple_field_rx = qr[</?w:fldSimple[^>]*>];
383
384							# apply the regexes
385	1					6	$self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx);
386
387	1					4	return \@names_of_ASK_fields;
388							}
389
390
391							sub replace {
392	11			11	1	60	my ($self, $pattern, $replacement_callback, %replacement_args) = @_;
393
394							# shared initial string for error messages
395	11					25	my $error_msg = '->replace($pattern, $callback, %args)';
396
397							# default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is'
398	11	50				28	if (delete $replacement_args{keep_xml_as_is}) {
399							not exists $replacement_args{cleanup_XML}
400	0	0				0	or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'";
401	0					0	carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead";
402	0					0	$replacement_args{cleanup_XML} = 0;
403							}
404							else {
405	11		100			32	$replacement_args{cleanup_XML} //= 1; # default
406							}
407
408							# cleanup the XML structure so that replacements work better
409	11	100				31	if (my $cleanup_args = $replacement_args{cleanup_XML}) {
410	1	50				6	$cleanup_args = {} if ! ref $cleanup_args;
411	1	50				6	ref $cleanup_args eq 'HASH'
412							or croak "$error_msg: arg 'cleanup_XML' should be a hashref";
413	1					6	$self->cleanup_XML(%$cleanup_args);
414							}
415
416							# check for presences of a special option to avoid modying contents
417	11					21	my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents};
418
419							# apply replacements and generate new XML
420							my $xml = join "",
421	11					366	map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*;
	1059					4078
422
423							# overwrite previous contents
424	11	50				596	$self->contents($xml) unless $dont_overwrite_contents;
425
426	11					87	return $xml;
427							}
428
429
430							sub _update_contents_in_zip { # called for each part before saving the zip file
431	0			0		0	my $self = shift;
432
433							$self->surgeon->xml_member($self->zip_member_name, $self->contents)
434	0	0				0	if $self->{contents_has_changed};
435							}
436
437
438							sub replace_image {
439	0			0	1	0	my ($self, $image_title, $image_PNG_content) = @_;
440
441	0	0				0	my $member_name = $self->images->{$image_title}
442							or die "could not find an image with title: $image_title";
443	0					0	$self->surgeon->zip->contents($member_name, $image_PNG_content);
444							}
445
446
447
448							sub add_image {
449	0			0	1	0	my ($self, $image_PNG_content) = @_;
450
451							# compute a fresh image number and a fresh relationship id
452	0					0	my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]);
453	0					0	my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members;
	0					0
454	0		0			0	my $last_img_num = max @image_nums // 0;
455	0					0	my $target = sprintf "media/image%d.png", $last_img_num + 1;
456	0					0	my $last_rId_num = $self->relationships->$#*;
457	0					0	my $rId = sprintf "rId%d", $last_rId_num + 1;
458
459							# assemble XML for the new relationship
460	0					0	my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
461	0					0	my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>};
462
463							# update the rels member
464	0					0	my $xml = $self->_rels_xml;
465	0					0	$xml =~ s[</Relationships>][$new_rel_xml</Relationships>];
466	0					0	$self->_rels_xml($xml);
467
468							# add the image as a new member into the archive
469	0					0	my $member_name = "word/$target";
470	0					0	$self->surgeon->zip->addString(\$image_PNG_content, $member_name);
471
472							# update the global content_types if it doesn't include PNG
473	0					0	my $ct = $self->surgeon->_content_types;
474	0	0				0	if ($ct !~ /Extension="png"/) {
475	0					0	$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>];
476	0					0	$self->surgeon->_content_types($ct);
477							}
478
479							# return the relationship id
480	0					0	return $rId;
481							}
482
483
484
485							#======================================================================
486							# UTILITY FUNCTIONS
487							#======================================================================
488
489
490							sub parse_attrs { # cheap parsing of attribute lists in an XML node
491	21			21	0	56	my ($lst_attrs) = @_;
492
493	21					34	state $attr_pair_regex = qr[
494							([^=\s"'&<>]+) # attribute name
495							\h* = \h* # Eq
496							(?: # attribute value
497							" ([^<"]*) " # .. enclosed in double quotes
498							\|
499							' ([^<']*) ' # .. or enclosed in single quotes
500							)
501							]x;
502
503	21					33	state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'};
504	21					34	state $entity_names = join "\|", keys %$entity;
505
506	21					29	my %attr;
507	21					130	while ($lst_attrs =~ /$attr_pair_regex/g) {
508	69		33			260	my ($name, $val) = ($1, $2 // $3);
509	69					127	$val =~ s/&(entity_names);/$entity->{$1}/eg;
	0					0
510	69					460	$attr{$name} = $val;
511							}
512
513	21					120	return %attr;
514							}
515
516
517							1;
518
519							__END__
520
521							=encoding ISO-8859-1
522
523							=head1 NAME
524
525							MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document
526
527							=head1 SYNOPSIS
528
529							my $part = $surgeon->document;
530							print $part->plain_text;
531							$part->replace(qr[$pattern], $replacement_callback);
532							$part->replace_image($image_alt_text, $image_PNG_content);
533
534
535							=head1 DESCRIPTION
536
537							This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single
538							I<package part> within the ZIP package of a C<.docx> document.
539							It is mostly used for the I<document> part, that contains the XML representation of the
540							main document body. However, other parts such as headers, footers, footnotes, etc. have the
541							same internal representation and therefore the same operations can be invoked.
542
543
544							=head1 METHODS
545
546							=head2 new
547
548							my $run = MsOffice::Word::Surgeon::PackagePart->new(
549							surgeon => $surgeon,
550							part_name => $name,
551							);
552
553							Constructor for a new part object. This is called internally from
554							L<MsOffice::Word::Surgeon>; it is not meant to be called directly
555							by clients.
556
557							=head3 Constructor arguments
558
559
560							=over
561
562							=item surgeon
563
564							a weak reference to the main surgeon object
565
566							=item part_name
567
568							ZIP member name of this part
569
570							=back
571
572							=head3 Other attributes
573
574							Other attributes, which are not passed through the constructor but are generated lazily on demand, are :
575
576							=over
577
578							=item contents
579
580							the XML contents of this part
581
582							=item runs
583
584							a decomposition of the XML contents into a collection of
585							L<MsOffice::Word::Surgeon::Run> objects.
586
587							=item relationships
588
589							an arrayref of Office relationships associated with this part. This information comes from
590							a C<.rels> member in the ZIP archive, named after the name of the package part.
591							Array indices correspond to relationship numbers. Array values are hashrefs with
592							keys
593
594							=over
595
596							=item Id
597
598							the full relationship id
599
600							=item num
601
602							the numeric part of C<rId>
603
604							=item Type
605
606							the full reference to the XML schema for this relationship
607
608							=item short_type
609
610							only the last word of the type, e.g. 'image', 'style', etc.
611
612							=item Target
613
614							designation of the target within the ZIP file. The prefix 'word/' must be
615							added for having a complete Zip member name.
616
617							=back
618
619
620
621							=item images
622
623							a hashref of images within this package part. Keys of the hash are image I<alternative texts>.
624							If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken
625							(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display
626							the I<description> field -- see
627							L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669\|MsOffice documentation>).
628
629							Images without alternative text will not be accessible through the current Perl module.
630
631							Values of the hash are zip member names for the corresponding
632							image representations in C<.png> format.
633
634
635							=back
636
637
638							=head2 Contents restitution
639
640							=head3 contents
641
642							Returns a Perl string with the current internal XML representation of the part
643							contents.
644
645							=head3 original_contents
646
647							Returns a Perl string with the XML representation of the
648							part contents, as it was in the ZIP archive before any
649							modification.
650
651							=head3 indented_contents
652
653							Returns an indented version of the XML contents, suitable for inspection in a text editor.
654							This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded
655							byte string, not a Perl string.
656
657							=head3 plain_text
658
659							Returns the text contents of the part, without any markup.
660							Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored.
661
662
663							=head3 runs
664
665							Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of
666							these objects holds an XML fragment; joining all fragments
667							restores the complete document.
668
669							my $contents = join "", map {$_->as_xml} $self->runs;
670
671
672							=head2 Modifying contents
673
674
675							=head3 cleanup_XML
676
677							$part->cleanup_XML(%args);
678
679							Apply several other methods for removing unnecessary nodes within the internal
680							XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>,
681							L</suppress_bookmarks> and L</merge_runs>.
682
683							Currently there is only one legal arg :
684
685							=over
686
687							=item C<no_caps>
688
689							If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically
690							called for each run object. As a result, all texts within runs with the C<caps> property are automatically
691							converted to uppercase.
692
693							=back
694
695
696
697							=head3 reduce_noise
698
699							$part->reduce_noise($regex1, $regex2, ...);
700
701							This method is used for removing unnecessary information in the XML
702							markup. It applies the given list of regexes to the whole document,
703							suppressing matches. The final result is put back into
704							C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >>
705							references, or as names of builtin regexes (described below). Regexes
706							are applied to the whole XML contents, not only to run nodes.
707
708
709							=head3 noise_reduction_regex
710
711							my $regex = $part->noise_reduction_regex($regex_name);
712
713							Returns the builtin regex corresponding to the given name.
714							Known regexes are :
715
716							proof_checking => qr(<w:(?:proofErr[^>]+\|noProof/)>),
717							revision_ids => qr(\sw:rsid\w+="[^"]+"),
718							complex_script_bold => qr(<w:bCs/>),
719							page_breaks => qr(<w:lastRenderedPageBreak/>),
720							language => qr(<w:lang w:val="[^/>]+/>),
721							empty_run_props => qr(<w:rPr></w:rPr>),
722							soft_hyphens => qr(<w:softHyphen/>),
723
724							=head3 reduce_all_noises
725
726							$part->reduce_all_noises;
727
728							Applies all regexes from the previous method.
729
730							=head3 unlink_fields
731
732							my $names_of_ASK_fields = $part->unlink_fields;
733
734							Removes all fields from the part, just leaving the current
735							value stored in each field. This is the equivalent of performing Ctrl-Shift-F9
736							on the whole document.
737
738							The return value is an arrayref to a list of names of ASK fields within the document.
739							Such names should then be passed to the L</suppress_bookmarks> method
740							(see below).
741
742
743							=head3 suppress_bookmarks
744
745							$part->suppress_bookmarks(@names_to_erase);
746
747							Removes bookmarks markup in the part. This is useful because
748							MsWord may silently insert bookmarks in unexpected places; therefore
749							some searches within the text may fail because of such bookmarks.
750
751							By default, this method only removes the bookmarks markup, leaving
752							intact the contents of the bookmark. However, when the name of a
753							bookmark belongs to the list C<< @names_to_erase >>, the contents
754							is also removed. Currently this is used for suppressing ASK fields,
755							because such fields contain a bookmark content that is never displayed by MsWord.
756
757
758
759							=head3 merge_runs
760
761							$part->merge_runs(no_caps => 1); # optional arg
762
763							Walks through all runs of text within the document, trying to merge
764							adjacent runs when possible (i.e. when both runs have the same
765							properties, and there is no other XML node inbetween).
766
767							This operation is a prerequisite before performing replace operations, because
768							documents edited in MsWord often have run boundaries across sentences or
769							even in the middle of words; so regex searches can only be successful if those
770							artificial boundaries have been removed.
771
772							If the argument C<< no_caps => 1 >> is present, the merge operation
773							will also convert runs with the C<w:caps> property, putting all letters
774							into uppercase and removing the property; this makes more merges possible.
775
776
777							=head3 replace
778
779							$part->replace($pattern, $replacement, %replacement_args);
780
781							Replaces all occurrences of C<$pattern> regex within the text nodes by the
782							given C<$replacement>. This is not exactly like a search-replace
783							operation performed within MsWord, because the search does not cross boundaries
784							of text nodes. In order to maximize the chances of successful replacements,
785							the L</cleanup_XML> method is automatically called before starting the operation.
786
787							The argument C<$pattern> can be either a string or a reference to a regular expression.
788							It should not contain any capturing parentheses, because that would perturb text
789							splitting operations.
790
791							The argument C<$replacement> can be either a fixed string, or a reference to
792							a callback subroutine that will be called for each match.
793
794
795							The C<< %replacement_args >> hash can be used to pass information to the callback
796							subroutine. That hash will be enriched with three entries :
797
798							=over
799
800							=item matched
801
802							The string that has been matched by C<$pattern>.
803
804							=item run
805
806							The run object in which this text resides.
807
808							=item xml_before
809
810							The XML fragment (possibly empty) found before the matched text .
811
812							=back
813
814							The callback subroutine may return either plain text or structured XML.
815							See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback.
816
817							The following special keys within C<< %replacement_args >> are interpreted by the
818							C<replace()> method itself, and therefore are not passed to the callback subroutine :
819
820							=over
821
822							=item keep_xml_as_is
823
824							if true, no call is made to the L</cleanup_XML> method before performing the replacements
825
826							=item dont_overwrite_contents
827
828							if true, the internal XML contents is not modified in place; the new XML after performing
829							replacements is merely returned to the caller.
830
831							=item cleanup_args
832
833							the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This
834							is typically used as
835
836							$part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]);
837
838							=back
839
840
841							=head3 replace_image
842
843							$part->replace_image($image_alt_text, $image_PNG_content);
844
845							Replaces an existing PNG image by a new image. All features of the old image will
846							be preserved (size, positioning, border, etc.) -- only the image itself will be
847							replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word
848							for this image.
849
850							This operation replaces a ZIP member within the C<.docx> file. If several XML
851							nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several
852							locations, the new image will appear at all locations, even if they do not have the
853							same alternative text -- unfortunately this module currently has no facility for
854							duplicating an existing image into separate instances. So if your intent is to only replace
855							one image, your original document should contain several distinct images, coming from
856							several distinct C<.PNG> file copies.
857
858
859							=head3 add_image
860
861							my $rId = $part->add_image($image_PNG_content);
862
863							Stores the given PNG image within the ZIP file, adds it as a relationship to the
864							current part, and returns the relationship id. This operation is not sufficient
865							to make the image visible in Word : it just stores the image, but you still
866							have to insert a proper C<drawing> node in the contents XML, using the C<$rId>.
867							Future versions of this module may offer helper methods for that purpose;
868							currently it must be done by hand.
869
870
871							=head1 AUTHOR
872
873							Laurent Dami, E<lt>dami AT cpan DOT org<gt>
874
875							=head1 COPYRIGHT AND LICENSE
876
877							Copyright 2019-2023 by Laurent Dami.
878
879							This library is free software; you can redistribute it and/or modify
880							it under the same terms as Perl itself.
881
882
883