File Coverage

blib/lib/MsOffice/Word/Surgeon/PackagePart.pm

Criterion	Covered	Total	%
statement	158	206	76.7
branch	21	44	47.7
condition	24	40	60.0
subroutine	31	37	83.7
pod	13	16	81.2
total	247	343	72.0

line	stmt	bran	cond	sub	pod	time	code
1							package MsOffice::Word::Surgeon::PackagePart;
2	1			1		13	use 5.24.0;
	1					4
3	1			1		13	use Moose;
	1					3
	1					6
4	1			1		6418	use MooseX::StrictConstructor;
	1					2
	1					7
5	1			1		3277	use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level);
	1					5
	1					105
6	1			1		485	use MsOffice::Word::Surgeon::Run;
	1					3
	1					31
7	1			1		430	use MsOffice::Word::Surgeon::Text;
	1					13
	1					34
8	1			1		753	use XML::LibXML;
	1					32756
	1					6
9	1			1		168	use List::Util qw(max);
	1					2
	1					66
10	1			1		7	use Carp qw(croak carp);
	1					2
	1					100
11
12							# syntactic sugar for attributes
13							sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
14
15							# constant integers to specify indentation modes -- see L<XML::LibXML>
16	1			1		8	use constant XML_NO_INDENT => 0;
	1					2
	1					56
17	1			1		6	use constant XML_SIMPLE_INDENT => 1;
	1					2
	1					59
18
19	1			1		8	use namespace::clean -except => 'meta';
	1					3
	1					9
20
21							our $VERSION = '2.03';
22
23
24							#======================================================================
25							# ATTRIBUTES
26							#======================================================================
27
28
29							# attributes passed to the constructor
30							has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
31							has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
32
33
34							# attributes constructed by the module -- not received through the constructor
35							has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
36							has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
37							has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
38							has_inner 'images' => (is => 'ro', isa => 'HashRef');
39
40							has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
41							has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0);
42
43							#======================================================================
44							# GLOBAL VARIABLES
45							#======================================================================
46
47							# Various regexes for removing uninteresting XML information
48							my %noise_reduction_regexes = (
49							proof_checking => qr(<w:(?:proofErr[^>]+\|noProof/)>),
50							revision_ids => qr(\sw:rsid\w+="[^"]+"),
51							complex_script_bold => qr(<w:bCs/>),
52							page_breaks => qr(<w:lastRenderedPageBreak/>),
53							language => qr(<w:lang w:val="[^/>]+/>),
54							empty_run_props => qr(<w:rPr></w:rPr>),
55							soft_hyphens => qr(<w:softHyphen/>),
56							);
57
58							my @noise_reduction_list = qw/proof_checking revision_ids
59							complex_script_bold page_breaks language
60							empty_run_props soft_hyphens/;
61
62							#======================================================================
63							# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
64							#======================================================================
65
66
67							sub _runs {
68	12			12		28	my $self = shift;
69
70	12					31	state $run_regex = qr[
71							<w:r> # opening tag for the run
72							(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
73							(.*?) # run contents -- capture in $2
74							</w:r> # closing tag for the run
75							]x;
76
77	12					30	state $txt_regex = qr[
78							<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
79							(.*?) # text contents -- capture in $1
80							</w:t> # closing tag for text
81							]x;
82
83
84							# split XML content into run fragments
85	12					392	my $contents = $self->contents;
86	12					12713	my @run_fragments = split m[$run_regex], $contents, -1;
87	12					36	my @runs;
88
89							# build internal RUN objects
90							RUN:
91	12					61	while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
92	2004		100			2360127	$run_contents //= '';
93
94							# split XML of this run into text fragmentsn
95	2004					23261	my @txt_fragments = split m[$txt_regex], $run_contents, -1;
96	2004					4103	my @texts;
97
98							# build internal TEXT objects
99							TXT:
100	2004					6733	while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
101	3634	100	66			1636291	next TXT if !$xml_before_text && ( !(defined $txt_contents) \|\| $txt_contents eq '');
			66
102	1840		50			7963	push @texts, MsOffice::Word::Surgeon::Text->new(
			100
103							xml_before => $xml_before_text // '',
104							literal_text => $txt_contents // '',
105							);
106							}
107
108							# assemble TEXT objects into a RUN object
109	2004	100	100			42853	next RUN if !$xml_before_run && !@texts;
110	1852		50			8180	push @runs, MsOffice::Word::Surgeon::Run->new(
			100
111							xml_before => $xml_before_run // '',
112							props => $props // '',
113							inner_texts => \@texts,
114							);
115							}
116
117	12					17285	return \@runs;
118							}
119
120
121							sub _relationships {
122	1			1		3	my $self = shift;
123
124							# xml that describes the relationships for this package part
125	1					3	my $rel_xml = $self->_rels_xml;
126
127							# parse the relationships and assemble into a sparse array indexed by relationship ids
128	1					77	my @relationships;
129	1					24	while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
130	21					47	my %attrs = parse_attrs($1);
131	21		50			94	$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
132	21					76	($attrs{num} = $attrs{Id}) =~ s[^\D+][];
133	21					98	($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
134	21					150	$relationships[$attrs{num}] = \%attrs;
135							}
136
137	1					40	return \@relationships;
138							}
139
140
141							sub _images {
142	0			0		0	my $self = shift;
143
144							# get relationship ids associated with images
145	0					0	my %rel_image = map {$_->{Id} => $_->{Target}}
146	0	0				0	grep {$_ && $_->{short_type} eq 'image'}
	0					0
147							$self->relationships->@*;
148
149							# get titles and relationship ids of images found within the part contents
150	0					0	my %image;
151	0					0	my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
152							DRAWING:
153	0					0	foreach my $drawing (@drawings) {
154	0	0				0	if ($drawing =~ m[<wp:docPr \s+ (.*?) />
155							.*?
156							<a:blip \s+ r:embed="(\w+)"]x) {
157	0					0	my ($lst_attrs, $rId) = ($1, $2);
158	0					0	my %attrs = parse_attrs($lst_attrs);
159							my $img_id = $attrs{title} \|\| $attrs{descr}
160	0	0	0			0	or next DRAWING;
161
162	0	0				0	$image{$img_id} = "word/$rel_image{$rId}"
163							or die "couldn't find image for relationship '$rId' associated with image '$img_id'";
164							# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why.
165							}
166							}
167
168	0					0	return \%image;
169							}
170
171
172	7			7		20	sub _contents {shift->original_contents}
173
174							sub _on_new_contents {
175	16			16		34	my $self = shift;
176
177	16					607	$self->clear_runs;
178	16					219154	$self->{contents_has_changed} = 1;
179	16					640	$self->{was_cleaned_up} = 0;
180							}
181
182							#======================================================================
183							# METHODS
184							#======================================================================
185
186
187							sub _rels_xml {
188	1			1		3	my ($self, $new_xml) = @_;
189	1					60	my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name;
190	1					35	return $self->surgeon->xml_member($rels_name, $new_xml);
191							}
192
193
194							sub zip_member_name {
195	7			7	0	14	my $self = shift;
196	7					248	return sprintf "word/%s.xml", $self->part_name;
197							}
198
199
200							sub original_contents {
201	7			7	1	11	my $self = shift;
202
203	7					221	return $self->surgeon->xml_member($self->zip_member_name);
204							}
205
206
207							sub image {
208	0			0	0	0	my ($self, $title, $new_image_content) = @_;
209
210							# name of the image file within the zip
211	0		0			0	my $zip_member_name = $self->images->{$title}
212							\|\| ($title =~ /^\d+$/ ? "word/media/image$title.png"
213							: die "couldn't find image '$title'");
214
215							# delegate to Archive::Zip::contents
216	0					0	return $self->surgeon->zip->contents($zip_member_name, $new_image_content);
217							}
218
219
220
221							#======================================================================
222							# CONTENTS RESTITUTION
223							#======================================================================
224
225							sub indented_contents {
226	0			0	1	0	my $self = shift;
227
228	0					0	my $dom = XML::LibXML->load_xml(string => $self->contents);
229	0					0	return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string
230							}
231
232
233							sub plain_text {
234	2			2	1	30	my $self = shift;
235
236							# XML contents
237	2					68	my $txt = $self->contents;
238
239							# replace opening paragraph tags by newlines
240	2					4575	$txt =~ s/(<w:p[ >])/\n$1/g;
241
242							# replace break tags by newlines
243	2					3982	$txt =~ s[<w:br/>][\n]g;
244
245							# replace tab nodes by ASCII tabs
246	2					5210	$txt =~ s/<w:tab[^s][^>]*>/\t/g;
247
248							# remove all remaining XML tags
249	2					15908	$txt =~ s/<[^>]+>//g;
250
251	2					40	return $txt;
252							}
253
254
255
256
257							#======================================================================
258							# MODIFYING CONTENTS
259							#======================================================================
260
261							sub cleanup_XML {
262	2			2	1	10	my ($self, @merge_args) = @_;
263
264							# avoid doing it twice
265	2	100				8	return if $self->{was_cleaned_up};
266
267							# do the cleanup
268	1					5	$self->reduce_all_noises;
269	1					7	my $names_of_ASK_fields = $self->unlink_fields;
270	1					8	$self->suppress_bookmarks(@$names_of_ASK_fields);
271	1					6	$self->merge_runs(@merge_args);
272
273							# remember it was done
274	1					83302	$self->{was_cleaned_up} = 1;
275							}
276
277							sub noise_reduction_regex {
278	7			7	1	13	my ($self, $regex_name) = @_;
279	7	50				21	my $regex = $noise_reduction_regexes{$regex_name}
280							or croak "->noise_reduction_regex('$regex_name') : unknown regex name";
281	7					18	return $regex;
282							}
283
284							sub reduce_noise {
285	2			2	1	8	my ($self, @noises) = @_;
286
287							# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes
288	2	100				5	my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises;
	10					27
289
290							# get contents, apply all regexes, put back the modified contents.
291	2					74	my $contents = $self->contents;
292	1			1		2715	no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef
	1					2
	1					1775
293	2					25901	$contents =~ s/$_/$1/g foreach @regexes;
294	2					98	$self->contents($contents);
295							}
296
297							sub reduce_all_noises {
298	1			1	1	3	my $self = shift;
299
300	1					4	$self->reduce_noise(@noise_reduction_list);
301							}
302
303							sub suppress_bookmarks {
304	1			1	1	5	my ($self, @names_to_erase) = @_;
305
306							# closure to decide what to do with bookmark contents
307	1					3	my %should_erase_contents = map {($_ => 1)} @names_to_erase;
	2					10
308							my $deal_with_bookmark_text = sub {
309	17			17		60	my ($bookmark_name, $bookmark_contents) = @_;
310	17	100				733	return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents;
311	1					10	};
312
313							# regex to find bookmarks markup
314	1					5	state $bookmark_rx = qr{
315							<w:bookmarkStart # initial tag
316							.+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1
317							.+? w:name="([^"]+)" # 'name' attribute -- capture 2
318							.*? /> # end of this tag
319							(.*?) # bookmark contents (may be empty) -- capture 3
320							<w:bookmarkEnd # ending tag
321							\s+ w:id="\1" # same 'id' attribute
322							.*? /> # end of this tag
323							}sx;
324
325							# remove bookmarks markup
326	1					35	my $contents = $self->contents;
327	1					1453	$contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg;
	17					50
328
329							# re-inject the modified contents
330	1					45	$self->contents($contents);
331							}
332
333							sub merge_runs {
334	1			1	1	8	my ($self, %args) = @_;
335
336							# check validity of received args
337	1					5	state $is_valid_arg = {no_caps => 1};
338	1					7	my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args;
	1					6
339	1	50				4	croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args;
340
341	1					3	my @new_runs;
342							# loop over internal "run" objects
343	1					3	foreach my $run (@{$self->runs}) {
	1					34
344
345	793	50				3190	$run->remove_caps_property if $args{no_caps};
346
347							# check if the current run can be merged with the previous one
348	793	100	66			23080	if ( !$run->xml_before # no other XML markup between the 2 runs
			100
349							&& @new_runs # there was a previous run
350							&& $new_runs[-1]->props eq $run->props # both runs have the same properties
351							) {
352							# conditions are OK, so merge this run with the previous one
353	272					638	$new_runs[-1]->merge($run);
354							}
355							else {
356							# conditions not OK, just push this run without merging
357	521					1430	push @new_runs, $run;
358							}
359							}
360
361							# reassemble the whole stuff and inject it as new contents
362	1					15	$self->contents(join "", map {$_->as_xml} @new_runs);
	521					1194
363							}
364
365
366
367							sub unlink_fields {
368	1			1	1	2	my $self = shift;
369
370							# must find out what are the ASK fields before erasing the markup
371	1					7	state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)];
372	1					35	my $contents = $self->contents;
373	1					232	my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g;
374
375
376							# regexes to remove field nodes and "field instruction" nodes
377	1					6	state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>];
378	1					6	state $field_boundary_rx = qr[<w:fldChar
379							(?: [^>]*?/> # ignore all attributes until end of node ..
380							\| # .. or
381							[^>]?>.?</w:fldChar>) # .. ignore node content until closing tag
382							]x; # field boundaries are encoded as "begin" / "separate" / "end"
383	1					3	state $simple_field_rx = qr[</?w:fldSimple[^>]*>];
384
385							# apply the regexes
386	1					5	$self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx);
387
388	1					10	return \@names_of_ASK_fields;
389							}
390
391
392							sub replace {
393	11			11	1	68	my ($self, $pattern, $replacement_callback, %replacement_args) = @_;
394
395							# shared initial string for error messages
396	11					26	my $error_msg = '->replace($pattern, $callback, %args)';
397
398							# default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is'
399	11	50				35	if (delete $replacement_args{keep_xml_as_is}) {
400							not exists $replacement_args{cleanup_XML}
401	0	0				0	or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'";
402	0					0	carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead";
403	0					0	$replacement_args{cleanup_XML} = 0;
404							}
405							else {
406	11		100			35	$replacement_args{cleanup_XML} //= 1; # default
407							}
408
409							# cleanup the XML structure so that replacements work better
410	11	100				31	if (my $cleanup_args = $replacement_args{cleanup_XML}) {
411	1	50				6	$cleanup_args = {} if ! ref $cleanup_args;
412	1	50				7	ref $cleanup_args eq 'HASH'
413							or croak "$error_msg: arg 'cleanup_XML' should be a hashref";
414	1					5	$self->cleanup_XML(%$cleanup_args);
415							}
416
417							# check for presences of a special option to avoid modying contents
418	11					22	my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents};
419
420							# apply replacements and generate new XML
421							my $xml = join "",
422	11					379	map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*;
	1059					3701
423
424							# overwrite previous contents
425	11	50				562	$self->contents($xml) unless $dont_overwrite_contents;
426
427	11					117	return $xml;
428							}
429
430
431							sub _update_contents_in_zip { # called for each part before saving the zip file
432	0			0		0	my $self = shift;
433
434							$self->surgeon->xml_member($self->zip_member_name, $self->contents)
435	0	0				0	if $self->{contents_has_changed};
436							}
437
438
439							sub replace_image {
440	0			0	1	0	my ($self, $image_title, $image_PNG_content) = @_;
441
442	0	0				0	my $member_name = $self->images->{$image_title}
443							or die "could not find an image with title: $image_title";
444	0					0	$self->surgeon->zip->contents($member_name, $image_PNG_content);
445							}
446
447
448
449							sub add_image {
450	0			0	1	0	my ($self, $image_PNG_content) = @_;
451
452							# compute a fresh image number and a fresh relationship id
453	0					0	my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]);
454	0					0	my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members;
	0					0
455	0		0			0	my $last_img_num = max @image_nums // 0;
456	0					0	my $target = sprintf "media/image%d.png", $last_img_num + 1;
457	0					0	my $last_rId_num = $self->relationships->$#*;
458	0					0	my $rId = sprintf "rId%d", $last_rId_num + 1;
459
460							# assemble XML for the new relationship
461	0					0	my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
462	0					0	my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>};
463
464							# update the rels member
465	0					0	my $xml = $self->_rels_xml;
466	0					0	$xml =~ s[</Relationships>][$new_rel_xml</Relationships>];
467	0					0	$self->_rels_xml($xml);
468
469							# add the image as a new member into the archive
470	0					0	my $member_name = "word/$target";
471	0					0	$self->surgeon->zip->addString(\$image_PNG_content, $member_name);
472
473							# update the global content_types if it doesn't include PNG
474	0					0	my $ct = $self->surgeon->_content_types;
475	0	0				0	if ($ct !~ /Extension="png"/) {
476	0					0	$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>];
477	0					0	$self->surgeon->_content_types($ct);
478							}
479
480							# return the relationship id
481	0					0	return $rId;
482							}
483
484
485
486							#======================================================================
487							# UTILITY FUNCTIONS
488							#======================================================================
489
490
491							sub parse_attrs { # cheap parsing of attribute lists in an XML node
492	21			21	0	50	my ($lst_attrs) = @_;
493
494	21					37	state $attr_pair_regex = qr[
495							([^=\s"'&<>]+) # attribute name
496							\h* = \h* # Eq
497							(?: # attribute value
498							" ([^<"]*) " # .. enclosed in double quotes
499							\|
500							' ([^<']*) ' # .. or enclosed in single quotes
501							)
502							]x;
503
504	21					36	state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'};
505	21					30	state $entity_names = join "\|", keys %$entity;
506
507	21					28	my %attr;
508	21					130	while ($lst_attrs =~ /$attr_pair_regex/g) {
509	69		33			250	my ($name, $val) = ($1, $2 // $3);
510	69					135	$val =~ s/&(entity_names);/$entity->{$1}/eg;
	0					0
511	69					451	$attr{$name} = $val;
512							}
513
514	21					112	return %attr;
515							}
516
517
518							1;
519
520							__END__
521
522							=encoding ISO-8859-1
523
524							=head1 NAME
525
526							MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document
527
528							=head1 SYNOPSIS
529
530							my $part = $surgeon->document;
531							print $part->plain_text;
532							$part->replace(qr[$pattern], $replacement_callback);
533							$part->replace_image($image_alt_text, $image_PNG_content);
534
535
536							=head1 DESCRIPTION
537
538							This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single
539							I<package part> within the ZIP package of a C<.docx> document.
540							It is mostly used for the I<document> part, that contains the XML representation of the
541							main document body. However, other parts such as headers, footers, footnotes, etc. have the
542							same internal representation and therefore the same operations can be invoked.
543
544
545							=head1 METHODS
546
547							=head2 new
548
549							my $run = MsOffice::Word::Surgeon::PackagePart->new(
550							surgeon => $surgeon,
551							part_name => $name,
552							);
553
554							Constructor for a new part object. This is called internally from
555							L<MsOffice::Word::Surgeon>; it is not meant to be called directly
556							by clients.
557
558							=head3 Constructor arguments
559
560
561							=over
562
563							=item surgeon
564
565							a weak reference to the main surgeon object
566
567							=item part_name
568
569							ZIP member name of this part
570
571							=back
572
573							=head3 Other attributes
574
575							Other attributes, which are not passed through the constructor but are generated lazily on demand, are :
576
577							=over
578
579							=item contents
580
581							the XML contents of this part
582
583							=item runs
584
585							a decomposition of the XML contents into a collection of
586							L<MsOffice::Word::Surgeon::Run> objects.
587
588							=item relationships
589
590							an arrayref of Office relationships associated with this part. This information comes from
591							a C<.rels> member in the ZIP archive, named after the name of the package part.
592							Array indices correspond to relationship numbers. Array values are hashrefs with
593							keys
594
595							=over
596
597							=item Id
598
599							the full relationship id
600
601							=item num
602
603							the numeric part of C<rId>
604
605							=item Type
606
607							the full reference to the XML schema for this relationship
608
609							=item short_type
610
611							only the last word of the type, e.g. 'image', 'style', etc.
612
613							=item Target
614
615							designation of the target within the ZIP file. The prefix 'word/' must be
616							added for having a complete Zip member name.
617
618							=back
619
620
621
622							=item images
623
624							a hashref of images within this package part. Keys of the hash are image I<alternative texts>.
625							If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken
626							(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display
627							the I<description> field -- see
628							L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669\|MsOffice documentation>).
629
630							Images without alternative text will not be accessible through the current Perl module.
631
632							Values of the hash are zip member names for the corresponding
633							image representations in C<.png> format.
634
635
636							=back
637
638
639							=head2 Contents restitution
640
641							=head3 contents
642
643							Returns a Perl string with the current internal XML representation of the part
644							contents.
645
646							=head3 original_contents
647
648							Returns a Perl string with the XML representation of the
649							part contents, as it was in the ZIP archive before any
650							modification.
651
652							=head3 indented_contents
653
654							Returns an indented version of the XML contents, suitable for inspection in a text editor.
655							This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded
656							byte string, not a Perl string.
657
658							=head3 plain_text
659
660							Returns the text contents of the part, without any markup.
661							Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored.
662
663
664							=head3 runs
665
666							Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of
667							these objects holds an XML fragment; joining all fragments
668							restores the complete document.
669
670							my $contents = join "", map {$_->as_xml} $self->runs;
671
672
673							=head2 Modifying contents
674
675
676							=head3 cleanup_XML
677
678							$part->cleanup_XML(%args);
679
680							Apply several other methods for removing unnecessary nodes within the internal
681							XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>,
682							L</suppress_bookmarks> and L</merge_runs>.
683
684							Currently there is only one legal arg :
685
686							=over
687
688							=item C<no_caps>
689
690							If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically
691							called for each run object. As a result, all texts within runs with the C<caps> property are automatically
692							converted to uppercase.
693
694							=back
695
696
697
698							=head3 reduce_noise
699
700							$part->reduce_noise($regex1, $regex2, ...);
701
702							This method is used for removing unnecessary information in the XML
703							markup. It applies the given list of regexes to the whole document,
704							suppressing matches. The final result is put back into
705							C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >>
706							references, or as names of builtin regexes (described below). Regexes
707							are applied to the whole XML contents, not only to run nodes.
708
709
710							=head3 noise_reduction_regex
711
712							my $regex = $part->noise_reduction_regex($regex_name);
713
714							Returns the builtin regex corresponding to the given name.
715							Known regexes are :
716
717							proof_checking => qr(<w:(?:proofErr[^>]+\|noProof/)>),
718							revision_ids => qr(\sw:rsid\w+="[^"]+"),
719							complex_script_bold => qr(<w:bCs/>),
720							page_breaks => qr(<w:lastRenderedPageBreak/>),
721							language => qr(<w:lang w:val="[^/>]+/>),
722							empty_run_props => qr(<w:rPr></w:rPr>),
723							soft_hyphens => qr(<w:softHyphen/>),
724
725							=head3 reduce_all_noises
726
727							$part->reduce_all_noises;
728
729							Applies all regexes from the previous method.
730
731							=head3 unlink_fields
732
733							my $names_of_ASK_fields = $part->unlink_fields;
734
735							Removes all fields from the part, just leaving the current
736							value stored in each field. This is the equivalent of performing Ctrl-Shift-F9
737							on the whole document.
738
739							The return value is an arrayref to a list of names of ASK fields within the document.
740							Such names should then be passed to the L</suppress_bookmarks> method
741							(see below).
742
743
744							=head3 suppress_bookmarks
745
746							$part->suppress_bookmarks(@names_to_erase);
747
748							Removes bookmarks markup in the part. This is useful because
749							MsWord may silently insert bookmarks in unexpected places; therefore
750							some searches within the text may fail because of such bookmarks.
751
752							By default, this method only removes the bookmarks markup, leaving
753							intact the contents of the bookmark. However, when the name of a
754							bookmark belongs to the list C<< @names_to_erase >>, the contents
755							is also removed. Currently this is used for suppressing ASK fields,
756							because such fields contain a bookmark content that is never displayed by MsWord.
757
758
759
760							=head3 merge_runs
761
762							$part->merge_runs(no_caps => 1); # optional arg
763
764							Walks through all runs of text within the document, trying to merge
765							adjacent runs when possible (i.e. when both runs have the same
766							properties, and there is no other XML node inbetween).
767
768							This operation is a prerequisite before performing replace operations, because
769							documents edited in MsWord often have run boundaries across sentences or
770							even in the middle of words; so regex searches can only be successful if those
771							artificial boundaries have been removed.
772
773							If the argument C<< no_caps => 1 >> is present, the merge operation
774							will also convert runs with the C<w:caps> property, putting all letters
775							into uppercase and removing the property; this makes more merges possible.
776
777
778							=head3 replace
779
780							$part->replace($pattern, $replacement, %replacement_args);
781
782							Replaces all occurrences of C<$pattern> regex within the text nodes by the
783							given C<$replacement>. This is not exactly like a search-replace
784							operation performed within MsWord, because the search does not cross boundaries
785							of text nodes. In order to maximize the chances of successful replacements,
786							the L</cleanup_XML> method is automatically called before starting the operation.
787
788							The argument C<$pattern> can be either a string or a reference to a regular expression.
789							It should not contain any capturing parentheses, because that would perturb text
790							splitting operations.
791
792							The argument C<$replacement> can be either a fixed string, or a reference to
793							a callback subroutine that will be called for each match.
794
795
796							The C<< %replacement_args >> hash can be used to pass information to the callback
797							subroutine. That hash will be enriched with three entries :
798
799							=over
800
801							=item matched
802
803							The string that has been matched by C<$pattern>.
804
805							=item run
806
807							The run object in which this text resides.
808
809							=item xml_before
810
811							The XML fragment (possibly empty) found before the matched text .
812
813							=back
814
815							The callback subroutine may return either plain text or structured XML.
816							See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback.
817
818							The following special keys within C<< %replacement_args >> are interpreted by the
819							C<replace()> method itself, and therefore are not passed to the callback subroutine :
820
821							=over
822
823							=item keep_xml_as_is
824
825							if true, no call is made to the L</cleanup_XML> method before performing the replacements
826
827							=item dont_overwrite_contents
828
829							if true, the internal XML contents is not modified in place; the new XML after performing
830							replacements is merely returned to the caller.
831
832							=item cleanup_args
833
834							the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This
835							is typically used as
836
837							$part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]);
838
839							=back
840
841
842							=head3 replace_image
843
844							$part->replace_image($image_alt_text, $image_PNG_content);
845
846							Replaces an existing PNG image by a new image. All features of the old image will
847							be preserved (size, positioning, border, etc.) -- only the image itself will be
848							replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word
849							for this image.
850
851							This operation replaces a ZIP member within the C<.docx> file. If several XML
852							nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several
853							locations, the new image will appear at all locations, even if they do not have the
854							same alternative text -- unfortunately this module currently has no facility for
855							duplicating an existing image into separate instances. So if your intent is to only replace
856							one image, your original document should contain several distinct images, coming from
857							several distinct C<.PNG> file copies.
858
859
860							=head3 add_image
861
862							my $rId = $part->add_image($image_PNG_content);
863
864							Stores the given PNG image within the ZIP file, adds it as a relationship to the
865							current part, and returns the relationship id. This operation is not sufficient
866							to make the image visible in Word : it just stores the image, but you still
867							have to insert a proper C<drawing> node in the contents XML, using the C<$rId>.
868							Future versions of this module may offer helper methods for that purpose;
869							currently it must be done by hand.
870
871
872							=head1 AUTHOR
873
874							Laurent Dami, E<lt>dami AT cpan DOT org<gt>
875
876							=head1 COPYRIGHT AND LICENSE
877
878							Copyright 2019-2023 by Laurent Dami.
879
880							This library is free software; you can redistribute it and/or modify
881							it under the same terms as Perl itself.
882
883
884