line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package MsOffice::Word::Surgeon::PackagePart; |
2
|
1
|
|
|
1
|
|
13
|
use 5.24.0; |
|
1
|
|
|
|
|
4
|
|
3
|
1
|
|
|
1
|
|
13
|
use Moose; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
6
|
|
4
|
1
|
|
|
1
|
|
6418
|
use MooseX::StrictConstructor; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
5
|
1
|
|
|
1
|
|
3277
|
use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level); |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
105
|
|
6
|
1
|
|
|
1
|
|
485
|
use MsOffice::Word::Surgeon::Run; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
7
|
1
|
|
|
1
|
|
430
|
use MsOffice::Word::Surgeon::Text; |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
34
|
|
8
|
1
|
|
|
1
|
|
753
|
use XML::LibXML; |
|
1
|
|
|
|
|
32756
|
|
|
1
|
|
|
|
|
6
|
|
9
|
1
|
|
|
1
|
|
168
|
use List::Util qw(max); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
66
|
|
10
|
1
|
|
|
1
|
|
7
|
use Carp qw(croak carp); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
100
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# syntactic sugar for attributes |
13
|
|
|
|
|
|
|
sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)} |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# constant integers to specify indentation modes -- see L<XML::LibXML> |
16
|
1
|
|
|
1
|
|
8
|
use constant XML_NO_INDENT => 0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
56
|
|
17
|
1
|
|
|
1
|
|
6
|
use constant XML_SIMPLE_INDENT => 1; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
59
|
|
18
|
|
|
|
|
|
|
|
19
|
1
|
|
|
1
|
|
8
|
use namespace::clean -except => 'meta'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
9
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '2.03'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#====================================================================== |
25
|
|
|
|
|
|
|
# ATTRIBUTES |
26
|
|
|
|
|
|
|
#====================================================================== |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# attributes passed to the constructor |
30
|
|
|
|
|
|
|
has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1); |
31
|
|
|
|
|
|
|
has 'part_name' => (is => 'ro', isa => 'Str', required => 1); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# attributes constructed by the module -- not received through the constructor |
35
|
|
|
|
|
|
|
has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents); |
36
|
|
|
|
|
|
|
has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs'); |
37
|
|
|
|
|
|
|
has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef'); |
38
|
|
|
|
|
|
|
has_inner 'images' => (is => 'ro', isa => 'HashRef'); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0); |
41
|
|
|
|
|
|
|
has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
#====================================================================== |
44
|
|
|
|
|
|
|
# GLOBAL VARIABLES |
45
|
|
|
|
|
|
|
#====================================================================== |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Various regexes for removing uninteresting XML information |
48
|
|
|
|
|
|
|
my %noise_reduction_regexes = ( |
49
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
50
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
51
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
52
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
53
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
54
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
55
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my @noise_reduction_list = qw/proof_checking revision_ids |
59
|
|
|
|
|
|
|
complex_script_bold page_breaks language |
60
|
|
|
|
|
|
|
empty_run_props soft_hyphens/; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
#====================================================================== |
63
|
|
|
|
|
|
|
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS |
64
|
|
|
|
|
|
|
#====================================================================== |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
sub _runs { |
68
|
12
|
|
|
12
|
|
28
|
my $self = shift; |
69
|
|
|
|
|
|
|
|
70
|
12
|
|
|
|
|
31
|
state $run_regex = qr[ |
71
|
|
|
|
|
|
|
<w:r> # opening tag for the run |
72
|
|
|
|
|
|
|
(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1 |
73
|
|
|
|
|
|
|
(.*?) # run contents -- capture in $2 |
74
|
|
|
|
|
|
|
</w:r> # closing tag for the run |
75
|
|
|
|
|
|
|
]x; |
76
|
|
|
|
|
|
|
|
77
|
12
|
|
|
|
|
30
|
state $txt_regex = qr[ |
78
|
|
|
|
|
|
|
<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents |
79
|
|
|
|
|
|
|
(.*?) # text contents -- capture in $1 |
80
|
|
|
|
|
|
|
</w:t> # closing tag for text |
81
|
|
|
|
|
|
|
]x; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# split XML content into run fragments |
85
|
12
|
|
|
|
|
392
|
my $contents = $self->contents; |
86
|
12
|
|
|
|
|
12713
|
my @run_fragments = split m[$run_regex], $contents, -1; |
87
|
12
|
|
|
|
|
36
|
my @runs; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# build internal RUN objects |
90
|
|
|
|
|
|
|
RUN: |
91
|
12
|
|
|
|
|
61
|
while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) { |
92
|
2004
|
|
100
|
|
|
2360127
|
$run_contents //= ''; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# split XML of this run into text fragmentsn |
95
|
2004
|
|
|
|
|
23261
|
my @txt_fragments = split m[$txt_regex], $run_contents, -1; |
96
|
2004
|
|
|
|
|
4103
|
my @texts; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# build internal TEXT objects |
99
|
|
|
|
|
|
|
TXT: |
100
|
2004
|
|
|
|
|
6733
|
while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) { |
101
|
3634
|
100
|
66
|
|
|
1636291
|
next TXT if !$xml_before_text && ( !(defined $txt_contents) || $txt_contents eq ''); |
|
|
|
66
|
|
|
|
|
102
|
1840
|
|
50
|
|
|
7963
|
push @texts, MsOffice::Word::Surgeon::Text->new( |
|
|
|
100
|
|
|
|
|
103
|
|
|
|
|
|
|
xml_before => $xml_before_text // '', |
104
|
|
|
|
|
|
|
literal_text => $txt_contents // '', |
105
|
|
|
|
|
|
|
); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# assemble TEXT objects into a RUN object |
109
|
2004
|
100
|
100
|
|
|
42853
|
next RUN if !$xml_before_run && !@texts; |
110
|
1852
|
|
50
|
|
|
8180
|
push @runs, MsOffice::Word::Surgeon::Run->new( |
|
|
|
100
|
|
|
|
|
111
|
|
|
|
|
|
|
xml_before => $xml_before_run // '', |
112
|
|
|
|
|
|
|
props => $props // '', |
113
|
|
|
|
|
|
|
inner_texts => \@texts, |
114
|
|
|
|
|
|
|
); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
12
|
|
|
|
|
17285
|
return \@runs; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub _relationships { |
122
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# xml that describes the relationships for this package part |
125
|
1
|
|
|
|
|
3
|
my $rel_xml = $self->_rels_xml; |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# parse the relationships and assemble into a sparse array indexed by relationship ids |
128
|
1
|
|
|
|
|
77
|
my @relationships; |
129
|
1
|
|
|
|
|
24
|
while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) { |
130
|
21
|
|
|
|
|
47
|
my %attrs = parse_attrs($1); |
131
|
21
|
|
50
|
|
|
94
|
$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/; |
132
|
21
|
|
|
|
|
76
|
($attrs{num} = $attrs{Id}) =~ s[^\D+][]; |
133
|
21
|
|
|
|
|
98
|
($attrs{short_type} = $attrs{Type}) =~ s[^.*/][]; |
134
|
21
|
|
|
|
|
150
|
$relationships[$attrs{num}] = \%attrs; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
1
|
|
|
|
|
40
|
return \@relationships; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub _images { |
142
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# get relationship ids associated with images |
145
|
0
|
|
|
|
|
0
|
my %rel_image = map {$_->{Id} => $_->{Target}} |
146
|
0
|
0
|
|
|
|
0
|
grep {$_ && $_->{short_type} eq 'image'} |
|
0
|
|
|
|
|
0
|
|
147
|
|
|
|
|
|
|
$self->relationships->@*; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# get titles and relationship ids of images found within the part contents |
150
|
0
|
|
|
|
|
0
|
my %image; |
151
|
0
|
|
|
|
|
0
|
my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g; |
152
|
|
|
|
|
|
|
DRAWING: |
153
|
0
|
|
|
|
|
0
|
foreach my $drawing (@drawings) { |
154
|
0
|
0
|
|
|
|
0
|
if ($drawing =~ m[<wp:docPr \s+ (.*?) /> |
155
|
|
|
|
|
|
|
.*? |
156
|
|
|
|
|
|
|
<a:blip \s+ r:embed="(\w+)"]x) { |
157
|
0
|
|
|
|
|
0
|
my ($lst_attrs, $rId) = ($1, $2); |
158
|
0
|
|
|
|
|
0
|
my %attrs = parse_attrs($lst_attrs); |
159
|
|
|
|
|
|
|
my $img_id = $attrs{title} || $attrs{descr} |
160
|
0
|
0
|
0
|
|
|
0
|
or next DRAWING; |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
|
|
|
0
|
$image{$img_id} = "word/$rel_image{$rId}" |
163
|
|
|
|
|
|
|
or die "couldn't find image for relationship '$rId' associated with image '$img_id'"; |
164
|
|
|
|
|
|
|
# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why. |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
0
|
return \%image; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
7
|
|
|
7
|
|
20
|
sub _contents {shift->original_contents} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub _on_new_contents { |
175
|
16
|
|
|
16
|
|
34
|
my $self = shift; |
176
|
|
|
|
|
|
|
|
177
|
16
|
|
|
|
|
607
|
$self->clear_runs; |
178
|
16
|
|
|
|
|
219154
|
$self->{contents_has_changed} = 1; |
179
|
16
|
|
|
|
|
640
|
$self->{was_cleaned_up} = 0; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
#====================================================================== |
183
|
|
|
|
|
|
|
# METHODS |
184
|
|
|
|
|
|
|
#====================================================================== |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub _rels_xml { |
188
|
1
|
|
|
1
|
|
3
|
my ($self, $new_xml) = @_; |
189
|
1
|
|
|
|
|
60
|
my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name; |
190
|
1
|
|
|
|
|
35
|
return $self->surgeon->xml_member($rels_name, $new_xml); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub zip_member_name { |
195
|
7
|
|
|
7
|
0
|
14
|
my $self = shift; |
196
|
7
|
|
|
|
|
248
|
return sprintf "word/%s.xml", $self->part_name; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
sub original_contents { |
201
|
7
|
|
|
7
|
1
|
11
|
my $self = shift; |
202
|
|
|
|
|
|
|
|
203
|
7
|
|
|
|
|
221
|
return $self->surgeon->xml_member($self->zip_member_name); |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub image { |
208
|
0
|
|
|
0
|
0
|
0
|
my ($self, $title, $new_image_content) = @_; |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# name of the image file within the zip |
211
|
0
|
|
0
|
|
|
0
|
my $zip_member_name = $self->images->{$title} |
212
|
|
|
|
|
|
|
|| ($title =~ /^\d+$/ ? "word/media/image$title.png" |
213
|
|
|
|
|
|
|
: die "couldn't find image '$title'"); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# delegate to Archive::Zip::contents |
216
|
0
|
|
|
|
|
0
|
return $self->surgeon->zip->contents($zip_member_name, $new_image_content); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#====================================================================== |
222
|
|
|
|
|
|
|
# CONTENTS RESTITUTION |
223
|
|
|
|
|
|
|
#====================================================================== |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub indented_contents { |
226
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
227
|
|
|
|
|
|
|
|
228
|
0
|
|
|
|
|
0
|
my $dom = XML::LibXML->load_xml(string => $self->contents); |
229
|
0
|
|
|
|
|
0
|
return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub plain_text { |
234
|
2
|
|
|
2
|
1
|
30
|
my $self = shift; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# XML contents |
237
|
2
|
|
|
|
|
68
|
my $txt = $self->contents; |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# replace opening paragraph tags by newlines |
240
|
2
|
|
|
|
|
4575
|
$txt =~ s/(<w:p[ >])/\n$1/g; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# replace break tags by newlines |
243
|
2
|
|
|
|
|
3982
|
$txt =~ s[<w:br/>][\n]g; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# replace tab nodes by ASCII tabs |
246
|
2
|
|
|
|
|
5210
|
$txt =~ s/<w:tab[^s][^>]*>/\t/g; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# remove all remaining XML tags |
249
|
2
|
|
|
|
|
15908
|
$txt =~ s/<[^>]+>//g; |
250
|
|
|
|
|
|
|
|
251
|
2
|
|
|
|
|
40
|
return $txt; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
#====================================================================== |
258
|
|
|
|
|
|
|
# MODIFYING CONTENTS |
259
|
|
|
|
|
|
|
#====================================================================== |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub cleanup_XML { |
262
|
2
|
|
|
2
|
1
|
10
|
my ($self, @merge_args) = @_; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# avoid doing it twice |
265
|
2
|
100
|
|
|
|
8
|
return if $self->{was_cleaned_up}; |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# do the cleanup |
268
|
1
|
|
|
|
|
5
|
$self->reduce_all_noises; |
269
|
1
|
|
|
|
|
7
|
my $names_of_ASK_fields = $self->unlink_fields; |
270
|
1
|
|
|
|
|
8
|
$self->suppress_bookmarks(@$names_of_ASK_fields); |
271
|
1
|
|
|
|
|
6
|
$self->merge_runs(@merge_args); |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
# remember it was done |
274
|
1
|
|
|
|
|
83302
|
$self->{was_cleaned_up} = 1; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub noise_reduction_regex { |
278
|
7
|
|
|
7
|
1
|
13
|
my ($self, $regex_name) = @_; |
279
|
7
|
50
|
|
|
|
21
|
my $regex = $noise_reduction_regexes{$regex_name} |
280
|
|
|
|
|
|
|
or croak "->noise_reduction_regex('$regex_name') : unknown regex name"; |
281
|
7
|
|
|
|
|
18
|
return $regex; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
sub reduce_noise { |
285
|
2
|
|
|
2
|
1
|
8
|
my ($self, @noises) = @_; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes |
288
|
2
|
100
|
|
|
|
5
|
my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises; |
|
10
|
|
|
|
|
27
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# get contents, apply all regexes, put back the modified contents. |
291
|
2
|
|
|
|
|
74
|
my $contents = $self->contents; |
292
|
1
|
|
|
1
|
|
2715
|
no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1775
|
|
293
|
2
|
|
|
|
|
25901
|
$contents =~ s/$_/$1/g foreach @regexes; |
294
|
2
|
|
|
|
|
98
|
$self->contents($contents); |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub reduce_all_noises { |
298
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
299
|
|
|
|
|
|
|
|
300
|
1
|
|
|
|
|
4
|
$self->reduce_noise(@noise_reduction_list); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
sub suppress_bookmarks { |
304
|
1
|
|
|
1
|
1
|
5
|
my ($self, @names_to_erase) = @_; |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# closure to decide what to do with bookmark contents |
307
|
1
|
|
|
|
|
3
|
my %should_erase_contents = map {($_ => 1)} @names_to_erase; |
|
2
|
|
|
|
|
10
|
|
308
|
|
|
|
|
|
|
my $deal_with_bookmark_text = sub { |
309
|
17
|
|
|
17
|
|
60
|
my ($bookmark_name, $bookmark_contents) = @_; |
310
|
17
|
100
|
|
|
|
733
|
return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents; |
311
|
1
|
|
|
|
|
10
|
}; |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# regex to find bookmarks markup |
314
|
1
|
|
|
|
|
5
|
state $bookmark_rx = qr{ |
315
|
|
|
|
|
|
|
<w:bookmarkStart # initial tag |
316
|
|
|
|
|
|
|
.+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1 |
317
|
|
|
|
|
|
|
.+? w:name="([^"]+)" # 'name' attribute -- capture 2 |
318
|
|
|
|
|
|
|
.*? /> # end of this tag |
319
|
|
|
|
|
|
|
(.*?) # bookmark contents (may be empty) -- capture 3 |
320
|
|
|
|
|
|
|
<w:bookmarkEnd # ending tag |
321
|
|
|
|
|
|
|
\s+ w:id="\1" # same 'id' attribute |
322
|
|
|
|
|
|
|
.*? /> # end of this tag |
323
|
|
|
|
|
|
|
}sx; |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
# remove bookmarks markup |
326
|
1
|
|
|
|
|
35
|
my $contents = $self->contents; |
327
|
1
|
|
|
|
|
1453
|
$contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg; |
|
17
|
|
|
|
|
50
|
|
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# re-inject the modified contents |
330
|
1
|
|
|
|
|
45
|
$self->contents($contents); |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub merge_runs { |
334
|
1
|
|
|
1
|
1
|
8
|
my ($self, %args) = @_; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# check validity of received args |
337
|
1
|
|
|
|
|
5
|
state $is_valid_arg = {no_caps => 1}; |
338
|
1
|
|
|
|
|
7
|
my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args; |
|
1
|
|
|
|
|
6
|
|
339
|
1
|
50
|
|
|
|
4
|
croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args; |
340
|
|
|
|
|
|
|
|
341
|
1
|
|
|
|
|
3
|
my @new_runs; |
342
|
|
|
|
|
|
|
# loop over internal "run" objects |
343
|
1
|
|
|
|
|
3
|
foreach my $run (@{$self->runs}) { |
|
1
|
|
|
|
|
34
|
|
344
|
|
|
|
|
|
|
|
345
|
793
|
50
|
|
|
|
3190
|
$run->remove_caps_property if $args{no_caps}; |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
# check if the current run can be merged with the previous one |
348
|
793
|
100
|
66
|
|
|
23080
|
if ( !$run->xml_before # no other XML markup between the 2 runs |
|
|
|
100
|
|
|
|
|
349
|
|
|
|
|
|
|
&& @new_runs # there was a previous run |
350
|
|
|
|
|
|
|
&& $new_runs[-1]->props eq $run->props # both runs have the same properties |
351
|
|
|
|
|
|
|
) { |
352
|
|
|
|
|
|
|
# conditions are OK, so merge this run with the previous one |
353
|
272
|
|
|
|
|
638
|
$new_runs[-1]->merge($run); |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
else { |
356
|
|
|
|
|
|
|
# conditions not OK, just push this run without merging |
357
|
521
|
|
|
|
|
1430
|
push @new_runs, $run; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# reassemble the whole stuff and inject it as new contents |
362
|
1
|
|
|
|
|
15
|
$self->contents(join "", map {$_->as_xml} @new_runs); |
|
521
|
|
|
|
|
1194
|
|
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub unlink_fields { |
368
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# must find out what are the ASK fields before erasing the markup |
371
|
1
|
|
|
|
|
7
|
state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)]; |
372
|
1
|
|
|
|
|
35
|
my $contents = $self->contents; |
373
|
1
|
|
|
|
|
232
|
my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g; |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# regexes to remove field nodes and "field instruction" nodes |
377
|
1
|
|
|
|
|
6
|
state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>]; |
378
|
1
|
|
|
|
|
6
|
state $field_boundary_rx = qr[<w:fldChar |
379
|
|
|
|
|
|
|
(?: [^>]*?/> # ignore all attributes until end of node .. |
380
|
|
|
|
|
|
|
| # .. or |
381
|
|
|
|
|
|
|
[^>]*?>.*?</w:fldChar>) # .. ignore node content until closing tag |
382
|
|
|
|
|
|
|
]x; # field boundaries are encoded as "begin" / "separate" / "end" |
383
|
1
|
|
|
|
|
3
|
state $simple_field_rx = qr[</?w:fldSimple[^>]*>]; |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
# apply the regexes |
386
|
1
|
|
|
|
|
5
|
$self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx); |
387
|
|
|
|
|
|
|
|
388
|
1
|
|
|
|
|
10
|
return \@names_of_ASK_fields; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub replace { |
393
|
11
|
|
|
11
|
1
|
68
|
my ($self, $pattern, $replacement_callback, %replacement_args) = @_; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# shared initial string for error messages |
396
|
11
|
|
|
|
|
26
|
my $error_msg = '->replace($pattern, $callback, %args)'; |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is' |
399
|
11
|
50
|
|
|
|
35
|
if (delete $replacement_args{keep_xml_as_is}) { |
400
|
|
|
|
|
|
|
not exists $replacement_args{cleanup_XML} |
401
|
0
|
0
|
|
|
|
0
|
or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'"; |
402
|
0
|
|
|
|
|
0
|
carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead"; |
403
|
0
|
|
|
|
|
0
|
$replacement_args{cleanup_XML} = 0; |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
else { |
406
|
11
|
|
100
|
|
|
35
|
$replacement_args{cleanup_XML} //= 1; # default |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# cleanup the XML structure so that replacements work better |
410
|
11
|
100
|
|
|
|
31
|
if (my $cleanup_args = $replacement_args{cleanup_XML}) { |
411
|
1
|
50
|
|
|
|
6
|
$cleanup_args = {} if ! ref $cleanup_args; |
412
|
1
|
50
|
|
|
|
7
|
ref $cleanup_args eq 'HASH' |
413
|
|
|
|
|
|
|
or croak "$error_msg: arg 'cleanup_XML' should be a hashref"; |
414
|
1
|
|
|
|
|
5
|
$self->cleanup_XML(%$cleanup_args); |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# check for presences of a special option to avoid modying contents |
418
|
11
|
|
|
|
|
22
|
my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents}; |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# apply replacements and generate new XML |
421
|
|
|
|
|
|
|
my $xml = join "", |
422
|
11
|
|
|
|
|
379
|
map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*; |
|
1059
|
|
|
|
|
3701
|
|
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# overwrite previous contents |
425
|
11
|
50
|
|
|
|
562
|
$self->contents($xml) unless $dont_overwrite_contents; |
426
|
|
|
|
|
|
|
|
427
|
11
|
|
|
|
|
117
|
return $xml; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
sub _update_contents_in_zip { # called for each part before saving the zip file |
432
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
$self->surgeon->xml_member($self->zip_member_name, $self->contents) |
435
|
0
|
0
|
|
|
|
0
|
if $self->{contents_has_changed}; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
sub replace_image { |
440
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_title, $image_PNG_content) = @_; |
441
|
|
|
|
|
|
|
|
442
|
0
|
0
|
|
|
|
0
|
my $member_name = $self->images->{$image_title} |
443
|
|
|
|
|
|
|
or die "could not find an image with title: $image_title"; |
444
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->contents($member_name, $image_PNG_content); |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
sub add_image { |
450
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_PNG_content) = @_; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# compute a fresh image number and a fresh relationship id |
453
|
0
|
|
|
|
|
0
|
my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]); |
454
|
0
|
|
|
|
|
0
|
my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members; |
|
0
|
|
|
|
|
0
|
|
455
|
0
|
|
0
|
|
|
0
|
my $last_img_num = max @image_nums // 0; |
456
|
0
|
|
|
|
|
0
|
my $target = sprintf "media/image%d.png", $last_img_num + 1; |
457
|
0
|
|
|
|
|
0
|
my $last_rId_num = $self->relationships->$#*; |
458
|
0
|
|
|
|
|
0
|
my $rId = sprintf "rId%d", $last_rId_num + 1; |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# assemble XML for the new relationship |
461
|
0
|
|
|
|
|
0
|
my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; |
462
|
0
|
|
|
|
|
0
|
my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>}; |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# update the rels member |
465
|
0
|
|
|
|
|
0
|
my $xml = $self->_rels_xml; |
466
|
0
|
|
|
|
|
0
|
$xml =~ s[</Relationships>][$new_rel_xml</Relationships>]; |
467
|
0
|
|
|
|
|
0
|
$self->_rels_xml($xml); |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# add the image as a new member into the archive |
470
|
0
|
|
|
|
|
0
|
my $member_name = "word/$target"; |
471
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->addString(\$image_PNG_content, $member_name); |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
# update the global content_types if it doesn't include PNG |
474
|
0
|
|
|
|
|
0
|
my $ct = $self->surgeon->_content_types; |
475
|
0
|
0
|
|
|
|
0
|
if ($ct !~ /Extension="png"/) { |
476
|
0
|
|
|
|
|
0
|
$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>]; |
477
|
0
|
|
|
|
|
0
|
$self->surgeon->_content_types($ct); |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# return the relationship id |
481
|
0
|
|
|
|
|
0
|
return $rId; |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
#====================================================================== |
487
|
|
|
|
|
|
|
# UTILITY FUNCTIONS |
488
|
|
|
|
|
|
|
#====================================================================== |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
sub parse_attrs { # cheap parsing of attribute lists in an XML node |
492
|
21
|
|
|
21
|
0
|
50
|
my ($lst_attrs) = @_; |
493
|
|
|
|
|
|
|
|
494
|
21
|
|
|
|
|
37
|
state $attr_pair_regex = qr[ |
495
|
|
|
|
|
|
|
([^=\s"'&<>]+) # attribute name |
496
|
|
|
|
|
|
|
\h* = \h* # Eq |
497
|
|
|
|
|
|
|
(?: # attribute value |
498
|
|
|
|
|
|
|
" ([^<"]*) " # .. enclosed in double quotes |
499
|
|
|
|
|
|
|
| |
500
|
|
|
|
|
|
|
' ([^<']*) ' # .. or enclosed in single quotes |
501
|
|
|
|
|
|
|
) |
502
|
|
|
|
|
|
|
]x; |
503
|
|
|
|
|
|
|
|
504
|
21
|
|
|
|
|
36
|
state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'}; |
505
|
21
|
|
|
|
|
30
|
state $entity_names = join "|", keys %$entity; |
506
|
|
|
|
|
|
|
|
507
|
21
|
|
|
|
|
28
|
my %attr; |
508
|
21
|
|
|
|
|
130
|
while ($lst_attrs =~ /$attr_pair_regex/g) { |
509
|
69
|
|
33
|
|
|
250
|
my ($name, $val) = ($1, $2 // $3); |
510
|
69
|
|
|
|
|
135
|
$val =~ s/&(entity_names);/$entity->{$1}/eg; |
|
0
|
|
|
|
|
0
|
|
511
|
69
|
|
|
|
|
451
|
$attr{$name} = $val; |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
|
514
|
21
|
|
|
|
|
112
|
return %attr; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
1; |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
__END__ |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
=head1 NAME |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=head1 SYNOPSIS |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
my $part = $surgeon->document; |
531
|
|
|
|
|
|
|
print $part->plain_text; |
532
|
|
|
|
|
|
|
$part->replace(qr[$pattern], $replacement_callback); |
533
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=head1 DESCRIPTION |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single |
539
|
|
|
|
|
|
|
I<package part> within the ZIP package of a C<.docx> document. |
540
|
|
|
|
|
|
|
It is mostly used for the I<document> part, that contains the XML representation of the |
541
|
|
|
|
|
|
|
main document body. However, other parts such as headers, footers, footnotes, etc. have the |
542
|
|
|
|
|
|
|
same internal representation and therefore the same operations can be invoked. |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=head1 METHODS |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=head2 new |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
my $run = MsOffice::Word::Surgeon::PackagePart->new( |
550
|
|
|
|
|
|
|
surgeon => $surgeon, |
551
|
|
|
|
|
|
|
part_name => $name, |
552
|
|
|
|
|
|
|
); |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
Constructor for a new part object. This is called internally from |
555
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon>; it is not meant to be called directly |
556
|
|
|
|
|
|
|
by clients. |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=head3 Constructor arguments |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=over |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=item surgeon |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
a weak reference to the main surgeon object |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=item part_name |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
ZIP member name of this part |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=back |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=head3 Other attributes |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
Other attributes, which are not passed through the constructor but are generated lazily on demand, are : |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=over |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=item contents |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
the XML contents of this part |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
=item runs |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
a decomposition of the XML contents into a collection of |
586
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon::Run> objects. |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=item relationships |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
an arrayref of Office relationships associated with this part. This information comes from |
591
|
|
|
|
|
|
|
a C<.rels> member in the ZIP archive, named after the name of the package part. |
592
|
|
|
|
|
|
|
Array indices correspond to relationship numbers. Array values are hashrefs with |
593
|
|
|
|
|
|
|
keys |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
=over |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
=item Id |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
the full relationship id |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
=item num |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
the numeric part of C<rId> |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=item Type |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
the full reference to the XML schema for this relationship |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=item short_type |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
only the last word of the type, e.g. 'image', 'style', etc. |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=item Target |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
designation of the target within the ZIP file. The prefix 'word/' must be |
616
|
|
|
|
|
|
|
added for having a complete Zip member name. |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=back |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=item images |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
a hashref of images within this package part. Keys of the hash are image I<alternative texts>. |
625
|
|
|
|
|
|
|
If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken |
626
|
|
|
|
|
|
|
(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display |
627
|
|
|
|
|
|
|
the I<description> field -- see |
628
|
|
|
|
|
|
|
L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669|MsOffice documentation>). |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
Images without alternative text will not be accessible through the current Perl module. |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
Values of the hash are zip member names for the corresponding |
633
|
|
|
|
|
|
|
image representations in C<.png> format. |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=back |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=head2 Contents restitution |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=head3 contents |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
Returns a Perl string with the current internal XML representation of the part |
644
|
|
|
|
|
|
|
contents. |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
=head3 original_contents |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
Returns a Perl string with the XML representation of the |
649
|
|
|
|
|
|
|
part contents, as it was in the ZIP archive before any |
650
|
|
|
|
|
|
|
modification. |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=head3 indented_contents |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
Returns an indented version of the XML contents, suitable for inspection in a text editor. |
655
|
|
|
|
|
|
|
This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded |
656
|
|
|
|
|
|
|
byte string, not a Perl string. |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=head3 plain_text |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Returns the text contents of the part, without any markup. |
661
|
|
|
|
|
|
|
Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored. |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=head3 runs |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of |
667
|
|
|
|
|
|
|
these objects holds an XML fragment; joining all fragments |
668
|
|
|
|
|
|
|
restores the complete document. |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
my $contents = join "", map {$_->as_xml} $self->runs; |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
=head2 Modifying contents |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=head3 cleanup_XML |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
$part->cleanup_XML(%args); |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
Apply several other methods for removing unnecessary nodes within the internal |
681
|
|
|
|
|
|
|
XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>, |
682
|
|
|
|
|
|
|
L</suppress_bookmarks> and L</merge_runs>. |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
Currently there is only one legal arg : |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
=over |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
=item C<no_caps> |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically |
691
|
|
|
|
|
|
|
called for each run object. As a result, all texts within runs with the C<caps> property are automatically |
692
|
|
|
|
|
|
|
converted to uppercase. |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
=back |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
=head3 reduce_noise |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
$part->reduce_noise($regex1, $regex2, ...); |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
This method is used for removing unnecessary information in the XML |
703
|
|
|
|
|
|
|
markup. It applies the given list of regexes to the whole document, |
704
|
|
|
|
|
|
|
suppressing matches. The final result is put back into |
705
|
|
|
|
|
|
|
C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >> |
706
|
|
|
|
|
|
|
references, or as names of builtin regexes (described below). Regexes |
707
|
|
|
|
|
|
|
are applied to the whole XML contents, not only to run nodes. |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head3 noise_reduction_regex |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
my $regex = $part->noise_reduction_regex($regex_name); |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
Returns the builtin regex corresponding to the given name. |
715
|
|
|
|
|
|
|
Known regexes are : |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
718
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
719
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
720
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
721
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
722
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
723
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
=head3 reduce_all_noises |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
$part->reduce_all_noises; |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
Applies all regexes from the previous method. |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=head3 unlink_fields |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
my $names_of_ASK_fields = $part->unlink_fields; |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
Removes all fields from the part, just leaving the current |
736
|
|
|
|
|
|
|
value stored in each field. This is the equivalent of performing Ctrl-Shift-F9 |
737
|
|
|
|
|
|
|
on the whole document. |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
The return value is an arrayref to a list of names of ASK fields within the document. |
740
|
|
|
|
|
|
|
Such names should then be passed to the L</suppress_bookmarks> method |
741
|
|
|
|
|
|
|
(see below). |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=head3 suppress_bookmarks |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
$part->suppress_bookmarks(@names_to_erase); |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
Removes bookmarks markup in the part. This is useful because |
749
|
|
|
|
|
|
|
MsWord may silently insert bookmarks in unexpected places; therefore |
750
|
|
|
|
|
|
|
some searches within the text may fail because of such bookmarks. |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
By default, this method only removes the bookmarks markup, leaving |
753
|
|
|
|
|
|
|
intact the contents of the bookmark. However, when the name of a |
754
|
|
|
|
|
|
|
bookmark belongs to the list C<< @names_to_erase >>, the contents |
755
|
|
|
|
|
|
|
is also removed. Currently this is used for suppressing ASK fields, |
756
|
|
|
|
|
|
|
because such fields contain a bookmark content that is never displayed by MsWord. |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=head3 merge_runs |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
$part->merge_runs(no_caps => 1); # optional arg |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
Walks through all runs of text within the document, trying to merge |
765
|
|
|
|
|
|
|
adjacent runs when possible (i.e. when both runs have the same |
766
|
|
|
|
|
|
|
properties, and there is no other XML node inbetween). |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
This operation is a prerequisite before performing replace operations, because |
769
|
|
|
|
|
|
|
documents edited in MsWord often have run boundaries across sentences or |
770
|
|
|
|
|
|
|
even in the middle of words; so regex searches can only be successful if those |
771
|
|
|
|
|
|
|
artificial boundaries have been removed. |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
If the argument C<< no_caps => 1 >> is present, the merge operation |
774
|
|
|
|
|
|
|
will also convert runs with the C<w:caps> property, putting all letters |
775
|
|
|
|
|
|
|
into uppercase and removing the property; this makes more merges possible. |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
=head3 replace |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, %replacement_args); |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
Replaces all occurrences of C<$pattern> regex within the text nodes by the |
783
|
|
|
|
|
|
|
given C<$replacement>. This is not exactly like a search-replace |
784
|
|
|
|
|
|
|
operation performed within MsWord, because the search does not cross boundaries |
785
|
|
|
|
|
|
|
of text nodes. In order to maximize the chances of successful replacements, |
786
|
|
|
|
|
|
|
the L</cleanup_XML> method is automatically called before starting the operation. |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
The argument C<$pattern> can be either a string or a reference to a regular expression. |
789
|
|
|
|
|
|
|
It should not contain any capturing parentheses, because that would perturb text |
790
|
|
|
|
|
|
|
splitting operations. |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
The argument C<$replacement> can be either a fixed string, or a reference to |
793
|
|
|
|
|
|
|
a callback subroutine that will be called for each match. |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
The C<< %replacement_args >> hash can be used to pass information to the callback |
797
|
|
|
|
|
|
|
subroutine. That hash will be enriched with three entries : |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
=over |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
=item matched |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
The string that has been matched by C<$pattern>. |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
=item run |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
The run object in which this text resides. |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=item xml_before |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
The XML fragment (possibly empty) found before the matched text . |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
=back |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
The callback subroutine may return either plain text or structured XML. |
816
|
|
|
|
|
|
|
See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback. |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
The following special keys within C<< %replacement_args >> are interpreted by the |
819
|
|
|
|
|
|
|
C<replace()> method itself, and therefore are not passed to the callback subroutine : |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=over |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=item keep_xml_as_is |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
if true, no call is made to the L</cleanup_XML> method before performing the replacements |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
=item dont_overwrite_contents |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
if true, the internal XML contents is not modified in place; the new XML after performing |
830
|
|
|
|
|
|
|
replacements is merely returned to the caller. |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
=item cleanup_args |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This |
835
|
|
|
|
|
|
|
is typically used as |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]); |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=back |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
=head3 replace_image |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
Replaces an existing PNG image by a new image. All features of the old image will |
847
|
|
|
|
|
|
|
be preserved (size, positioning, border, etc.) -- only the image itself will be |
848
|
|
|
|
|
|
|
replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word |
849
|
|
|
|
|
|
|
for this image. |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
This operation replaces a ZIP member within the C<.docx> file. If several XML |
852
|
|
|
|
|
|
|
nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several |
853
|
|
|
|
|
|
|
locations, the new image will appear at all locations, even if they do not have the |
854
|
|
|
|
|
|
|
same alternative text -- unfortunately this module currently has no facility for |
855
|
|
|
|
|
|
|
duplicating an existing image into separate instances. So if your intent is to only replace |
856
|
|
|
|
|
|
|
one image, your original document should contain several distinct images, coming from |
857
|
|
|
|
|
|
|
several distinct C<.PNG> file copies. |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=head3 add_image |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
my $rId = $part->add_image($image_PNG_content); |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
Stores the given PNG image within the ZIP file, adds it as a relationship to the |
865
|
|
|
|
|
|
|
current part, and returns the relationship id. This operation is not sufficient |
866
|
|
|
|
|
|
|
to make the image visible in Word : it just stores the image, but you still |
867
|
|
|
|
|
|
|
have to insert a proper C<drawing> node in the contents XML, using the C<$rId>. |
868
|
|
|
|
|
|
|
Future versions of this module may offer helper methods for that purpose; |
869
|
|
|
|
|
|
|
currently it must be done by hand. |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
=head1 AUTHOR |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
Laurent Dami, E<lt>dami AT cpan DOT org<gt> |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
Copyright 2019-2023 by Laurent Dami. |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
881
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
|