line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package MsOffice::Word::Surgeon::PackagePart; |
2
|
1
|
|
|
1
|
|
18
|
use 5.24.0; |
|
1
|
|
|
|
|
5
|
|
3
|
1
|
|
|
1
|
|
18
|
use Moose; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
9
|
|
4
|
1
|
|
|
1
|
|
6839
|
use MooseX::StrictConstructor; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
11
|
|
5
|
1
|
|
|
1
|
|
3278
|
use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level); |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
74
|
|
6
|
1
|
|
|
1
|
|
504
|
use MsOffice::Word::Surgeon::Run; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
46
|
|
7
|
1
|
|
|
1
|
|
616
|
use MsOffice::Word::Surgeon::Text; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
8
|
1
|
|
|
1
|
|
705
|
use XML::LibXML; |
|
1
|
|
|
|
|
31826
|
|
|
1
|
|
|
|
|
8
|
|
9
|
1
|
|
|
1
|
|
170
|
use List::Util qw(max); |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
70
|
|
10
|
1
|
|
|
1
|
|
6
|
use Carp qw(croak carp); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
93
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# syntactic sugar for attributes |
13
|
|
|
|
|
|
|
sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)} |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# constant integers to specify indentation modes -- see L<XML::LibXML> |
16
|
1
|
|
|
1
|
|
7
|
use constant XML_NO_INDENT => 0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
56
|
|
17
|
1
|
|
|
1
|
|
6
|
use constant XML_SIMPLE_INDENT => 1; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
63
|
|
18
|
|
|
|
|
|
|
|
19
|
1
|
|
|
1
|
|
8
|
use namespace::clean -except => 'meta'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '2.02'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#====================================================================== |
25
|
|
|
|
|
|
|
# ATTRIBUTES |
26
|
|
|
|
|
|
|
#====================================================================== |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# attributes passed to the constructor |
30
|
|
|
|
|
|
|
has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1); |
31
|
|
|
|
|
|
|
has 'part_name' => (is => 'ro', isa => 'Str', required => 1); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# attributes constructed by the module -- not received through the constructor |
35
|
|
|
|
|
|
|
has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents); |
36
|
|
|
|
|
|
|
has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs'); |
37
|
|
|
|
|
|
|
has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef'); |
38
|
|
|
|
|
|
|
has_inner 'images' => (is => 'ro', isa => 'HashRef'); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0); |
41
|
|
|
|
|
|
|
has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
#====================================================================== |
44
|
|
|
|
|
|
|
# GLOBAL VARIABLES |
45
|
|
|
|
|
|
|
#====================================================================== |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Various regexes for removing uninteresting XML information |
48
|
|
|
|
|
|
|
my %noise_reduction_regexes = ( |
49
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
50
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
51
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
52
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
53
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
54
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
55
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my @noise_reduction_list = qw/proof_checking revision_ids |
59
|
|
|
|
|
|
|
complex_script_bold page_breaks language |
60
|
|
|
|
|
|
|
empty_run_props soft_hyphens/; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
#====================================================================== |
63
|
|
|
|
|
|
|
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS |
64
|
|
|
|
|
|
|
#====================================================================== |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
sub _runs { |
68
|
12
|
|
|
12
|
|
23
|
my $self = shift; |
69
|
|
|
|
|
|
|
|
70
|
12
|
|
|
|
|
24
|
state $run_regex = qr[ |
71
|
|
|
|
|
|
|
<w:r> # opening tag for the run |
72
|
|
|
|
|
|
|
(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1 |
73
|
|
|
|
|
|
|
(.*?) # run contents -- capture in $2 |
74
|
|
|
|
|
|
|
</w:r> # closing tag for the run |
75
|
|
|
|
|
|
|
]x; |
76
|
|
|
|
|
|
|
|
77
|
12
|
|
|
|
|
24
|
state $txt_regex = qr[ |
78
|
|
|
|
|
|
|
<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents |
79
|
|
|
|
|
|
|
(.*?) # text contents -- capture in $1 |
80
|
|
|
|
|
|
|
</w:t> # closing tag for text |
81
|
|
|
|
|
|
|
]x; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# split XML content into run fragments |
85
|
12
|
|
|
|
|
384
|
my $contents = $self->contents; |
86
|
12
|
|
|
|
|
12876
|
my @run_fragments = split m[$run_regex], $contents, -1; |
87
|
12
|
|
|
|
|
35
|
my @runs; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# build internal RUN objects |
90
|
|
|
|
|
|
|
RUN: |
91
|
12
|
|
|
|
|
60
|
while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) { |
92
|
2004
|
|
100
|
|
|
2338004
|
$run_contents //= ''; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# split XML of this run into text fragmentsn |
95
|
2004
|
|
|
|
|
22690
|
my @txt_fragments = split m[$txt_regex], $run_contents, -1; |
96
|
2004
|
|
|
|
|
4251
|
my @texts; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# build internal TEXT objects |
99
|
|
|
|
|
|
|
TXT: |
100
|
2004
|
|
|
|
|
6191
|
while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) { |
101
|
3634
|
100
|
66
|
|
|
1627409
|
next TXT if !$xml_before_text && ( !(defined $txt_contents) || $txt_contents eq ''); |
|
|
|
66
|
|
|
|
|
102
|
1840
|
|
50
|
|
|
7805
|
push @texts, MsOffice::Word::Surgeon::Text->new( |
|
|
|
100
|
|
|
|
|
103
|
|
|
|
|
|
|
xml_before => $xml_before_text // '', |
104
|
|
|
|
|
|
|
literal_text => $txt_contents // '', |
105
|
|
|
|
|
|
|
); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# assemble TEXT objects into a RUN object |
109
|
2004
|
100
|
100
|
|
|
41488
|
next RUN if !$xml_before_run && !@texts; |
110
|
1852
|
|
50
|
|
|
8101
|
push @runs, MsOffice::Word::Surgeon::Run->new( |
|
|
|
100
|
|
|
|
|
111
|
|
|
|
|
|
|
xml_before => $xml_before_run // '', |
112
|
|
|
|
|
|
|
props => $props // '', |
113
|
|
|
|
|
|
|
inner_texts => \@texts, |
114
|
|
|
|
|
|
|
); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
12
|
|
|
|
|
16007
|
return \@runs; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub _relationships { |
122
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# xml that describes the relationships for this package part |
125
|
1
|
|
|
|
|
8
|
my $rel_xml = $self->_rels_xml; |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# parse the relationships and assemble into a sparse array indexed by relationship ids |
128
|
1
|
|
|
|
|
74
|
my @relationships; |
129
|
1
|
|
|
|
|
19
|
while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) { |
130
|
21
|
|
|
|
|
46
|
my %attrs = parse_attrs($1); |
131
|
21
|
|
50
|
|
|
95
|
$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/; |
132
|
21
|
|
|
|
|
79
|
($attrs{num} = $attrs{Id}) =~ s[^\D+][]; |
133
|
21
|
|
|
|
|
102
|
($attrs{short_type} = $attrs{Type}) =~ s[^.*/][]; |
134
|
21
|
|
|
|
|
150
|
$relationships[$attrs{num}] = \%attrs; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
1
|
|
|
|
|
42
|
return \@relationships; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub _images { |
142
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# get relationship ids associated with images |
145
|
0
|
|
|
|
|
0
|
my %rel_image = map {$_->{Id} => $_->{Target}} |
146
|
0
|
0
|
|
|
|
0
|
grep {$_ && $_->{short_type} eq 'image'} |
|
0
|
|
|
|
|
0
|
|
147
|
|
|
|
|
|
|
$self->relationships->@*; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# get titles and relationship ids of images found within the part contents |
150
|
0
|
|
|
|
|
0
|
my %image; |
151
|
0
|
|
|
|
|
0
|
my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g; |
152
|
|
|
|
|
|
|
DRAWING: |
153
|
0
|
|
|
|
|
0
|
foreach my $drawing (@drawings) { |
154
|
0
|
0
|
|
|
|
0
|
if ($drawing =~ m[<wp:docPr \s+ (.*?) /> |
155
|
|
|
|
|
|
|
.*? |
156
|
|
|
|
|
|
|
<a:blip \s+ r:embed="(\w+)"]x) { |
157
|
0
|
|
|
|
|
0
|
my ($lst_attrs, $rId) = ($1, $2); |
158
|
0
|
|
|
|
|
0
|
my %attrs = parse_attrs($lst_attrs); |
159
|
|
|
|
|
|
|
my $img_id = $attrs{title} || $attrs{descr} |
160
|
0
|
0
|
0
|
|
|
0
|
or next DRAWING; |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
|
|
|
0
|
$image{$img_id} = "word/$rel_image{$rId}" |
163
|
|
|
|
|
|
|
or die "couldn't find image for relationship '$rId' associated with image '$img_id'"; |
164
|
|
|
|
|
|
|
# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why. |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
0
|
return \%image; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
7
|
|
|
7
|
|
19
|
sub _contents {shift->original_contents} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub _on_new_contents { |
175
|
16
|
|
|
16
|
|
31
|
my $self = shift; |
176
|
|
|
|
|
|
|
|
177
|
16
|
|
|
|
|
633
|
$self->clear_runs; |
178
|
16
|
|
|
|
|
224038
|
$self->{contents_has_changed} = 1; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
#====================================================================== |
182
|
|
|
|
|
|
|
# METHODS |
183
|
|
|
|
|
|
|
#====================================================================== |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub _rels_xml { |
187
|
1
|
|
|
1
|
|
4
|
my ($self, $new_xml) = @_; |
188
|
1
|
|
|
|
|
34
|
my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name; |
189
|
1
|
|
|
|
|
36
|
return $self->surgeon->xml_member($rels_name, $new_xml); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub zip_member_name { |
194
|
7
|
|
|
7
|
0
|
15
|
my $self = shift; |
195
|
7
|
|
|
|
|
248
|
return sprintf "word/%s.xml", $self->part_name; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
sub original_contents { |
200
|
7
|
|
|
7
|
1
|
13
|
my $self = shift; |
201
|
|
|
|
|
|
|
|
202
|
7
|
|
|
|
|
224
|
return $self->surgeon->xml_member($self->zip_member_name); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub image { |
207
|
0
|
|
|
0
|
0
|
0
|
my ($self, $title, $new_image_content) = @_; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# name of the image file within the zip |
210
|
0
|
|
0
|
|
|
0
|
my $zip_member_name = $self->images->{$title} |
211
|
|
|
|
|
|
|
|| ($title =~ /^\d+$/ ? "word/media/image$title.png" |
212
|
|
|
|
|
|
|
: die "couldn't find image '$title'"); |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# delegate to Archive::Zip::contents |
215
|
0
|
|
|
|
|
0
|
return $self->surgeon->zip->contents($zip_member_name, $new_image_content); |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
#====================================================================== |
221
|
|
|
|
|
|
|
# CONTENTS RESTITUTION |
222
|
|
|
|
|
|
|
#====================================================================== |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub indented_contents { |
225
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
226
|
|
|
|
|
|
|
|
227
|
0
|
|
|
|
|
0
|
my $dom = XML::LibXML->load_xml(string => $self->contents); |
228
|
0
|
|
|
|
|
0
|
return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub plain_text { |
233
|
2
|
|
|
2
|
1
|
30
|
my $self = shift; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# XML contents |
236
|
2
|
|
|
|
|
67
|
my $txt = $self->contents; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# replace opening paragraph tags by newlines |
239
|
2
|
|
|
|
|
4535
|
$txt =~ s/(<w:p[ >])/\n$1/g; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# replace break tags by newlines |
242
|
2
|
|
|
|
|
3857
|
$txt =~ s[<w:br/>][\n]g; |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# replace tab nodes by ASCII tabs |
245
|
2
|
|
|
|
|
5192
|
$txt =~ s/<w:tab[^s][^>]*>/\t/g; |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# remove all remaining XML tags |
248
|
2
|
|
|
|
|
15472
|
$txt =~ s/<[^>]+>//g; |
249
|
|
|
|
|
|
|
|
250
|
2
|
|
|
|
|
37
|
return $txt; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
#====================================================================== |
257
|
|
|
|
|
|
|
# MODIFYING CONTENTS |
258
|
|
|
|
|
|
|
#====================================================================== |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub cleanup_XML { |
261
|
2
|
|
|
2
|
1
|
10
|
my ($self, @merge_args) = @_; |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# avoid doing it twice |
264
|
2
|
100
|
|
|
|
11
|
return if $self->{was_cleaned_up}; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# do the cleanup |
267
|
1
|
|
|
|
|
7
|
$self->reduce_all_noises; |
268
|
1
|
|
|
|
|
6
|
my $names_of_ASK_fields = $self->unlink_fields; |
269
|
1
|
|
|
|
|
12
|
$self->suppress_bookmarks(@$names_of_ASK_fields); |
270
|
1
|
|
|
|
|
11
|
$self->merge_runs(@merge_args); |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# remember it was done |
273
|
1
|
|
|
|
|
84971
|
$self->{was_cleaned_up} = 1; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub noise_reduction_regex { |
277
|
7
|
|
|
7
|
1
|
13
|
my ($self, $regex_name) = @_; |
278
|
7
|
50
|
|
|
|
25
|
my $regex = $noise_reduction_regexes{$regex_name} |
279
|
|
|
|
|
|
|
or croak "->noise_reduction_regex('$regex_name') : unknown regex name"; |
280
|
7
|
|
|
|
|
15
|
return $regex; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub reduce_noise { |
284
|
2
|
|
|
2
|
1
|
8
|
my ($self, @noises) = @_; |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes |
287
|
2
|
100
|
|
|
|
6
|
my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises; |
|
10
|
|
|
|
|
26
|
|
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# get contents, apply all regexes, put back the modified contents. |
290
|
2
|
|
|
|
|
74
|
my $contents = $self->contents; |
291
|
1
|
|
|
1
|
|
2814
|
no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1642
|
|
292
|
2
|
|
|
|
|
25522
|
$contents =~ s/$_/$1/g foreach @regexes; |
293
|
2
|
|
|
|
|
94
|
$self->contents($contents); |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub reduce_all_noises { |
297
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
298
|
|
|
|
|
|
|
|
299
|
1
|
|
|
|
|
5
|
$self->reduce_noise(@noise_reduction_list); |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
sub suppress_bookmarks { |
303
|
1
|
|
|
1
|
1
|
4
|
my ($self, @names_to_erase) = @_; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# closure to decide what to do with bookmark contents |
306
|
1
|
|
|
|
|
4
|
my %should_erase_contents = map {($_ => 1)} @names_to_erase; |
|
2
|
|
|
|
|
11
|
|
307
|
|
|
|
|
|
|
my $deal_with_bookmark_text = sub { |
308
|
17
|
|
|
17
|
|
64
|
my ($bookmark_name, $bookmark_contents) = @_; |
309
|
17
|
100
|
|
|
|
698
|
return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents; |
310
|
1
|
|
|
|
|
8
|
}; |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# regex to find bookmarks markup |
313
|
1
|
|
|
|
|
6
|
state $bookmark_rx = qr{ |
314
|
|
|
|
|
|
|
<w:bookmarkStart # initial tag |
315
|
|
|
|
|
|
|
.+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1 |
316
|
|
|
|
|
|
|
.+? w:name="([^"]+)" # 'name' attribute -- capture 2 |
317
|
|
|
|
|
|
|
.*? /> # end of this tag |
318
|
|
|
|
|
|
|
(.*?) # bookmark contents (may be empty) -- capture 3 |
319
|
|
|
|
|
|
|
<w:bookmarkEnd # ending tag |
320
|
|
|
|
|
|
|
\s+ w:id="\1" # same 'id' attribute |
321
|
|
|
|
|
|
|
.*? /> # end of this tag |
322
|
|
|
|
|
|
|
}sx; |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
# remove bookmarks markup |
325
|
1
|
|
|
|
|
34
|
my $contents = $self->contents; |
326
|
1
|
|
|
|
|
1428
|
$contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg; |
|
17
|
|
|
|
|
50
|
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# re-inject the modified contents |
329
|
1
|
|
|
|
|
45
|
$self->contents($contents); |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
sub merge_runs { |
333
|
1
|
|
|
1
|
1
|
5
|
my ($self, %args) = @_; |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# check validity of received args |
336
|
1
|
|
|
|
|
4
|
state $is_valid_arg = {no_caps => 1}; |
337
|
1
|
|
|
|
|
6
|
my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args; |
|
1
|
|
|
|
|
5
|
|
338
|
1
|
50
|
|
|
|
5
|
croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args; |
339
|
|
|
|
|
|
|
|
340
|
1
|
|
|
|
|
3
|
my @new_runs; |
341
|
|
|
|
|
|
|
# loop over internal "run" objects |
342
|
1
|
|
|
|
|
2
|
foreach my $run (@{$self->runs}) { |
|
1
|
|
|
|
|
35
|
|
343
|
|
|
|
|
|
|
|
344
|
793
|
50
|
|
|
|
2817
|
$run->remove_caps_property if $args{no_caps}; |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# check if the current run can be merged with the previous one |
347
|
793
|
100
|
66
|
|
|
22914
|
if ( !$run->xml_before # no other XML markup between the 2 runs |
|
|
|
100
|
|
|
|
|
348
|
|
|
|
|
|
|
&& @new_runs # there was a previous run |
349
|
|
|
|
|
|
|
&& $new_runs[-1]->props eq $run->props # both runs have the same properties |
350
|
|
|
|
|
|
|
) { |
351
|
|
|
|
|
|
|
# conditions are OK, so merge this run with the previous one |
352
|
272
|
|
|
|
|
644
|
$new_runs[-1]->merge($run); |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
else { |
355
|
|
|
|
|
|
|
# conditions not OK, just push this run without merging |
356
|
521
|
|
|
|
|
1280
|
push @new_runs, $run; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# reassemble the whole stuff and inject it as new contents |
361
|
1
|
|
|
|
|
15
|
$self->contents(join "", map {$_->as_xml} @new_runs); |
|
521
|
|
|
|
|
1315
|
|
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
sub unlink_fields { |
367
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# must find out what are the ASK fields before erasing the markup |
370
|
1
|
|
|
|
|
5
|
state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)]; |
371
|
1
|
|
|
|
|
33
|
my $contents = $self->contents; |
372
|
1
|
|
|
|
|
233
|
my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g; |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# regexes to remove field nodes and "field instruction" nodes |
376
|
1
|
|
|
|
|
5
|
state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>]; |
377
|
1
|
|
|
|
|
5
|
state $field_boundary_rx = qr[<w:fldChar |
378
|
|
|
|
|
|
|
(?: [^>]*?/> # ignore all attributes until end of node .. |
379
|
|
|
|
|
|
|
| # .. or |
380
|
|
|
|
|
|
|
[^>]*?>.*?</w:fldChar>) # .. ignore node content until closing tag |
381
|
|
|
|
|
|
|
]x; # field boundaries are encoded as "begin" / "separate" / "end" |
382
|
1
|
|
|
|
|
4
|
state $simple_field_rx = qr[</?w:fldSimple[^>]*>]; |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# apply the regexes |
385
|
1
|
|
|
|
|
6
|
$self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx); |
386
|
|
|
|
|
|
|
|
387
|
1
|
|
|
|
|
4
|
return \@names_of_ASK_fields; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub replace { |
392
|
11
|
|
|
11
|
1
|
60
|
my ($self, $pattern, $replacement_callback, %replacement_args) = @_; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# shared initial string for error messages |
395
|
11
|
|
|
|
|
25
|
my $error_msg = '->replace($pattern, $callback, %args)'; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is' |
398
|
11
|
50
|
|
|
|
28
|
if (delete $replacement_args{keep_xml_as_is}) { |
399
|
|
|
|
|
|
|
not exists $replacement_args{cleanup_XML} |
400
|
0
|
0
|
|
|
|
0
|
or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'"; |
401
|
0
|
|
|
|
|
0
|
carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead"; |
402
|
0
|
|
|
|
|
0
|
$replacement_args{cleanup_XML} = 0; |
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
else { |
405
|
11
|
|
100
|
|
|
32
|
$replacement_args{cleanup_XML} //= 1; # default |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# cleanup the XML structure so that replacements work better |
409
|
11
|
100
|
|
|
|
31
|
if (my $cleanup_args = $replacement_args{cleanup_XML}) { |
410
|
1
|
50
|
|
|
|
6
|
$cleanup_args = {} if ! ref $cleanup_args; |
411
|
1
|
50
|
|
|
|
6
|
ref $cleanup_args eq 'HASH' |
412
|
|
|
|
|
|
|
or croak "$error_msg: arg 'cleanup_XML' should be a hashref"; |
413
|
1
|
|
|
|
|
6
|
$self->cleanup_XML(%$cleanup_args); |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# check for presences of a special option to avoid modying contents |
417
|
11
|
|
|
|
|
21
|
my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents}; |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# apply replacements and generate new XML |
420
|
|
|
|
|
|
|
my $xml = join "", |
421
|
11
|
|
|
|
|
366
|
map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*; |
|
1059
|
|
|
|
|
4078
|
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# overwrite previous contents |
424
|
11
|
50
|
|
|
|
596
|
$self->contents($xml) unless $dont_overwrite_contents; |
425
|
|
|
|
|
|
|
|
426
|
11
|
|
|
|
|
87
|
return $xml; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
sub _update_contents_in_zip { # called for each part before saving the zip file |
431
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
$self->surgeon->xml_member($self->zip_member_name, $self->contents) |
434
|
0
|
0
|
|
|
|
0
|
if $self->{contents_has_changed}; |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
sub replace_image { |
439
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_title, $image_PNG_content) = @_; |
440
|
|
|
|
|
|
|
|
441
|
0
|
0
|
|
|
|
0
|
my $member_name = $self->images->{$image_title} |
442
|
|
|
|
|
|
|
or die "could not find an image with title: $image_title"; |
443
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->contents($member_name, $image_PNG_content); |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub add_image { |
449
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_PNG_content) = @_; |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
# compute a fresh image number and a fresh relationship id |
452
|
0
|
|
|
|
|
0
|
my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]); |
453
|
0
|
|
|
|
|
0
|
my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members; |
|
0
|
|
|
|
|
0
|
|
454
|
0
|
|
0
|
|
|
0
|
my $last_img_num = max @image_nums // 0; |
455
|
0
|
|
|
|
|
0
|
my $target = sprintf "media/image%d.png", $last_img_num + 1; |
456
|
0
|
|
|
|
|
0
|
my $last_rId_num = $self->relationships->$#*; |
457
|
0
|
|
|
|
|
0
|
my $rId = sprintf "rId%d", $last_rId_num + 1; |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# assemble XML for the new relationship |
460
|
0
|
|
|
|
|
0
|
my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; |
461
|
0
|
|
|
|
|
0
|
my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>}; |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# update the rels member |
464
|
0
|
|
|
|
|
0
|
my $xml = $self->_rels_xml; |
465
|
0
|
|
|
|
|
0
|
$xml =~ s[</Relationships>][$new_rel_xml</Relationships>]; |
466
|
0
|
|
|
|
|
0
|
$self->_rels_xml($xml); |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# add the image as a new member into the archive |
469
|
0
|
|
|
|
|
0
|
my $member_name = "word/$target"; |
470
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->addString(\$image_PNG_content, $member_name); |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# update the global content_types if it doesn't include PNG |
473
|
0
|
|
|
|
|
0
|
my $ct = $self->surgeon->_content_types; |
474
|
0
|
0
|
|
|
|
0
|
if ($ct !~ /Extension="png"/) { |
475
|
0
|
|
|
|
|
0
|
$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>]; |
476
|
0
|
|
|
|
|
0
|
$self->surgeon->_content_types($ct); |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# return the relationship id |
480
|
0
|
|
|
|
|
0
|
return $rId; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
#====================================================================== |
486
|
|
|
|
|
|
|
# UTILITY FUNCTIONS |
487
|
|
|
|
|
|
|
#====================================================================== |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub parse_attrs { # cheap parsing of attribute lists in an XML node |
491
|
21
|
|
|
21
|
0
|
56
|
my ($lst_attrs) = @_; |
492
|
|
|
|
|
|
|
|
493
|
21
|
|
|
|
|
34
|
state $attr_pair_regex = qr[ |
494
|
|
|
|
|
|
|
([^=\s"'&<>]+) # attribute name |
495
|
|
|
|
|
|
|
\h* = \h* # Eq |
496
|
|
|
|
|
|
|
(?: # attribute value |
497
|
|
|
|
|
|
|
" ([^<"]*) " # .. enclosed in double quotes |
498
|
|
|
|
|
|
|
| |
499
|
|
|
|
|
|
|
' ([^<']*) ' # .. or enclosed in single quotes |
500
|
|
|
|
|
|
|
) |
501
|
|
|
|
|
|
|
]x; |
502
|
|
|
|
|
|
|
|
503
|
21
|
|
|
|
|
33
|
state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'}; |
504
|
21
|
|
|
|
|
34
|
state $entity_names = join "|", keys %$entity; |
505
|
|
|
|
|
|
|
|
506
|
21
|
|
|
|
|
29
|
my %attr; |
507
|
21
|
|
|
|
|
130
|
while ($lst_attrs =~ /$attr_pair_regex/g) { |
508
|
69
|
|
33
|
|
|
260
|
my ($name, $val) = ($1, $2 // $3); |
509
|
69
|
|
|
|
|
127
|
$val =~ s/&(entity_names);/$entity->{$1}/eg; |
|
0
|
|
|
|
|
0
|
|
510
|
69
|
|
|
|
|
460
|
$attr{$name} = $val; |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
|
513
|
21
|
|
|
|
|
120
|
return %attr; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
1; |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
__END__ |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head1 NAME |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=head1 SYNOPSIS |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
my $part = $surgeon->document; |
530
|
|
|
|
|
|
|
print $part->plain_text; |
531
|
|
|
|
|
|
|
$part->replace(qr[$pattern], $replacement_callback); |
532
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=head1 DESCRIPTION |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single |
538
|
|
|
|
|
|
|
I<package part> within the ZIP package of a C<.docx> document. |
539
|
|
|
|
|
|
|
It is mostly used for the I<document> part, that contains the XML representation of the |
540
|
|
|
|
|
|
|
main document body. However, other parts such as headers, footers, footnotes, etc. have the |
541
|
|
|
|
|
|
|
same internal representation and therefore the same operations can be invoked. |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 METHODS |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=head2 new |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
my $run = MsOffice::Word::Surgeon::PackagePart->new( |
549
|
|
|
|
|
|
|
surgeon => $surgeon, |
550
|
|
|
|
|
|
|
part_name => $name, |
551
|
|
|
|
|
|
|
); |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Constructor for a new part object. This is called internally from |
554
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon>; it is not meant to be called directly |
555
|
|
|
|
|
|
|
by clients. |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=head3 Constructor arguments |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=over |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=item surgeon |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
a weak reference to the main surgeon object |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=item part_name |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
ZIP member name of this part |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=back |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=head3 Other attributes |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
Other attributes, which are not passed through the constructor but are generated lazily on demand, are : |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=over |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=item contents |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
the XML contents of this part |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=item runs |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
a decomposition of the XML contents into a collection of |
585
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon::Run> objects. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=item relationships |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
an arrayref of Office relationships associated with this part. This information comes from |
590
|
|
|
|
|
|
|
a C<.rels> member in the ZIP archive, named after the name of the package part. |
591
|
|
|
|
|
|
|
Array indices correspond to relationship numbers. Array values are hashrefs with |
592
|
|
|
|
|
|
|
keys |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=over |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=item Id |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
the full relationship id |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=item num |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
the numeric part of C<rId> |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
=item Type |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
the full reference to the XML schema for this relationship |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
=item short_type |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
only the last word of the type, e.g. 'image', 'style', etc. |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=item Target |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
designation of the target within the ZIP file. The prefix 'word/' must be |
615
|
|
|
|
|
|
|
added for having a complete Zip member name. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=back |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=item images |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
a hashref of images within this package part. Keys of the hash are image I<alternative texts>. |
624
|
|
|
|
|
|
|
If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken |
625
|
|
|
|
|
|
|
(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display |
626
|
|
|
|
|
|
|
the I<description> field -- see |
627
|
|
|
|
|
|
|
L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669|MsOffice documentation>). |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
Images without alternative text will not be accessible through the current Perl module. |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
Values of the hash are zip member names for the corresponding |
632
|
|
|
|
|
|
|
image representations in C<.png> format. |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
=back |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=head2 Contents restitution |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=head3 contents |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
Returns a Perl string with the current internal XML representation of the part |
643
|
|
|
|
|
|
|
contents. |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
=head3 original_contents |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
Returns a Perl string with the XML representation of the |
648
|
|
|
|
|
|
|
part contents, as it was in the ZIP archive before any |
649
|
|
|
|
|
|
|
modification. |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=head3 indented_contents |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
Returns an indented version of the XML contents, suitable for inspection in a text editor. |
654
|
|
|
|
|
|
|
This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded |
655
|
|
|
|
|
|
|
byte string, not a Perl string. |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
=head3 plain_text |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
Returns the text contents of the part, without any markup. |
660
|
|
|
|
|
|
|
Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored. |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=head3 runs |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of |
666
|
|
|
|
|
|
|
these objects holds an XML fragment; joining all fragments |
667
|
|
|
|
|
|
|
restores the complete document. |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
my $contents = join "", map {$_->as_xml} $self->runs; |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=head2 Modifying contents |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=head3 cleanup_XML |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
$part->cleanup_XML(%args); |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
Apply several other methods for removing unnecessary nodes within the internal |
680
|
|
|
|
|
|
|
XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>, |
681
|
|
|
|
|
|
|
L</suppress_bookmarks> and L</merge_runs>. |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
Currently there is only one legal arg : |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=over |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item C<no_caps> |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically |
690
|
|
|
|
|
|
|
called for each run object. As a result, all texts within runs with the C<caps> property are automatically |
691
|
|
|
|
|
|
|
converted to uppercase. |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=back |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
=head3 reduce_noise |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
$part->reduce_noise($regex1, $regex2, ...); |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
This method is used for removing unnecessary information in the XML |
702
|
|
|
|
|
|
|
markup. It applies the given list of regexes to the whole document, |
703
|
|
|
|
|
|
|
suppressing matches. The final result is put back into |
704
|
|
|
|
|
|
|
C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >> |
705
|
|
|
|
|
|
|
references, or as names of builtin regexes (described below). Regexes |
706
|
|
|
|
|
|
|
are applied to the whole XML contents, not only to run nodes. |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=head3 noise_reduction_regex |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
my $regex = $part->noise_reduction_regex($regex_name); |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
Returns the builtin regex corresponding to the given name. |
714
|
|
|
|
|
|
|
Known regexes are : |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
717
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
718
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
719
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
720
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
721
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
722
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
=head3 reduce_all_noises |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
$part->reduce_all_noises; |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
Applies all regexes from the previous method. |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=head3 unlink_fields |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
my $names_of_ASK_fields = $part->unlink_fields; |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
Removes all fields from the part, just leaving the current |
735
|
|
|
|
|
|
|
value stored in each field. This is the equivalent of performing Ctrl-Shift-F9 |
736
|
|
|
|
|
|
|
on the whole document. |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
The return value is an arrayref to a list of names of ASK fields within the document. |
739
|
|
|
|
|
|
|
Such names should then be passed to the L</suppress_bookmarks> method |
740
|
|
|
|
|
|
|
(see below). |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=head3 suppress_bookmarks |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
$part->suppress_bookmarks(@names_to_erase); |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
Removes bookmarks markup in the part. This is useful because |
748
|
|
|
|
|
|
|
MsWord may silently insert bookmarks in unexpected places; therefore |
749
|
|
|
|
|
|
|
some searches within the text may fail because of such bookmarks. |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
By default, this method only removes the bookmarks markup, leaving |
752
|
|
|
|
|
|
|
intact the contents of the bookmark. However, when the name of a |
753
|
|
|
|
|
|
|
bookmark belongs to the list C<< @names_to_erase >>, the contents |
754
|
|
|
|
|
|
|
is also removed. Currently this is used for suppressing ASK fields, |
755
|
|
|
|
|
|
|
because such fields contain a bookmark content that is never displayed by MsWord. |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=head3 merge_runs |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
$part->merge_runs(no_caps => 1); # optional arg |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
Walks through all runs of text within the document, trying to merge |
764
|
|
|
|
|
|
|
adjacent runs when possible (i.e. when both runs have the same |
765
|
|
|
|
|
|
|
properties, and there is no other XML node inbetween). |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
This operation is a prerequisite before performing replace operations, because |
768
|
|
|
|
|
|
|
documents edited in MsWord often have run boundaries across sentences or |
769
|
|
|
|
|
|
|
even in the middle of words; so regex searches can only be successful if those |
770
|
|
|
|
|
|
|
artificial boundaries have been removed. |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
If the argument C<< no_caps => 1 >> is present, the merge operation |
773
|
|
|
|
|
|
|
will also convert runs with the C<w:caps> property, putting all letters |
774
|
|
|
|
|
|
|
into uppercase and removing the property; this makes more merges possible. |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=head3 replace |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, %replacement_args); |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
Replaces all occurrences of C<$pattern> regex within the text nodes by the |
782
|
|
|
|
|
|
|
given C<$replacement>. This is not exactly like a search-replace |
783
|
|
|
|
|
|
|
operation performed within MsWord, because the search does not cross boundaries |
784
|
|
|
|
|
|
|
of text nodes. In order to maximize the chances of successful replacements, |
785
|
|
|
|
|
|
|
the L</cleanup_XML> method is automatically called before starting the operation. |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
The argument C<$pattern> can be either a string or a reference to a regular expression. |
788
|
|
|
|
|
|
|
It should not contain any capturing parentheses, because that would perturb text |
789
|
|
|
|
|
|
|
splitting operations. |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
The argument C<$replacement> can be either a fixed string, or a reference to |
792
|
|
|
|
|
|
|
a callback subroutine that will be called for each match. |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
The C<< %replacement_args >> hash can be used to pass information to the callback |
796
|
|
|
|
|
|
|
subroutine. That hash will be enriched with three entries : |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=over |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=item matched |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
The string that has been matched by C<$pattern>. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=item run |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
The run object in which this text resides. |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=item xml_before |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
The XML fragment (possibly empty) found before the matched text . |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=back |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
The callback subroutine may return either plain text or structured XML. |
815
|
|
|
|
|
|
|
See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback. |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
The following special keys within C<< %replacement_args >> are interpreted by the |
818
|
|
|
|
|
|
|
C<replace()> method itself, and therefore are not passed to the callback subroutine : |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
=over |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
=item keep_xml_as_is |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
if true, no call is made to the L</cleanup_XML> method before performing the replacements |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
=item dont_overwrite_contents |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
if true, the internal XML contents is not modified in place; the new XML after performing |
829
|
|
|
|
|
|
|
replacements is merely returned to the caller. |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=item cleanup_args |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This |
834
|
|
|
|
|
|
|
is typically used as |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]); |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
=back |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=head3 replace_image |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
Replaces an existing PNG image by a new image. All features of the old image will |
846
|
|
|
|
|
|
|
be preserved (size, positioning, border, etc.) -- only the image itself will be |
847
|
|
|
|
|
|
|
replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word |
848
|
|
|
|
|
|
|
for this image. |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
This operation replaces a ZIP member within the C<.docx> file. If several XML |
851
|
|
|
|
|
|
|
nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several |
852
|
|
|
|
|
|
|
locations, the new image will appear at all locations, even if they do not have the |
853
|
|
|
|
|
|
|
same alternative text -- unfortunately this module currently has no facility for |
854
|
|
|
|
|
|
|
duplicating an existing image into separate instances. So if your intent is to only replace |
855
|
|
|
|
|
|
|
one image, your original document should contain several distinct images, coming from |
856
|
|
|
|
|
|
|
several distinct C<.PNG> file copies. |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
=head3 add_image |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
my $rId = $part->add_image($image_PNG_content); |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
Stores the given PNG image within the ZIP file, adds it as a relationship to the |
864
|
|
|
|
|
|
|
current part, and returns the relationship id. This operation is not sufficient |
865
|
|
|
|
|
|
|
to make the image visible in Word : it just stores the image, but you still |
866
|
|
|
|
|
|
|
have to insert a proper C<drawing> node in the contents XML, using the C<$rId>. |
867
|
|
|
|
|
|
|
Future versions of this module may offer helper methods for that purpose; |
868
|
|
|
|
|
|
|
currently it must be done by hand. |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
=head1 AUTHOR |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
Laurent Dami, E<lt>dami AT cpan DOT org<gt> |
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
Copyright 2019-2023 by Laurent Dami. |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
880
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
|