line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package MsOffice::Word::Surgeon::PackagePart; |
2
|
1
|
|
|
1
|
|
18
|
use 5.24.0; |
|
1
|
|
|
|
|
4
|
|
3
|
1
|
|
|
1
|
|
19
|
use Moose; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
4
|
1
|
|
|
1
|
|
7069
|
use MooseX::StrictConstructor; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
9
|
|
5
|
1
|
|
|
1
|
|
3442
|
use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
75
|
|
6
|
1
|
|
|
1
|
|
504
|
use MsOffice::Word::Surgeon::Run; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
45
|
|
7
|
1
|
|
|
1
|
|
674
|
use MsOffice::Word::Surgeon::Text; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
8
|
1
|
|
|
1
|
|
919
|
use XML::LibXML; |
|
1
|
|
|
|
|
35333
|
|
|
1
|
|
|
|
|
9
|
|
9
|
1
|
|
|
1
|
|
196
|
use List::Util qw(max); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
76
|
|
10
|
1
|
|
|
1
|
|
10
|
use Carp qw(croak); |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
97
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# syntactic sugar for attributes |
13
|
|
|
|
|
|
|
sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)} |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# constant integers to specify indentation modes -- see L<XML::LibXML> |
16
|
1
|
|
|
1
|
|
16
|
use constant XML_NO_INDENT => 0; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
72
|
|
17
|
1
|
|
|
1
|
|
8
|
use constant XML_SIMPLE_INDENT => 1; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
60
|
|
18
|
|
|
|
|
|
|
|
19
|
1
|
|
|
1
|
|
11
|
use namespace::clean -except => 'meta'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
16
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '2.01'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#====================================================================== |
25
|
|
|
|
|
|
|
# ATTRIBUTES |
26
|
|
|
|
|
|
|
#====================================================================== |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# attributes passed to the constructor |
30
|
|
|
|
|
|
|
has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1); |
31
|
|
|
|
|
|
|
has 'part_name' => (is => 'ro', isa => 'Str', required => 1); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# attributes constructed by the module -- not received through the constructor |
35
|
|
|
|
|
|
|
has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents); |
36
|
|
|
|
|
|
|
has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs'); |
37
|
|
|
|
|
|
|
has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef'); |
38
|
|
|
|
|
|
|
has_inner 'images' => (is => 'ro', isa => 'HashRef'); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
#====================================================================== |
43
|
|
|
|
|
|
|
# GLOBAL VARIABLES |
44
|
|
|
|
|
|
|
#====================================================================== |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Various regexes for removing uninteresting XML information |
47
|
|
|
|
|
|
|
my %noise_reduction_regexes = ( |
48
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
49
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
50
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
51
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
52
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
53
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
54
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
55
|
|
|
|
|
|
|
); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
my @noise_reduction_list = qw/proof_checking revision_ids |
58
|
|
|
|
|
|
|
complex_script_bold page_breaks language |
59
|
|
|
|
|
|
|
empty_run_props soft_hyphens/; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
#====================================================================== |
62
|
|
|
|
|
|
|
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS |
63
|
|
|
|
|
|
|
#====================================================================== |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub _runs { |
67
|
26
|
|
|
26
|
|
50
|
my $self = shift; |
68
|
|
|
|
|
|
|
|
69
|
26
|
|
|
|
|
47
|
state $run_regex = qr[ |
70
|
|
|
|
|
|
|
<w:r> # opening tag for the run |
71
|
|
|
|
|
|
|
(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1 |
72
|
|
|
|
|
|
|
(.*?) # run contents -- capture in $2 |
73
|
|
|
|
|
|
|
</w:r> # closing tag for the run |
74
|
|
|
|
|
|
|
]x; |
75
|
|
|
|
|
|
|
|
76
|
26
|
|
|
|
|
58
|
state $txt_regex = qr[ |
77
|
|
|
|
|
|
|
<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents |
78
|
|
|
|
|
|
|
(.*?) # text contents -- capture in $1 |
79
|
|
|
|
|
|
|
</w:t> # closing tag for text |
80
|
|
|
|
|
|
|
]x; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# split XML content into run fragments |
84
|
26
|
|
|
|
|
847
|
my $contents = $self->contents; |
85
|
26
|
|
|
|
|
20284
|
my @run_fragments = split m[$run_regex], $contents, -1; |
86
|
26
|
|
|
|
|
74
|
my @runs; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# build internal RUN objects |
89
|
|
|
|
|
|
|
RUN: |
90
|
26
|
|
|
|
|
139
|
while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) { |
91
|
3052
|
|
100
|
|
|
3611652
|
$run_contents //= ''; |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# split XML of this run into text fragmentsn |
94
|
3052
|
|
|
|
|
34760
|
my @txt_fragments = split m[$txt_regex], $run_contents, -1; |
95
|
3052
|
|
|
|
|
5959
|
my @texts; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# build internal TEXT objects |
98
|
|
|
|
|
|
|
TXT: |
99
|
3052
|
|
|
|
|
9404
|
while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) { |
100
|
5698
|
100
|
66
|
|
|
2540602
|
next TXT if !$xml_before_text && ( !(defined $txt_contents) || $txt_contents eq ''); |
|
|
|
66
|
|
|
|
|
101
|
2874
|
|
50
|
|
|
12695
|
push @texts, MsOffice::Word::Surgeon::Text->new( |
|
|
|
100
|
|
|
|
|
102
|
|
|
|
|
|
|
xml_before => $xml_before_text // '', |
103
|
|
|
|
|
|
|
literal_text => $txt_contents // '', |
104
|
|
|
|
|
|
|
); |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# assemble TEXT objects into a RUN object |
108
|
3052
|
100
|
100
|
|
|
41539
|
next RUN if !$xml_before_run && !@texts; |
109
|
2893
|
|
50
|
|
|
13636
|
push @runs, MsOffice::Word::Surgeon::Run->new( |
|
|
|
100
|
|
|
|
|
110
|
|
|
|
|
|
|
xml_before => $xml_before_run // '', |
111
|
|
|
|
|
|
|
props => $props // '', |
112
|
|
|
|
|
|
|
inner_texts => \@texts, |
113
|
|
|
|
|
|
|
); |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
26
|
|
|
|
|
34342
|
return \@runs; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub _relationships { |
121
|
1
|
|
|
1
|
|
5
|
my $self = shift; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# xml that describes the relationships for this package part |
124
|
1
|
|
|
|
|
4
|
my $rel_xml = $self->_rels_xml; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# parse the relationships and assemble into a sparse array indexed by relationship ids |
127
|
1
|
|
|
|
|
69
|
my @relationships; |
128
|
1
|
|
|
|
|
22
|
while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) { |
129
|
21
|
|
|
|
|
49
|
my %attrs = parse_attrs($1); |
130
|
21
|
|
50
|
|
|
100
|
$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/; |
131
|
21
|
|
|
|
|
80
|
($attrs{num} = $attrs{Id}) =~ s[^\D+][]; |
132
|
21
|
|
|
|
|
103
|
($attrs{short_type} = $attrs{Type}) =~ s[^.*/][]; |
133
|
21
|
|
|
|
|
156
|
$relationships[$attrs{num}] = \%attrs; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
1
|
|
|
|
|
41
|
return \@relationships; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub _images { |
141
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# get relationship ids associated with images |
144
|
0
|
|
|
|
|
0
|
my %rel_image = map {$_->{Id} => $_->{Target}} |
145
|
0
|
0
|
|
|
|
0
|
grep {$_ && $_->{short_type} eq 'image'} |
|
0
|
|
|
|
|
0
|
|
146
|
|
|
|
|
|
|
$self->relationships->@*; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# get titles and relationship ids of images found within the part contents |
149
|
0
|
|
|
|
|
0
|
my %image; |
150
|
0
|
|
|
|
|
0
|
my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g; |
151
|
|
|
|
|
|
|
DRAWING: |
152
|
0
|
|
|
|
|
0
|
foreach my $drawing (@drawings) { |
153
|
0
|
0
|
|
|
|
0
|
if ($drawing =~ m[<wp:docPr \s+ (.*?) /> |
154
|
|
|
|
|
|
|
.*? |
155
|
|
|
|
|
|
|
<a:blip \s+ r:embed="(\w+)"]x) { |
156
|
0
|
|
|
|
|
0
|
my ($lst_attrs, $rId) = ($1, $2); |
157
|
0
|
|
|
|
|
0
|
my %attrs = parse_attrs($lst_attrs); |
158
|
|
|
|
|
|
|
my $img_id = $attrs{title} || $attrs{descr} |
159
|
0
|
0
|
0
|
|
|
0
|
or next DRAWING; |
160
|
|
|
|
|
|
|
|
161
|
0
|
0
|
|
|
|
0
|
$image{$img_id} = "word/$rel_image{$rId}" |
162
|
|
|
|
|
|
|
or die "couldn't find image for relationship '$rId' associated with image '$img_id'"; |
163
|
|
|
|
|
|
|
# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why. |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
0
|
|
|
|
|
0
|
return \%image; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
|
171
|
7
|
|
|
7
|
|
20
|
sub _contents {shift->original_contents} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub _on_new_contents { |
174
|
72
|
|
|
72
|
|
138
|
my $self = shift; |
175
|
|
|
|
|
|
|
|
176
|
72
|
|
|
|
|
2638
|
$self->clear_runs; |
177
|
72
|
|
|
|
|
228344
|
$self->{contents_has_changed} = 1; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
#====================================================================== |
181
|
|
|
|
|
|
|
# METHODS |
182
|
|
|
|
|
|
|
#====================================================================== |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
sub _rels_xml { |
186
|
1
|
|
|
1
|
|
4
|
my ($self, $new_xml) = @_; |
187
|
1
|
|
|
|
|
33
|
my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name; |
188
|
1
|
|
|
|
|
35
|
return $self->surgeon->xml_member($rels_name, $new_xml); |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub zip_member_name { |
193
|
7
|
|
|
7
|
0
|
19
|
my $self = shift; |
194
|
7
|
|
|
|
|
223
|
return sprintf "word/%s.xml", $self->part_name; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub original_contents { |
199
|
7
|
|
|
7
|
1
|
13
|
my $self = shift; |
200
|
|
|
|
|
|
|
|
201
|
7
|
|
|
|
|
222
|
return $self->surgeon->xml_member($self->zip_member_name); |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub image { |
206
|
0
|
|
|
0
|
0
|
0
|
my ($self, $title, $new_image_content) = @_; |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# name of the image file within the zip |
209
|
0
|
|
0
|
|
|
0
|
my $zip_member_name = $self->images->{$title} |
210
|
|
|
|
|
|
|
|| ($title =~ /^\d+$/ ? "word/media/image$title.png" |
211
|
|
|
|
|
|
|
: die "couldn't find image '$title'"); |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# delegate to Archive::Zip::contents |
214
|
0
|
|
|
|
|
0
|
return $self->surgeon->zip->contents($zip_member_name, $new_image_content); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
#====================================================================== |
220
|
|
|
|
|
|
|
# CONTENTS RESTITUTION |
221
|
|
|
|
|
|
|
#====================================================================== |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub indented_contents { |
224
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
225
|
|
|
|
|
|
|
|
226
|
0
|
|
|
|
|
0
|
my $dom = XML::LibXML->load_xml(string => $self->contents); |
227
|
0
|
|
|
|
|
0
|
return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
sub plain_text { |
232
|
2
|
|
|
2
|
1
|
32
|
my $self = shift; |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# XML contents |
235
|
2
|
|
|
|
|
66
|
my $txt = $self->contents; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# replace opening paragraph tags by newlines |
238
|
2
|
|
|
|
|
4526
|
$txt =~ s/(<w:p[ >])/\n$1/g; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# replace break tags by newlines |
241
|
2
|
|
|
|
|
3902
|
$txt =~ s[<w:br/>][\n]g; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# replace tab nodes by ASCII tabs |
244
|
2
|
|
|
|
|
5236
|
$txt =~ s/<w:tab[^s][^>]*>/\t/g; |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# remove all remaining XML tags |
247
|
2
|
|
|
|
|
15930
|
$txt =~ s/<[^>]+>//g; |
248
|
|
|
|
|
|
|
|
249
|
2
|
|
|
|
|
44
|
return $txt; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
#====================================================================== |
256
|
|
|
|
|
|
|
# MODIFYING CONTENTS |
257
|
|
|
|
|
|
|
#====================================================================== |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
sub cleanup_XML { |
260
|
15
|
|
|
15
|
1
|
85070
|
my ($self, @merge_args) = @_; |
261
|
|
|
|
|
|
|
|
262
|
15
|
|
|
|
|
51
|
$self->reduce_all_noises; |
263
|
15
|
|
|
|
|
73
|
my $names_of_ASK_fields = $self->unlink_fields; |
264
|
15
|
|
|
|
|
63
|
$self->suppress_bookmarks(@$names_of_ASK_fields); |
265
|
15
|
|
|
|
|
51
|
$self->merge_runs(@merge_args); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub noise_reduction_regex { |
269
|
105
|
|
|
105
|
1
|
196
|
my ($self, $regex_name) = @_; |
270
|
105
|
50
|
|
|
|
273
|
my $regex = $noise_reduction_regexes{$regex_name} |
271
|
|
|
|
|
|
|
or croak "->noise_reduction_regex('$regex_name') : unknown regex name"; |
272
|
105
|
|
|
|
|
224
|
return $regex; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub reduce_noise { |
276
|
30
|
|
|
30
|
1
|
91
|
my ($self, @noises) = @_; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes |
279
|
30
|
100
|
|
|
|
61
|
my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises; |
|
150
|
|
|
|
|
347
|
|
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# get contents, apply all regexes, put back the modified contents. |
282
|
30
|
|
|
|
|
1049
|
my $contents = $self->contents; |
283
|
1
|
|
|
1
|
|
2755
|
no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1597
|
|
284
|
30
|
|
|
|
|
53086
|
$contents =~ s/$_/$1/g foreach @regexes; |
285
|
30
|
|
|
|
|
1072
|
$self->contents($contents); |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
sub reduce_all_noises { |
289
|
15
|
|
|
15
|
1
|
28
|
my $self = shift; |
290
|
|
|
|
|
|
|
|
291
|
15
|
|
|
|
|
50
|
$self->reduce_noise(@noise_reduction_list); |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
sub suppress_bookmarks { |
295
|
15
|
|
|
15
|
1
|
36
|
my ($self, @names_to_erase) = @_; |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# closure to decide what to do with bookmark contents |
298
|
15
|
|
|
|
|
29
|
my %should_erase_contents = map {($_ => 1)} @names_to_erase; |
|
2
|
|
|
|
|
9
|
|
299
|
|
|
|
|
|
|
my $deal_with_bookmark_text = sub { |
300
|
18
|
|
|
18
|
|
54
|
my ($bookmark_name, $bookmark_contents) = @_; |
301
|
18
|
100
|
|
|
|
372
|
return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents; |
302
|
15
|
|
|
|
|
82
|
}; |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# regex to find bookmarks markup |
305
|
15
|
|
|
|
|
28
|
state $bookmark_rx = qr{ |
306
|
|
|
|
|
|
|
<w:bookmarkStart # initial tag |
307
|
|
|
|
|
|
|
.+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1 |
308
|
|
|
|
|
|
|
.+? w:name="([^"]+)" # 'name' attribute -- capture 2 |
309
|
|
|
|
|
|
|
.*? /> # end of this tag |
310
|
|
|
|
|
|
|
(.*?) # bookmark contents (may be empty) -- capture 3 |
311
|
|
|
|
|
|
|
<w:bookmarkEnd # ending tag |
312
|
|
|
|
|
|
|
\s+ w:id="\1" # same 'id' attribute |
313
|
|
|
|
|
|
|
.*? /> # end of this tag |
314
|
|
|
|
|
|
|
}sx; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# remove bookmarks markup |
317
|
15
|
|
|
|
|
476
|
my $contents = $self->contents; |
318
|
15
|
|
|
|
|
4092
|
$contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg; |
|
18
|
|
|
|
|
45
|
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# re-inject the modified contents |
321
|
15
|
|
|
|
|
497
|
$self->contents($contents); |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub merge_runs { |
325
|
15
|
|
|
15
|
1
|
43
|
my ($self, %args) = @_; |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
# check validity of received args |
328
|
15
|
|
|
|
|
32
|
state $is_valid_arg = {no_caps => 1}; |
329
|
|
|
|
|
|
|
$is_valid_arg->{$_} or croak "merge_runs(): invalid arg: $_" |
330
|
15
|
|
50
|
|
|
56
|
foreach keys %args; |
331
|
|
|
|
|
|
|
|
332
|
15
|
|
|
|
|
29
|
my @new_runs; |
333
|
|
|
|
|
|
|
# loop over internal "run" objects |
334
|
15
|
|
|
|
|
23
|
foreach my $run (@{$self->runs}) { |
|
15
|
|
|
|
|
472
|
|
335
|
|
|
|
|
|
|
|
336
|
1844
|
100
|
|
|
|
5147
|
$run->remove_caps_property if $args{no_caps}; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# check if the current run can be merged with the previous one |
339
|
1844
|
100
|
66
|
|
|
52337
|
if ( !$run->xml_before # no other XML markup between the 2 runs |
|
|
|
100
|
|
|
|
|
340
|
|
|
|
|
|
|
&& @new_runs # there was a previous run |
341
|
|
|
|
|
|
|
&& $new_runs[-1]->props eq $run->props # both runs have the same properties |
342
|
|
|
|
|
|
|
) { |
343
|
|
|
|
|
|
|
# conditions are OK, so merge this run with the previous one |
344
|
277
|
|
|
|
|
689
|
$new_runs[-1]->merge($run); |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
else { |
347
|
|
|
|
|
|
|
# conditions not OK, just push this run without merging |
348
|
1567
|
|
|
|
|
3994
|
push @new_runs, $run; |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# reassemble the whole stuff and inject it as new contents |
353
|
15
|
|
|
|
|
64
|
$self->contents(join "", map {$_->as_xml} @new_runs); |
|
1567
|
|
|
|
|
4116
|
|
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub unlink_fields { |
359
|
15
|
|
|
15
|
1
|
35
|
my $self = shift; |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# must find out what are the ASK fields before erasing the markup |
362
|
15
|
|
|
|
|
31
|
state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)]; |
363
|
15
|
|
|
|
|
475
|
my $contents = $self->contents; |
364
|
15
|
|
|
|
|
610
|
my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g; |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# regexes to remove field nodes and "field instruction" nodes |
368
|
15
|
|
|
|
|
39
|
state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>]; |
369
|
15
|
|
|
|
|
36
|
state $field_boundary_rx = qr[<w:fldChar |
370
|
|
|
|
|
|
|
(?: [^>]*?/> # ignore all attributes until end of node .. |
371
|
|
|
|
|
|
|
| # .. or |
372
|
|
|
|
|
|
|
[^>]*?>.*?</w:fldChar>) # .. ignore node content until closing tag |
373
|
|
|
|
|
|
|
]x; # field boundaries are encoded as "begin" / "separate" / "end" |
374
|
15
|
|
|
|
|
27
|
state $simple_field_rx = qr[</?w:fldSimple[^>]*>]; |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# apply the regexes |
377
|
15
|
|
|
|
|
48
|
$self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx); |
378
|
|
|
|
|
|
|
|
379
|
15
|
|
|
|
|
49
|
return \@names_of_ASK_fields; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub replace { |
384
|
11
|
|
|
11
|
1
|
61
|
my ($self, $pattern, $replacement_callback, %replacement_args) = @_; |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
# cleanup the XML structure so that replacements work better |
387
|
11
|
|
|
|
|
25
|
my $keep_xml_as_is = delete $replacement_args{keep_xml_as_is}; |
388
|
11
|
100
|
|
|
|
41
|
$self->cleanup_XML unless $keep_xml_as_is; |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
# check for presences of a special option to avoid modying contents |
391
|
11
|
|
|
|
|
168847
|
my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents}; |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# apply replacements and generate new XML |
394
|
|
|
|
|
|
|
my $xml = join "", |
395
|
11
|
|
|
|
|
401
|
map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*; |
|
1049
|
|
|
|
|
4084
|
|
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# overwrite previous contents |
398
|
11
|
50
|
|
|
|
552
|
$self->contents($xml) unless $dont_overwrite_contents; |
399
|
|
|
|
|
|
|
|
400
|
11
|
|
|
|
|
111
|
return $xml; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
sub _update_contents_in_zip { # called for each part before saving the zip file |
405
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
$self->surgeon->xml_member($self->zip_member_name, $self->contents) |
408
|
0
|
0
|
|
|
|
0
|
if $self->{contents_has_changed}; |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
sub replace_image { |
413
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_title, $image_PNG_content) = @_; |
414
|
|
|
|
|
|
|
|
415
|
0
|
0
|
|
|
|
0
|
my $member_name = $self->images->{$image_title} |
416
|
|
|
|
|
|
|
or die "could not find an image with title: $image_title"; |
417
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->contents($member_name, $image_PNG_content); |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub add_image { |
423
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_PNG_content) = @_; |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# compute a fresh image number and a fresh relationship id |
426
|
0
|
|
|
|
|
0
|
my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]); |
427
|
0
|
|
|
|
|
0
|
my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members; |
|
0
|
|
|
|
|
0
|
|
428
|
0
|
|
0
|
|
|
0
|
my $last_img_num = max @image_nums // 0; |
429
|
0
|
|
|
|
|
0
|
my $target = sprintf "media/image%d.png", $last_img_num + 1; |
430
|
0
|
|
|
|
|
0
|
my $last_rId_num = $self->relationships->$#*; |
431
|
0
|
|
|
|
|
0
|
my $rId = sprintf "rId%d", $last_rId_num + 1; |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# assemble XML for the new relationship |
434
|
0
|
|
|
|
|
0
|
my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; |
435
|
0
|
|
|
|
|
0
|
my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>}; |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
# update the rels member |
438
|
0
|
|
|
|
|
0
|
my $xml = $self->_rels_xml; |
439
|
0
|
|
|
|
|
0
|
$xml =~ s[</Relationships>][$new_rel_xml</Relationships>]; |
440
|
0
|
|
|
|
|
0
|
$self->_rels_xml($xml); |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# add the image as a new member into the archive |
443
|
0
|
|
|
|
|
0
|
my $member_name = "word/$target"; |
444
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->addString(\$image_PNG_content, $member_name); |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# update the global content_types if it doesn't include PNG |
447
|
0
|
|
|
|
|
0
|
my $ct = $self->surgeon->_content_types; |
448
|
0
|
0
|
|
|
|
0
|
if ($ct !~ /Extension="png"/) { |
449
|
0
|
|
|
|
|
0
|
$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>]; |
450
|
0
|
|
|
|
|
0
|
$self->surgeon->_content_types($ct); |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# return the relationship id |
454
|
0
|
|
|
|
|
0
|
return $rId; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
#====================================================================== |
460
|
|
|
|
|
|
|
# UTILITY FUNCTIONS |
461
|
|
|
|
|
|
|
#====================================================================== |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
sub parse_attrs { # cheap parsing of attribute lists in an XML node |
465
|
21
|
|
|
21
|
0
|
56
|
my ($lst_attrs) = @_; |
466
|
|
|
|
|
|
|
|
467
|
21
|
|
|
|
|
29
|
state $attr_pair_regex = qr[ |
468
|
|
|
|
|
|
|
([^=\s"'&<>]+) # attribute name |
469
|
|
|
|
|
|
|
\h* = \h* # Eq |
470
|
|
|
|
|
|
|
(?: # attribute value |
471
|
|
|
|
|
|
|
" ([^<"]*) " # .. enclosed in double quotes |
472
|
|
|
|
|
|
|
| |
473
|
|
|
|
|
|
|
' ([^<']*) ' # .. or enclosed in single quotes |
474
|
|
|
|
|
|
|
) |
475
|
|
|
|
|
|
|
]x; |
476
|
|
|
|
|
|
|
|
477
|
21
|
|
|
|
|
35
|
state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'}; |
478
|
21
|
|
|
|
|
35
|
state $entity_names = join "|", keys %$entity; |
479
|
|
|
|
|
|
|
|
480
|
21
|
|
|
|
|
26
|
my %attr; |
481
|
21
|
|
|
|
|
141
|
while ($lst_attrs =~ /$attr_pair_regex/g) { |
482
|
69
|
|
33
|
|
|
241
|
my ($name, $val) = ($1, $2 // $3); |
483
|
69
|
|
|
|
|
136
|
$val =~ s/&(entity_names);/$entity->{$1}/eg; |
|
0
|
|
|
|
|
0
|
|
484
|
69
|
|
|
|
|
477
|
$attr{$name} = $val; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
21
|
|
|
|
|
132
|
return %attr; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
1; |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
__END__ |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=head1 NAME |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=head1 DESCRIPTION |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single |
504
|
|
|
|
|
|
|
I<package part> within the ZIP package of a C<.docx> document. |
505
|
|
|
|
|
|
|
It is mostly used for the I<document> part, that contains the XML representation of the |
506
|
|
|
|
|
|
|
main document body. However, other parts such as headers, footers, footnotes, etc. have the |
507
|
|
|
|
|
|
|
same internal representation and therefore the same operations can be invoked. |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=head1 METHODS |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=head2 new |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
my $run = MsOffice::Word::Surgeon::PackagePart->new( |
515
|
|
|
|
|
|
|
surgeon => $surgeon, |
516
|
|
|
|
|
|
|
part_name => $name, |
517
|
|
|
|
|
|
|
); |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
Constructor for a new part object. This is called internally from |
520
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon>; it is not meant to be called directly |
521
|
|
|
|
|
|
|
by clients. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head3 Constructor arguments |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=over |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=item surgeon |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
a weak reference to the main surgeon object |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=item part_name |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
ZIP member name of this part |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=back |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=head3 Other attributes |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
Other attributes, which are not passed through the constructor but are generated lazily on demand, are : |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=over |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=item contents |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
the XML contents of this part |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=item runs |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
a decomposition of the XML contents into a collection of |
551
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon::Run> objects. |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=item relationships |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
an arrayref of Office relationships associated with this part. This information comes from |
556
|
|
|
|
|
|
|
a C<.rels> member in the ZIP archive, named after the name of the package part. |
557
|
|
|
|
|
|
|
Array indices correspond to relationship numbers. Array values are hashrefs with |
558
|
|
|
|
|
|
|
keys |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=over |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=item Id |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
the full relationship id |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=item num |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
the numeric part of C<rId> |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=item Type |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
the full reference to the XML schema for this relationship |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=item short_type |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
only the last word of the type, e.g. 'image', 'style', etc. |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=item Target |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
designation of the target within the ZIP file. The prefix 'word/' must be |
581
|
|
|
|
|
|
|
added for having a complete Zip member name. |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
=back |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=item images |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
a hashref of images within this package part. Keys of the hash are image I<alternative texts>. |
590
|
|
|
|
|
|
|
If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken |
591
|
|
|
|
|
|
|
(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display |
592
|
|
|
|
|
|
|
the I<description> field -- see |
593
|
|
|
|
|
|
|
L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669|MsOffice documentation>). |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
Images without alternative text will not be accessible through the current Perl module. |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Values of the hash are zip member names for the corresponding |
598
|
|
|
|
|
|
|
image representations in C<.png> format. |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
=back |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
=head2 Contents restitution |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=head3 contents |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
Returns a Perl string with the current internal XML representation of the part |
609
|
|
|
|
|
|
|
contents. |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
=head3 original_contents |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
Returns a Perl string with the XML representation of the |
614
|
|
|
|
|
|
|
part contents, as it was in the ZIP archive before any |
615
|
|
|
|
|
|
|
modification. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=head3 indented_contents |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
Returns an indented version of the XML contents, suitable for inspection in a text editor. |
620
|
|
|
|
|
|
|
This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded |
621
|
|
|
|
|
|
|
byte string, not a Perl string. |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=head3 plain_text |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
Returns the text contents of the part, without any markup. |
626
|
|
|
|
|
|
|
Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored. |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
=head3 runs |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of |
632
|
|
|
|
|
|
|
these objects holds an XML fragment; joining all fragments |
633
|
|
|
|
|
|
|
restores the complete document. |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
my $contents = join "", map {$_->as_xml} $self->runs; |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=head2 Modifying contents |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=head3 cleanup_XML |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
$part->cleanup_XML; |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
Apply several other methods for removing unnecessary nodes within the internal |
646
|
|
|
|
|
|
|
XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>, |
647
|
|
|
|
|
|
|
L</suppress_bookmarks> and L</merge_runs>. |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head3 reduce_noise |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
$part->reduce_noise($regex1, $regex2, ...); |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
This method is used for removing unnecessary information in the XML |
655
|
|
|
|
|
|
|
markup. It applies the given list of regexes to the whole document, |
656
|
|
|
|
|
|
|
suppressing matches. The final result is put back into |
657
|
|
|
|
|
|
|
C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >> |
658
|
|
|
|
|
|
|
references, or as names of builtin regexes (described below). Regexes |
659
|
|
|
|
|
|
|
are applied to the whole XML contents, not only to run nodes. |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=head3 noise_reduction_regex |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
my $regex = $part->noise_reduction_regex($regex_name); |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
Returns the builtin regex corresponding to the given name. |
667
|
|
|
|
|
|
|
Known regexes are : |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
670
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
671
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
672
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
673
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
674
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
675
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=head3 reduce_all_noises |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
$part->reduce_all_noises; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
Applies all regexes from the previous method. |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=head3 unlink_fields |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
my $names_of_ASK_fields = $part->unlink_fields; |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
Removes all fields from the part, just leaving the current |
688
|
|
|
|
|
|
|
value stored in each field. This is the equivalent of performing Ctrl-Shift-F9 |
689
|
|
|
|
|
|
|
on the whole document. |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
The return value is an arrayref to a list of names of ASK fields within the document. |
692
|
|
|
|
|
|
|
Such names should then be passed to the L</suppress_bookmarks> method |
693
|
|
|
|
|
|
|
(see below). |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
=head3 suppress_bookmarks |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
$part->suppress_bookmarks(@names_to_erase); |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
Removes bookmarks markup in the part. This is useful because |
701
|
|
|
|
|
|
|
MsWord may silently insert bookmarks in unexpected places; therefore |
702
|
|
|
|
|
|
|
some searches within the text may fail because of such bookmarks. |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
By default, this method only removes the bookmarks markup, leaving |
705
|
|
|
|
|
|
|
intact the contents of the bookmark. However, when the name of a |
706
|
|
|
|
|
|
|
bookmark belongs to the list C<< @names_to_erase >>, the contents |
707
|
|
|
|
|
|
|
is also removed. Currently this is used for suppressing ASK fields, |
708
|
|
|
|
|
|
|
because such fields contain a bookmark content that is never displayed by MsWord. |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
=head3 merge_runs |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
$part->merge_runs(no_caps => 1); # optional arg |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
Walks through all runs of text within the document, trying to merge |
717
|
|
|
|
|
|
|
adjacent runs when possible (i.e. when both runs have the same |
718
|
|
|
|
|
|
|
properties, and there is no other XML node inbetween). |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
This operation is a prerequisite before performing replace operations, because |
721
|
|
|
|
|
|
|
documents edited in MsWord often have run boundaries across sentences or |
722
|
|
|
|
|
|
|
even in the middle of words; so regex searches can only be successful if those |
723
|
|
|
|
|
|
|
artificial boundaries have been removed. |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
If the argument C<< no_caps => 1 >> is present, the merge operation |
726
|
|
|
|
|
|
|
will also convert runs with the C<w:caps> property, putting all letters |
727
|
|
|
|
|
|
|
into uppercase and removing the property; this makes more merges possible. |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=head3 replace |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, %replacement_args); |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
Replaces all occurrences of C<$pattern> regex within the text nodes by the |
735
|
|
|
|
|
|
|
given C<$replacement>. This is not exactly like a search-replace |
736
|
|
|
|
|
|
|
operation performed within MsWord, because the search does not cross boundaries |
737
|
|
|
|
|
|
|
of text nodes. In order to maximize the chances of successful replacements, |
738
|
|
|
|
|
|
|
the L</cleanup_XML> method is automatically called before starting the operation. |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
The argument C<$pattern> can be either a string or a reference to a regular expression. |
741
|
|
|
|
|
|
|
It should not contain any capturing parentheses, because that would perturb text |
742
|
|
|
|
|
|
|
splitting operations. |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
The argument C<$replacement> can be either a fixed string, or a reference to |
745
|
|
|
|
|
|
|
a callback subroutine that will be called for each match. |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
The C<< %replacement_args >> hash can be used to pass information to the callback |
749
|
|
|
|
|
|
|
subroutine. That hash will be enriched with three entries : |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
=over |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=item matched |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
The string that has been matched by C<$pattern>. |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
=item run |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
The run object in which this text resides. |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=item xml_before |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
The XML fragment (possibly empty) found before the matched text . |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=back |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
The callback subroutine may return either plain text or structured XML. |
768
|
|
|
|
|
|
|
See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback. |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
The following special keys within C<< %replacement_args >> are interpreted by the |
771
|
|
|
|
|
|
|
C<replace()> method itself, and therefore are not passed to the callback subroutine : |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=over |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=item keep_xml_as_is |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
if true, no call is made to the L</cleanup_XML> method before performing the replacements |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=item dont_overwrite_contents |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
if true, the internal XML contents is not modified in place; the new XML after performing |
782
|
|
|
|
|
|
|
replacements is merely returned to the caller. |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=back |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
=head3 replace_image |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
Replaces an existing PNG image by a new image. All features of the old image will |
792
|
|
|
|
|
|
|
be preserved (size, positioning, border, etc.) -- only the image itself will be |
793
|
|
|
|
|
|
|
replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word |
794
|
|
|
|
|
|
|
for this image. |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
This operation replaces a ZIP member within the C<.docx> file. If several XML |
797
|
|
|
|
|
|
|
nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several |
798
|
|
|
|
|
|
|
locations, the new image will appear at all locations, even if they do not have the |
799
|
|
|
|
|
|
|
same alternative text -- unfortunately this module currently has no facility for |
800
|
|
|
|
|
|
|
duplicating an existing image into separate instances. So if your intent is to only replace |
801
|
|
|
|
|
|
|
one image, your original document should contain several distinct images, coming from |
802
|
|
|
|
|
|
|
several distinct C<.PNG> file copies. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
=head3 add_image |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
my $rId = $part->add_image($image_PNG_content); |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
Stores the given PNG image within the ZIP file, adds it as a relationship to the |
810
|
|
|
|
|
|
|
current part, and returns the relationship id. This operation is not sufficient |
811
|
|
|
|
|
|
|
to make the image visible in Word : it just stores the image, but you still |
812
|
|
|
|
|
|
|
have to insert a proper C<drawing> node in the contents XML, using the C<$rId>. |
813
|
|
|
|
|
|
|
Future versions of this module may offer helper methods for that purpose; |
814
|
|
|
|
|
|
|
currently it must be done by hand. |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
=head1 AUTHOR |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
Laurent Dami, E<lt>dami AT cpan DOT org<gt> |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Copyright 2019-2022 by Laurent Dami. |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
826
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
|