| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package MsOffice::Word::Surgeon::PackagePart; |
|
2
|
4
|
|
|
4
|
|
71
|
use 5.24.0; |
|
|
4
|
|
|
|
|
14
|
|
|
3
|
4
|
|
|
4
|
|
18
|
use Moose; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
25
|
|
|
4
|
4
|
|
|
4
|
|
23464
|
use MooseX::StrictConstructor; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
32
|
|
|
5
|
4
|
|
|
4
|
|
11696
|
use MsOffice::Word::Surgeon::Carp; |
|
|
4
|
|
|
|
|
9
|
|
|
|
4
|
|
|
|
|
37
|
|
|
6
|
4
|
|
|
4
|
|
267
|
use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level parse_attrs decode_entities encode_entities); |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
319
|
|
|
7
|
4
|
|
|
4
|
|
1987
|
use MsOffice::Word::Surgeon::Run; |
|
|
4
|
|
|
|
|
54
|
|
|
|
4
|
|
|
|
|
219
|
|
|
8
|
4
|
|
|
4
|
|
2251
|
use MsOffice::Word::Surgeon::Text; |
|
|
4
|
|
|
|
|
31
|
|
|
|
4
|
|
|
|
|
180
|
|
|
9
|
4
|
|
|
4
|
|
2216
|
use MsOffice::Word::Surgeon::Field; |
|
|
4
|
|
|
|
|
40
|
|
|
|
4
|
|
|
|
|
211
|
|
|
10
|
4
|
|
|
4
|
|
2639
|
use MsOffice::Word::Surgeon::BookmarkBoundary; |
|
|
4
|
|
|
|
|
35
|
|
|
|
4
|
|
|
|
|
173
|
|
|
11
|
4
|
|
|
4
|
|
3462
|
use XML::LibXML ();; |
|
|
4
|
|
|
|
|
147608
|
|
|
|
4
|
|
|
|
|
189
|
|
|
12
|
4
|
|
|
4
|
|
35
|
use List::Util qw(max); |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
290
|
|
|
13
|
4
|
|
|
4
|
|
2014
|
use match::simple qw(match); |
|
|
4
|
|
|
|
|
30632
|
|
|
|
4
|
|
|
|
|
31
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# syntactic sugar for attributes |
|
16
|
|
|
|
|
|
|
sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)} |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# constant integers to specify indentation modes -- see L<XML::LibXML> |
|
20
|
4
|
|
|
4
|
|
1327
|
use constant XML_NO_INDENT => 0; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
275
|
|
|
21
|
4
|
|
|
4
|
|
23
|
use constant XML_SIMPLE_INDENT => 1; |
|
|
4
|
|
|
|
|
9
|
|
|
|
4
|
|
|
|
|
197
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
4
|
|
|
4
|
|
39
|
use namespace::clean -except => 'meta'; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
46
|
|
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
#====================================================================== |
|
26
|
|
|
|
|
|
|
# ATTRIBUTES |
|
27
|
|
|
|
|
|
|
#====================================================================== |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# attributes passed to the constructor |
|
30
|
|
|
|
|
|
|
has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1); |
|
31
|
|
|
|
|
|
|
has 'part_name' => (is => 'ro', isa => 'Str', required => 1); |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# attributes constructed by the module -- not received through the constructor |
|
34
|
|
|
|
|
|
|
has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents); |
|
35
|
|
|
|
|
|
|
has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs'); |
|
36
|
|
|
|
|
|
|
has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef'); |
|
37
|
|
|
|
|
|
|
has_inner 'images' => (is => 'ro', isa => 'HashRef'); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0); |
|
40
|
|
|
|
|
|
|
has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
#====================================================================== |
|
43
|
|
|
|
|
|
|
# GLOBAL VARIABLES |
|
44
|
|
|
|
|
|
|
#====================================================================== |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Various regexes for removing uninteresting XML information |
|
47
|
|
|
|
|
|
|
my %noise_reduction_regexes = ( |
|
48
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
|
49
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
|
50
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
|
51
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
|
52
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
|
53
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
|
54
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
|
55
|
|
|
|
|
|
|
); |
|
56
|
|
|
|
|
|
|
my @noise_reduction_list = qw/proof_checking revision_ids |
|
57
|
|
|
|
|
|
|
complex_script_bold page_breaks language |
|
58
|
|
|
|
|
|
|
empty_run_props soft_hyphens/; |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
#====================================================================== |
|
61
|
|
|
|
|
|
|
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS |
|
62
|
|
|
|
|
|
|
#====================================================================== |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub _runs { |
|
66
|
14
|
|
|
14
|
|
25
|
my $self = shift; |
|
67
|
|
|
|
|
|
|
|
|
68
|
14
|
|
|
|
|
24
|
state $run_regex = qr[ |
|
69
|
|
|
|
|
|
|
<w:r> # opening tag for the run |
|
70
|
|
|
|
|
|
|
(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1 |
|
71
|
|
|
|
|
|
|
(.*?) # run contents -- capture in $2 |
|
72
|
|
|
|
|
|
|
</w:r> # closing tag for the run |
|
73
|
|
|
|
|
|
|
]x; |
|
74
|
|
|
|
|
|
|
|
|
75
|
14
|
|
|
|
|
26
|
state $txt_regex = qr[ |
|
76
|
|
|
|
|
|
|
<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents |
|
77
|
|
|
|
|
|
|
(.*?) # text contents -- capture in $1 |
|
78
|
|
|
|
|
|
|
</w:t> # closing tag for text |
|
79
|
|
|
|
|
|
|
]x; |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# split XML content into run fragments |
|
82
|
14
|
|
|
|
|
661
|
my $contents = $self->contents; |
|
83
|
14
|
|
|
|
|
14031
|
my @run_fragments = split m[$run_regex], $contents, -1; # -1 : don't strip trailing items |
|
84
|
14
|
|
|
|
|
43
|
my @runs; |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# build internal RUN objects |
|
87
|
|
|
|
|
|
|
RUN: |
|
88
|
14
|
|
|
|
|
78
|
while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) { |
|
89
|
2727
|
|
100
|
|
|
4148001
|
$run_contents //= ''; |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# split XML of this run into text fragmentsn |
|
92
|
2727
|
|
|
|
|
41887
|
my @txt_fragments = split m[$txt_regex], $run_contents, -1; # -1 : don't strip trailing items |
|
93
|
2727
|
|
|
|
|
5596
|
my @texts; |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# build internal TEXT objects |
|
96
|
|
|
|
|
|
|
TXT: |
|
97
|
2727
|
|
|
|
|
9746
|
while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) { |
|
98
|
5307
|
100
|
100
|
|
|
2919452
|
next TXT if !$xml_before_text && !length($txt_contents); |
|
99
|
2686
|
|
100
|
|
|
9763
|
$_ //= '' for $xml_before_text, $txt_contents; |
|
100
|
2686
|
|
|
|
|
12233
|
decode_entities($txt_contents); |
|
101
|
2686
|
|
|
|
|
10090
|
push @texts, MsOffice::Word::Surgeon::Text->new(xml_before => $xml_before_text, |
|
102
|
|
|
|
|
|
|
literal_text => $txt_contents); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# assemble TEXT objects into a RUN object |
|
106
|
2727
|
100
|
100
|
|
|
57144
|
next RUN if !$xml_before_run && !@texts; |
|
107
|
2686
|
|
100
|
|
|
10263
|
$_ //= '' for $xml_before_run, $props; |
|
108
|
2686
|
|
|
|
|
10600
|
push @runs, MsOffice::Word::Surgeon::Run->new(xml_before => $xml_before_run, |
|
109
|
|
|
|
|
|
|
props => $props, |
|
110
|
|
|
|
|
|
|
inner_texts => \@texts); |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
14
|
|
|
|
|
21020
|
return \@runs; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub _relationships { |
|
118
|
4
|
|
|
4
|
|
11
|
my $self = shift; |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# xml that describes the relationships for this package part |
|
121
|
4
|
|
|
|
|
22
|
my $rel_xml = $self->_rels_xml; |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# parse the relationships and assemble into a sparse array indexed by relationship ids |
|
124
|
4
|
|
|
|
|
76
|
my @relationships; |
|
125
|
4
|
|
|
|
|
78
|
while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) { |
|
126
|
132
|
|
|
|
|
307
|
my %attrs = parse_attrs($1); |
|
127
|
132
|
|
50
|
|
|
581
|
$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/; |
|
128
|
132
|
|
|
|
|
519
|
($attrs{num} = $attrs{Id}) =~ s[^\D+][]; |
|
129
|
132
|
|
|
|
|
581
|
($attrs{short_type} = $attrs{Type}) =~ s[^.*/][]; |
|
130
|
132
|
|
|
|
|
947
|
$relationships[$attrs{num}] = \%attrs; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
4
|
|
|
|
|
212
|
return \@relationships; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub _images { |
|
138
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# get relationship ids associated with images |
|
141
|
0
|
|
|
|
|
0
|
my %rel_image = map {$_->{Id} => $_->{Target}} |
|
142
|
0
|
0
|
|
|
|
0
|
grep {$_ && $_->{short_type} eq 'image'} |
|
|
0
|
|
|
|
|
0
|
|
|
143
|
|
|
|
|
|
|
$self->relationships->@*; |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# get titles and relationship ids of images found within the part contents |
|
146
|
0
|
|
|
|
|
0
|
my %image; |
|
147
|
0
|
|
|
|
|
0
|
my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g; |
|
148
|
|
|
|
|
|
|
DRAWING: |
|
149
|
0
|
|
|
|
|
0
|
foreach my $drawing (@drawings) { |
|
150
|
0
|
0
|
|
|
|
0
|
if ($drawing =~ m[<wp:docPr \s+ (.*?) /> |
|
151
|
|
|
|
|
|
|
.*? |
|
152
|
|
|
|
|
|
|
<a:blip \s+ r:embed="(\w+)"]x) { |
|
153
|
0
|
|
|
|
|
0
|
my ($lst_attrs, $rId) = ($1, $2); |
|
154
|
0
|
|
|
|
|
0
|
my %attrs = parse_attrs($lst_attrs); |
|
155
|
|
|
|
|
|
|
my $img_id = $attrs{title} || $attrs{descr} |
|
156
|
0
|
0
|
0
|
|
|
0
|
or next DRAWING; |
|
157
|
|
|
|
|
|
|
|
|
158
|
0
|
0
|
|
|
|
0
|
$image{$img_id} = "word/$rel_image{$rId}" |
|
159
|
|
|
|
|
|
|
or die "couldn't find image for relationship '$rId' associated with image '$img_id'"; |
|
160
|
|
|
|
|
|
|
# NOTE: targets in the rels XML miss the "word/" prefix, I don't know why. |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
0
|
|
|
|
|
0
|
return \%image; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
|
|
168
|
10
|
|
|
10
|
|
34
|
sub _contents {shift->original_contents} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub _on_new_contents { |
|
171
|
23
|
|
|
23
|
|
42
|
my $self = shift; |
|
172
|
|
|
|
|
|
|
|
|
173
|
23
|
|
|
|
|
1026
|
$self->clear_runs; |
|
174
|
23
|
|
|
|
|
263712
|
$self->{contents_has_changed} = 1; |
|
175
|
23
|
|
|
|
|
1283
|
$self->{was_cleaned_up} = 0; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
#====================================================================== |
|
179
|
|
|
|
|
|
|
# GENERAL METHODS |
|
180
|
|
|
|
|
|
|
#====================================================================== |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub _rels_xml { # rw accessor |
|
184
|
4
|
|
|
4
|
|
11
|
my ($self, $new_xml) = @_; |
|
185
|
4
|
|
|
|
|
253
|
my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name; |
|
186
|
4
|
|
|
|
|
182
|
return $self->surgeon->xml_member($rels_name, $new_xml); |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub zip_member_name { |
|
191
|
10
|
|
|
10
|
0
|
20
|
my $self = shift; |
|
192
|
10
|
|
|
|
|
382
|
return sprintf "word/%s.xml", $self->part_name; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub original_contents { |
|
197
|
10
|
|
|
10
|
1
|
20
|
my $self = shift; |
|
198
|
|
|
|
|
|
|
|
|
199
|
10
|
|
|
|
|
373
|
return $self->surgeon->xml_member($self->zip_member_name); |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
#====================================================================== |
|
204
|
|
|
|
|
|
|
# CONTENTS RESTITUTION |
|
205
|
|
|
|
|
|
|
#====================================================================== |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub indented_contents { |
|
208
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
209
|
|
|
|
|
|
|
|
|
210
|
0
|
|
|
|
|
0
|
my $dom = XML::LibXML->load_xml(string => $self->contents); |
|
211
|
0
|
|
|
|
|
0
|
return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub plain_text { |
|
216
|
4
|
|
|
4
|
1
|
45
|
my $self = shift; |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# XML contents |
|
219
|
4
|
|
|
|
|
165
|
my $txt = $self->contents; |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# replace opening paragraph tags by newlines |
|
222
|
4
|
|
|
|
|
2446
|
$txt =~ s/(<w:p[ >])/\n$1/g; |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# replace break tags by newlines |
|
225
|
4
|
|
|
|
|
1517
|
$txt =~ s[<w:br/>][\n]g; |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# replace tab nodes by ASCII tabs |
|
228
|
4
|
|
|
|
|
3101
|
$txt =~ s/<w:tab[^s][^>]*>/\t/g; |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# remove all remaining XML tags |
|
231
|
4
|
|
|
|
|
24078
|
$txt =~ s/<[^>]+>//g; |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# decode entities |
|
234
|
4
|
|
|
|
|
46
|
decode_entities($txt); |
|
235
|
|
|
|
|
|
|
|
|
236
|
4
|
|
|
|
|
129
|
return $txt; |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
#====================================================================== |
|
241
|
|
|
|
|
|
|
# MODIFYING CONTENTS |
|
242
|
|
|
|
|
|
|
#====================================================================== |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub cleanup_XML { |
|
245
|
3
|
|
|
3
|
1
|
10
|
my ($self, @merge_args) = @_; |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# avoid doing it twice |
|
248
|
3
|
100
|
|
|
|
14
|
return if $self->{was_cleaned_up}; |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# start the cleanup |
|
251
|
2
|
|
|
|
|
10
|
$self->reduce_all_noises; |
|
252
|
|
|
|
|
|
|
|
|
253
|
2
|
|
|
|
|
72
|
my $contents = $self->contents; |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# unlink fields, suppress bookmarks, merge runs |
|
256
|
2
|
|
|
|
|
10
|
$self->unlink_fields; |
|
257
|
2
|
|
|
|
|
481
|
$self->suppress_bookmarks; |
|
258
|
2
|
|
|
|
|
4559
|
$self->merge_runs(@merge_args); |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# flag the fact that the cleanup was done |
|
261
|
2
|
|
|
|
|
175891
|
$self->{was_cleaned_up} = 1; |
|
262
|
|
|
|
|
|
|
} |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
sub reduce_noise { |
|
265
|
2
|
|
|
2
|
1
|
11
|
my ($self, @noises) = @_; |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# gather regexes to apply, given either directly as regex refs, or as names of builtin regexes |
|
268
|
2
|
50
|
|
|
|
6
|
my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises; |
|
|
14
|
|
|
|
|
32
|
|
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# get contents, apply all regexes, put back the modified contents. |
|
271
|
2
|
|
|
|
|
90
|
my $contents = $self->contents; |
|
272
|
4
|
|
|
4
|
|
9167
|
no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef |
|
|
4
|
|
|
|
|
9
|
|
|
|
4
|
|
|
|
|
13241
|
|
|
273
|
2
|
|
|
|
|
8367
|
$contents =~ s/$_/$1/g foreach @regexes; |
|
274
|
2
|
|
|
|
|
113
|
$self->contents($contents); |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub noise_reduction_regex { |
|
278
|
14
|
|
|
14
|
1
|
21
|
my ($self, $regex_name) = @_; |
|
279
|
14
|
50
|
|
|
|
39
|
my $regex = $noise_reduction_regexes{$regex_name} |
|
280
|
|
|
|
|
|
|
or croak "->noise_reduction_regex('$regex_name') : unknown regex name"; |
|
281
|
14
|
|
|
|
|
28
|
return $regex; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
sub reduce_all_noises { |
|
285
|
2
|
|
|
2
|
1
|
4
|
my $self = shift; |
|
286
|
|
|
|
|
|
|
|
|
287
|
2
|
|
|
|
|
11
|
$self->reduce_noise(@noise_reduction_list); |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub merge_runs { |
|
292
|
2
|
|
|
2
|
1
|
9
|
my ($self, %args) = @_; |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# check validity of received args |
|
295
|
2
|
|
|
|
|
7
|
state $is_valid_arg = {no_caps => 1}; |
|
296
|
2
|
|
|
|
|
9
|
my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args; |
|
|
1
|
|
|
|
|
5
|
|
|
297
|
2
|
50
|
|
|
|
8
|
croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args; |
|
298
|
|
|
|
|
|
|
|
|
299
|
2
|
|
|
|
|
6
|
my @new_runs; |
|
300
|
|
|
|
|
|
|
# loop over internal "run" objects |
|
301
|
2
|
|
|
|
|
4
|
foreach my $run (@{$self->runs}) { |
|
|
2
|
|
|
|
|
133
|
|
|
302
|
|
|
|
|
|
|
|
|
303
|
1098
|
100
|
|
|
|
4081
|
$run->remove_caps_property if $args{no_caps}; |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# check if the current run can be merged with the previous one |
|
306
|
1098
|
100
|
66
|
|
|
50646
|
if ( !$run->xml_before # no other XML markup between the 2 runs |
|
|
|
|
100
|
|
|
|
|
|
307
|
|
|
|
|
|
|
&& @new_runs # there was a previous run |
|
308
|
|
|
|
|
|
|
&& $new_runs[-1]->props eq $run->props # both runs have the same properties |
|
309
|
|
|
|
|
|
|
) { |
|
310
|
|
|
|
|
|
|
# conditions are OK, so merge this run with the previous one |
|
311
|
54
|
|
|
|
|
181
|
$new_runs[-1]->merge($run); |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
else { |
|
314
|
|
|
|
|
|
|
# conditions not OK, just push this run without merging |
|
315
|
1044
|
|
|
|
|
3281
|
push @new_runs, $run; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
|
|
|
|
|
|
} |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# reassemble the whole stuff and inject it as new contents |
|
320
|
2
|
|
|
|
|
17
|
$self->contents(join "", map {$_->as_xml} @new_runs); |
|
|
1044
|
|
|
|
|
3251
|
|
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
sub replace { |
|
324
|
12
|
|
|
12
|
1
|
58
|
my ($self, $pattern, $replacement_callback, %replacement_args) = @_; |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# shared initial string for error messages |
|
327
|
12
|
|
|
|
|
23
|
my $error_msg = '->replace($pattern, $callback, %args)'; |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is' |
|
330
|
12
|
50
|
|
|
|
53
|
if (delete $replacement_args{keep_xml_as_is}) { |
|
331
|
|
|
|
|
|
|
not exists $replacement_args{cleanup_XML} |
|
332
|
0
|
0
|
|
|
|
0
|
or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'"; |
|
333
|
0
|
|
|
|
|
0
|
carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead"; |
|
334
|
0
|
|
|
|
|
0
|
$replacement_args{cleanup_XML} = 0; |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
else { |
|
337
|
12
|
|
100
|
|
|
45
|
$replacement_args{cleanup_XML} //= 1; # default |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# cleanup the XML structure so that replacements work better |
|
341
|
12
|
100
|
|
|
|
42
|
if (my $cleanup_args = $replacement_args{cleanup_XML}) { |
|
342
|
2
|
50
|
|
|
|
11
|
$cleanup_args = {} if ! ref $cleanup_args; |
|
343
|
2
|
50
|
|
|
|
8
|
ref $cleanup_args eq 'HASH' |
|
344
|
|
|
|
|
|
|
or croak "$error_msg: arg 'cleanup_XML' should be a hashref"; |
|
345
|
2
|
|
|
|
|
10
|
$self->cleanup_XML(%$cleanup_args); |
|
346
|
|
|
|
|
|
|
} |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# check for presences of a special option to avoid modying contents |
|
349
|
12
|
|
|
|
|
25
|
my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents}; |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# apply replacements and generate new XML |
|
352
|
|
|
|
|
|
|
my $xml = join "", |
|
353
|
12
|
|
|
|
|
467
|
map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*; |
|
|
1588
|
|
|
|
|
5774
|
|
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
# overwrite previous contents |
|
356
|
12
|
50
|
|
|
|
760
|
$self->contents($xml) unless $dont_overwrite_contents; |
|
357
|
|
|
|
|
|
|
|
|
358
|
12
|
|
|
|
|
107
|
return $xml; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
sub _update_contents_in_zip { # called for each part before saving the zip file |
|
362
|
7
|
|
|
7
|
|
8
|
my $self = shift; |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
$self->surgeon->xml_member($self->zip_member_name, $self->contents) |
|
365
|
7
|
50
|
|
|
|
16
|
if $self->{contents_has_changed}; |
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
#====================================================================== |
|
371
|
|
|
|
|
|
|
# OPERATIONS ON BOOKMARKS |
|
372
|
|
|
|
|
|
|
#====================================================================== |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
sub bookmark_boundaries { |
|
375
|
3
|
|
|
3
|
1
|
13
|
my ($self) = @_; |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# regex to find bookmark tags |
|
378
|
3
|
|
|
|
|
9
|
state $bookmark_rx = qr{ |
|
379
|
|
|
|
|
|
|
( # $1: the whole tag |
|
380
|
|
|
|
|
|
|
<w:bookmark(Start|End) # $2: kind of bookmark boundary |
|
381
|
|
|
|
|
|
|
\h* ([^>]*?) # $3: node attributes |
|
382
|
|
|
|
|
|
|
/> # end of tag |
|
383
|
|
|
|
|
|
|
) # end of capture 1 |
|
384
|
|
|
|
|
|
|
}sx; |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
# split the whole xml according to the regex. Captured groups are also added to the list |
|
387
|
3
|
|
|
|
|
111
|
my @xml_chunks = split /$bookmark_rx/, $self->contents; |
|
388
|
3
|
|
|
|
|
10
|
my $final_xml = pop @xml_chunks; |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
# walk through the list of fragments and build BookmarkBoundary objects |
|
391
|
3
|
|
|
|
|
7
|
my @bookmark_boundaries; |
|
392
|
3
|
|
|
|
|
16
|
while (my @chunk = splice @xml_chunks, 0, 4) { |
|
393
|
79
|
|
|
|
|
127523
|
my %bkmk_args; @bkmk_args{qw/xml_before node_xml kind attrs/} = @chunk; |
|
|
79
|
|
|
|
|
302
|
|
|
394
|
79
|
|
50
|
|
|
321
|
my %attrs = parse_attrs(delete $bkmk_args{attrs} // ""); |
|
395
|
79
|
|
|
|
|
197
|
$bkmk_args{id} = $attrs{'w:id'}; |
|
396
|
79
|
100
|
|
|
|
172
|
$bkmk_args{name} = $attrs{'w:name'} if $attrs{'w:name'}; |
|
397
|
79
|
|
|
|
|
315
|
push @bookmark_boundaries, MsOffice::Word::Surgeon::BookmarkBoundary->new(%bkmk_args); |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
|
|
400
|
3
|
50
|
|
|
|
5665
|
return wantarray ? (\@bookmark_boundaries, $final_xml) : \@bookmark_boundaries; |
|
401
|
|
|
|
|
|
|
} |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
sub suppress_bookmarks { |
|
406
|
2
|
|
|
2
|
1
|
8
|
my ($self, %options) = @_; |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# check if options are valid and supply defaults |
|
409
|
2
|
|
|
|
|
6
|
my @invalid_opt = grep {!/^(full_range|markup_only)$/} keys %options; |
|
|
0
|
|
|
|
|
0
|
|
|
410
|
2
|
50
|
|
|
|
8
|
croak "suppress_bookmarks: invalid options: " . join(", ", @invalid_opt) if @invalid_opt; |
|
411
|
2
|
50
|
|
|
|
17
|
%options = (markup_only => qr/./) if ! keys %options; |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# parse bookmark boundaries |
|
414
|
2
|
|
|
|
|
7
|
my ($bookmark_boundaries, $final_xml) = $self->bookmark_boundaries; |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# loop on bookmark boundaries |
|
417
|
2
|
|
|
|
|
6
|
my %boundary_ix_by_id; |
|
418
|
2
|
|
|
|
|
12
|
while (my ($ix, $boundary) = each @$bookmark_boundaries) { |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# for starting boundaries, just remember the starting index |
|
421
|
37
|
100
|
|
|
|
1970
|
if ($boundary->kind eq 'Start') { |
|
|
|
50
|
|
|
|
|
|
|
422
|
18
|
|
|
|
|
1289
|
$boundary_ix_by_id{$boundary->id} = $ix; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# for ending boundaries, do the suppression |
|
426
|
|
|
|
|
|
|
elsif ($boundary->kind eq 'End') { |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# try to find the corresponding bookmarkStart node. |
|
429
|
19
|
|
|
|
|
1000
|
my $start_ix = $boundary_ix_by_id{$boundary->id}; |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# if not found, this is because the start was within a field that has been erased. So just clear the bookmarkEnd |
|
432
|
19
|
100
|
|
|
|
57
|
if (!defined $start_ix) { |
|
433
|
1
|
|
|
|
|
52
|
$boundary->node_xml(""); |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# if found, do the normal suppression |
|
437
|
|
|
|
|
|
|
else { |
|
438
|
18
|
|
|
|
|
43
|
my $bookmark_start = $bookmark_boundaries->[$start_ix]; |
|
439
|
18
|
|
|
|
|
2272
|
my $bookmark_name = $bookmark_start->name; |
|
440
|
18
|
|
|
|
|
196
|
my $should_erase_markup = match($bookmark_name, $options{markup_only}); |
|
441
|
18
|
|
|
|
|
200
|
my $should_erase_range = match($bookmark_name, $options{full_range}); |
|
442
|
18
|
50
|
33
|
|
|
71
|
if ($should_erase_markup || $should_erase_range) { |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# erase markup (start and end bookmarks) |
|
445
|
18
|
|
|
|
|
1017
|
$_->node_xml("") for $boundary, $bookmark_start; |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
# if required, also erase inner range |
|
448
|
18
|
50
|
|
|
|
116
|
if ($should_erase_range) { |
|
449
|
0
|
|
|
|
|
0
|
for my $erase_ix ($start_ix+1 .. $ix) { |
|
450
|
0
|
|
|
|
|
0
|
my $inner_boundary = $bookmark_boundaries->[$erase_ix]; |
|
451
|
0
|
0
|
|
|
|
0
|
!$inner_boundary->node_xml |
|
452
|
|
|
|
|
|
|
or die "cannot erase contents of bookmark '$bookmark_name' " |
|
453
|
|
|
|
|
|
|
. "because it contains the start of bookmark '". $inner_boundary->name . "'"; |
|
454
|
0
|
|
|
|
|
0
|
$inner_boundary->xml_before(""); |
|
455
|
|
|
|
|
|
|
} |
|
456
|
|
|
|
|
|
|
} |
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# re-build the whole XML from all remaining fragments, and inject it back |
|
463
|
2
|
|
|
|
|
114
|
my $new_contents = join "", (map {$_->xml_before, $_->node_xml} @$bookmark_boundaries), $final_xml; |
|
|
37
|
|
|
|
|
2103
|
|
|
464
|
2
|
|
|
|
|
144
|
$self->contents($new_contents); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
sub reveal_bookmarks { |
|
469
|
1
|
|
|
1
|
1
|
4
|
my ($self, @marking_args) = @_; |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# auxiliary objects |
|
472
|
1
|
|
|
|
|
16
|
my $marker = MsOffice::Word::Surgeon::PackagePart::_BookmarkMarker->new(@marking_args); |
|
473
|
1
|
|
|
|
|
6
|
my $paragraph_tracker = MsOffice::Word::Surgeon::PackagePart::_ParaTracker->new; |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
# parse bookmark boundaries |
|
476
|
1
|
|
|
|
|
4
|
my ($bookmark_boundaries, $final_xml) = $self->bookmark_boundaries; |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
# loop on bookmark boundaries |
|
479
|
1
|
|
|
|
|
3
|
my @bookmark_name_by_id; |
|
480
|
1
|
|
|
|
|
3
|
foreach my $boundary (@$bookmark_boundaries) { |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# count opening and closing paragraphs in xml before this node |
|
483
|
42
|
|
|
|
|
1043
|
$paragraph_tracker->count_paragraphs($boundary->xml_before); |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# add visible runs before or after bookmark nodes |
|
486
|
42
|
100
|
|
|
|
964
|
if ($boundary->kind eq 'Start') { |
|
|
|
50
|
|
|
|
|
|
|
487
|
21
|
|
|
|
|
503
|
$bookmark_name_by_id[$boundary->id] = $boundary->name; |
|
488
|
21
|
|
|
|
|
490
|
$boundary->prepend_xml($paragraph_tracker->maybe_add_paragraph($marker->mark($boundary->name, 0))); |
|
489
|
|
|
|
|
|
|
} |
|
490
|
|
|
|
|
|
|
elsif ($boundary->kind eq 'End') { |
|
491
|
21
|
|
|
|
|
449
|
my $bookmark_name = $bookmark_name_by_id[$boundary->id]; |
|
492
|
21
|
|
|
|
|
30
|
$boundary->append_xml($paragraph_tracker->maybe_add_paragraph($marker->mark($bookmark_name, 1))); |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# re-build the whole XML and inject it back |
|
497
|
1
|
|
|
|
|
3
|
my $new_contents = join "", (map {$_->xml_before, $_->node_xml} @$bookmark_boundaries), $final_xml; |
|
|
42
|
|
|
|
|
954
|
|
|
498
|
1
|
|
|
|
|
41
|
$self->contents($new_contents); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
#====================================================================== |
|
503
|
|
|
|
|
|
|
# OPERATIONS ON FIELDS |
|
504
|
|
|
|
|
|
|
#====================================================================== |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub fields { |
|
507
|
3
|
|
|
3
|
1
|
7
|
my ($self) = @_; |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
# regex to find field nodes |
|
510
|
3
|
|
|
|
|
52
|
state $field_rx = qr{ |
|
511
|
|
|
|
|
|
|
< w:fld # initial prefix for a field node |
|
512
|
|
|
|
|
|
|
(Simple|Char) # $1 : distinguish between simple fields and complex fields |
|
513
|
|
|
|
|
|
|
\h* ([^>]*?) # $2 : node attributes |
|
514
|
|
|
|
|
|
|
(?: # either .. |
|
515
|
|
|
|
|
|
|
/> # .. the end of an empty XML element |
|
516
|
|
|
|
|
|
|
| # or .. |
|
517
|
|
|
|
|
|
|
> # .. the end of the opening tag |
|
518
|
|
|
|
|
|
|
(.*?) # .. $3: some node content |
|
519
|
|
|
|
|
|
|
</w:fld\g1> # .. the closing tag |
|
520
|
|
|
|
|
|
|
) |
|
521
|
|
|
|
|
|
|
}sx; |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
# split the whole xml according to the regex. Captured groups are also added to the list |
|
524
|
3
|
|
|
|
|
147
|
my @xml_chunks = split /$field_rx/, $self->contents; |
|
525
|
3
|
|
|
|
|
16
|
my $final_xml = pop @xml_chunks; |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# walk through the list of fragments and build a stack of field objects |
|
528
|
3
|
|
|
|
|
7
|
my @field_stack; |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
NODE: |
|
531
|
3
|
|
|
|
|
21
|
while (my @chunk = splice @xml_chunks, 0, 4) { |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# initialize a node hash |
|
534
|
144
|
|
|
|
|
3856
|
my %node; @node{qw/xml_before field_kind attrs node_content/} = @chunk; |
|
|
144
|
|
|
|
|
691
|
|
|
535
|
144
|
|
100
|
|
|
984
|
$node{$_} //= "" for qw/xml_before field_kind attrs node_content/; |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
# node attributes |
|
538
|
144
|
|
|
|
|
481
|
my %attrs = parse_attrs($node{attrs}); |
|
539
|
|
|
|
|
|
|
|
|
540
|
144
|
50
|
|
|
|
563
|
if ($node{field_kind} eq 'Simple') { |
|
|
|
50
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
# for a simple field, all information is within the XML node |
|
542
|
|
|
|
|
|
|
push @field_stack, MsOffice::Word::Surgeon::Field->new( |
|
543
|
|
|
|
|
|
|
xml_before => $node{xml_before}, |
|
544
|
|
|
|
|
|
|
code => $attrs{'w:instr'}, |
|
545
|
|
|
|
|
|
|
result => $node{node_content}, |
|
546
|
0
|
|
|
|
|
0
|
); |
|
547
|
|
|
|
|
|
|
} |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
elsif ($node{field_kind} eq 'Char') { |
|
550
|
|
|
|
|
|
|
# for a complex field, we need an auxiliary subroutine to handle the begin/separate/end parts |
|
551
|
144
|
|
|
|
|
408
|
_handle_fldChar_node(\@field_stack, \%node, \%attrs); |
|
552
|
|
|
|
|
|
|
} |
|
553
|
|
|
|
|
|
|
|
|
554
|
144
|
|
|
|
|
90836
|
$self->_maybe_embed_last_field(\@field_stack); |
|
555
|
|
|
|
|
|
|
} |
|
556
|
|
|
|
|
|
|
|
|
557
|
3
|
50
|
|
|
|
22
|
return wantarray ? (\@field_stack, $final_xml) : \@field_stack; |
|
558
|
|
|
|
|
|
|
} |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub replace_fields { |
|
562
|
3
|
|
|
3
|
1
|
9
|
my ($self, $field_replacer) = @_; |
|
563
|
|
|
|
|
|
|
|
|
564
|
3
|
|
|
|
|
14
|
my ($fields, $final_xml) = $self->fields; |
|
565
|
3
|
|
|
|
|
10
|
my @xml_parts = map {$_->xml_before, $field_replacer->($_)} @$fields; |
|
|
16
|
|
|
|
|
551
|
|
|
566
|
|
|
|
|
|
|
|
|
567
|
3
|
|
|
|
|
1489
|
$self->contents(join "", @xml_parts, $final_xml); |
|
568
|
|
|
|
|
|
|
} |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
sub reveal_fields { |
|
572
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# replace all fields by a textual representatio of their "code" part |
|
575
|
1
|
|
|
8
|
|
7
|
my $revealer = sub {my $code = shift->code; encode_entities($code); return "<w:t>{$code}</w:t>"}; |
|
|
8
|
|
|
|
|
382
|
|
|
|
8
|
|
|
|
|
77
|
|
|
|
8
|
|
|
|
|
30
|
|
|
576
|
1
|
|
|
|
|
24
|
$self->replace_fields($revealer); |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
sub unlink_fields { |
|
581
|
2
|
|
|
2
|
1
|
6
|
my $self = shift; |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
# replace all fields by just their "result" part (in other words, ignore the "code" part). |
|
584
|
|
|
|
|
|
|
# ASK fields return an empty string (because they have a special treatment in Word, where |
|
585
|
|
|
|
|
|
|
# their 'result' part is hidden, unlike all other fields. |
|
586
|
|
|
|
|
|
|
my $unlinker = sub { |
|
587
|
8
|
|
|
8
|
|
12
|
my $field = shift; |
|
588
|
8
|
100
|
|
|
|
152
|
return $field->type eq 'ASK' ? '' : $field->result; |
|
589
|
2
|
|
|
|
|
12
|
}; |
|
590
|
2
|
|
|
|
|
9
|
$self->replace_fields($unlinker); |
|
591
|
|
|
|
|
|
|
} |
|
592
|
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
# below: auxiliary methods or subroutines for field handling |
|
595
|
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
sub _decode_instr_text { |
|
597
|
50
|
|
|
50
|
|
87
|
my ($xml) = @_; |
|
598
|
|
|
|
|
|
|
|
|
599
|
50
|
|
|
|
|
434
|
my @instr_text = $xml =~ m{<w:instrText.*?>(.*?)</w:instrText>}g; |
|
600
|
50
|
|
|
|
|
135
|
my $instr = join "", @instr_text; |
|
601
|
50
|
|
|
|
|
161
|
decode_entities($instr); |
|
602
|
50
|
|
|
|
|
258
|
return $instr; |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
sub _handle_fldChar_node { |
|
606
|
144
|
|
|
144
|
|
273
|
my ($field_stack, $node, $attrs) = @_; |
|
607
|
|
|
|
|
|
|
|
|
608
|
144
|
|
|
|
|
284
|
my $fldChar_type = $attrs->{"w:fldCharType"}; |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# if this is the beginning a of a field : push a new field object on top of the stack |
|
611
|
144
|
100
|
|
|
|
289
|
if ($fldChar_type eq 'begin') { |
|
612
|
|
|
|
|
|
|
push @$field_stack, MsOffice::Word::Surgeon::Field->new( |
|
613
|
|
|
|
|
|
|
xml_before => $node->{xml_before}, |
|
614
|
48
|
|
|
|
|
244
|
code => '', |
|
615
|
|
|
|
|
|
|
result => '', |
|
616
|
|
|
|
|
|
|
status => "begin", |
|
617
|
|
|
|
|
|
|
); |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
# otherwise this is the continuation of the current field (eiter "separate" or "end") : update it |
|
621
|
|
|
|
|
|
|
else { |
|
622
|
96
|
50
|
|
|
|
234
|
my $current_field = $field_stack->[-1] |
|
623
|
|
|
|
|
|
|
or croak qq{met <w:fldChar w:fldCharType="$fldChar_type"> but there is no current field}; |
|
624
|
96
|
|
|
|
|
3490
|
my $current_status = $current_field->status; |
|
625
|
|
|
|
|
|
|
|
|
626
|
96
|
100
|
|
|
|
256
|
if ($current_status eq "begin") { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
627
|
48
|
|
|
|
|
114
|
$current_field->append_to_code(_decode_instr_text($node->{xml_before})); |
|
628
|
|
|
|
|
|
|
} |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
elsif ($current_status eq "separate") { |
|
631
|
48
|
50
|
|
|
|
105
|
$fldChar_type eq "end" |
|
632
|
|
|
|
|
|
|
or croak qq{after a "separate" node, w:fldCharType cannot be "$fldChar_type"}; |
|
633
|
48
|
|
|
|
|
218
|
$current_field->append_to_result($node->{xml_before}); |
|
634
|
|
|
|
|
|
|
} |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
elsif ($current_status eq "end") { |
|
637
|
0
|
|
|
|
|
0
|
croak qq{met <w:fldChar w:fldCharType="$fldChar_type"> but last field is not open}; |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
|
|
641
|
96
|
|
|
|
|
3373
|
$current_field->status($fldChar_type); |
|
642
|
|
|
|
|
|
|
} |
|
643
|
|
|
|
|
|
|
} |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
sub _maybe_embed_last_field { |
|
646
|
144
|
|
|
144
|
|
274
|
my ($self, $field_stack) = @_; |
|
647
|
|
|
|
|
|
|
|
|
648
|
144
|
|
|
|
|
254
|
my $last_field = $field_stack->[-1]; |
|
649
|
144
|
|
|
|
|
246
|
my $prev_field = $field_stack->[-2]; |
|
650
|
|
|
|
|
|
|
|
|
651
|
144
|
100
|
66
|
|
|
5544
|
if ($last_field && $prev_field && $last_field->status eq 'end') { |
|
|
|
|
100
|
|
|
|
|
|
652
|
|
|
|
|
|
|
|
|
653
|
46
|
|
|
|
|
1533
|
my $prev_status = $prev_field->status; |
|
654
|
|
|
|
|
|
|
|
|
655
|
46
|
100
|
|
|
|
203
|
if ($prev_status eq 'begin') { |
|
|
|
100
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# the last field is embedded within the "code" part of the previous field |
|
657
|
2
|
|
|
|
|
68
|
$prev_field->append_to_code(_decode_instr_text($last_field->xml_before) |
|
658
|
|
|
|
|
|
|
. sprintf $self->surgeon->show_embedded_field, $last_field->code); |
|
659
|
2
|
|
|
|
|
18
|
pop @$field_stack; |
|
660
|
|
|
|
|
|
|
} |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
elsif ($prev_status eq 'separate') { |
|
663
|
|
|
|
|
|
|
# the last field is embedded within the "result" part of the previous field |
|
664
|
30
|
|
|
|
|
1027
|
$prev_field->append_to_result($last_field->xml_before . $last_field->result); |
|
665
|
30
|
|
|
|
|
234
|
pop @$field_stack; |
|
666
|
|
|
|
|
|
|
} |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# elsif ($prev_status eq 'end') : $last_field is an independend field, just leave it on top of stack |
|
669
|
|
|
|
|
|
|
} |
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
#====================================================================== |
|
674
|
|
|
|
|
|
|
# OPERATIONS ON IMAGES |
|
675
|
|
|
|
|
|
|
#====================================================================== |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
sub replace_image { |
|
679
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_title, $image_PNG_content) = @_; |
|
680
|
|
|
|
|
|
|
|
|
681
|
0
|
0
|
|
|
|
0
|
my $member_name = $self->images->{$image_title} |
|
682
|
|
|
|
|
|
|
or die "could not find an image with title: $image_title"; |
|
683
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->contents($member_name, $image_PNG_content); |
|
684
|
|
|
|
|
|
|
} |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
sub add_image { |
|
689
|
0
|
|
|
0
|
1
|
0
|
my ($self, $image_PNG_content) = @_; |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# compute a fresh image number and a fresh relationship id |
|
692
|
0
|
|
|
|
|
0
|
my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]); |
|
693
|
0
|
|
|
|
|
0
|
my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members; |
|
|
0
|
|
|
|
|
0
|
|
|
694
|
0
|
|
0
|
|
|
0
|
my $last_img_num = max @image_nums // 0; |
|
695
|
0
|
|
|
|
|
0
|
my $target = sprintf "media/image%d.png", $last_img_num + 1; |
|
696
|
0
|
|
|
|
|
0
|
my $last_rId_num = $self->relationships->$#*; |
|
697
|
0
|
|
|
|
|
0
|
my $rId = sprintf "rId%d", $last_rId_num + 1; |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
# assemble XML for the new relationship |
|
700
|
0
|
|
|
|
|
0
|
my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; |
|
701
|
0
|
|
|
|
|
0
|
my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>}; |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
# update the rels member |
|
704
|
0
|
|
|
|
|
0
|
my $xml = $self->_rels_xml; |
|
705
|
0
|
|
|
|
|
0
|
$xml =~ s[</Relationships>][$new_rel_xml</Relationships>]; |
|
706
|
0
|
|
|
|
|
0
|
$self->_rels_xml($xml); |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
# add the image as a new member into the archive |
|
709
|
0
|
|
|
|
|
0
|
my $member_name = "word/$target"; |
|
710
|
0
|
|
|
|
|
0
|
$self->surgeon->zip->addString(\$image_PNG_content, $member_name); |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
# update the global content_types if it doesn't include PNG |
|
713
|
0
|
|
|
|
|
0
|
my $ct = $self->surgeon->_content_types; |
|
714
|
0
|
0
|
|
|
|
0
|
if ($ct !~ /Extension="png"/) { |
|
715
|
0
|
|
|
|
|
0
|
$ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>]; |
|
716
|
0
|
|
|
|
|
0
|
$self->surgeon->_content_types($ct); |
|
717
|
|
|
|
|
|
|
} |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
# return the relationship id |
|
720
|
0
|
|
|
|
|
0
|
return $rId; |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
#====================================================================== |
|
727
|
|
|
|
|
|
|
# INTERNAL CLASS FOR TRACKING PARAGRAPHS |
|
728
|
|
|
|
|
|
|
#====================================================================== |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
package # hide from PAUSE |
|
731
|
|
|
|
|
|
|
MsOffice::Word::Surgeon::PackagePart::_ParaTracker; |
|
732
|
4
|
|
|
4
|
|
31
|
use strict; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
148
|
|
|
733
|
4
|
|
|
4
|
|
18
|
use warnings; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
1222
|
|
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
sub new { |
|
736
|
1
|
|
|
1
|
|
2
|
my $class = shift; |
|
737
|
1
|
|
|
|
|
2
|
my $nb_para = 0; |
|
738
|
1
|
|
|
|
|
2
|
bless \$nb_para, $class; |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
sub count_paragraphs { |
|
742
|
42
|
|
|
42
|
|
53
|
my ($self, $xml) = @_; |
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
# count opening and closing paragraph nodes |
|
745
|
42
|
|
|
|
|
120
|
while ($xml =~ m[<(/)?w:p.*?(/)?>]g) { |
|
746
|
794
|
100
|
|
|
|
1086
|
next if $2; # self-ending node -- doesn't change the number of paragraphs |
|
747
|
777
|
100
|
|
|
|
3839
|
$$self += $1 ? -1 : +1; |
|
748
|
|
|
|
|
|
|
} |
|
749
|
|
|
|
|
|
|
} |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
sub maybe_add_paragraph { |
|
752
|
42
|
|
|
42
|
|
53
|
my ($self, $xml) = @_; |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
# add paragraph nodes only if the ParaTracker is currently outside of any paragraph |
|
755
|
42
|
|
|
|
|
46
|
my $is_outside_para = !$$self; |
|
756
|
42
|
50
|
33
|
|
|
109
|
return $is_outside_para && $xml ? "<w:p>$xml</w:p>" : $xml; |
|
757
|
|
|
|
|
|
|
}; |
|
758
|
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
#====================================================================== |
|
761
|
|
|
|
|
|
|
# INTERNAL CLASS FOR INTRODUCING BOOKMARK MARKERS |
|
762
|
|
|
|
|
|
|
#====================================================================== |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
package # hide from PAUSE |
|
765
|
|
|
|
|
|
|
MsOffice::Word::Surgeon::PackagePart::_BookmarkMarker; |
|
766
|
4
|
|
|
4
|
|
32
|
use strict; |
|
|
4
|
|
|
|
|
14
|
|
|
|
4
|
|
|
|
|
85
|
|
|
767
|
4
|
|
|
4
|
|
15
|
use warnings; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
224
|
|
|
768
|
4
|
|
|
4
|
|
23
|
use MsOffice::Word::Surgeon::Utils qw(encode_entities); |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
234
|
|
|
769
|
4
|
|
|
4
|
|
21
|
use Carp qw(croak carp); |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
1468
|
|
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
sub new { |
|
772
|
1
|
|
|
1
|
|
3
|
my $class = shift; |
|
773
|
1
|
|
|
|
|
4
|
my %self = @_; |
|
774
|
|
|
|
|
|
|
|
|
775
|
1
|
|
50
|
|
|
3
|
$self{color} //= "yellow"; |
|
776
|
1
|
|
50
|
|
|
7
|
$self{props} //= qq{<w:highlight w:val="%s"/>}; |
|
777
|
1
|
|
50
|
|
|
4
|
$self{start} //= "<%s>"; |
|
778
|
1
|
|
50
|
|
|
6
|
$self{end} //= "</%s>"; |
|
779
|
1
|
50
|
|
|
|
4
|
$self{ignore} = qr/^_/ if not exists $self{ignore}; |
|
780
|
|
|
|
|
|
|
|
|
781
|
1
|
50
|
|
|
|
6
|
$self{color} =~ m{ ^( black | blue | cyan | darkBlue | darkCyan | |
|
782
|
|
|
|
|
|
|
darkGray | darkGreen | darkMagenta | darkRed | darkYellow | green | |
|
783
|
|
|
|
|
|
|
lightGray | magenta | none | red | white | yellow )$}x |
|
784
|
|
|
|
|
|
|
or carp "invalid color : $self{color}"; |
|
785
|
|
|
|
|
|
|
|
|
786
|
1
|
|
|
|
|
2
|
bless \%self, $class; |
|
787
|
|
|
|
|
|
|
} |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
sub mark { |
|
790
|
42
|
|
|
42
|
|
62
|
my ($self, $bookmark_name, $is_end_node) = @_; |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
# some bookmarks are just ignored |
|
793
|
|
|
|
|
|
|
return "" |
|
794
|
42
|
100
|
66
|
|
|
210
|
if $self->{ignore} and $bookmark_name =~ $self->{ignore}; |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
# build the visible text |
|
797
|
4
|
|
|
4
|
|
29
|
no warnings 'redundant'; # because sprintf templates may decide not to use their arguments |
|
|
4
|
|
|
|
|
11
|
|
|
|
4
|
|
|
|
|
571
|
|
|
798
|
12
|
100
|
|
|
|
24
|
my $sprintf_node = $is_end_node ? $self->{end} : $self->{start}; |
|
799
|
12
|
|
|
|
|
24
|
my $text = sprintf $sprintf_node, $bookmark_name; |
|
800
|
12
|
|
|
|
|
21
|
my $props = sprintf $self->{props}, $self->{color}; |
|
801
|
12
|
|
|
|
|
49
|
encode_entities($text); |
|
802
|
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
# full xml for a visible run before or after the boookmark node |
|
804
|
12
|
|
|
|
|
28
|
return "<w:r><w:rPr>$props</w:rPr><w:t>$text</w:t></w:r>"; |
|
805
|
|
|
|
|
|
|
} |
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
1; |
|
809
|
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
__END__ |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
=head1 NAME |
|
815
|
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document |
|
817
|
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
819
|
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
my $part = $surgeon->document; |
|
821
|
|
|
|
|
|
|
print $part->plain_text; |
|
822
|
|
|
|
|
|
|
$part->replace(qr[$pattern], $replacement_callback); |
|
823
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
|
824
|
|
|
|
|
|
|
$part->unlink_fields; |
|
825
|
|
|
|
|
|
|
$part->reveal_bookmarks; |
|
826
|
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
829
|
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single |
|
831
|
|
|
|
|
|
|
I<package part> within the ZIP package of a C<.docx> document. |
|
832
|
|
|
|
|
|
|
It is mostly used for the I<document> part, that contains the XML representation of the |
|
833
|
|
|
|
|
|
|
main document body. However, other parts such as headers, footers, footnotes, etc. have the |
|
834
|
|
|
|
|
|
|
same internal representation and therefore the same operations can be invoked. |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=head1 METHODS |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=head2 new |
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
my $part = MsOffice::Word::Surgeon::PackagePart->new( |
|
842
|
|
|
|
|
|
|
surgeon => $surgeon, |
|
843
|
|
|
|
|
|
|
part_name => $name, |
|
844
|
|
|
|
|
|
|
); |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
Constructor for a new part object. This is called internally from |
|
847
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon>; it is not meant to be called directly |
|
848
|
|
|
|
|
|
|
by clients. |
|
849
|
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
=head3 Constructor arguments |
|
851
|
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
=over |
|
854
|
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
=item surgeon |
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
a weak reference to the main surgeon object |
|
858
|
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
=item part_name |
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
ZIP member name of this part |
|
862
|
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
=back |
|
864
|
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head3 Other attributes |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
Other attributes, not passed through the constructor but generated lazily on demand, are : |
|
868
|
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
=over |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
=item contents |
|
872
|
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
the XML contents of this part |
|
874
|
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
=item runs |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
a decomposition of the XML contents into a collection of |
|
878
|
|
|
|
|
|
|
L<MsOffice::Word::Surgeon::Run> objects. |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
=item relationships |
|
881
|
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
an arrayref of Office relationships associated with this part. This information comes from |
|
883
|
|
|
|
|
|
|
a C<.rels> member in the ZIP archive, named after the name of the package part. |
|
884
|
|
|
|
|
|
|
Array indices correspond to relationship numbers. Array values are hashrefs with |
|
885
|
|
|
|
|
|
|
keys |
|
886
|
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
=over |
|
888
|
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
=item Id |
|
890
|
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
the full relationship id |
|
892
|
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
=item num |
|
894
|
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
the numeric part of C<rId> |
|
896
|
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
=item Type |
|
898
|
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
the full reference to the XML schema for this relationship |
|
900
|
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
=item short_type |
|
902
|
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
only the last word of the type, e.g. 'image', 'style', etc. |
|
904
|
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=item Target |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
designation of the target within the ZIP file. The prefix 'word/' must be |
|
908
|
|
|
|
|
|
|
added for having a complete Zip member name. |
|
909
|
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
=back |
|
911
|
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
=item images |
|
915
|
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
a hashref of images within this package part. Keys of the hash are image I<alternative texts>. |
|
917
|
|
|
|
|
|
|
If present, the alternative I<title> will be preferred; otherwise the alternative I<description> will be taken |
|
918
|
|
|
|
|
|
|
(note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display |
|
919
|
|
|
|
|
|
|
the I<description> field -- see |
|
920
|
|
|
|
|
|
|
L<MsOffice documentation|https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669>). |
|
921
|
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
Images without alternative text will not be accessible through the current Perl module. |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
Values of the hash are zip member names for the corresponding |
|
925
|
|
|
|
|
|
|
image representations in C<.png> format. |
|
926
|
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
=back |
|
929
|
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
=head2 Contents restitution |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
=head3 contents |
|
934
|
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
Returns a Perl string with the current internal XML representation of the part |
|
936
|
|
|
|
|
|
|
contents. |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
=head3 original_contents |
|
939
|
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
Returns a Perl string with the XML representation of the |
|
941
|
|
|
|
|
|
|
part contents, as it was in the ZIP archive before any |
|
942
|
|
|
|
|
|
|
modification. |
|
943
|
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=head3 indented_contents |
|
945
|
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
Returns an indented version of the XML contents, suitable for inspection in a text editor. |
|
947
|
|
|
|
|
|
|
This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded |
|
948
|
|
|
|
|
|
|
byte string, not a Perl string. |
|
949
|
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
=head3 plain_text |
|
951
|
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
Returns the text contents of the part, without any markup. |
|
953
|
|
|
|
|
|
|
Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored. |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
=head3 runs |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of |
|
959
|
|
|
|
|
|
|
these objects holds an XML fragment; joining all fragments |
|
960
|
|
|
|
|
|
|
restores the complete document. |
|
961
|
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
my $contents = join "", map {$_->as_xml} $self->runs; |
|
963
|
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
=head2 Modifying contents |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
=head3 cleanup_XML |
|
969
|
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
$part->cleanup_XML(%args); |
|
971
|
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
Apply several other methods for removing unnecessary nodes within the internal |
|
973
|
|
|
|
|
|
|
XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>, |
|
974
|
|
|
|
|
|
|
L</suppress_bookmarks> and L</merge_runs>. |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
Currently there is only one legal arg : |
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
=over |
|
979
|
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
=item C<no_caps> |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically |
|
983
|
|
|
|
|
|
|
called for each run object. As a result, all texts within runs with the C<caps> property are automatically |
|
984
|
|
|
|
|
|
|
converted to uppercase. |
|
985
|
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
=back |
|
987
|
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
=head3 reduce_noise |
|
991
|
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
$part->reduce_noise($regex1, $regex2, ...); |
|
993
|
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
This method is used for removing unnecessary information in the XML |
|
995
|
|
|
|
|
|
|
markup. It applies the given list of regexes to the whole document, |
|
996
|
|
|
|
|
|
|
suppressing matches. The final result is put back into |
|
997
|
|
|
|
|
|
|
C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >> |
|
998
|
|
|
|
|
|
|
references, or as names of builtin regexes (described below). Regexes |
|
999
|
|
|
|
|
|
|
are applied to the whole XML contents, not only to run nodes. |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
=head3 noise_reduction_regex |
|
1003
|
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
my $regex = $part->noise_reduction_regex($regex_name); |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
Returns the builtin regex corresponding to the given name. |
|
1007
|
|
|
|
|
|
|
Known regexes are : |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>), |
|
1010
|
|
|
|
|
|
|
revision_ids => qr(\sw:rsid\w+="[^"]+"), |
|
1011
|
|
|
|
|
|
|
complex_script_bold => qr(<w:bCs/>), |
|
1012
|
|
|
|
|
|
|
page_breaks => qr(<w:lastRenderedPageBreak/>), |
|
1013
|
|
|
|
|
|
|
language => qr(<w:lang w:val="[^/>]+/>), |
|
1014
|
|
|
|
|
|
|
empty_run_props => qr(<w:rPr></w:rPr>), |
|
1015
|
|
|
|
|
|
|
soft_hyphens => qr(<w:softHyphen/>), |
|
1016
|
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
=head3 reduce_all_noises |
|
1018
|
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
$part->reduce_all_noises; |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
Applies all regexes from the previous method. |
|
1022
|
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
=head3 merge_runs |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
$part->merge_runs(no_caps => 1); # optional arg |
|
1027
|
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
Walks through all runs of text within the document, trying to merge |
|
1029
|
|
|
|
|
|
|
adjacent runs when possible (i.e. when both runs have the same |
|
1030
|
|
|
|
|
|
|
properties, and there is no other XML node inbetween). |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
This operation is a prerequisite before performing replace operations, because |
|
1033
|
|
|
|
|
|
|
documents edited in MsWord often have run boundaries across sentences or |
|
1034
|
|
|
|
|
|
|
even in the middle of words; so regex searches can only be successful if those |
|
1035
|
|
|
|
|
|
|
artificial boundaries have been removed. |
|
1036
|
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
If the argument C<< no_caps => 1 >> is present, the merge operation |
|
1038
|
|
|
|
|
|
|
will also convert runs with the C<w:caps> property, putting all letters |
|
1039
|
|
|
|
|
|
|
into uppercase and removing the property; this makes more merges possible. |
|
1040
|
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
=head3 replace |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, %replacement_args); |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
Replaces all occurrences of C<$pattern> regex within the text nodes by the |
|
1048
|
|
|
|
|
|
|
given C<$replacement>. This is not exactly like a search-replace |
|
1049
|
|
|
|
|
|
|
operation performed within MsWord, because the search does not cross boundaries |
|
1050
|
|
|
|
|
|
|
of text nodes. In order to maximize the chances of successful replacements, |
|
1051
|
|
|
|
|
|
|
the L</cleanup_XML> method is automatically called before starting the operation. |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
The argument C<$pattern> can be either a string or a reference to a regular expression. |
|
1054
|
|
|
|
|
|
|
It should not contain any capturing parentheses, because that would perturb text |
|
1055
|
|
|
|
|
|
|
splitting operations. |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
The argument C<$replacement> can be either a fixed string, or a reference to |
|
1058
|
|
|
|
|
|
|
a callback subroutine that will be called for each match. |
|
1059
|
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
The C<< %replacement_args >> hash can be used to pass information to the callback |
|
1062
|
|
|
|
|
|
|
subroutine. That hash will be enriched with three entries : |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
=over |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=item matched |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
The string that has been matched by C<$pattern>. |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
=item run |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
The run object in which this text resides. |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
=item xml_before |
|
1075
|
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
The XML fragment (possibly empty) found before the matched text . |
|
1077
|
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
=back |
|
1079
|
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
The callback subroutine may return either plain text or structured XML. |
|
1081
|
|
|
|
|
|
|
See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback. |
|
1082
|
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
The following special keys within C<< %replacement_args >> are interpreted by the |
|
1084
|
|
|
|
|
|
|
C<replace()> method itself, and therefore are not passed to the callback subroutine : |
|
1085
|
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
=over |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
=item keep_xml_as_is |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
if true, no call is made to the L</cleanup_XML> method before performing the replacements |
|
1091
|
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
=item dont_overwrite_contents |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
if true, the internal XML contents is not modified in place; the new XML after performing |
|
1095
|
|
|
|
|
|
|
replacements is merely returned to the caller. |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
=item cleanup_args |
|
1098
|
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This |
|
1100
|
|
|
|
|
|
|
is typically used as |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
$part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]); |
|
1103
|
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
=back |
|
1105
|
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
=head2 Operations on bookmarks |
|
1108
|
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
=head3 bookmark_boundaries |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
my $boundaries = part->bookmark_boundaries; |
|
1113
|
|
|
|
|
|
|
my ($boundaries, $final_xml) = part->bookmark_boundaries; |
|
1114
|
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
Parses the XML content to discover bookmark boundaries. |
|
1116
|
|
|
|
|
|
|
In scalar context, returns an arrayref of L<MsOffice::Word::Surgeon::BookmarkBoundary> objects. |
|
1117
|
|
|
|
|
|
|
In list context, returns the arrayref followed by a plain string containing the final XML fragment. |
|
1118
|
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
=head3 suppress_bookmarks |
|
1122
|
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
$part->suppress_bookmarks(full_range => [qw/foo bar/], markup_only => qr/^_/); |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
Suppresses bookmarks according to the specified options : |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
=over |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
=item full_range |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
For bookmark names matching this option, the bookmark will be fully |
|
1132
|
|
|
|
|
|
|
suppressed (not only the start and end markers, but also any |
|
1133
|
|
|
|
|
|
|
content inbetween). |
|
1134
|
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=item markup_only |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
For bookmark names matching this option, start and end markers |
|
1139
|
|
|
|
|
|
|
are suppressed, but the inner content remains. |
|
1140
|
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
=back |
|
1142
|
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
Options may be specified as lists of strings, or regexes, or coderefs ... anything suitable |
|
1144
|
|
|
|
|
|
|
to be compared through L<match::simple>. In absence of any options, the default |
|
1145
|
|
|
|
|
|
|
is C<< markup_only => qr/./ >>, meaning that all bookmarks markup is suppressed. |
|
1146
|
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
Removing bookmarks is useful because |
|
1148
|
|
|
|
|
|
|
MsWord may silently insert bookmarks in unexpected places; therefore |
|
1149
|
|
|
|
|
|
|
some searches within the text may fail because of such bookmarks. |
|
1150
|
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
The C<full_range> option is especially convenient for removing bookmarks associated |
|
1152
|
|
|
|
|
|
|
with ASK fields. Such bookmarks contain ranges of text that are |
|
1153
|
|
|
|
|
|
|
never displayed by MsWord. |
|
1154
|
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=head3 reveal_bookmarks |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
$part->reveal_bookmarks(color => 'green'); |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
Usually bookmarks boundaries in MsWord are not visible; the only way to have a visual clue is to turn on |
|
1161
|
|
|
|
|
|
|
an option in |
|
1162
|
|
|
|
|
|
|
L<Advanced / Show document content / Show bookmarks|https://support.microsoft.com/en-gb/office/troubleshoot-bookmarks-9cad566f-913d-49c6-8d37-c21e0e8d6db0> -- but this only displays where bookmarks start and end, without the names of the bookmarks. |
|
1163
|
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
The C<reveal_bookmarks()> method will insert a visible run before each bookmark start and after each bookmark end, showing |
|
1165
|
|
|
|
|
|
|
the bookmark name. This is an interesting tool for documenting where bookmarks are located in an existing document. |
|
1166
|
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
Options to this method are : |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=over |
|
1170
|
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
=item color |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
The highlighting color for visible marks. This should be a valid |
|
1174
|
|
|
|
|
|
|
highlighting color, i.e black, blue, cyan, darkBlue, darkCyan, |
|
1175
|
|
|
|
|
|
|
darkGray, darkGreen, darkMagenta, darkRed, darkYellow, green, |
|
1176
|
|
|
|
|
|
|
lightGray, magenta, none, red, white or yellow. Default is yellow. |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
=item props |
|
1179
|
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
A string in C<sprintf> format for building the XML to be inserted in C<< <w:rPr> >> node |
|
1181
|
|
|
|
|
|
|
when displaying bookmarks marks, i.e. the style for displaying such marks. |
|
1182
|
|
|
|
|
|
|
The default is just a highlighting property : C<< <w:highlight w:val="%s"/> >>. |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
=item start |
|
1185
|
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
A string in C<sprintf> format for generating text before a bookmark start. |
|
1187
|
|
|
|
|
|
|
Default is C<< <%s> >>. |
|
1188
|
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
=item end |
|
1190
|
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
A string in C<sprintf> format for generating text after a bookmark end. |
|
1192
|
|
|
|
|
|
|
Default is C<< </%s> >>. |
|
1193
|
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
=item ignore |
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
A regexp for deciding which bookmarks will not be revealed. Default is C<< qr/^_/ >>, |
|
1197
|
|
|
|
|
|
|
because bookmarks with an initial underscore are usually technical bookmarks inserted |
|
1198
|
|
|
|
|
|
|
automatically by MsWord, such as C<_GoBack> or C<_Toc53196147>. |
|
1199
|
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
=back |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
=head2 Operations on fields |
|
1205
|
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
=head3 fields |
|
1207
|
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
my $fields = part->fields; |
|
1209
|
|
|
|
|
|
|
my ($fields, $final_xml) = part->fields; |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
Parses the XML content to discover MsWord fields. |
|
1212
|
|
|
|
|
|
|
In scalar context, returns an arrayref of L<MsOffice::Word::Surgeon::Field> objects. |
|
1213
|
|
|
|
|
|
|
In list context, returns the arrayref followed by a plain string containing the final XML fragment. |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
=head3 replace_fields |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
my $field_replacer = sub {my ($code, $result) = @_; return "...";}; |
|
1220
|
|
|
|
|
|
|
$part->replace_fields($field_replacer); |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
Replaces MsWord fields by the product of the C<< $field_replacer >> callback. |
|
1223
|
|
|
|
|
|
|
The callback receives two arguments : |
|
1224
|
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
=over |
|
1226
|
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
=item C<$code> |
|
1228
|
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
A plain string containing the field's full code instruction, i.e a keyword followed by optional arguments and switches, |
|
1230
|
|
|
|
|
|
|
including initial and final spaces. Embedded fields are represented in curly braces, like for example |
|
1231
|
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
C<< IF { DOCPROPERTY foo } = "bar" "is bar" "is not bar" >>. |
|
1233
|
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
=item C<$result> |
|
1235
|
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
An XML fragment containing the current value for the field. |
|
1237
|
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
=back |
|
1239
|
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
The callback should return an XML fragment suitable to be inserted within an MsWord I<run>. |
|
1241
|
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
=head3 reveal_fields |
|
1244
|
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
$part->reveal_fields; |
|
1246
|
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
Replaces each field with a textual representation of its code instruction, embedded in curly braces. |
|
1248
|
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
=head3 unlink_fields |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
$part->unlink_fields; |
|
1253
|
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
Replaces each field with its current result, i.e removing the code instruction. |
|
1255
|
|
|
|
|
|
|
This is the equivalent of performing Ctrl-Shift-F9 in MsWord on the whole document. |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
=head2 Operations on images |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
=head3 replace_image |
|
1263
|
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
$part->replace_image($image_alt_text, $image_PNG_content); |
|
1265
|
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
Replaces an existing PNG image by a new image. All features of the old image will |
|
1267
|
|
|
|
|
|
|
be preserved (size, positioning, border, etc.) -- only the image itself will be |
|
1268
|
|
|
|
|
|
|
replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word |
|
1269
|
|
|
|
|
|
|
for this image. |
|
1270
|
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
This operation replaces a ZIP member within the C<.docx> file. If several XML |
|
1272
|
|
|
|
|
|
|
nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several |
|
1273
|
|
|
|
|
|
|
locations, the new image will appear at all locations, even if they do not have the |
|
1274
|
|
|
|
|
|
|
same alternative text -- unfortunately this module currently has no facility for |
|
1275
|
|
|
|
|
|
|
duplicating an existing image into separate instances. So if your intent is to only replace |
|
1276
|
|
|
|
|
|
|
one instance of the image, your original document should contain several distinct copies |
|
1277
|
|
|
|
|
|
|
of the C<.PNG> file. |
|
1278
|
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=head3 add_image |
|
1281
|
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
my $rId = $part->add_image($image_PNG_content); |
|
1283
|
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
Stores the given PNG image within the ZIP file, adds it as a relationship to the |
|
1285
|
|
|
|
|
|
|
current part, and returns the relationship id. This operation is not sufficient |
|
1286
|
|
|
|
|
|
|
to make the image visible in Word : it just stores the image, but you still |
|
1287
|
|
|
|
|
|
|
have to insert a proper C<drawing> node in the contents XML, using the C<$rId>. |
|
1288
|
|
|
|
|
|
|
Future versions of this module may offer helper methods for that purpose; |
|
1289
|
|
|
|
|
|
|
currently it must be done by hand. |
|
1290
|
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
=head1 AUTHOR |
|
1293
|
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
Laurent Dami, E<lt>dami AT cpan DOT org<gt> |
|
1295
|
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
Copyright 2019-2024 by Laurent Dami. |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
This program is free software, you can redistribute it and/or modify it under the terms of the Artistic License version 2.0. |
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
|