line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Extract::Word;
|
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
106837
|
use strict;
|
|
4
|
|
|
|
|
12
|
|
|
4
|
|
|
|
|
168
|
|
4
|
4
|
|
|
4
|
|
22
|
use warnings;
|
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
191
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = 0.02;
|
7
|
|
|
|
|
|
|
|
8
|
4
|
|
|
4
|
|
20
|
use base qw(Exporter);
|
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
677
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @EXPORT_OK = qw(get_all_text);
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
#use Smart::Comments;
|
13
|
|
|
|
|
|
|
|
14
|
4
|
|
|
4
|
|
23
|
use Carp;
|
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
387
|
|
15
|
4
|
|
|
4
|
|
10400
|
use Encode;
|
|
4
|
|
|
|
|
60512
|
|
|
4
|
|
|
|
|
588
|
|
16
|
4
|
|
|
4
|
|
3864
|
use POSIX;
|
|
4
|
|
|
|
|
35835
|
|
|
4
|
|
|
|
|
28
|
|
17
|
4
|
|
|
4
|
|
26931
|
use OLE::Storage_Lite;
|
|
4
|
|
|
|
|
118927
|
|
|
4
|
|
|
|
|
186
|
|
18
|
4
|
|
|
4
|
|
41
|
use IO::File;
|
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
1084
|
|
19
|
4
|
|
|
4
|
|
22
|
use Scalar::Util;
|
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
7677
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub new {
|
22
|
10
|
|
|
10
|
1
|
1214
|
my ($this, @options) = @_;
|
23
|
10
|
|
33
|
|
|
75
|
my $class = ref($this) || $this;
|
24
|
|
|
|
|
|
|
|
25
|
10
|
|
|
|
|
21
|
my $self = { };
|
26
|
10
|
|
|
|
|
24
|
bless $self, $class;
|
27
|
10
|
|
|
|
|
34
|
_initialize($self, @options);
|
28
|
10
|
|
|
|
|
45
|
return $self;
|
29
|
|
|
|
|
|
|
}
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub _initialize {
|
32
|
10
|
|
|
10
|
|
23
|
my ($self, @options) = @_;
|
33
|
10
|
|
|
|
|
19
|
my $value = shift(@options);
|
34
|
10
|
50
|
|
|
|
34
|
if (@options) {
|
35
|
0
|
|
|
|
|
0
|
carp("Ignored additional parameters to constructor");
|
36
|
|
|
|
|
|
|
}
|
37
|
10
|
50
|
|
|
|
475
|
if (Scalar::Util::openhandle($value)) {
|
|
|
50
|
|
|
|
|
|
38
|
0
|
|
|
|
|
0
|
$self->{_fh} = $value;
|
39
|
|
|
|
|
|
|
} elsif (-e $value) {
|
40
|
10
|
|
|
|
|
91
|
my $oIo = IO::File->new();
|
41
|
10
|
50
|
|
|
|
418
|
$oIo->open($value, "<") or croak("Can't open $value: $!");
|
42
|
10
|
|
|
|
|
767
|
binmode($oIo);
|
43
|
10
|
|
|
|
|
48
|
$self->{_fh} = $oIo;
|
44
|
|
|
|
|
|
|
} else {
|
45
|
0
|
|
|
|
|
0
|
croak("Invalid parameter to constructor: $value should be a file handle or file name");
|
46
|
|
|
|
|
|
|
}
|
47
|
10
|
|
|
|
|
32
|
_extract_stream($self);
|
48
|
|
|
|
|
|
|
}
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _compare_ranges {
|
51
|
0
|
|
|
0
|
|
0
|
my ($range1, $range2) = @_;
|
52
|
0
|
|
|
|
|
0
|
return ($range1->[0] <=> $range2->[0]);
|
53
|
|
|
|
|
|
|
}
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub _extract_stream {
|
56
|
10
|
|
|
10
|
|
23
|
my ($self) = @_;
|
57
|
|
|
|
|
|
|
|
58
|
10
|
|
|
|
|
21
|
my $fh = $self->{_fh};
|
59
|
10
|
|
|
|
|
96
|
my $ofs = OLE::Storage_Lite->new($fh);
|
60
|
10
|
|
|
|
|
117
|
my $name = encode("UCS-2LE", "WordDocument");
|
61
|
10
|
|
|
|
|
19170
|
my @pps = $ofs->getPpsSearch([$name], 1, 1);
|
62
|
10
|
50
|
|
|
|
23250
|
croak("This does not seem to be a Word document") unless (@pps);
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# OK, at this stage, we have the word stream. Now we need to start reading from it.
|
65
|
10
|
|
|
|
|
33
|
my $data = $pps[0]->{Data};
|
66
|
10
|
|
|
|
|
34
|
$self->{_data} = $data;
|
67
|
|
|
|
|
|
|
|
68
|
10
|
|
|
|
|
35
|
my $magic = unpack("v", substr($data, 0x0000, 2));
|
69
|
10
|
50
|
|
|
|
66
|
croak(sprintf("This does not seem to be a Word document, but it is pretending to be one: %x", $magic)) unless ($magic == 0xa5ec);
|
70
|
|
|
|
|
|
|
|
71
|
10
|
|
|
|
|
26
|
my $flags = unpack("v", substr($data, 0x000A, 2));
|
72
|
10
|
50
|
|
|
|
96
|
my $table = ($flags & 0x0200) ? "1Table" : "0Table";
|
73
|
10
|
|
|
|
|
41
|
$table = encode("UCS-2LE", $table);
|
74
|
|
|
|
|
|
|
|
75
|
10
|
|
|
|
|
289
|
@pps = $ofs->getPpsSearch([$table], 1, 1);
|
76
|
10
|
50
|
|
|
|
20899
|
confess("Internal error: could not locate table stream") unless (@pps);
|
77
|
|
|
|
|
|
|
|
78
|
10
|
|
|
|
|
28
|
$table = $pps[0]->{Data};
|
79
|
10
|
|
|
|
|
25
|
$self->{_table} = $table;
|
80
|
|
|
|
|
|
|
|
81
|
10
|
|
|
|
|
27
|
my $fcMin = unpack("V", substr($data, 0x0018, 4));
|
82
|
10
|
|
|
|
|
21
|
my $ccpText = unpack("V", substr($data, 0x004c, 4));
|
83
|
10
|
|
|
|
|
23
|
my $ccpFtn = unpack("V", substr($data, 0x0050, 4));
|
84
|
10
|
|
|
|
|
22
|
my $ccpHdd = unpack("V", substr($data, 0x0054, 4));
|
85
|
10
|
|
|
|
|
27
|
my $ccpAtn = unpack("V", substr($data, 0x005c, 4));
|
86
|
|
|
|
|
|
|
|
87
|
10
|
|
|
|
|
20
|
$self->{_fcMin} = $fcMin;
|
88
|
10
|
|
|
|
|
22
|
$self->{_ccpText} = $ccpText;
|
89
|
10
|
|
|
|
|
16
|
$self->{_ccpFtn} = $ccpFtn;
|
90
|
10
|
|
|
|
|
19
|
$self->{_ccpHdd} = $ccpHdd;
|
91
|
10
|
|
|
|
|
27
|
$self->{_ccpAtn} = $ccpAtn;
|
92
|
|
|
|
|
|
|
|
93
|
10
|
|
|
|
|
22
|
my $charPLC = unpack("V", substr($data, 0x00fa, 4));
|
94
|
10
|
|
|
|
|
23
|
my $charPlcSize = unpack("V", substr($data, 0x00fe, 4));
|
95
|
10
|
|
|
|
|
21
|
my $parPLC = unpack("V", substr($data, 0x0102, 4));
|
96
|
10
|
|
|
|
|
20
|
my $parPlcSize = unpack("V", substr($data, 0x0106, 4));
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# get the location of the piece table
|
99
|
10
|
|
|
|
|
23
|
my $complexOffset = unpack("V", substr($data, 0x01a2, 4));
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
### fcMin: $fcMin
|
102
|
|
|
|
|
|
|
### ccpText: $ccpText
|
103
|
|
|
|
|
|
|
### ccpFtn: $ccpFtn
|
104
|
|
|
|
|
|
|
### ccpHdd: $ccpHdd
|
105
|
|
|
|
|
|
|
### ccpAtn: $ccpAtn
|
106
|
|
|
|
|
|
|
### end: $ccpText + $ccpFtn + $ccpHdd + $ccpAtn
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# Read character positioning data positions
|
109
|
10
|
|
|
|
|
32
|
my $fcPlcfBteChpx = unpack("V", substr($data, 0x0fa, 4));
|
110
|
10
|
|
|
|
|
20
|
my $lcbPlcfBteChpx = unpack("V", substr($data, 0x0fe, 4));
|
111
|
10
|
|
|
|
|
23
|
$self->{_fcPlcfBteChpx} = $fcPlcfBteChpx;
|
112
|
10
|
|
|
|
|
22
|
$self->{_lcbPlcfBteChpx} = $lcbPlcfBteChpx;
|
113
|
|
|
|
|
|
|
|
114
|
10
|
|
|
|
|
35
|
_get_bookmarks($self);
|
115
|
|
|
|
|
|
|
|
116
|
10
|
|
|
|
|
36
|
my @pieces = _find_text(\$table, $complexOffset);
|
117
|
10
|
|
|
|
|
29
|
@pieces = sort { $a->{start} <=> $b->{start} } @pieces;
|
|
12
|
|
|
|
|
25
|
|
118
|
|
|
|
|
|
|
|
119
|
10
|
|
|
|
|
34
|
_get_text(\$data, \@pieces);
|
120
|
|
|
|
|
|
|
|
121
|
10
|
|
|
|
|
109
|
$self->{_pieces} = \@pieces;
|
122
|
|
|
|
|
|
|
}
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
sub _get_bookmarks {
|
125
|
10
|
|
|
10
|
|
15
|
my ($self) = @_;
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# Now to look for bookmark information
|
128
|
10
|
|
|
|
|
65
|
my $fcSttbfBkmk = unpack("V", substr($self->{_data}, 0x0142, 4));
|
129
|
10
|
|
|
|
|
24
|
my $lcbSttbfBkmk = unpack("V", substr($self->{_data}, 0x0146, 4));
|
130
|
10
|
|
|
|
|
23
|
my $fcPlcfBkf = unpack("V", substr($self->{_data}, 0x014a, 4));
|
131
|
10
|
|
|
|
|
23
|
my $lcbPlcfBkf = unpack("V", substr($self->{_data}, 0x014e, 4));
|
132
|
10
|
|
|
|
|
21
|
my $fcPlcfBkl = unpack("V", substr($self->{_data}, 0x0152, 4));
|
133
|
10
|
|
|
|
|
22
|
my $lcbPlcfBkl = unpack("V", substr($self->{_data}, 0x0156, 4));
|
134
|
|
|
|
|
|
|
### fcSttbfBkmk: $fcSttbfBkmk
|
135
|
|
|
|
|
|
|
### lcbSttbfBkmk: $lcbSttbfBkmk
|
136
|
|
|
|
|
|
|
### fcPlcfBkf: $fcPlcfBkf
|
137
|
|
|
|
|
|
|
### lcbPlcfBkf: $lcbPlcfBkf
|
138
|
|
|
|
|
|
|
### fcPlcfBkl: $fcPlcfBkl
|
139
|
|
|
|
|
|
|
### lcbPlcfBkl: $lcbPlcfBkl
|
140
|
|
|
|
|
|
|
|
141
|
10
|
100
|
|
|
|
32
|
return if ($lcbSttbfBkmk == 0);
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# Read the bookmark name block
|
144
|
2
|
|
|
|
|
10
|
my $sttbfBkmk = substr($self->{_table}, $fcSttbfBkmk, $lcbSttbfBkmk);
|
145
|
2
|
|
|
|
|
5
|
my $plcfBkf = substr($self->{_table}, $fcPlcfBkf, $lcbPlcfBkf);
|
146
|
2
|
|
|
|
|
5
|
my $plcfBkl = substr($self->{_table}, $fcPlcfBkl, $lcbPlcfBkl);
|
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# Now we can read the bookmark names
|
149
|
|
|
|
|
|
|
|
150
|
2
|
|
|
|
|
13
|
my $fcExtend = unpack("v", substr($sttbfBkmk, 0, 2));
|
151
|
2
|
|
|
|
|
3
|
my $cData = unpack("v", substr($sttbfBkmk, 2, 2));
|
152
|
2
|
|
|
|
|
4
|
my $cbExtra = unpack("v", substr($sttbfBkmk, 4, 2));
|
153
|
2
|
50
|
|
|
|
6
|
confess("Internal error: unexpected single-byte bookmark data") unless ($fcExtend == 0xffff);
|
154
|
|
|
|
|
|
|
|
155
|
2
|
|
|
|
|
2
|
my $offset = 6;
|
156
|
2
|
|
|
|
|
3
|
my $index = 0;
|
157
|
2
|
|
|
|
|
5
|
my %bookmarks = ();
|
158
|
2
|
|
|
|
|
6
|
while($offset < $lcbSttbfBkmk) {
|
159
|
70
|
|
|
|
|
111
|
my $length = unpack("v", substr($sttbfBkmk, $offset, 2));
|
160
|
70
|
|
|
|
|
66
|
$length = $length * 2;
|
161
|
70
|
|
|
|
|
106
|
my $string = substr($sttbfBkmk, $offset + 2, $length);
|
162
|
70
|
|
|
|
|
132
|
my $cpStart = unpack("V", substr($plcfBkf, $index * 4, 4));
|
163
|
70
|
|
|
|
|
97
|
my $cpEnd = unpack("V", substr($plcfBkl, $index * 4, 4));
|
164
|
70
|
|
|
|
|
149
|
$string = Encode::decode("UCS-2LE", $string);
|
165
|
|
|
|
|
|
|
### field name: $string
|
166
|
|
|
|
|
|
|
### position: $cpStart
|
167
|
|
|
|
|
|
|
### position: $cpEnd
|
168
|
70
|
|
|
|
|
1321
|
$bookmarks{$string} = {start => $cpStart, end => $cpEnd};
|
169
|
70
|
|
|
|
|
234
|
$offset += $length + 2;
|
170
|
70
|
|
|
|
|
143
|
$index++;
|
171
|
|
|
|
|
|
|
}
|
172
|
|
|
|
|
|
|
|
173
|
2
|
|
|
|
|
8
|
$self->{_bookmarks} = \%bookmarks;
|
174
|
|
|
|
|
|
|
}
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub _get_piece {
|
177
|
17
|
|
|
17
|
|
34
|
my ($dataref, $piece) = @_;
|
178
|
|
|
|
|
|
|
|
179
|
17
|
|
|
|
|
35
|
my $pstart = $piece->{start};
|
180
|
17
|
|
|
|
|
24
|
my $ptotLength = $piece->{totLength};
|
181
|
17
|
|
|
|
|
44
|
my $pfilePos = $piece->{filePos};
|
182
|
17
|
|
|
|
|
21
|
my $punicode = $piece->{unicode};
|
183
|
|
|
|
|
|
|
|
184
|
17
|
|
|
|
|
24
|
my $pend = $pstart + $ptotLength;
|
185
|
17
|
|
|
|
|
23
|
my $textStart = $pfilePos;
|
186
|
17
|
|
|
|
|
23
|
my $textEnd = $textStart + ($pend - $pstart);
|
187
|
|
|
|
|
|
|
|
188
|
17
|
100
|
|
|
|
40
|
if ($punicode) {
|
189
|
|
|
|
|
|
|
### Adding ucs2 text...
|
190
|
|
|
|
|
|
|
### Start: $textStart
|
191
|
|
|
|
|
|
|
### End: $textEnd
|
192
|
|
|
|
|
|
|
### Length: $textEnd - $textStart
|
193
|
|
|
|
|
|
|
### Bytes: $ptotLength
|
194
|
8
|
|
|
|
|
20
|
$piece->{text} = _add_unicode_text($textStart, $textEnd, $dataref);
|
195
|
8
|
|
|
|
|
15
|
return;
|
196
|
|
|
|
|
|
|
} else {
|
197
|
|
|
|
|
|
|
### Adding iso8869 text...
|
198
|
|
|
|
|
|
|
### Start: $textStart
|
199
|
|
|
|
|
|
|
### End: $textEnd
|
200
|
|
|
|
|
|
|
### Length: $textEnd - $textStart
|
201
|
|
|
|
|
|
|
### Bytes: $ptotLength
|
202
|
9
|
|
|
|
|
31
|
$piece->{text} = _add_text($textStart, $textEnd, $dataref);
|
203
|
9
|
|
|
|
|
21
|
return;
|
204
|
|
|
|
|
|
|
}
|
205
|
|
|
|
|
|
|
}
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub _get_text {
|
208
|
10
|
|
|
10
|
|
18
|
my ($dataref, $piecesref) = @_;
|
209
|
|
|
|
|
|
|
|
210
|
10
|
|
|
|
|
24
|
my @pieces = @$piecesref;
|
211
|
10
|
|
|
|
|
19
|
my @result = ();
|
212
|
10
|
|
|
|
|
17
|
my $index = 1;
|
213
|
10
|
|
|
|
|
12
|
my $position = 0;
|
214
|
|
|
|
|
|
|
|
215
|
10
|
|
|
|
|
22
|
foreach my $piece (@pieces) {
|
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
### piece: $index++
|
218
|
|
|
|
|
|
|
### position: $position
|
219
|
17
|
|
|
|
|
37
|
$piece->{position} = $position;
|
220
|
|
|
|
|
|
|
|
221
|
17
|
|
|
|
|
51
|
_get_piece($dataref, $piece);
|
222
|
17
|
|
|
|
|
27
|
my $segment = $piece->{text};
|
223
|
17
|
|
|
|
|
32
|
push @result, $segment;
|
224
|
17
|
|
|
|
|
267
|
my $length = length($segment);
|
225
|
17
|
|
|
|
|
31
|
$piece->{length} = $length;
|
226
|
17
|
|
|
|
|
47
|
$piece->{endPosition} = $position + $length;
|
227
|
|
|
|
|
|
|
|
228
|
17
|
|
|
|
|
58
|
$position += $length;
|
229
|
|
|
|
|
|
|
}
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
### End position: $position
|
232
|
10
|
|
|
|
|
25
|
return;
|
233
|
|
|
|
|
|
|
}
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub _add_unicode_text {
|
236
|
8
|
|
|
8
|
|
12
|
my ($textStart, $textEnd, $dataref) = @_;
|
237
|
|
|
|
|
|
|
|
238
|
8
|
|
|
|
|
23
|
my $string = substr($$dataref, $textStart, 2*($textEnd - $textStart));
|
239
|
|
|
|
|
|
|
|
240
|
8
|
|
|
|
|
28
|
my $perl_string = Encode::decode("UCS-2LE", $string);
|
241
|
8
|
|
|
|
|
224
|
return $perl_string;
|
242
|
|
|
|
|
|
|
}
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub _add_text {
|
245
|
9
|
|
|
9
|
|
15
|
my ($textStart, $textEnd, $dataref) = @_;
|
246
|
|
|
|
|
|
|
|
247
|
9
|
|
|
|
|
107
|
my $string = substr($$dataref, $textStart, $textEnd - $textStart);
|
248
|
|
|
|
|
|
|
|
249
|
9
|
|
|
|
|
35
|
my $perl_string = Encode::decode("iso-8859-1", $string);
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
# See the conversion table for FcCompressed structures. Note that these
|
252
|
|
|
|
|
|
|
# should not affect positions, as these are characters now, not bytes
|
253
|
4
|
|
|
4
|
|
4261
|
$perl_string =~ tr[\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9f][\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{0178}];
|
|
4
|
|
|
|
|
39
|
|
|
4
|
|
|
|
|
53
|
|
|
9
|
|
|
|
|
2427
|
|
254
|
|
|
|
|
|
|
|
255
|
9
|
|
|
|
|
38
|
return $perl_string;
|
256
|
|
|
|
|
|
|
}
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub _get_chunks {
|
259
|
0
|
|
|
0
|
|
0
|
my ($start, $length, $piecesref) = @_;
|
260
|
0
|
|
|
|
|
0
|
my @result = ();
|
261
|
0
|
|
|
|
|
0
|
my $end = $start + $length;
|
262
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
0
|
foreach my $piece (@$piecesref) {
|
264
|
0
|
|
|
|
|
0
|
my ($pstart, $ptotLength, $pfilePos, $punicode) = @$piece;
|
265
|
0
|
|
|
|
|
0
|
my $pend = $pstart + $ptotLength;
|
266
|
0
|
0
|
|
|
|
0
|
if ($pstart < $end) {
|
267
|
0
|
0
|
|
|
|
0
|
if ($start < $pend) {
|
268
|
0
|
|
|
|
|
0
|
push @result, $piece;
|
269
|
|
|
|
|
|
|
}
|
270
|
|
|
|
|
|
|
} else {
|
271
|
0
|
|
|
|
|
0
|
last;
|
272
|
|
|
|
|
|
|
}
|
273
|
|
|
|
|
|
|
}
|
274
|
|
|
|
|
|
|
|
275
|
0
|
|
|
|
|
0
|
return @result;
|
276
|
|
|
|
|
|
|
}
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
sub _find_text {
|
279
|
10
|
|
|
10
|
|
18
|
my ($tableref, $pos) = @_;
|
280
|
|
|
|
|
|
|
|
281
|
10
|
|
|
|
|
16
|
my @pieces = ();
|
282
|
|
|
|
|
|
|
|
283
|
10
|
|
|
|
|
41
|
while(unpack("C", substr($$tableref, $pos, 1)) == 1) {
|
284
|
0
|
|
|
|
|
0
|
$pos++;
|
285
|
0
|
|
|
|
|
0
|
my $skip = unpack("v", substr($$tableref, $pos, 2));
|
286
|
|
|
|
|
|
|
# print STDERR sprintf("Skipping %d\n", $skip);
|
287
|
0
|
|
|
|
|
0
|
$pos += 2 + $skip;
|
288
|
|
|
|
|
|
|
}
|
289
|
|
|
|
|
|
|
|
290
|
10
|
50
|
|
|
|
35
|
if (unpack("C", substr($$tableref, $pos, 1)) != 2) {
|
291
|
0
|
|
|
|
|
0
|
confess("Internal error: ccorrupted Word file");
|
292
|
|
|
|
|
|
|
} else {
|
293
|
10
|
|
|
|
|
28
|
my $pieceTableSize = unpack("V", substr($$tableref, ++$pos, 4));
|
294
|
|
|
|
|
|
|
# print STDERR sprintf("pieceTableSize: %d\n", $pieceTableSize);
|
295
|
|
|
|
|
|
|
|
296
|
10
|
|
|
|
|
15
|
$pos += 4;
|
297
|
10
|
|
|
|
|
21
|
my $pieces = ($pieceTableSize - 4) / 12;
|
298
|
|
|
|
|
|
|
# print STDERR sprintf("pieces: %d\n", $pieces);
|
299
|
10
|
|
|
|
|
15
|
my $start = 0;
|
300
|
|
|
|
|
|
|
|
301
|
10
|
|
|
|
|
38
|
for (my $x = 0; $x < $pieces; $x++) {
|
302
|
17
|
|
|
|
|
53
|
my $filePos = unpack("V", substr($$tableref, $pos + (($pieces + 1) * 4) + ($x * 8) + 2, 4));
|
303
|
17
|
|
|
|
|
23
|
my $unicode = 0;
|
304
|
17
|
100
|
|
|
|
36
|
if (($filePos & 0x40000000) == 0) {
|
305
|
8
|
|
|
|
|
12
|
$unicode = 1;
|
306
|
|
|
|
|
|
|
} else {
|
307
|
9
|
|
|
|
|
12
|
$unicode = 0;
|
308
|
9
|
|
|
|
|
19
|
$filePos &= ~(0x40000000); #gives me FC in doc stream
|
309
|
9
|
|
|
|
|
14
|
$filePos /= 2;
|
310
|
|
|
|
|
|
|
}
|
311
|
|
|
|
|
|
|
# print STDERR sprintf("filePos: %x\n", $filePos);
|
312
|
17
|
|
|
|
|
36
|
my $lStart = unpack("V", substr($$tableref, $pos + ($x * 4), 4));
|
313
|
17
|
|
|
|
|
36
|
my $lEnd = unpack("V", substr($$tableref, $pos + (($x + 1) * 4), 4));
|
314
|
17
|
|
|
|
|
25
|
my $totLength = $lEnd - $lStart;
|
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# print STDERR "lStart: $lStart; lEnd: $lEnd\n";
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# print STDERR ("Piece: " . (1 + $x) . ", start=" . $start
|
319
|
|
|
|
|
|
|
# . ", len=" . $totLength . ", phys=" . $filePos
|
320
|
|
|
|
|
|
|
# . ", uni=" .$unicode . "\n");
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# TextPiece piece = new TextPiece(start, totLength, filePos, unicode);
|
323
|
|
|
|
|
|
|
# start = start + totLength;
|
324
|
|
|
|
|
|
|
# text.add(piece);
|
325
|
|
|
|
|
|
|
|
326
|
17
|
|
|
|
|
74
|
push @pieces, {start => $start,
|
327
|
|
|
|
|
|
|
totLength => $totLength,
|
328
|
|
|
|
|
|
|
filePos => $filePos,
|
329
|
|
|
|
|
|
|
unicode => $unicode};
|
330
|
17
|
100
|
|
|
|
68
|
$start = $start + (($unicode) ? $totLength/2 : $totLength);
|
331
|
|
|
|
|
|
|
}
|
332
|
|
|
|
|
|
|
}
|
333
|
10
|
|
|
|
|
32
|
return @pieces;
|
334
|
|
|
|
|
|
|
}
|
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub _get_piece_index {
|
337
|
172
|
|
|
172
|
|
174
|
my ($self, $position) = @_;
|
338
|
172
|
50
|
|
|
|
317
|
confess("Internal error: invalid position") if (! defined($position));
|
339
|
172
|
|
|
|
|
297
|
my $index = 0;
|
340
|
172
|
|
|
|
|
175
|
foreach my $piece (@{$self->{_pieces}}) {
|
|
172
|
|
|
|
|
326
|
|
341
|
805
|
100
|
|
|
|
1400
|
return $index if ($position <= $piece->{endPosition});
|
342
|
633
|
|
|
|
|
565
|
$index++;
|
343
|
|
|
|
|
|
|
}
|
344
|
|
|
|
|
|
|
}
|
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
sub _get_text_range {
|
347
|
86
|
|
|
86
|
|
100
|
my ($self, $start, $end) = @_;
|
348
|
|
|
|
|
|
|
|
349
|
86
|
|
|
|
|
110
|
my $pieces = $self->{_pieces};
|
350
|
86
|
|
|
|
|
151
|
my $start_piece = _get_piece_index($self, $start);
|
351
|
86
|
|
|
|
|
143
|
my $end_piece = _get_piece_index($self, $end);
|
352
|
86
|
|
|
|
|
123
|
my @result = ();
|
353
|
86
|
|
|
|
|
184
|
for(my $i = $start_piece; $i <= $end_piece; $i++) {
|
354
|
101
|
|
|
|
|
116
|
my $piece = $pieces->[$i];
|
355
|
101
|
100
|
|
|
|
196
|
my $xstart = ($i == $start_piece) ? $start - $piece->{position} : 0;
|
356
|
101
|
100
|
|
|
|
150
|
my $xend = ($i == $end_piece) ? $end - $piece->{position} : $piece->{endPosition};
|
357
|
101
|
|
|
|
|
683
|
push @result, substr($piece->{text}, $xstart, $xend - $xstart);
|
358
|
|
|
|
|
|
|
}
|
359
|
|
|
|
|
|
|
|
360
|
86
|
|
|
|
|
500
|
return join("", @result);
|
361
|
|
|
|
|
|
|
}
|
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
sub get_bookmarks {
|
364
|
10
|
|
|
10
|
1
|
1816
|
my ($self, $filter) = @_;
|
365
|
10
|
|
|
|
|
21
|
my $bookmarks = $self->{_bookmarks};
|
366
|
10
|
|
|
|
|
93
|
my @bookmark_names = sort keys %$bookmarks;
|
367
|
10
|
|
|
|
|
25
|
foreach my $name (@bookmark_names) {
|
368
|
72
|
|
|
|
|
127
|
my $bookmark = $bookmarks->{$name};
|
369
|
72
|
100
|
|
|
|
140
|
next if (exists($bookmark->{value}));
|
370
|
70
|
|
|
|
|
79
|
my $start = $bookmark->{start};
|
371
|
70
|
|
|
|
|
74
|
my $end = $bookmark->{end};
|
372
|
70
|
|
|
|
|
134
|
my $value = _get_text_range($self, $start - 1, $end);
|
373
|
70
|
100
|
|
|
|
174
|
if (substr($value, 0, 1) ne chr(19)) {
|
374
|
1
|
|
|
|
|
4
|
$value = substr($value, 1);
|
375
|
|
|
|
|
|
|
}
|
376
|
70
|
|
|
|
|
164
|
$bookmark->{value} = $value;
|
377
|
|
|
|
|
|
|
### name: $name
|
378
|
|
|
|
|
|
|
### value: $value
|
379
|
|
|
|
|
|
|
}
|
380
|
|
|
|
|
|
|
|
381
|
10
|
|
|
|
|
25
|
return { map { ($_ => _filter($bookmarks->{$_}->{value}, $filter) ) } @bookmark_names };
|
|
72
|
|
|
|
|
193
|
|
382
|
|
|
|
|
|
|
}
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
sub get_body {
|
385
|
3
|
|
|
3
|
1
|
14
|
my ($self, $filter) = @_;
|
386
|
3
|
|
|
|
|
8
|
my $start = 0;
|
387
|
3
|
|
|
|
|
13
|
return _filter(_get_text_range($self, $start, $start + $self->{_ccpText}), $filter);
|
388
|
|
|
|
|
|
|
}
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub get_footnotes {
|
391
|
2
|
|
|
2
|
1
|
5
|
my ($self, $filter) = @_;
|
392
|
2
|
|
|
|
|
4
|
my $start = $self->{_ccpText};
|
393
|
2
|
|
|
|
|
6
|
return _filter(_get_text_range($self, $start, $start + $self->{_ccpFtn}), $filter);
|
394
|
|
|
|
|
|
|
}
|
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
sub get_headers {
|
397
|
2
|
|
|
2
|
1
|
8
|
my ($self, $filter) = @_;
|
398
|
2
|
|
|
|
|
5
|
my $start = $self->{_ccpText} + $self->{_ccpFtn};
|
399
|
2
|
|
|
|
|
7
|
return _filter(_get_text_range($self, $start, $start + $self->{_ccpHdd}), $filter);
|
400
|
|
|
|
|
|
|
}
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
sub get_annotations {
|
403
|
2
|
|
|
2
|
1
|
4
|
my ($self, $filter) = @_;
|
404
|
2
|
|
|
|
|
4
|
my $start = $self->{_ccpText} + $self->{_ccpFtn} + $self->{_ccpHdd};
|
405
|
2
|
|
|
|
|
10
|
return _filter(_get_text_range($self, $start, $start + $self->{_ccpAtn}), $filter);
|
406
|
|
|
|
|
|
|
}
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
sub get_text {
|
409
|
1
|
|
|
1
|
1
|
2
|
my ($self, $filter) = @_;
|
410
|
1
|
|
|
|
|
3
|
return $self->get_body($filter) .
|
411
|
|
|
|
|
|
|
$self->get_footnotes($filter) .
|
412
|
|
|
|
|
|
|
$self->get_headers($filter) .
|
413
|
|
|
|
|
|
|
$self->get_annotations($filter);
|
414
|
|
|
|
|
|
|
}
|
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub _filter {
|
417
|
81
|
|
|
81
|
|
107
|
my ($text, $filter) = @_;
|
418
|
81
|
100
|
|
|
|
125
|
if (! defined($filter)) {
|
|
|
50
|
|
|
|
|
|
419
|
79
|
|
|
|
|
486
|
$text =~ tr/\x02\x05\x08//d;
|
420
|
79
|
|
|
|
|
505
|
$text =~ tr/\x{2018}\x{2019}\x{201c}\x{201d}\x{0007}\x{000d}\x{2002}\x{2003}\x{2012}\x{2013}\x{2014}/''""\t\n \-\-\-/;
|
421
|
79
|
|
|
|
|
546
|
$text =~ s{\cS(?:[^\cT]*\cT)([^\cU]*)\cU}{$1}g;
|
422
|
79
|
|
|
|
|
319
|
$text =~ s{\cS(?:[^\cU]*\cU)}{}g;
|
423
|
79
|
|
|
|
|
343
|
$text =~ s{[\cJ\cM]}{\n}g;
|
424
|
|
|
|
|
|
|
} elsif ($filter eq ':raw') {
|
425
|
|
|
|
|
|
|
# Do nothing
|
426
|
|
|
|
|
|
|
} else {
|
427
|
0
|
|
|
|
|
0
|
croak("Invalid filter type: $filter");
|
428
|
|
|
|
|
|
|
}
|
429
|
81
|
|
|
|
|
256
|
return $text;
|
430
|
|
|
|
|
|
|
}
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
sub get_all_text {
|
433
|
7
|
|
|
7
|
1
|
13199
|
my ($file) = @_;
|
434
|
|
|
|
|
|
|
|
435
|
7
|
|
|
|
|
54
|
my $instance = __PACKAGE__->new($file);
|
436
|
|
|
|
|
|
|
|
437
|
7
|
|
|
|
|
28
|
$instance->get_bookmarks();
|
438
|
7
|
|
|
|
|
38
|
return _get_text_range($instance, 0, $instance->{_ccpText} +
|
439
|
|
|
|
|
|
|
$instance->{_ccpFtn} +
|
440
|
|
|
|
|
|
|
$instance->{_ccpHdd} +
|
441
|
|
|
|
|
|
|
$instance->{_ccpAtn});
|
442
|
|
|
|
|
|
|
}
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
1;
|
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 NAME
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Text::Extract::Word - Extract text from Word files
|
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# object-based interface
|
453
|
|
|
|
|
|
|
use Text::Extract::Word;
|
454
|
|
|
|
|
|
|
my $file = Text::Extract::Word->new("test1.doc");
|
455
|
|
|
|
|
|
|
my $text = $file->get_text();
|
456
|
|
|
|
|
|
|
my $body = $file->get_body();
|
457
|
|
|
|
|
|
|
my $footnotes = $file->get_footnotes();
|
458
|
|
|
|
|
|
|
my $headers = $file->get_headers();
|
459
|
|
|
|
|
|
|
my $annotations = $file->get_annotations();
|
460
|
|
|
|
|
|
|
my $bookmarks = $file->get_bookmarks();
|
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# specify :raw if you don't want the text cleaned
|
463
|
|
|
|
|
|
|
my $raw = $file->get_text(':raw');
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# legacy interface
|
466
|
|
|
|
|
|
|
use Text::Extract::Word qw(get_all_text);
|
467
|
|
|
|
|
|
|
my $text = get_all_text("test1.doc");
|
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
This simple module allows the textual contents to be extracted from a Word file.
|
472
|
|
|
|
|
|
|
The code was ported from Java code, originally part of the Apache POE project, but
|
473
|
|
|
|
|
|
|
extensive code changes were made internally.
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head1 OBJECT-BASED INTERFACE
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=head2 Text::Extract::Word->new($input);
|
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Passed either a file name or an open file handle, this constructor returns an
|
480
|
|
|
|
|
|
|
instance that can be used to query the file contents.
|
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=head1 METHODS
|
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
All the query methods accept an optional filter argument that can take the value
|
485
|
|
|
|
|
|
|
':raw' -- if this is passed the original Word file contents will be returned without
|
486
|
|
|
|
|
|
|
any attempt to clean the text.
|
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
The default filter attempts to remove Word internal characters used to identify
|
489
|
|
|
|
|
|
|
fields (including field instructions), and translate common Unicode 'fancy' quotes
|
490
|
|
|
|
|
|
|
into more conventional ISO-8859-1 equivalents, for ease of processing. Table cell
|
491
|
|
|
|
|
|
|
markers are also translated into tabs, and paragraph marks into Perl newlines.
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head2 get_body([$filter])
|
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
Returns the text for the main body of the Word document. This excludes headers,
|
496
|
|
|
|
|
|
|
footers, and annotations.
|
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=head2 get_headers([$filter])
|
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
Returns the header and footer texts for the Word document, as a single scalar
|
501
|
|
|
|
|
|
|
string.
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=head2 get_footnotes([$filter])
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
Returns the footnote and endnode texts for the Word document, as a single scalar
|
506
|
|
|
|
|
|
|
string.
|
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head2 get_annotations([$filter])
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Returns the annotation texts for the Word document, as a single scalar
|
511
|
|
|
|
|
|
|
string.
|
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
=head2 get_text([$filter])
|
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
Returns the concatenated text from the body, headers, footnotes, and annotations
|
516
|
|
|
|
|
|
|
of the the Word document, as a single scalar string.
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
=head2 get_bookmarks([$filter])
|
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
Returns the bookmark texts for the Word document, as a hash reference. The keys
|
521
|
|
|
|
|
|
|
in the hash are the bookmark names (Word requires that these are unique) and
|
522
|
|
|
|
|
|
|
the values are the filtered bookmark texts.
|
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
This method can be used to get Word form text data out of a Word file. All text fields
|
525
|
|
|
|
|
|
|
in a Word form will normally be labelled as bookmarks, and will be returned by this
|
526
|
|
|
|
|
|
|
method. Non-textual form fields (including drop-downs) will not be returned, as these
|
527
|
|
|
|
|
|
|
are not labelled as bookmarks.
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=head1 FUNCTIONS
|
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=head2 get_all_text($filename)
|
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
The only function exportable by this module, when called on a file name, returns the
|
534
|
|
|
|
|
|
|
raw text contents of the Word file. The contents are returned as UTF-8 encoded text.
|
535
|
|
|
|
|
|
|
This is unfiltered, for compatibility with previous versions of the module.
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head1 TODO
|
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=over 4
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=item *
|
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
handle non-textual form fields
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=back
|
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=head1 BUGS
|
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=over 4
|
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=item *
|
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
support for legacy Word - the module does not extract text from Word version 6 or earlier
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=back
|
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=head1 SEE ALSO
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
L also has a script C (Let's Have a Look at Word) which extracts
|
560
|
|
|
|
|
|
|
text from Word files. This is simply a much smaller module with lighter dependencies,
|
561
|
|
|
|
|
|
|
using L for its storage management.
|
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=head1 AUTHOR
|
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Stuart Watt, stuart@morungos.com
|
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=head1 COPYRIGHT
|
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
Copyright (c) 2010 Stuart Watt. All rights reserved.
|
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=cut
|
572
|
|
|
|
|
|
|
|