line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Text::FromAny |
2
|
|
|
|
|
|
|
# A module to read pure text from a vareiety of formats |
3
|
|
|
|
|
|
|
# Copyright Eskild Hustvedt 2010 |
4
|
|
|
|
|
|
|
# for Portu Media & Communications |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# This library is free software; you can redistribute it and/or modify |
7
|
|
|
|
|
|
|
# it under the terms of either: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# a) the GNU General Public License as published by the Free |
10
|
|
|
|
|
|
|
# Software Foundation; either version 3, or (at your option) any |
11
|
|
|
|
|
|
|
# later version, or |
12
|
|
|
|
|
|
|
# b) the "Artistic License" which comes with this Kit. |
13
|
|
|
|
|
|
|
# |
14
|
|
|
|
|
|
|
# This library is distributed in the hope that it will be useful, |
15
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
16
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either |
17
|
|
|
|
|
|
|
# the GNU General Public License or the Artistic License for more details. |
18
|
|
|
|
|
|
|
# |
19
|
|
|
|
|
|
|
# You should have received a copy of the Artistic License |
20
|
|
|
|
|
|
|
# in the file named "COPYING.artistic". If not, I'll be glad to provide one. |
21
|
|
|
|
|
|
|
# |
22
|
|
|
|
|
|
|
# You should also have received a copy of the GNU General Public License |
23
|
|
|
|
|
|
|
# along with this library in the file named "COPYING.gpl". If not, |
24
|
|
|
|
|
|
|
# see . |
25
|
|
|
|
|
|
|
package Text::FromAny; |
26
|
2
|
|
|
2
|
|
4066
|
use Any::Moose; |
|
2
|
|
|
|
|
114500
|
|
|
2
|
|
|
|
|
13
|
|
27
|
2
|
|
|
2
|
|
1064
|
use Carp qw(carp croak); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
136
|
|
28
|
2
|
|
|
2
|
|
2784
|
use Try::Tiny; |
|
2
|
|
|
|
|
1779
|
|
|
2
|
|
|
|
|
107
|
|
29
|
2
|
|
|
2
|
|
3143
|
use Text::Extract::Word qw(get_all_text); |
|
2
|
|
|
|
|
148441
|
|
|
2
|
|
|
|
|
164
|
|
30
|
2
|
|
|
2
|
|
902
|
use OpenOffice::OODoc 2.101; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
use File::LibMagic; |
32
|
|
|
|
|
|
|
use Archive::Zip; |
33
|
|
|
|
|
|
|
use RTF::TEXT::Converter; |
34
|
|
|
|
|
|
|
use HTML::FormatText::WithLinks; |
35
|
|
|
|
|
|
|
use File::Spec::Functions; |
36
|
|
|
|
|
|
|
use CAM::PDF; |
37
|
|
|
|
|
|
|
use CAM::PDF::PageText; |
38
|
|
|
|
|
|
|
use IPC::Open3 qw(open3); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
our $VERSION = '0.21'; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
has 'file' => ( |
43
|
|
|
|
|
|
|
is => 'ro', |
44
|
|
|
|
|
|
|
isa => 'Str', |
45
|
|
|
|
|
|
|
required => 1, |
46
|
|
|
|
|
|
|
); |
47
|
|
|
|
|
|
|
has 'allowGuess' => ( |
48
|
|
|
|
|
|
|
is => 'rw', |
49
|
|
|
|
|
|
|
isa => 'Str', |
50
|
|
|
|
|
|
|
default => 1, |
51
|
|
|
|
|
|
|
); |
52
|
|
|
|
|
|
|
has 'allowExternal' => ( |
53
|
|
|
|
|
|
|
is => 'rw', |
54
|
|
|
|
|
|
|
isa => 'Str', |
55
|
|
|
|
|
|
|
default => 0, |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
has '_fileType' => ( |
58
|
|
|
|
|
|
|
is => 'ro', |
59
|
|
|
|
|
|
|
isa => 'Maybe[Str]', |
60
|
|
|
|
|
|
|
builder => '_getType', |
61
|
|
|
|
|
|
|
lazy => 1, |
62
|
|
|
|
|
|
|
); |
63
|
|
|
|
|
|
|
has '_pdfToText' => ( |
64
|
|
|
|
|
|
|
is => 'ro', |
65
|
|
|
|
|
|
|
isa => 'Bool', |
66
|
|
|
|
|
|
|
builder => '_checkPdfToText', |
67
|
|
|
|
|
|
|
lazy => 1 |
68
|
|
|
|
|
|
|
); |
69
|
|
|
|
|
|
|
has '_content' => ( |
70
|
|
|
|
|
|
|
is => 'rw', |
71
|
|
|
|
|
|
|
); |
72
|
|
|
|
|
|
|
has '_readState' => ( |
73
|
|
|
|
|
|
|
is => 'rw', |
74
|
|
|
|
|
|
|
isa => 'Maybe[Str]', |
75
|
|
|
|
|
|
|
); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# Ensure file exists during construction |
78
|
|
|
|
|
|
|
sub BUILD |
79
|
|
|
|
|
|
|
{ |
80
|
|
|
|
|
|
|
my $self = shift; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
if(not -e $self->file) |
83
|
|
|
|
|
|
|
{ |
84
|
|
|
|
|
|
|
croak($self->file.': does not exist'); |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
elsif(not -r $self->file) |
87
|
|
|
|
|
|
|
{ |
88
|
|
|
|
|
|
|
croak($self->file.': is not readable'); |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
elsif(not -f $self->file) |
91
|
|
|
|
|
|
|
{ |
92
|
|
|
|
|
|
|
croak($self->file.': is not a normal file'); |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Get the text string representing the contents of the file. |
97
|
|
|
|
|
|
|
# Returns undef if the format is unknown or unsupported |
98
|
|
|
|
|
|
|
sub text |
99
|
|
|
|
|
|
|
{ |
100
|
|
|
|
|
|
|
my $self = shift; |
101
|
|
|
|
|
|
|
my $ftype = $self->detectedType; |
102
|
|
|
|
|
|
|
my $text = $self->_getRead(); |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
if(defined $text) |
105
|
|
|
|
|
|
|
{ |
106
|
|
|
|
|
|
|
return $text; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
if(not defined $ftype) |
109
|
|
|
|
|
|
|
{ |
110
|
|
|
|
|
|
|
return undef; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
try |
114
|
|
|
|
|
|
|
{ |
115
|
|
|
|
|
|
|
if ($ftype eq 'pdf') |
116
|
|
|
|
|
|
|
{ |
117
|
|
|
|
|
|
|
$text = $self->_getFromPDF(); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
elsif($ftype eq 'doc') |
120
|
|
|
|
|
|
|
{ |
121
|
|
|
|
|
|
|
$text = $self->_getFromDoc(); |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
elsif($ftype eq 'odt') |
124
|
|
|
|
|
|
|
{ |
125
|
|
|
|
|
|
|
$text = $self->_getFromODT(); |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
elsif($ftype eq 'sxw') |
128
|
|
|
|
|
|
|
{ |
129
|
|
|
|
|
|
|
$text = $self->_getFromSXW(); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
elsif($ftype eq 'txt') |
132
|
|
|
|
|
|
|
{ |
133
|
|
|
|
|
|
|
$text = $self->_getFromRaw(); |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
elsif($ftype eq 'rtf') |
136
|
|
|
|
|
|
|
{ |
137
|
|
|
|
|
|
|
$text = $self->_getFromRTF(); |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
elsif($ftype eq 'docx') |
140
|
|
|
|
|
|
|
{ |
141
|
|
|
|
|
|
|
$text = $self->_getFromDocx(); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
elsif($ftype eq 'html') |
144
|
|
|
|
|
|
|
{ |
145
|
|
|
|
|
|
|
$text = $self->_getFromHTML(); |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
elsif(defined $ftype) |
148
|
|
|
|
|
|
|
{ |
149
|
|
|
|
|
|
|
die("Text::FromAny: Unknown detected filetype: $ftype\n"); |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
if(defined $text) |
153
|
|
|
|
|
|
|
{ |
154
|
|
|
|
|
|
|
$text =~ s/(\r|\f)//g; |
155
|
|
|
|
|
|
|
$self->_content($text); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
catch |
159
|
|
|
|
|
|
|
{ |
160
|
|
|
|
|
|
|
$text = undef; |
161
|
|
|
|
|
|
|
}; |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
$self->_setRead($text); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
return $text; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# Returns the detected filetype. |
169
|
|
|
|
|
|
|
# This is defined as a method because it should not be accepted as a |
170
|
|
|
|
|
|
|
# construction parameters. |
171
|
|
|
|
|
|
|
sub detectedType |
172
|
|
|
|
|
|
|
{ |
173
|
|
|
|
|
|
|
my $self = shift; |
174
|
|
|
|
|
|
|
return $self->_fileType; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Retrieve text from a PDF file |
178
|
|
|
|
|
|
|
sub _getFromPDF |
179
|
|
|
|
|
|
|
{ |
180
|
|
|
|
|
|
|
my $self = shift; |
181
|
|
|
|
|
|
|
my $text = $self->_getFromPDF_CAMPDF(); |
182
|
|
|
|
|
|
|
if ($text =~ /(\w|\d)/) |
183
|
|
|
|
|
|
|
{ |
184
|
|
|
|
|
|
|
return $text; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
my $pdftotext = $self->_getFromPDF_pdftotext; |
187
|
|
|
|
|
|
|
if ($pdftotext) |
188
|
|
|
|
|
|
|
{ |
189
|
|
|
|
|
|
|
return $pdftotext; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
return $text; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# Retrieve text from a PDF file using CAM::PDF |
195
|
|
|
|
|
|
|
sub _getFromPDF_CAMPDF |
196
|
|
|
|
|
|
|
{ |
197
|
|
|
|
|
|
|
my $self = shift; |
198
|
|
|
|
|
|
|
my $f = CAM::PDF->new($self->file); |
199
|
|
|
|
|
|
|
my $text = ''; |
200
|
|
|
|
|
|
|
foreach(1..$f->numPages()) |
201
|
|
|
|
|
|
|
{ |
202
|
|
|
|
|
|
|
my $page = $f->getPageContentTree($_); |
203
|
|
|
|
|
|
|
$text .= CAM::PDF::PageText->render($page); |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
return $text; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# Retrieve text from a PDF file using pdftotext (if we are allowed to, and it |
209
|
|
|
|
|
|
|
# is available) |
210
|
|
|
|
|
|
|
sub _getFromPDF_pdftotext |
211
|
|
|
|
|
|
|
{ |
212
|
|
|
|
|
|
|
my $self = shift; |
213
|
|
|
|
|
|
|
if(not $self->allowExternal or not $self->_pdfToText) |
214
|
|
|
|
|
|
|
{ |
215
|
|
|
|
|
|
|
return; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
my $content = ''; |
218
|
|
|
|
|
|
|
try |
219
|
|
|
|
|
|
|
{ |
220
|
|
|
|
|
|
|
my $pid = open3(my $in, my $out, my $err, 'pdftotext','-layout','-enc','UTF-8',$self->file,'-') or die("Failed to open3() pdftotext: $!\n"); |
221
|
|
|
|
|
|
|
while(<$out>) |
222
|
|
|
|
|
|
|
{ |
223
|
|
|
|
|
|
|
$content .= $_; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
close($in) if $in; |
226
|
|
|
|
|
|
|
close($out) if $out; |
227
|
|
|
|
|
|
|
close($err) if $err; |
228
|
|
|
|
|
|
|
waitpid($pid,0); |
229
|
|
|
|
|
|
|
my $status = $? >> 8; |
230
|
|
|
|
|
|
|
if ($status != 0) |
231
|
|
|
|
|
|
|
{ |
232
|
|
|
|
|
|
|
$content = ''; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
}; |
235
|
|
|
|
|
|
|
return $content; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Check if pdftotext is installed |
239
|
|
|
|
|
|
|
sub _checkPdfToText |
240
|
|
|
|
|
|
|
{ |
241
|
|
|
|
|
|
|
foreach (split /:/, $ENV{PATH}) |
242
|
|
|
|
|
|
|
{ |
243
|
|
|
|
|
|
|
my $f = catfile($_,'pdftotext'); |
244
|
|
|
|
|
|
|
if (-x $f and not -d $f) |
245
|
|
|
|
|
|
|
{ |
246
|
|
|
|
|
|
|
return 1; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
return 0; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# Retrieve text from a msword .doc file |
253
|
|
|
|
|
|
|
sub _getFromDoc |
254
|
|
|
|
|
|
|
{ |
255
|
|
|
|
|
|
|
my $self = shift; |
256
|
|
|
|
|
|
|
my $text = get_all_text($self->file); |
257
|
|
|
|
|
|
|
$text =~ s/(\r|\r\n)/\n/g; |
258
|
|
|
|
|
|
|
$text =~ s/\n$//; |
259
|
|
|
|
|
|
|
return $text; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
# Retrieve text from an "Office Open XML" file |
263
|
|
|
|
|
|
|
sub _getFromDocx |
264
|
|
|
|
|
|
|
{ |
265
|
|
|
|
|
|
|
my $self = shift; |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
my $xml = $self->_readFileInZIP('word/document.xml'); |
268
|
|
|
|
|
|
|
return if not defined $xml; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# Strip formatting newlines in the XML |
271
|
|
|
|
|
|
|
$xml =~ s/\n//g; |
272
|
|
|
|
|
|
|
# Convert XML newlines to real ones |
273
|
|
|
|
|
|
|
if(not $xml =~ s/]*w:rsidRDefault[^>]+>/\n/g) |
274
|
|
|
|
|
|
|
{ |
275
|
|
|
|
|
|
|
$xml =~ s/<\/w:p>/\n/g; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
# Remove tags |
278
|
|
|
|
|
|
|
$xml =~ s/<[^>]+>//g; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
return $xml; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# Retrieve text from an Open Document text file |
284
|
|
|
|
|
|
|
sub _getFromODT |
285
|
|
|
|
|
|
|
{ |
286
|
|
|
|
|
|
|
my $self = shift; |
287
|
|
|
|
|
|
|
my $doc = odfText(file => $self->file); |
288
|
|
|
|
|
|
|
my $xml; |
289
|
|
|
|
|
|
|
open(my $out,'>',\$xml); |
290
|
|
|
|
|
|
|
$doc->getBody->print($out); |
291
|
|
|
|
|
|
|
close($out); |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
return $self->_getFromODT_SXW_XML($xml); |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Retrieve text from a legacy OpenOffice.org writer text file |
297
|
|
|
|
|
|
|
sub _getFromSXW |
298
|
|
|
|
|
|
|
{ |
299
|
|
|
|
|
|
|
my $self = shift; |
300
|
|
|
|
|
|
|
my $xml = $self->_readFileInZIP('content.xml'); |
301
|
|
|
|
|
|
|
return $self->_getFromODT_SXW_XML($xml); |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# Retrieve text from an RTF file |
305
|
|
|
|
|
|
|
sub _getFromRTF |
306
|
|
|
|
|
|
|
{ |
307
|
|
|
|
|
|
|
my $self = shift; |
308
|
|
|
|
|
|
|
my $file = $self->file; |
309
|
|
|
|
|
|
|
my $text = ''; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# RTF::TEXT::Converter spews some errors to STDERR that we don't need, |
312
|
|
|
|
|
|
|
# so we silence it |
313
|
|
|
|
|
|
|
local *STDERR; |
314
|
|
|
|
|
|
|
open(STDERR,'>','/dev/null'); |
315
|
|
|
|
|
|
|
try |
316
|
|
|
|
|
|
|
{ |
317
|
|
|
|
|
|
|
my $p = RTF::TEXT::Converter->new( output => \$text ); |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
open(my $in, '<', $file); |
320
|
|
|
|
|
|
|
$p->parse_stream($in); |
321
|
|
|
|
|
|
|
close($in); |
322
|
|
|
|
|
|
|
}; |
323
|
|
|
|
|
|
|
return $text; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# Get the contents of a cleartext file |
327
|
|
|
|
|
|
|
sub _getFromRaw |
328
|
|
|
|
|
|
|
{ |
329
|
|
|
|
|
|
|
my $self = shift; |
330
|
|
|
|
|
|
|
open(my $in,'<',$self->file) or carp("Failed to open ".$self->file.": ".$!); |
331
|
|
|
|
|
|
|
return if not $in; |
332
|
|
|
|
|
|
|
local $/ = undef; |
333
|
|
|
|
|
|
|
my $text = <$in>; |
334
|
|
|
|
|
|
|
close($in); |
335
|
|
|
|
|
|
|
return $text; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# Retrieve text from a HTML file |
339
|
|
|
|
|
|
|
sub _getFromHTML |
340
|
|
|
|
|
|
|
{ |
341
|
|
|
|
|
|
|
my $self = shift; |
342
|
|
|
|
|
|
|
my $formatText = HTML::FormatText::WithLinks->new( |
343
|
|
|
|
|
|
|
before_link => '', |
344
|
|
|
|
|
|
|
after_link => '', |
345
|
|
|
|
|
|
|
unique_links => 1, |
346
|
|
|
|
|
|
|
footnote => '%l', |
347
|
|
|
|
|
|
|
); |
348
|
|
|
|
|
|
|
my $text = $formatText->parse_file($self->file); |
349
|
|
|
|
|
|
|
# Remove additional formatting added by HTML::FormatText::WithLinks |
350
|
|
|
|
|
|
|
my $result = ''; |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# Remove whitespace prefix on each line |
353
|
|
|
|
|
|
|
foreach my $l (split(/\n/,$text)) |
354
|
|
|
|
|
|
|
{ |
355
|
|
|
|
|
|
|
$l =~ s/^ {1,4}//; |
356
|
|
|
|
|
|
|
$result .= $l."\n"; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# Remove newline padding at the end |
360
|
|
|
|
|
|
|
$result =~ s/\n+$//g; |
361
|
|
|
|
|
|
|
return $result; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# Simple regex cleaner and formatted for ODT and SXW |
365
|
|
|
|
|
|
|
sub _getFromODT_SXW_XML |
366
|
|
|
|
|
|
|
{ |
367
|
|
|
|
|
|
|
my $self = shift; |
368
|
|
|
|
|
|
|
my $xml = shift; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Strip formatting newlines in the XML |
371
|
|
|
|
|
|
|
$xml =~ s/\n//g; |
372
|
|
|
|
|
|
|
# Strip first text:p |
373
|
|
|
|
|
|
|
$xml =~ s/]*>//; |
374
|
|
|
|
|
|
|
# Convert XML newlines to real ones |
375
|
|
|
|
|
|
|
$xml =~ s/]*>/\n/g; |
376
|
|
|
|
|
|
|
# Remove tags |
377
|
|
|
|
|
|
|
$xml =~ s/<[^>]*>//g; |
378
|
|
|
|
|
|
|
return $xml; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Read a single file contained in a zipfile and return its contents (or undef) |
382
|
|
|
|
|
|
|
sub _readFileInZIP |
383
|
|
|
|
|
|
|
{ |
384
|
|
|
|
|
|
|
my $self = shift; |
385
|
|
|
|
|
|
|
my $file = shift; |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
my $contents; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
try |
390
|
|
|
|
|
|
|
{ |
391
|
|
|
|
|
|
|
my $zip = Archive::Zip->new(); |
392
|
|
|
|
|
|
|
$zip->read($self->file); |
393
|
|
|
|
|
|
|
$contents = $zip->contents($file); |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
catch |
396
|
|
|
|
|
|
|
{ |
397
|
|
|
|
|
|
|
$contents = undef; |
398
|
|
|
|
|
|
|
}; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
return $contents; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# Returns a filetype, one of: |
404
|
|
|
|
|
|
|
# pdf => PDF |
405
|
|
|
|
|
|
|
# odt => OpenDocument text |
406
|
|
|
|
|
|
|
# sxw => Legacy OpenOffice.org Writer |
407
|
|
|
|
|
|
|
# doc => msword |
408
|
|
|
|
|
|
|
# docx => "Open XML" |
409
|
|
|
|
|
|
|
# rtf => RTF |
410
|
|
|
|
|
|
|
# txt => Cleartext |
411
|
|
|
|
|
|
|
# |
412
|
|
|
|
|
|
|
# undef => Unable to detect/unsupported |
413
|
|
|
|
|
|
|
sub _getType |
414
|
|
|
|
|
|
|
{ |
415
|
|
|
|
|
|
|
my $self = shift; |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
my $type = $self->_getTypeFromMIME(); |
418
|
|
|
|
|
|
|
if ($type) |
419
|
|
|
|
|
|
|
{ |
420
|
|
|
|
|
|
|
return $type; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
$type = $self->_getTypeFromMagicDesc(); |
424
|
|
|
|
|
|
|
if ($type) |
425
|
|
|
|
|
|
|
{ |
426
|
|
|
|
|
|
|
return $type; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
$type = $self->_guessType(); |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
return $type; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
# Get the filetype based upon the mimetype |
435
|
|
|
|
|
|
|
sub _getTypeFromMIME |
436
|
|
|
|
|
|
|
{ |
437
|
|
|
|
|
|
|
my $self = shift; |
438
|
|
|
|
|
|
|
my $type; |
439
|
|
|
|
|
|
|
my %mimeMap = ( |
440
|
|
|
|
|
|
|
'application/pdf' => 'pdf', |
441
|
|
|
|
|
|
|
'application/msword' => 'doc', |
442
|
|
|
|
|
|
|
'application/vnd.ms-office' => 'doc', |
443
|
|
|
|
|
|
|
'application/vnd.oasis.opendocument.text' => 'odt', |
444
|
|
|
|
|
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx', |
445
|
|
|
|
|
|
|
'application/vnd.sun.xml.writer' => 'sxw', |
446
|
|
|
|
|
|
|
'text/plain' => 'txt', |
447
|
|
|
|
|
|
|
'text/html' => 'html', |
448
|
|
|
|
|
|
|
'text/rtf' => 'rtf', |
449
|
|
|
|
|
|
|
'application/xhtml+xml' => 'html', |
450
|
|
|
|
|
|
|
); |
451
|
|
|
|
|
|
|
try |
452
|
|
|
|
|
|
|
{ |
453
|
|
|
|
|
|
|
my $mime = File::LibMagic->new(); |
454
|
|
|
|
|
|
|
$type = $mime->checktype_filename($self->file); |
455
|
|
|
|
|
|
|
if ($type) |
456
|
|
|
|
|
|
|
{ |
457
|
|
|
|
|
|
|
chomp($type); |
458
|
|
|
|
|
|
|
$type =~ s/;.*//g; |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
}; |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# Try to get mimetype from the zip |
463
|
|
|
|
|
|
|
if(defined $type && $type eq 'application/zip') |
464
|
|
|
|
|
|
|
{ |
465
|
|
|
|
|
|
|
$type = $self->_readFileInZIP('mimetype'); |
466
|
|
|
|
|
|
|
if ($type) |
467
|
|
|
|
|
|
|
{ |
468
|
|
|
|
|
|
|
$type =~ s/;.*//g; |
469
|
|
|
|
|
|
|
chomp($type); |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
if (defined $type && $mimeMap{$type}) |
474
|
|
|
|
|
|
|
{ |
475
|
|
|
|
|
|
|
return $mimeMap{$type}; |
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
return; |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# Get the filetype based upon the magic file description |
481
|
|
|
|
|
|
|
sub _getTypeFromMagicDesc |
482
|
|
|
|
|
|
|
{ |
483
|
|
|
|
|
|
|
my $self = shift; |
484
|
|
|
|
|
|
|
my $type; |
485
|
|
|
|
|
|
|
my %descrMap = ( |
486
|
|
|
|
|
|
|
'^OpenOffice\.org.+Writer.+' => 'sxw', |
487
|
|
|
|
|
|
|
'^OpenDocument text$' => 'odt', |
488
|
|
|
|
|
|
|
'^PDF document.+$' => 'pdf', |
489
|
|
|
|
|
|
|
); |
490
|
|
|
|
|
|
|
try |
491
|
|
|
|
|
|
|
{ |
492
|
|
|
|
|
|
|
my $mime = File::LibMagic->new(); |
493
|
|
|
|
|
|
|
my $descr = $mime->describe_filename($self->file); |
494
|
|
|
|
|
|
|
if ($descr) |
495
|
|
|
|
|
|
|
{ |
496
|
|
|
|
|
|
|
foreach my $r(keys(%descrMap)) |
497
|
|
|
|
|
|
|
{ |
498
|
|
|
|
|
|
|
if ($descr =~ /$r/) |
499
|
|
|
|
|
|
|
{ |
500
|
|
|
|
|
|
|
$type = $descrMap{$r}; |
501
|
|
|
|
|
|
|
last; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
}; |
507
|
|
|
|
|
|
|
return $type; |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Guess the file type |
511
|
|
|
|
|
|
|
sub _guessType |
512
|
|
|
|
|
|
|
{ |
513
|
|
|
|
|
|
|
my $self = shift; |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
return if not $self->allowGuess; |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
my @guess = qw(sxw odt txt docx); |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
foreach my $e (@guess) |
520
|
|
|
|
|
|
|
{ |
521
|
|
|
|
|
|
|
if ($self->file =~ /\.$e$/) |
522
|
|
|
|
|
|
|
{ |
523
|
|
|
|
|
|
|
return $e; |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
return; |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
# Saves "read" status in the object, so that we know for later reference |
530
|
|
|
|
|
|
|
# if we need to re-read the file. |
531
|
|
|
|
|
|
|
sub _setRead |
532
|
|
|
|
|
|
|
{ |
533
|
|
|
|
|
|
|
my $self = shift; |
534
|
|
|
|
|
|
|
my $text = shift; |
535
|
|
|
|
|
|
|
if(defined $text) |
536
|
|
|
|
|
|
|
{ |
537
|
|
|
|
|
|
|
$self->_content($text); |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
$self->_readState($self->_getStateString); |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# Retrieves the read file content as long as the read state equals the |
543
|
|
|
|
|
|
|
# previous read state, otherwise returns undef |
544
|
|
|
|
|
|
|
sub _getRead |
545
|
|
|
|
|
|
|
{ |
546
|
|
|
|
|
|
|
my $self = shift; |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
if ($self->_readState && $self->_readState eq $self->_getStateString) |
549
|
|
|
|
|
|
|
{ |
550
|
|
|
|
|
|
|
return $self->_content; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
return; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# Retrieves the 'state string'. This is a string representation of |
556
|
|
|
|
|
|
|
# the internal state in the object that might have some effect on how |
557
|
|
|
|
|
|
|
# text gets read. |
558
|
|
|
|
|
|
|
# |
559
|
|
|
|
|
|
|
# Ie. if allowExternal or allowGuess has changed since we last read |
560
|
|
|
|
|
|
|
# a file, we read it again. |
561
|
|
|
|
|
|
|
sub _getStateString |
562
|
|
|
|
|
|
|
{ |
563
|
|
|
|
|
|
|
my $self = shift; |
564
|
|
|
|
|
|
|
my $readState = join('-',$self->allowExternal,$self->allowGuess); |
565
|
|
|
|
|
|
|
return $readState; |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
569
|
|
|
|
|
|
|
1; |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
__END__ |