File Coverage

blib/lib/Text/FromAny.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				# Text::FromAny
2				# A module to read pure text from a vareiety of formats
3				# Copyright Eskild Hustvedt 2010
4				# for Portu Media & Communications
5				#
6				# This library is free software; you can redistribute it and/or modify
7				# it under the terms of either:
8				#
9				# a) the GNU General Public License as published by the Free
10				# Software Foundation; either version 3, or (at your option) any
11				# later version, or
12				# b) the "Artistic License" which comes with this Kit.
13				#
14				# This library is distributed in the hope that it will be useful,
15				# but WITHOUT ANY WARRANTY; without even the implied warranty of
16				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either
17				# the GNU General Public License or the Artistic License for more details.
18				#
19				# You should have received a copy of the Artistic License
20				# in the file named "COPYING.artistic". If not, I'll be glad to provide one.
21				#
22				# You should also have received a copy of the GNU General Public License
23				# along with this library in the file named "COPYING.gpl". If not,
24				# see .
25				package Text::FromAny;
26	2	2	4066	use Any::Moose;
	2		114500
	2		13
27	2	2	1064	use Carp qw(carp croak);
	2		4
	2		136
28	2	2	2784	use Try::Tiny;
	2		1779
	2		107
29	2	2	3143	use Text::Extract::Word qw(get_all_text);
	2		148441
	2		164
30	2	2	902	use OpenOffice::OODoc 2.101;
	0
	0
31				use File::LibMagic;
32				use Archive::Zip;
33				use RTF::TEXT::Converter;
34				use HTML::FormatText::WithLinks;
35				use File::Spec::Functions;
36				use CAM::PDF;
37				use CAM::PDF::PageText;
38				use IPC::Open3 qw(open3);
39
40				our $VERSION = '0.21';
41
42				has 'file' => (
43				is => 'ro',
44				isa => 'Str',
45				required => 1,
46				);
47				has 'allowGuess' => (
48				is => 'rw',
49				isa => 'Str',
50				default => 1,
51				);
52				has 'allowExternal' => (
53				is => 'rw',
54				isa => 'Str',
55				default => 0,
56				);
57				has '_fileType' => (
58				is => 'ro',
59				isa => 'Maybe[Str]',
60				builder => '_getType',
61				lazy => 1,
62				);
63				has '_pdfToText' => (
64				is => 'ro',
65				isa => 'Bool',
66				builder => '_checkPdfToText',
67				lazy => 1
68				);
69				has '_content' => (
70				is => 'rw',
71				);
72				has '_readState' => (
73				is => 'rw',
74				isa => 'Maybe[Str]',
75				);
76
77				# Ensure file exists during construction
78				sub BUILD
79				{
80				my $self = shift;
81
82				if(not -e $self->file)
83				{
84				croak($self->file.': does not exist');
85				}
86				elsif(not -r $self->file)
87				{
88				croak($self->file.': is not readable');
89				}
90				elsif(not -f $self->file)
91				{
92				croak($self->file.': is not a normal file');
93				}
94				}
95
96				# Get the text string representing the contents of the file.
97				# Returns undef if the format is unknown or unsupported
98				sub text
99				{
100				my $self = shift;
101				my $ftype = $self->detectedType;
102				my $text = $self->_getRead();
103
104				if(defined $text)
105				{
106				return $text;
107				}
108				if(not defined $ftype)
109				{
110				return undef;
111				}
112
113				try
114				{
115				if ($ftype eq 'pdf')
116				{
117				$text = $self->_getFromPDF();
118				}
119				elsif($ftype eq 'doc')
120				{
121				$text = $self->_getFromDoc();
122				}
123				elsif($ftype eq 'odt')
124				{
125				$text = $self->_getFromODT();
126				}
127				elsif($ftype eq 'sxw')
128				{
129				$text = $self->_getFromSXW();
130				}
131				elsif($ftype eq 'txt')
132				{
133				$text = $self->_getFromRaw();
134				}
135				elsif($ftype eq 'rtf')
136				{
137				$text = $self->_getFromRTF();
138				}
139				elsif($ftype eq 'docx')
140				{
141				$text = $self->_getFromDocx();
142				}
143				elsif($ftype eq 'html')
144				{
145				$text = $self->_getFromHTML();
146				}
147				elsif(defined $ftype)
148				{
149				die("Text::FromAny: Unknown detected filetype: $ftype\n");
150				}
151
152				if(defined $text)
153				{
154				$text =~ s/(\r\|\f)//g;
155				$self->_content($text);
156				}
157				}
158				catch
159				{
160				$text = undef;
161				};
162
163				$self->_setRead($text);
164
165				return $text;
166				}
167
168				# Returns the detected filetype.
169				# This is defined as a method because it should not be accepted as a
170				# construction parameters.
171				sub detectedType
172				{
173				my $self = shift;
174				return $self->_fileType;
175				}
176
177				# Retrieve text from a PDF file
178				sub _getFromPDF
179				{
180				my $self = shift;
181				my $text = $self->_getFromPDF_CAMPDF();
182				if ($text =~ /(\w\|\d)/)
183				{
184				return $text;
185				}
186				my $pdftotext = $self->_getFromPDF_pdftotext;
187				if ($pdftotext)
188				{
189				return $pdftotext;
190				}
191				return $text;
192				}
193
194				# Retrieve text from a PDF file using CAM::PDF
195				sub _getFromPDF_CAMPDF
196				{
197				my $self = shift;
198				my $f = CAM::PDF->new($self->file);
199				my $text = '';
200				foreach(1..$f->numPages())
201				{
202				my $page = $f->getPageContentTree($_);
203				$text .= CAM::PDF::PageText->render($page);
204				}
205				return $text;
206				}
207
208				# Retrieve text from a PDF file using pdftotext (if we are allowed to, and it
209				# is available)
210				sub _getFromPDF_pdftotext
211				{
212				my $self = shift;
213				if(not $self->allowExternal or not $self->_pdfToText)
214				{
215				return;
216				}
217				my $content = '';
218				try
219				{
220				my $pid = open3(my $in, my $out, my $err, 'pdftotext','-layout','-enc','UTF-8',$self->file,'-') or die("Failed to open3() pdftotext: $!\n");
221				while(<$out>)
222				{
223				$content .= $_;
224				}
225				close($in) if $in;
226				close($out) if $out;
227				close($err) if $err;
228				waitpid($pid,0);
229				my $status = $? >> 8;
230				if ($status != 0)
231				{
232				$content = '';
233				}
234				};
235				return $content;
236				}
237
238				# Check if pdftotext is installed
239				sub _checkPdfToText
240				{
241				foreach (split /:/, $ENV{PATH})
242				{
243				my $f = catfile($_,'pdftotext');
244				if (-x $f and not -d $f)
245				{
246				return 1;
247				}
248				}
249				return 0;
250				}
251
252				# Retrieve text from a msword .doc file
253				sub _getFromDoc
254				{
255				my $self = shift;
256				my $text = get_all_text($self->file);
257				$text =~ s/(\r\|\r\n)/\n/g;
258				$text =~ s/\n$//;
259				return $text;
260				}
261
262				# Retrieve text from an "Office Open XML" file
263				sub _getFromDocx
264				{
265				my $self = shift;
266
267				my $xml = $self->_readFileInZIP('word/document.xml');
268				return if not defined $xml;
269
270				# Strip formatting newlines in the XML
271				$xml =~ s/\n//g;
272				# Convert XML newlines to real ones
273				if(not $xml =~ s/]*w:rsidRDefault[^>]+>/\n/g)
274				{
275				$xml =~ s/<\/w:p>/\n/g;
276				}
277				# Remove tags
278				$xml =~ s/<[^>]+>//g;
279
280				return $xml;
281				}
282
283				# Retrieve text from an Open Document text file
284				sub _getFromODT
285				{
286				my $self = shift;
287				my $doc = odfText(file => $self->file);
288				my $xml;
289				open(my $out,'>',\$xml);
290				$doc->getBody->print($out);
291				close($out);
292
293				return $self->_getFromODT_SXW_XML($xml);
294				}
295
296				# Retrieve text from a legacy OpenOffice.org writer text file
297				sub _getFromSXW
298				{
299				my $self = shift;
300				my $xml = $self->_readFileInZIP('content.xml');
301				return $self->_getFromODT_SXW_XML($xml);
302				}
303
304				# Retrieve text from an RTF file
305				sub _getFromRTF
306				{
307				my $self = shift;
308				my $file = $self->file;
309				my $text = '';
310
311				# RTF::TEXT::Converter spews some errors to STDERR that we don't need,
312				# so we silence it
313				local *STDERR;
314				open(STDERR,'>','/dev/null');
315				try
316				{
317				my $p = RTF::TEXT::Converter->new( output => \$text );
318
319				open(my $in, '<', $file);
320				$p->parse_stream($in);
321				close($in);
322				};
323				return $text;
324				}
325
326				# Get the contents of a cleartext file
327				sub _getFromRaw
328				{
329				my $self = shift;
330				open(my $in,'<',$self->file) or carp("Failed to open ".$self->file.": ".$!);
331				return if not $in;
332				local $/ = undef;
333				my $text = <$in>;
334				close($in);
335				return $text;
336				}
337
338				# Retrieve text from a HTML file
339				sub _getFromHTML
340				{
341				my $self = shift;
342				my $formatText = HTML::FormatText::WithLinks->new(
343				before_link => '',
344				after_link => '',
345				unique_links => 1,
346				footnote => '%l',
347				);
348				my $text = $formatText->parse_file($self->file);
349				# Remove additional formatting added by HTML::FormatText::WithLinks
350				my $result = '';
351
352				# Remove whitespace prefix on each line
353				foreach my $l (split(/\n/,$text))
354				{
355				$l =~ s/^ {1,4}//;
356				$result .= $l."\n";
357				}
358
359				# Remove newline padding at the end
360				$result =~ s/\n+$//g;
361				return $result;
362				}
363
364				# Simple regex cleaner and formatted for ODT and SXW
365				sub _getFromODT_SXW_XML
366				{
367				my $self = shift;
368				my $xml = shift;
369
370				# Strip formatting newlines in the XML
371				$xml =~ s/\n//g;
372				# Strip first text:p
373				$xml =~ s/]*>//;
374				# Convert XML newlines to real ones
375				$xml =~ s/]*>/\n/g;
376				# Remove tags
377				$xml =~ s/<[^>]*>//g;
378				return $xml;
379				}
380
381				# Read a single file contained in a zipfile and return its contents (or undef)
382				sub _readFileInZIP
383				{
384				my $self = shift;
385				my $file = shift;
386
387				my $contents;
388
389				try
390				{
391				my $zip = Archive::Zip->new();
392				$zip->read($self->file);
393				$contents = $zip->contents($file);
394				}
395				catch
396				{
397				$contents = undef;
398				};
399
400				return $contents;
401				}
402
403				# Returns a filetype, one of:
404				# pdf => PDF
405				# odt => OpenDocument text
406				# sxw => Legacy OpenOffice.org Writer
407				# doc => msword
408				# docx => "Open XML"
409				# rtf => RTF
410				# txt => Cleartext
411				#
412				# undef => Unable to detect/unsupported
413				sub _getType
414				{
415				my $self = shift;
416
417				my $type = $self->_getTypeFromMIME();
418				if ($type)
419				{
420				return $type;
421				}
422
423				$type = $self->_getTypeFromMagicDesc();
424				if ($type)
425				{
426				return $type;
427				}
428
429				$type = $self->_guessType();
430
431				return $type;
432				}
433
434				# Get the filetype based upon the mimetype
435				sub _getTypeFromMIME
436				{
437				my $self = shift;
438				my $type;
439				my %mimeMap = (
440				'application/pdf' => 'pdf',
441				'application/msword' => 'doc',
442				'application/vnd.ms-office' => 'doc',
443				'application/vnd.oasis.opendocument.text' => 'odt',
444				'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
445				'application/vnd.sun.xml.writer' => 'sxw',
446				'text/plain' => 'txt',
447				'text/html' => 'html',
448				'text/rtf' => 'rtf',
449				'application/xhtml+xml' => 'html',
450				);
451				try
452				{
453				my $mime = File::LibMagic->new();
454				$type = $mime->checktype_filename($self->file);
455				if ($type)
456				{
457				chomp($type);
458				$type =~ s/;.*//g;
459				}
460				};
461
462				# Try to get mimetype from the zip
463				if(defined $type && $type eq 'application/zip')
464				{
465				$type = $self->_readFileInZIP('mimetype');
466				if ($type)
467				{
468				$type =~ s/;.*//g;
469				chomp($type);
470				}
471				}
472
473				if (defined $type && $mimeMap{$type})
474				{
475				return $mimeMap{$type};
476				}
477				return;
478				}
479
480				# Get the filetype based upon the magic file description
481				sub _getTypeFromMagicDesc
482				{
483				my $self = shift;
484				my $type;
485				my %descrMap = (
486				'^OpenOffice\.org.+Writer.+' => 'sxw',
487				'^OpenDocument text$' => 'odt',
488				'^PDF document.+$' => 'pdf',
489				);
490				try
491				{
492				my $mime = File::LibMagic->new();
493				my $descr = $mime->describe_filename($self->file);
494				if ($descr)
495				{
496				foreach my $r(keys(%descrMap))
497				{
498				if ($descr =~ /$r/)
499				{
500				$type = $descrMap{$r};
501				last;
502				}
503
504				}
505				}
506				};
507				return $type;
508				}
509
510				# Guess the file type
511				sub _guessType
512				{
513				my $self = shift;
514
515				return if not $self->allowGuess;
516
517				my @guess = qw(sxw odt txt docx);
518
519				foreach my $e (@guess)
520				{
521				if ($self->file =~ /\.$e$/)
522				{
523				return $e;
524				}
525				}
526				return;
527				}
528
529				# Saves "read" status in the object, so that we know for later reference
530				# if we need to re-read the file.
531				sub _setRead
532				{
533				my $self = shift;
534				my $text = shift;
535				if(defined $text)
536				{
537				$self->_content($text);
538				}
539				$self->_readState($self->_getStateString);
540				}
541
542				# Retrieves the read file content as long as the read state equals the
543				# previous read state, otherwise returns undef
544				sub _getRead
545				{
546				my $self = shift;
547
548				if ($self->_readState && $self->_readState eq $self->_getStateString)
549				{
550				return $self->_content;
551				}
552				return;
553				}
554
555				# Retrieves the 'state string'. This is a string representation of
556				# the internal state in the object that might have some effect on how
557				# text gets read.
558				#
559				# Ie. if allowExternal or allowGuess has changed since we last read
560				# a file, we read it again.
561				sub _getStateString
562				{
563				my $self = shift;
564				my $readState = join('-',$self->allowExternal,$self->allowGuess);
565				return $readState;
566				}
567
568				__PACKAGE__->meta->make_immutable;
569				1;
570
571				__END__