| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package SWISH::Filters::Pdf2HTML; |
|
2
|
1
|
|
|
1
|
|
789
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
3
|
1
|
|
|
1
|
|
5
|
use vars qw( $VERSION @ISA ); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
540
|
|
|
4
|
|
|
|
|
|
|
$VERSION = '0.190'; |
|
5
|
|
|
|
|
|
|
@ISA = ('SWISH::Filters::Base'); |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub new { |
|
8
|
1
|
|
|
1
|
0
|
28
|
my ($class) = @_; |
|
9
|
|
|
|
|
|
|
|
|
10
|
1
|
|
|
|
|
8
|
my $self = bless { mimetypes => [qr!application/pdf!], }, $class; |
|
11
|
|
|
|
|
|
|
|
|
12
|
1
|
|
|
|
|
11
|
return $self->set_programs(qw/ pdftotext pdfinfo /); |
|
13
|
|
|
|
|
|
|
} |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub filter { |
|
16
|
0
|
|
|
0
|
1
|
|
my ( $self, $doc ) = @_; |
|
17
|
|
|
|
|
|
|
|
|
18
|
0
|
|
|
|
|
|
my $user_data = $doc->user_data; |
|
19
|
0
|
0
|
|
|
|
|
my $title_tag |
|
20
|
|
|
|
|
|
|
= ref $user_data eq 'HASH' |
|
21
|
|
|
|
|
|
|
? $user_data->{pdf}{title_tag} |
|
22
|
|
|
|
|
|
|
: 'title'; |
|
23
|
|
|
|
|
|
|
|
|
24
|
0
|
|
0
|
|
|
|
my $user_meta = $doc->meta_data || {}; |
|
25
|
0
|
|
|
|
|
|
my $file = $doc->fetch_filename; |
|
26
|
|
|
|
|
|
|
|
|
27
|
0
|
|
|
|
|
|
$self->mywarn("Pdf2HTML handling $file"); |
|
28
|
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
my $metadata = $self->get_pdf_headers($file); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# merge pdf meta with meta we inherited, preferring user meta |
|
32
|
0
|
|
|
|
|
|
$metadata->{$_} = $user_meta->{$_} for keys %$user_meta; |
|
33
|
|
|
|
|
|
|
|
|
34
|
0
|
|
|
|
|
|
my $headers = $self->format_meta_headers($metadata); |
|
35
|
|
|
|
|
|
|
|
|
36
|
0
|
0
|
0
|
|
|
|
if ( $title_tag && exists $metadata->{$title_tag} ) { |
|
37
|
0
|
|
|
|
|
|
my $title = $self->escapeXML( $metadata->{$title_tag} ); |
|
38
|
|
|
|
|
|
|
|
|
39
|
0
|
|
|
|
|
|
$headers = "$title\n" . $headers; |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# Check for encrypted content |
|
43
|
|
|
|
|
|
|
|
|
44
|
0
|
|
|
|
|
|
my $content_ref; |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# patch provided by Martial Chartoire |
|
47
|
0
|
0
|
0
|
|
|
|
if ( $metadata->{encrypted} |
|
48
|
|
|
|
|
|
|
&& $metadata->{encrypted} =~ /yes\.*\scopy:no\s\.*/i ) |
|
49
|
|
|
|
|
|
|
{ |
|
50
|
0
|
|
|
|
|
|
$content_ref = \''; |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
else { |
|
54
|
0
|
|
|
|
|
|
$content_ref = $self->get_pdf_content_ref($file); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# update the document's content type |
|
58
|
0
|
|
|
|
|
|
$doc->set_content_type('text/html'); |
|
59
|
|
|
|
|
|
|
|
|
60
|
0
|
|
|
|
|
|
my $txt = <
|
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
$headers |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
$$content_ref |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
EOF |
|
72
|
|
|
|
|
|
|
|
|
73
|
0
|
|
|
|
|
|
return ( \$txt, $metadata ); |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub get_pdf_headers { |
|
78
|
|
|
|
|
|
|
|
|
79
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# We need a file name to pass to the pdf conversion programs |
|
82
|
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
my %metadata; |
|
84
|
0
|
|
|
|
|
|
my $headers = $self->run_pdfinfo($file); |
|
85
|
0
|
0
|
|
|
|
|
return \%metadata unless $headers; |
|
86
|
|
|
|
|
|
|
|
|
87
|
0
|
|
|
|
|
|
for ( split /\n/, $headers ) { |
|
88
|
0
|
0
|
|
|
|
|
if (/^\s*([^:]+):\s+(.+)$/) { |
|
89
|
0
|
|
|
|
|
|
my ( $metaname, $value ) = ( lc($1), $2 ); |
|
90
|
0
|
|
|
|
|
|
$metaname =~ tr/ /_/; |
|
91
|
0
|
|
|
|
|
|
$metadata{$metaname} = $value; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
0
|
|
|
|
|
|
return \%metadata; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub get_pdf_content_ref { |
|
99
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
|
100
|
|
|
|
|
|
|
|
|
101
|
0
|
|
|
|
|
|
my $content = $self->escapeXML( $self->run_pdftotext( $file, '-' ) ); |
|
102
|
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
return \$content; |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
1; |
|
107
|
|
|
|
|
|
|
__END__ |