line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package PDF::Extract;
|
2
|
1
|
|
|
1
|
|
25184
|
use strict;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
53
|
|
3
|
|
|
|
|
|
|
#use warnings;
|
4
|
1
|
|
|
1
|
|
6
|
use vars qw($VERSION);
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
3982
|
|
5
|
|
|
|
|
|
|
$VERSION = '3.04';
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
PDF::Extract - Extracting sub PDF documents from a multi page PDF document
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use PDF::Extract;
|
14
|
|
|
|
|
|
|
$pdf=new PDF::Extract;
|
15
|
|
|
|
|
|
|
$pdf->servePDFExtract( PDFDoc=>"c:/Docs/my.pdf", PDFPages=>"1-3 31-36" );
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
or
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
use PDF::Extract;
|
20
|
|
|
|
|
|
|
$pdf = new PDF::Extract( PDFDoc=>'C:/my.pdf' );
|
21
|
|
|
|
|
|
|
$pdf->getPDFExtract( PDFPages=>$PDFPages );
|
22
|
|
|
|
|
|
|
print "Content-Type text/plain\n\n", $pdf->getVars("PDFExtract");
|
23
|
|
|
|
|
|
|
print $pdf->getVars("PDFError");
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
or
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# Extract and save, in the current directory, all the pages in a pdf document
|
28
|
|
|
|
|
|
|
use PDF::Extract;
|
29
|
|
|
|
|
|
|
$pdf=new PDF::Extract( PDFDoc=>"test.pdf");
|
30
|
|
|
|
|
|
|
$i=1;
|
31
|
|
|
|
|
|
|
$i++ while ( $pdf->savePDFExtract( PDFPages=>$i ) );
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
PDF Extract is a group of methods that allow the user to quickly grab pages
|
37
|
|
|
|
|
|
|
as a new PDF document from a pre-existing PDF document.
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
With PDF::Extract a new PDF document can be:-
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=over 4
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=item *
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
assigned to a scalar variable with getPDFExtract.
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=item *
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
saved to disk with savePDFExtract.
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=item *
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
printed to STDOUT as a PDF web document with servePDFExtract.
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item *
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
cached and served for a faster PDF web document service with fastServePDFExtract.
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=back
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
These four main methods can be called with or without arguments. The methods
|
62
|
|
|
|
|
|
|
will not work unless they know the location of the original PDF document.
|
63
|
|
|
|
|
|
|
PDFPages defaults to "1". There are no other default values.
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
There are four other methods that deal with setting and getting the public variables.
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=over 4
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=item *
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
getPDFExtractVariables can return an array of variables.
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=item *
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
getVars is an alias of getPDFExtractVariables
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=item *
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
setPDFExtractVariables can set the public variables.
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item *
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
setVars is an alias of setPDFExtractVariables
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=back
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=cut
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
my ( $pages, $fileNumber, $filename, $CatalogPages, $Catalog, $Root, $pdf, $pdfFile, $object, $encryptedPdf, $trailerObject )=(1,1); #default PDFPages to 1
|
91
|
|
|
|
|
|
|
my ( @object, @obj, @instnum, @pages );
|
92
|
|
|
|
|
|
|
my ( %vars, %getPages, %pageObject );
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
$vars{"PDFCache"}="."; # defaults to this directory
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
my $CRLF = '[ \t\r\n\f\0]'."*(?:\015|\012|(?:\015\012))";
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# ----------------------------------------------------------- The Public Methods --------------------------------------------------------------
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 METHODS
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head2 new PDF::Extract
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Creates a new Extract object with empty state information ready for processing
|
105
|
|
|
|
|
|
|
data both input and output. New can be called with a hash array argument.
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
new PDF::Extract( PDFDoc=>"c:/Docs/my.pdf", PDFPages=>"1-3 31-36" )
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
This will cause a new PDF document to be generated unless there is an error.
|
110
|
|
|
|
|
|
|
Extract->new() simply calls getPDFExtract() if there is an argument.
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub new {
|
115
|
0
|
|
|
0
|
1
|
|
my $this = shift;
|
116
|
0
|
|
0
|
|
|
|
my $class = ref($this) || $this;
|
117
|
0
|
|
|
|
|
|
my $self = {};
|
118
|
0
|
|
|
|
|
|
bless $self, $class;
|
119
|
0
|
0
|
|
|
|
|
$self->getPDFExtract(@_) if @_;
|
120
|
0
|
|
|
|
|
|
return $self;
|
121
|
|
|
|
|
|
|
}
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 getPDFExtract
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
This method is the main workhorse of the package. It does all the PDF processing
|
126
|
|
|
|
|
|
|
and sets PDFError if its unable to create a new PDF document. It requires
|
127
|
|
|
|
|
|
|
PDFDoc and PDFPages to be set either in this call of before to function.
|
128
|
|
|
|
|
|
|
It outputs a PDF document as a string or undef if there is an error.
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
To create an array of PDF documents, each consisting of a single page,
|
131
|
|
|
|
|
|
|
from a multi page PDF document.
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
$pdf = new PDF::Extract( PDFDoc=>'C:/my.pdf' );
|
134
|
|
|
|
|
|
|
$i=1;
|
135
|
|
|
|
|
|
|
while ( $pdf[$i++]=$pdf->getPDFExtract( PDFPages=>$i ) );
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
The lowest valid page number for PDFPages is 1. A value of undef will produce no
|
138
|
|
|
|
|
|
|
output and raise an error. An error will be raised if the PDFPages values do
|
139
|
|
|
|
|
|
|
not correspond to any pages.
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=cut
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub getPDFExtract{
|
144
|
0
|
|
|
0
|
1
|
|
&setEnv(@_);
|
145
|
0
|
|
|
|
|
|
&getDoc;
|
146
|
0
|
0
|
|
|
|
|
$vars{"PDFExtract"} ? $vars{"PDFExtract"} : undef;
|
147
|
|
|
|
|
|
|
}
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head2 savePDFExtract
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
This method saves its output to the directory defined for PDFCache. (see PDFCache)
|
152
|
|
|
|
|
|
|
If PDFSaveAs is unset the new PDF's filename will be an amalgam of the original filename, the
|
153
|
|
|
|
|
|
|
requested page numbers and the .pdf file type suffix. If more than one page is extracted into a new PDF
|
154
|
|
|
|
|
|
|
the page numbers will be separated with an underscore "_" for individual pages, ".." for a range of pages.
|
155
|
|
|
|
|
|
|
eg. my6.pdf for a single page (page 6) and my1_3..6.pdf for a multi page PDF (pages 1, 3, 4, 5, 6)
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
$pdf->savePDFExtract(PDFPages=>"1 3-6", PDFDoc=>'C:/my.pdf', PDFCache=>"C:/myCache" );
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
If there is an error then an error page will be served and savePDFExtract will return a "0".
|
160
|
|
|
|
|
|
|
Otherwise savePDFExtract will return "1" and the saved PDF location and file name will be "C:/myCache/my1_3..5.pdf".
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=cut
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub savePDFExtract{
|
166
|
0
|
|
|
0
|
1
|
|
&setEnv(@_);
|
167
|
0
|
|
|
|
|
|
&getDoc;
|
168
|
0
|
|
|
|
|
|
&savePdfDoc;
|
169
|
0
|
0
|
|
|
|
|
$vars{"PDFError"} ? 0 : 1;
|
170
|
|
|
|
|
|
|
}
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head2 servePDFExtract
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
This method serves its output to STDOUT with the correct header for a PDF document served on the web.
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
$pdf = PDF::Extract->new(
|
177
|
|
|
|
|
|
|
PDFDoc=>'C:/my.pdf',
|
178
|
|
|
|
|
|
|
PDFErrorPage=>"C:/myErrorPage.html" );
|
179
|
|
|
|
|
|
|
$pdf->servePDFExtract( PDFPages=>1);
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
If there is an error then an error page will be served and servePDFExtract will return "0".
|
182
|
|
|
|
|
|
|
Otherwise servePDFExtract will return "1"
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=cut
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub servePDFExtract{
|
187
|
0
|
|
|
0
|
1
|
|
&setEnv(@_);
|
188
|
0
|
|
|
|
|
|
&getDoc;
|
189
|
0
|
|
|
|
|
|
&uploadPDFDoc;
|
190
|
0
|
0
|
|
|
|
|
$vars{"PDFError"} ? 0 : 1;
|
191
|
|
|
|
|
|
|
}
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head2 fastServePDFExtract
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
This method serves its output to STDOUT with the correct header for a PDF document served on the web.
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
If PDFSaveAs is unset the new PDF's filename will be an amalgam of the original filename, the
|
198
|
|
|
|
|
|
|
requested page numbers and the .pdf file type suffix. If more than one page is extracted into a new PDF
|
199
|
|
|
|
|
|
|
the page numbers will be separated with an underscore "_" for individual pages, ".." for a range of pages.
|
200
|
|
|
|
|
|
|
eg. my6.pdf for a single page (page 6) and my1_3..6.pdf for a multi page PDF (pages 1, 3, 4, 5, 6).
|
201
|
|
|
|
|
|
|
If there is an error then an error page will be served and fastServePDFExtract will return "0".
|
202
|
|
|
|
|
|
|
fastServePDFExtract will return "1" on success.
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
$pdf->setVars(
|
205
|
|
|
|
|
|
|
PDFDoc=>'C:/my.pdf',
|
206
|
|
|
|
|
|
|
PDFCache=>"C:/myCache",
|
207
|
|
|
|
|
|
|
PDFErrorPage=>"C:/myErrorPage.html",
|
208
|
|
|
|
|
|
|
PDFPages=>1);
|
209
|
|
|
|
|
|
|
unless ($pdf->fastServePDFExtract ) {
|
210
|
|
|
|
|
|
|
# there was an error
|
211
|
|
|
|
|
|
|
$error=$pdf->getVars("PDFError") ;
|
212
|
|
|
|
|
|
|
}
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=cut
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub fastServePDFExtract{
|
217
|
0
|
|
|
0
|
1
|
|
&setEnv(@_);
|
218
|
0
|
0
|
|
|
|
|
&redirect if -e "$vars{\"PDFCache\"}/$vars{\"PDFFilename\"} ";
|
219
|
0
|
|
|
|
|
|
&getDoc;
|
220
|
0
|
|
|
|
|
|
&savePdfDoc;
|
221
|
0
|
|
|
|
|
|
&redirect;
|
222
|
0
|
|
|
|
|
|
&uploadPDFDoc;
|
223
|
0
|
0
|
|
|
|
|
$vars{"PDFError"} ? 0 : 1;
|
224
|
|
|
|
|
|
|
}
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=head2 getPDFExtractVariables
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
Get any of the public variables using a list of the variables to get
|
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
($error,$found)=$pdf->getPDFExtractVariables( "PDFError", "PDFPagesFound");
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
This method returns an an array of variables corresponding to the named variables passed in as arguments.
|
233
|
|
|
|
|
|
|
If a variable is undefined then its returned value will be undefined.
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=cut
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub getPDFExtractVariables {
|
238
|
0
|
|
|
0
|
1
|
|
my @var;
|
239
|
|
|
|
|
|
|
my $i;
|
240
|
0
|
|
|
|
|
|
shift;
|
241
|
0
|
|
|
|
|
|
foreach my $key (@_) {
|
242
|
0
|
|
|
|
|
|
$var[$i++]=$vars{$key};
|
243
|
|
|
|
|
|
|
}
|
244
|
0
|
|
|
|
|
|
@var;
|
245
|
|
|
|
|
|
|
}
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head2 getVars
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
This methos is an alias for getPDFExtractVariables. Get any of the public variables using a list of the variables to get
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
@vars=$pdf->getVars( @varNames );
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
This method returns an an array of variables corresponding to the named variables passed in as arguments.
|
254
|
|
|
|
|
|
|
If a variable is undefined then its returned value will be undefined.
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub getVars {
|
259
|
0
|
|
|
0
|
1
|
|
&getPDFExtractVariables(@_);
|
260
|
|
|
|
|
|
|
}
|
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head2 setPDFExtractVariables
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
Set any of the public variables using a hash of the variables and their values.
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
($doc,$pages)=$pdf->setPDFExtractVariables(PDFDoc=>'C:/my.pdf', PDFPages=>1);
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
This method sets the variables specified in the argument hash.
|
269
|
|
|
|
|
|
|
They return an array of the new values set.
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=cut
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
sub setPDFExtractVariables {
|
274
|
0
|
|
|
0
|
1
|
|
&setEnv( @_ );
|
275
|
0
|
|
|
|
|
|
shift;
|
276
|
0
|
|
|
|
|
|
my %var=@_;
|
277
|
0
|
|
|
|
|
|
&getVars( undef, keys %var);
|
278
|
|
|
|
|
|
|
}
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head2 setVars
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
This methos is an alias for setPDFExtractVariables. Set any of the public variables using a hash of the variables and their values.
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
@vars=$pdf->setVars( %vars );
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
This method sets the variables specified in the argument hash.
|
287
|
|
|
|
|
|
|
They return an array of the new values set.
|
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=cut
|
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub setVars {
|
292
|
0
|
|
|
0
|
1
|
|
&setPDFExtractVariables(@_);
|
293
|
|
|
|
|
|
|
}
|
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head1 VARIABLES
|
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=head2 PDFDoc
|
298
|
|
|
|
|
|
|
(set and get)
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
$file=$pdf->getVars("PDFDoc");
|
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
This variable contains the path to the last original PDF document accessed by
|
303
|
|
|
|
|
|
|
getPDFExtract, savePDFExtract, servePDFExtract and fastServePDFExtract.
|
304
|
|
|
|
|
|
|
PDFDoc will be an empty string if there was an error.
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=head2 PDFPages
|
307
|
|
|
|
|
|
|
(set and get)
|
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
$pages=$pdf->setVars( PDFPages =>"1 18-23");
|
310
|
|
|
|
|
|
|
or
|
311
|
|
|
|
|
|
|
$pages=$pdf->getVars("PDFPages");
|
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
This variable contains a list of pages to extract from the original PDF document accessed by
|
314
|
|
|
|
|
|
|
getPDFExtract, savePDFExtract, servePDFExtract and fastServePDFExtract.
|
315
|
|
|
|
|
|
|
Use the join function to create a list of pages from an array.
|
316
|
|
|
|
|
|
|
Such a an array of pages sent from a multi select box on a web form.
|
317
|
|
|
|
|
|
|
PDFPages will default to "1" if unset or there is an error processing the pages string.
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
PDFPages => join( " ", $cgi->param( "PDFPages" )),
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=head2 PDFCache
|
322
|
|
|
|
|
|
|
(set and get)
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
$cachePath=$pdf->setVars( PDFCache =>"C:/myCache");
|
325
|
|
|
|
|
|
|
or
|
326
|
|
|
|
|
|
|
$cachePath=$pdf->getVars("PDFCache");
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
This variable, if set, should contain the FULL PATH to the PDF document cache.
|
329
|
|
|
|
|
|
|
This value is used by savePDFExtract and fastServePDFExtract method calls.
|
330
|
|
|
|
|
|
|
PDFCache will be an empty string if there was an error in setting the value.
|
331
|
|
|
|
|
|
|
If PDFCache path does not exist an attempt will be made to create it recursively.
|
332
|
|
|
|
|
|
|
Any directories that need to be created will be created with permissions of 0x777.
|
333
|
|
|
|
|
|
|
PDFCache defaults to ".", the current directory.
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=head2 PDFSaveAs
|
336
|
|
|
|
|
|
|
(set and get)
|
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
$filename=$pdf->setVars( PDFSaveAs =>"myFileName");
|
339
|
|
|
|
|
|
|
or
|
340
|
|
|
|
|
|
|
$filename=$pdf->getVars("PDFSaveAs");
|
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
If PDFSaveAs is unset the new PDF's filename will be an amalgam of the original filename, the
|
343
|
|
|
|
|
|
|
requested page numbers and the .pdf file type suffix. If more than one page is extracted into a new PDF
|
344
|
|
|
|
|
|
|
the page numbers will be separated with an underscore "_" for individual pages, ".." for a range of pages.
|
345
|
|
|
|
|
|
|
eg. my6.pdf for a single page (page 6) and my1_3..6.pdf for a multi page PDF (pages 1, 3, 4, 5, 6)
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Setting PDFSaveAs to something other than "" or 0 will cause the output to be named with the content of PDFSaveAs.
|
348
|
|
|
|
|
|
|
The .pdf filename extension and any path informationwill be stripped from the variable if set.
|
349
|
|
|
|
|
|
|
PDFFilename will contain the actual filename used for the last extracted pdf'.
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=head2 PDFErrorPage
|
352
|
|
|
|
|
|
|
(set and get)
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
$errorPagePath=$pdf->setVars("PDFErrorPage"=>"C:/myError.html");
|
355
|
|
|
|
|
|
|
or
|
356
|
|
|
|
|
|
|
$errorPagePath=$pdf->getVars("PDFErrorPage");
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
PDFErrorPage is a text file that can be used as a template for the error page.
|
359
|
|
|
|
|
|
|
If the PDFErrorPage contains [PDFError], the word PDFError surrounded by square brackets,
|
360
|
|
|
|
|
|
|
then the error description will replace [PDFError].
|
361
|
|
|
|
|
|
|
Otherwise you can devise a generic error description and describe remedial actions to be taken by the viewer.
|
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
If this variable is not set then a default error page will be used.
|
364
|
|
|
|
|
|
|
The default page has a message in red at the top,
|
365
|
|
|
|
|
|
|
"There is system problem in processing your PDF Pages request.",
|
366
|
|
|
|
|
|
|
and then a description of the actual error follows underneath in black.
|
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head2 PDFExtract
|
369
|
|
|
|
|
|
|
(get only)
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
$out=$pdf->getVars("PDFExtract");
|
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
This variable contains the last PDF document processed by getPDFExtract, savePDFExtract, servePDFExtract and fastServePDFExtract.
|
374
|
|
|
|
|
|
|
PDFExtract will be an empty string if there was an error.
|
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 PDFPagesFound
|
377
|
|
|
|
|
|
|
(get only)
|
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
$pagesFound=$pdf->getVars("PDFPagesFound");
|
380
|
|
|
|
|
|
|
or
|
381
|
|
|
|
|
|
|
@pages = split ", ", $pdf->getVars("PDFPagesFound");
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
This variable contains a comma seperated list of the page numbers that were selected and found within the original PDF document.
|
384
|
|
|
|
|
|
|
PDFPagesFound will be a undefined if there was an error in finding any pages.
|
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 PDFPageCount
|
387
|
|
|
|
|
|
|
(get only)
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
$pageCount=$pdf->getVars("PDFPageCount");
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
This variable contains the number of the pages that were selected and found within the original PDF document.
|
393
|
|
|
|
|
|
|
PDFPageCount will be an empty string if there was an error in finding any pages.
|
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=head2 PDFFileName
|
396
|
|
|
|
|
|
|
(get only)
|
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
$filename=$pdf->getVars("PDFFilename");
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
This variable will contain the actual filename.
|
401
|
|
|
|
|
|
|
If PDFSaveAs is unset the new PDF's filename will be an amalgam of the original filename, the
|
402
|
|
|
|
|
|
|
requested page numbers and the .pdf file type suffix. If more than one page is extracted into a new PDF
|
403
|
|
|
|
|
|
|
the page numbers will be separated with an underscore "_" for individual pages, ".." for a range of pages.
|
404
|
|
|
|
|
|
|
eg. my6.pdf for a single page (page 6) and my1_3..6.pdf for a multi page PDF (pages 1, 3, 4, 5, 6).
|
405
|
|
|
|
|
|
|
If PDFSaveAs is set then PDFSaveAs will be used to construct PDFFilename.
|
406
|
|
|
|
|
|
|
The full path to the extracted pdf file can be obtained by -
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
$fullpath = $pdf->getVars("PDFCache") ."/". $pdf->getVars("PDFFilename");
|
409
|
|
|
|
|
|
|
or
|
410
|
|
|
|
|
|
|
($path,$filename) = $pdf->getVars("PDFCache","PDFFilename");
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=head2 PDFError
|
413
|
|
|
|
|
|
|
(get only)
|
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
$error=$pdf->getVars("PDFError");
|
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
This variable contains a string describing the errors if any in processing the original PDF file.
|
418
|
|
|
|
|
|
|
PDFError is guarenteed to be set if getPDFExtract, savePDFExtract, servePDFExtract or fastServePDFExtract fail and return a "0".
|
419
|
|
|
|
|
|
|
PDFError will be an empty string if there was no error.
|
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=head2 PDFDebug
|
422
|
|
|
|
|
|
|
(set for method call duration only)
|
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
$pdf->setVars(
|
425
|
|
|
|
|
|
|
PDFDoc=>'C:\docs\pdf',
|
426
|
|
|
|
|
|
|
PDFPages=>"2 6-8 ",
|
427
|
|
|
|
|
|
|
PDFDebug=>1);
|
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
This really a directive and not a true variable. It is used to debug the setting of variables in a PDF::Extract method call.
|
430
|
|
|
|
|
|
|
PDFDebug as used above will print:-
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
These variables are to be set
|
433
|
|
|
|
|
|
|
PDFDoc="C:\docs\pdf/"
|
434
|
|
|
|
|
|
|
PDFPages="2 6-8 "
|
435
|
|
|
|
|
|
|
PDFDebug="1"
|
436
|
|
|
|
|
|
|
These variables have been set
|
437
|
|
|
|
|
|
|
PDFCache="C:/myCache"
|
438
|
|
|
|
|
|
|
PDFFilename="2_6..8_.pdf"
|
439
|
|
|
|
|
|
|
PDFPagesFound=""
|
440
|
|
|
|
|
|
|
PDFDoc=""
|
441
|
|
|
|
|
|
|
PDFPages="2, 6, 7, 8"
|
442
|
|
|
|
|
|
|
PDFPageCount=""
|
443
|
|
|
|
|
|
|
PDFExtract=""
|
444
|
|
|
|
|
|
|
PDFError="PDF document "" not found at C:/Perl/site/lib/PDF/Extract.pm line 467"
|
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=cut
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# ----------------------------------------------------------- The Private Functions --------------------------------------------------------------
|
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
sub setEnv {
|
452
|
0
|
|
|
0
|
0
|
|
my (undef, %PDF)=@_;
|
453
|
0
|
|
|
|
|
|
my $requestedPages=0;
|
454
|
0
|
|
|
|
|
|
$vars{"PDFError"}="";
|
455
|
0
|
0
|
|
|
|
|
if ($PDF{"PDFDebug"} ) {
|
456
|
0
|
|
|
|
|
|
print "These variables are to be set\n";
|
457
|
0
|
|
|
|
|
|
foreach my $key (keys %PDF) {
|
458
|
0
|
|
|
|
|
|
print "\t$key=\"$PDF{$key}\"\n";
|
459
|
|
|
|
|
|
|
}
|
460
|
|
|
|
|
|
|
}
|
461
|
0
|
0
|
|
|
|
|
if ($PDF{"PDFErrorPage"} ) {
|
462
|
0
|
|
|
|
|
|
$vars{"PDFErrorPage"}="";
|
463
|
0
|
0
|
|
|
|
|
if ( -f $PDF{"PDFErrorPage"} ) {
|
464
|
0
|
0
|
|
|
|
|
if (open FILE, $PDF{"PDFErrorPage"} ) {
|
465
|
0
|
|
|
|
|
|
$vars{"PDFErrorFile"} = join('', );
|
466
|
0
|
|
|
|
|
|
close FILE;
|
467
|
0
|
|
|
|
|
|
$vars{"PDFErrorPage"}=$PDF{ "PDFErrorPage"};
|
468
|
|
|
|
|
|
|
} else {
|
469
|
0
|
|
|
|
|
|
&error( "Can't open PDF Error page template file $PDF{\"PDFErrorPage\"} to read\n",__FILE__,__LINE__);
|
470
|
|
|
|
|
|
|
}
|
471
|
|
|
|
|
|
|
} else {
|
472
|
0
|
|
|
|
|
|
&error("PDF Error page template file \"$PDF{PDFErrorPage}\" not found",__FILE__,__LINE__);
|
473
|
|
|
|
|
|
|
}
|
474
|
|
|
|
|
|
|
}
|
475
|
0
|
0
|
|
|
|
|
if ($PDF{ "PDFDoc" } ) {
|
476
|
0
|
|
|
|
|
|
$vars{"PDFDoc"}="";
|
477
|
0
|
|
|
|
|
|
$vars{"PDFPageCount"}=$vars{"PDFPagesFound"}=$vars{"PDFExtract"}="";
|
478
|
0
|
|
|
|
|
|
$pdfFile=$filename=$CatalogPages=$Root=$object=$encryptedPdf=$trailerObject="";
|
479
|
0
|
|
|
|
|
|
@object=@obj=@pages=();
|
480
|
0
|
|
|
|
|
|
%pageObject=();
|
481
|
0
|
0
|
|
|
|
|
$filename=$1 if $PDF{"PDFDoc"}=~/([^\\\/]+)\.pdf$/i;
|
482
|
0
|
0
|
|
|
|
|
if ( -f $PDF{"PDFDoc"} ) {
|
483
|
0
|
0
|
|
|
|
|
if (open FILE, $PDF{"PDFDoc"} ) {
|
484
|
0
|
|
|
|
|
|
binmode FILE;
|
485
|
0
|
|
|
|
|
|
$pdfFile = join('', );
|
486
|
0
|
|
|
|
|
|
close FILE;
|
487
|
0
|
|
|
|
|
|
$vars{"PDFDoc"}=$PDF{"PDFDoc"};
|
488
|
|
|
|
|
|
|
} else {
|
489
|
0
|
|
|
|
|
|
&error( "Can't open PDF document $PDF{\"PDFDoc\"} to read\n",__FILE__,__LINE__);
|
490
|
|
|
|
|
|
|
}
|
491
|
|
|
|
|
|
|
} else {
|
492
|
0
|
|
|
|
|
|
&error(" PDF document \"$filename\" not found",__FILE__,__LINE__);
|
493
|
|
|
|
|
|
|
}
|
494
|
|
|
|
|
|
|
}
|
495
|
0
|
0
|
|
|
|
|
if ($PDF{ "PDFPages" } ) {
|
496
|
0
|
|
|
|
|
|
$vars{ "PDFPages"}="";
|
497
|
0
|
|
|
|
|
|
$vars{"PDFPageCount"}=$vars{"PDFPagesFound"}=$vars{"PDFExtract"}="";
|
498
|
0
|
|
|
|
|
|
$CatalogPages=$Root=$object=$encryptedPdf=$trailerObject="";
|
499
|
0
|
|
|
|
|
|
@object=@obj=@pages=();
|
500
|
0
|
|
|
|
|
|
%getPages=%pageObject=();
|
501
|
0
|
|
|
|
|
|
$pages=$PDF{ "PDFPages" };
|
502
|
0
|
|
|
|
|
|
my $pageError=$pages;
|
503
|
0
|
|
|
|
|
|
$pages=~s/\.\./-/g;
|
504
|
0
|
|
|
|
|
|
$pages=~s/\.//g;
|
505
|
0
|
|
|
|
|
|
$pages=~s/\-/../g;
|
506
|
0
|
|
|
|
|
|
$pages=~s/ +/,/g;
|
507
|
0
|
|
|
|
|
|
$pages=~s/[^\d,\.]//g; # allow only numbers to be processed
|
508
|
0
|
0
|
|
|
|
|
$pages=1 unless $pages; # defaults to 1
|
509
|
0
|
|
|
|
|
|
$fileNumber=$pages;
|
510
|
0
|
|
|
|
|
|
$fileNumber=~s/,/_/g;
|
511
|
0
|
|
|
|
|
|
foreach my $page ( eval $pages ) {
|
512
|
0
|
0
|
|
|
|
|
next unless int $page;
|
513
|
0
|
|
|
|
|
|
$getPages{int $page}=1;
|
514
|
0
|
|
|
|
|
|
$requestedPages++;
|
515
|
|
|
|
|
|
|
}
|
516
|
0
|
0
|
|
|
|
|
if ( $requestedPages ) {
|
517
|
0
|
|
|
|
|
|
$pages="";
|
518
|
0
|
|
|
|
|
|
foreach my $page ( sort keys %getPages) {
|
519
|
0
|
|
|
|
|
|
$pages.="$page, ";
|
520
|
|
|
|
|
|
|
}
|
521
|
0
|
|
|
|
|
|
$pages=~s/, $//;
|
522
|
0
|
|
|
|
|
|
$vars{ "PDFPages"}=$pages;
|
523
|
|
|
|
|
|
|
} else {
|
524
|
0
|
|
|
|
|
|
&error("Can't get PDF Pages. No page numbers were set with '$pages' ",__FILE__,__LINE__);
|
525
|
|
|
|
|
|
|
}
|
526
|
|
|
|
|
|
|
}
|
527
|
0
|
0
|
|
|
|
|
if ($PDF{ "PDFCache"} ) {
|
528
|
0
|
|
|
|
|
|
$vars{"PDFCache"}=dir($PDF{ "PDFCache"});
|
529
|
|
|
|
|
|
|
}
|
530
|
0
|
0
|
|
|
|
|
if ( defined $PDF{ "PDFSaveAs" } ) { # we also want to be able to set PDFSaveAs to nothing ("")
|
531
|
0
|
|
|
|
|
|
$vars{"PDFSaveAs"} = $PDF{"PDFSaveAs"};
|
532
|
0
|
|
|
|
|
|
$vars{"PDFSaveAs"}=~s/\.pdf$//i; # just want the name, not the path and not the .pdf tag
|
533
|
0
|
|
|
|
|
|
$vars{"PDFSaveAs"}=~s/^.*[\/\\]//;
|
534
|
|
|
|
|
|
|
}
|
535
|
0
|
0
|
0
|
|
|
|
$vars{"PDFFilename"}=$vars{"PDFSaveAs"} ? $vars{"PDFSaveAs"}.".pdf" : $filename.($fileNumber||1).'.pdf'; # Reported bug 41628 - $fileNumber might not be defined. Suggested fix by Patrick Bourdon to avoid warnings
|
536
|
|
|
|
|
|
|
|
537
|
0
|
0
|
|
|
|
|
if ( $PDF{"PDFDebug"} ) {
|
538
|
0
|
|
|
|
|
|
print "These variables have been set\n";
|
539
|
0
|
|
|
|
|
|
foreach my $key (keys %vars) {
|
540
|
0
|
|
|
|
|
|
print "\t$key=\"$vars{$key}\"\n";
|
541
|
|
|
|
|
|
|
}
|
542
|
|
|
|
|
|
|
}
|
543
|
|
|
|
|
|
|
}
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
sub dir {
|
546
|
0
|
|
|
0
|
0
|
|
my($path,$dir,@folders)=@_;
|
547
|
0
|
|
|
|
|
|
$path=~s/\\/\//g;
|
548
|
0
|
|
|
|
|
|
(@folders)=split "/", $path;
|
549
|
0
|
|
|
|
|
|
foreach my $folder (@folders) {
|
550
|
0
|
0
|
|
|
|
|
$dir.= $folder=~/:/ ? $folder : "/$folder";
|
551
|
0
|
0
|
|
|
|
|
next if $folder=~/:/;
|
552
|
0
|
0
|
|
|
|
|
mkdir $dir, 0x777 unless -d $dir;
|
553
|
|
|
|
|
|
|
# print "$dir\n";
|
554
|
|
|
|
|
|
|
}
|
555
|
0
|
0
|
|
|
|
|
$path=~s/\//\\/g if ($^O eq "MSWin32");
|
556
|
0
|
0
|
|
|
|
|
return &error("This Cache path \"$path\" can't be created",__FILE__,__LINE__)
|
557
|
|
|
|
|
|
|
unless -d $path;
|
558
|
0
|
|
|
|
|
|
$path;
|
559
|
|
|
|
|
|
|
}
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub redirect {
|
562
|
0
|
|
|
0
|
0
|
|
exit print "Content-Type: text/html\n\n";
|
563
|
|
|
|
|
|
|
}
|
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub getDoc {
|
566
|
0
|
0
|
|
0
|
0
|
|
return if $vars{"PDFExtract"};
|
567
|
0
|
0
|
|
|
|
|
return &error("There is no pdf document to extract pages from",__FILE__,__LINE__) unless $pdfFile;
|
568
|
0
|
|
|
|
|
|
&getRoot;
|
569
|
0
|
|
|
|
|
|
&getPages($CatalogPages,0);
|
570
|
0
|
0
|
|
|
|
|
return &error("There are no pages in $filename.pdf that match '".((defined $pages) ? $pages : '?')."' ",__FILE__,__LINE__) # Reported bug 41628 - $pages might not be defined. Suggested fix by Patrick Bourdon to avoid warnings
|
|
|
0
|
|
|
|
|
|
571
|
|
|
|
|
|
|
unless $vars{"PDFPageCount"};
|
572
|
0
|
|
|
|
|
|
&getObj($Root,0);
|
573
|
0
|
|
|
|
|
|
&makePdfDoc;
|
574
|
|
|
|
|
|
|
}
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub savePdfDoc {
|
577
|
0
|
0
|
|
0
|
0
|
|
return "" if $vars{"PDFError"};
|
578
|
0
|
0
|
|
|
|
|
return &error("Can't open $vars{'PDFCache'}/$vars{'PDFFilename'}",__FILE__,__LINE__)
|
579
|
|
|
|
|
|
|
unless open FILE, ">$vars{'PDFCache'}/$vars{'PDFFilename'}";
|
580
|
0
|
|
|
|
|
|
binmode FILE;
|
581
|
0
|
|
|
|
|
|
print FILE $vars{"PDFExtract"};
|
582
|
0
|
|
|
|
|
|
close FILE;
|
583
|
|
|
|
|
|
|
}
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
sub uploadPDFDoc {
|
586
|
0
|
0
|
|
0
|
0
|
|
return &servError("") if $vars{"PDFError"};
|
587
|
0
|
|
|
|
|
|
my $len=length $vars{"PDFExtract"};
|
588
|
0
|
0
|
|
|
|
|
return &servError("PDF output is null, No output",__FILE__,__LINE__) unless $len;
|
589
|
0
|
|
|
|
|
|
print <
|
590
|
|
|
|
|
|
|
Content-Disposition: inline; filename=$vars{"PDFFilename"}\r
|
591
|
|
|
|
|
|
|
Content-Length: $len\r
|
592
|
|
|
|
|
|
|
Content-Type: application/pdf\r
|
593
|
|
|
|
|
|
|
\r
|
594
|
|
|
|
|
|
|
$vars{"PDFExtract"}\r
|
595
|
|
|
|
|
|
|
EOF
|
596
|
|
|
|
|
|
|
}
|
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
#------------------------------------ support Routines --------------------------------------------
|
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
sub servError {
|
601
|
0
|
|
|
0
|
0
|
|
my ($error,$file,$line)=@_;
|
602
|
0
|
0
|
|
|
|
|
&error($error,$file,$line) if $error;
|
603
|
0
|
0
|
|
|
|
|
if ($vars{"PDFErrorPage"}) {
|
604
|
0
|
|
|
|
|
|
$error=$vars{"PDFErrorFile"};
|
605
|
0
|
|
|
|
|
|
$error=~s/\[PDFError\]/$vars{"PDFError"}/sg;
|
606
|
|
|
|
|
|
|
} else {
|
607
|
0
|
|
|
|
|
|
$error="There is system problem in processing your PDF Pages requestERROR: $vars{\"PDFError\"} ";
|
608
|
|
|
|
|
|
|
}
|
609
|
0
|
|
|
|
|
|
print "Content-Type: text/html\n\n$error";
|
610
|
0
|
|
|
|
|
|
"";
|
611
|
|
|
|
|
|
|
}
|
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
sub error {
|
614
|
0
|
|
|
0
|
0
|
|
my ($error,$file,$line)=@_;
|
615
|
0
|
|
|
|
|
|
$vars{"PDFError"}.="$error\nat $file line $line\n";
|
616
|
0
|
|
|
|
|
|
"";
|
617
|
|
|
|
|
|
|
}
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
#------------------------------------ PDF Page Routines --------------------------------------------
|
620
|
|
|
|
|
|
|
sub getRoot {
|
621
|
0
|
0
|
|
0
|
0
|
|
return "" if $vars{"PDFError"};
|
622
|
0
|
0
|
|
|
|
|
return if $Root;
|
623
|
0
|
|
|
|
|
|
$pdf=$pdfFile;
|
624
|
0
|
0
|
|
|
|
|
my $val=$1 if $pdf=~/(trailer\s*<<.*?>>\s*)/s;
|
625
|
0
|
0
|
|
|
|
|
$Root=int $1 if $val=~/\/Root (\d+) 0 R/s;
|
626
|
0
|
|
|
|
|
|
$val=~s/\/Size \d+/\/Size __Size__/s;
|
627
|
0
|
|
|
|
|
|
$val=~s/\/Prev \d+//s; # delete Prev reference if its there was delelte to CRLF but croaked in 1.5
|
628
|
|
|
|
|
|
|
|
629
|
0
|
0
|
|
|
|
|
&getObj($1, $2 ) if $val=~/\/Info (\d+) (\d+) R/s;
|
630
|
0
|
0
|
|
|
|
|
&getObj( $encryptedPdf=$1, $2 ) if $val=~/\/Encrypt (\d+) (\d+) R/s;
|
631
|
0
|
|
|
|
|
|
$trailerObject=$val;
|
632
|
0
|
0
|
|
|
|
|
$Catalog=$1 if $pdf=~/\D($Root 0 obj.*?endobj\s*)/s;
|
633
|
0
|
0
|
|
|
|
|
$CatalogPages=int $1 if $Catalog=~/\/Pages (\d+) 0 R\s*/s;
|
634
|
0
|
|
|
|
|
|
$Catalog=~s/\/Outlines \d+ \d+ R//; # delete outlines as they won't conform to extracted pages 3.01
|
635
|
0
|
|
|
|
|
|
$Catalog=~s/\/PageLabels \d+ \d+ R//; # delete PageLabels as they won't conform to extracted pages 3.01
|
636
|
0
|
|
|
|
|
|
$Catalog=~s/\/Threads \d+ \d+ R//; # delete Threads as they won't conform to extracted pages 3.01
|
637
|
0
|
|
|
|
|
|
$Catalog=~s/\/StructTreeRoot \d+ \d+ R//; # delete StructTreeRoot as it won't conform to extracted pages 3.01
|
638
|
0
|
|
|
|
|
|
$pdf=~s/(\D)$Root 0 obj.*?endobj\s*/$1$Catalog/s;
|
639
|
|
|
|
|
|
|
}
|
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
sub getObj {
|
642
|
0
|
0
|
|
0
|
0
|
|
return "" if $vars{"PDFError"};
|
643
|
0
|
|
|
|
|
|
my($obj,$instnum,$gd)=@_;
|
644
|
0
|
0
|
|
|
|
|
unless ($obj[$obj] ) {
|
645
|
0
|
0
|
|
|
|
|
if ($pdf=~/\D($obj $instnum obj.*?endobj\s*)/s ) {
|
646
|
0
|
|
|
|
|
|
$object = $1;
|
647
|
|
|
|
|
|
|
# return "" if $object=~/\/GoToR/; # Don't want these link objects
|
648
|
0
|
|
|
|
|
|
$obj[$obj]++;
|
649
|
0
|
|
|
|
|
|
$object[$obj]=$object;
|
650
|
0
|
|
|
|
|
|
$instnum[$obj]=$instnum;
|
651
|
|
|
|
|
|
|
|
652
|
0
|
|
|
|
|
|
$object[$obj]=~s/(\/Dest \[ )(\d+)( \d.*?)/&uri($1,$2,$3)/es; # Convert page dest to uri if not present
|
|
0
|
|
|
|
|
|
|
653
|
0
|
|
|
|
|
|
$object[$obj]=~s/(\d+) (\d+) R([^GD])/&getObj($1, $2, $3)/ges; # Reported bug 33707 found 0 0 R in 0 0 0 RG generated by BUFFETTI software
|
|
0
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
# $object[$obj]=~s/(\/Dest \[ \d+)==/$1 0/s; # Don't follow this path
|
655
|
0
|
|
|
|
|
|
$object[$obj]=~s/\/Annots \[\s+\]\s+//s; # Delete empty Annots array
|
656
|
|
|
|
|
|
|
} else {
|
657
|
0
|
|
|
|
|
|
&error("Can't find object $obj $instnum obj ",__FILE__,__LINE__);
|
658
|
|
|
|
|
|
|
}
|
659
|
|
|
|
|
|
|
}
|
660
|
0
|
0
|
|
|
|
|
(defined $gd) ? "$obj 0 R$gd" : "$obj 0 R"; # Reported bugs 38579 & 41628 - $gd might not be defined. Suggested fix by Patrick Bourdon to avoid warnings
|
661
|
|
|
|
|
|
|
}
|
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
sub uri {
|
664
|
0
|
|
|
0
|
0
|
|
my($dest,$obj,$param)=@_;
|
665
|
0
|
0
|
|
|
|
|
return "$dest$obj$param" if $getPages{ $pageObject{$obj} }; # page is in document
|
666
|
|
|
|
|
|
|
#return "/A << /S /URI /URI ($web?PDFDoc%26$vars{PDFDoc}&PDFExtract%26$pageObject{$obj})>> \r"
|
667
|
|
|
|
|
|
|
# unless $encryptedPdf;
|
668
|
0
|
|
|
|
|
|
"";
|
669
|
|
|
|
|
|
|
}
|
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
sub getPages {
|
672
|
0
|
0
|
|
0
|
0
|
|
return "" if $vars{"PDFError"};
|
673
|
0
|
|
|
|
|
|
my($obj, $instnum)=@_;
|
674
|
0
|
0
|
|
|
|
|
my $val=$1 if $pdf=~/\s($obj $instnum obj.*?endobj\s*)/s;#by Stefano Capuzzimato. There can be even no space after endobj (* instead of +)
|
675
|
0
|
|
|
|
|
|
my $found="";
|
676
|
0
|
|
|
|
|
|
my $count=0;
|
677
|
0
|
0
|
|
|
|
|
if ($val=~/\/Kids\s*\[\s*(.*?)\]/s ) {#by Stefano Capuzzimato. You can find spaces between "Kids" and "["
|
678
|
0
|
|
|
|
|
|
my $kids=$1;
|
679
|
0
|
|
|
|
|
|
$kids=~s/\s+/ /gs;
|
680
|
0
|
|
|
|
|
|
foreach my $kid (split " R ", $kids) {
|
681
|
0
|
|
|
|
|
|
my($f,$c)=&getPages(split " ", $kid);
|
682
|
0
|
|
|
|
|
|
$found.=$f;
|
683
|
0
|
|
|
|
|
|
$count+=$c;
|
684
|
|
|
|
|
|
|
}
|
685
|
0
|
|
|
|
|
|
$pdf=~s/(\D$obj $instnum obj.*?\/Kids\s*\[).*?\]/$1$found\]/s;#by Stefano Capuzzimato. Between "Kids" and "[" there can be even no space
|
686
|
0
|
|
|
|
|
|
$pdf=~s/(\D$obj $instnum obj.*?\/Count )\d+/$1$count/s;
|
687
|
0
|
0
|
|
|
|
|
$found="$obj $instnum R " if $found;
|
688
|
|
|
|
|
|
|
} else {
|
689
|
0
|
|
|
|
|
|
$pageObject{$obj}=push @pages, $obj; # create a hash of all pages
|
690
|
0
|
0
|
|
|
|
|
if ( $getPages{$pageObject{$obj}} ) {
|
691
|
0
|
|
|
|
|
|
$found="$obj $instnum R ";
|
692
|
0
|
|
|
|
|
|
$count=1;
|
693
|
0
|
0
|
|
|
|
|
$vars{"PDFPagesFound"}.= $vars{"PDFPagesFound"} ? ", $pageObject{$obj}" : $pageObject{$obj};
|
694
|
0
|
|
|
|
|
|
$vars{"PDFPageCount"}++;
|
695
|
|
|
|
|
|
|
}
|
696
|
|
|
|
|
|
|
}
|
697
|
0
|
|
|
|
|
|
($found,$count);
|
698
|
|
|
|
|
|
|
}
|
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
sub makePdfDoc {
|
701
|
0
|
0
|
|
0
|
0
|
|
return "" if $vars{"PDFError"};
|
702
|
0
|
0
|
|
|
|
|
return &error("$vars{PDFDoc} is not a PDF file \n$pdf",__FILE__,__LINE__)
|
703
|
|
|
|
|
|
|
unless $pdf=~s/^(.*?)($CRLF+)/$2/;
|
704
|
0
|
|
|
|
|
|
$vars{"PDFExtract"}=$1.$2;
|
705
|
0
|
|
|
|
|
|
$vars{"PDFExtract"}.=$1.$2
|
706
|
|
|
|
|
|
|
while( $pdf=~s/^\s+(\%.*?)($CRLF+)/$2/); #include comment lines if any
|
707
|
0
|
|
|
|
|
|
my $xref="xxxxxxxxxx 65535 f\015\012";
|
708
|
0
|
|
|
|
|
|
my $objCount=1;
|
709
|
0
|
|
|
|
|
|
my $cnt=0;
|
710
|
0
|
|
|
|
|
|
for( ;$objCount<@object;$objCount++) {
|
711
|
0
|
0
|
|
|
|
|
if ($object[$objCount]) {
|
712
|
0
|
|
|
|
|
|
$xref.=sprintf("%0.10d %0.5d n\015\012",
|
713
|
|
|
|
|
|
|
length $vars{"PDFExtract"},
|
714
|
|
|
|
|
|
|
$instnum[$objCount] );
|
715
|
0
|
|
|
|
|
|
$vars{"PDFExtract"}.=$object[$objCount];
|
716
|
0
|
|
|
|
|
|
$cnt++;
|
717
|
|
|
|
|
|
|
}
|
718
|
|
|
|
|
|
|
}
|
719
|
0
|
0
|
|
|
|
|
return &error("$vars{PDFDoc} does not contain objects",__FILE__,__LINE__)
|
720
|
|
|
|
|
|
|
if $cnt==0;
|
721
|
0
|
|
|
|
|
|
$xref=~s/xxxxxxxxxx/0000000000/s;
|
722
|
0
|
|
|
|
|
|
my $startXref=length $vars{"PDFExtract"};
|
723
|
0
|
|
|
|
|
|
$vars{"PDFExtract"}.="xref\n0 $cnt\n$xref"; # changed \r to \n for unixish systems by Alberto Accomazzi
|
724
|
0
|
|
|
|
|
|
$trailerObject=~s/__Size__/$cnt/s;
|
725
|
0
|
|
|
|
|
|
$vars{"PDFExtract"}.="$trailerObject\nstartxref\n$startXref\n\%\%EOF\n"; # changed \r to \n for unixish systems by Alberto Accomazzi
|
726
|
|
|
|
|
|
|
}
|
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=head1 NOTES
|
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
This version of PDF::Extract has been designed to produce output to the PDF Standard as defined in the PDF Reference Seventh Edition.
|
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
However some third party PDF applications require a non standard feature of PDF documents.
|
733
|
|
|
|
|
|
|
Namely: The sequential numbering of objects starting at zero.
|
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
PDF::Extract treats a PDF file as a flat file, for speed of processing, and consequently knows nothing of PDF objects.
|
736
|
|
|
|
|
|
|
Objects extracted remain exactly as they were in the original document.
|
737
|
|
|
|
|
|
|
These objects are not renumbered. There will be gaps in the object number sequence. This is allowed in the specification.
|
738
|
|
|
|
|
|
|
Only the catalog and page tree objects are altered.
|
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
See the web site if you need information how to make PDF documents comply with what your third party PDF application expects.
|
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=head1 BUGS
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
There is a bug that Jon Schaeffer reported that had to do with some font resources not being found in the extracted PDF.
|
745
|
|
|
|
|
|
|
The source of the bug has, as yet, not been found.
|
746
|
|
|
|
|
|
|
If you find such a bug can you email a one page original pdf that can produce a PDF extract that has this bug.
|
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
Please report any bugs you find.
|
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=head1 AUTHOR
|
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
Noel Sharrock Emailto:nsharrok@lgmedia.com.auE
|
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
PDF::Extract's home page http://www.lgmedia.com.au/page.aspx?ID=8
|
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
Forum for users and developers has been hacked and database no longer exists. There are some sad folk around.
|
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=head1 SUPPORT
|
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
Much thanks to:-
|
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
Lyman Byrd for his welcome programming suggestions and editorial comments on the POD.
|
763
|
|
|
|
|
|
|
Michael Cox for his suggestion of PDFSaveAs and for the time he spent in testing the module.
|
764
|
|
|
|
|
|
|
Alberto Accomazzi for sharing his time and his knowledge of Unixish PDF voodoo magick.
|
765
|
|
|
|
|
|
|
Stefano Capuzzimato for correcting some stuff in the regexes he found.
|
766
|
|
|
|
|
|
|
Geert Theys for finding a small bug and supplying an excelent solution.
|
767
|
|
|
|
|
|
|
Jon Schaeffer for help with finding a solution to a bug in extracting Adobe 6+ pages.
|
768
|
|
|
|
|
|
|
Dario Santini for reporting a bug at http://rt.cpan.org//Ticket/Display.html?id=33707
|
769
|
|
|
|
|
|
|
Patrick Bourdon suggested several fixes for undefind string concatination warnings.
|
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
=head1 COPYRIGHT
|
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
Copyright (c) 2005 by Noel Sharrock. All rights reserved.
|
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=head1 LICENSE
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself,
|
778
|
|
|
|
|
|
|
i.e., under the terms of the ``Artistic License'' or the ``GNU General Public License''.
|
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
The C library at the core of this Perl module can additionally be redistributed and/or modified
|
781
|
|
|
|
|
|
|
under the terms of the ``GNU Library General Public License''.
|
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=head1 DISCLAIMER
|
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
This package is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
786
|
|
|
|
|
|
|
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
See the ``GNU General Public License'' for more details.
|
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
PDF::Extract - Extracting sub PDF documents from a multipage PDF document
|
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=cut
|
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
#------------------------------------------ End PDF Page ------------------------------------------
|
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
1;
|