File Coverage

blib/lib/PDF/Extract.pm

Criterion	Covered	Total	%
statement	6	223	2.6
branch	0	120	0.0
condition	0	5	0.0
subroutine	2	24	8.3
pod	9	22	40.9
total	17	394	4.3

line	stmt	bran	sub	pod	time	code
1						package PDF::Extract;
2	1		1		25184	use strict;
	1				2
	1				53
3						#use warnings;
4	1		1		6	use vars qw($VERSION);
	1				3
	1				3982
5						$VERSION = '3.04';
6
7						=head1 NAME
8
9						PDF::Extract - Extracting sub PDF documents from a multi page PDF document
10
11						=head1 SYNOPSIS
12
13						use PDF::Extract;
14						$pdf=new PDF::Extract;
15						$pdf->servePDFExtract( PDFDoc=>"c:/Docs/my.pdf", PDFPages=>"1-3 31-36" );
16
17						or
18
19						use PDF::Extract;
20						$pdf = new PDF::Extract( PDFDoc=>'C:/my.pdf' );
21						$pdf->getPDFExtract( PDFPages=>$PDFPages );
22						print "Content-Type text/plain\n\n";
608						}
609	0					print "Content-Type: text/html\n\n$error";
610	0					"";
611						}
612
613						sub error {
614	0		0	0		my ($error,$file,$line)=@_;
615	0					$vars{"PDFError"}.="$error\nat $file line $line\n";
616	0					"";
617						}
618
619						#------------------------------------ PDF Page Routines --------------------------------------------
620						sub getRoot {
621	0	0	0	0		return "" if $vars{"PDFError"};
622	0	0				return if $Root;
623	0					$pdf=$pdfFile;
624	0	0				my $val=$1 if $pdf=~/(trailer\s<<.?>>\s*)/s;
625	0	0				$Root=int $1 if $val=~/\/Root (\d+) 0 R/s;
626	0					$val=~s/\/Size \d+/\/Size __Size__/s;
627	0					$val=~s/\/Prev \d+//s; # delete Prev reference if its there was delelte to CRLF but croaked in 1.5
628
629	0	0				&getObj($1, $2 ) if $val=~/\/Info (\d+) (\d+) R/s;
630	0	0				&getObj( $encryptedPdf=$1, $2 ) if $val=~/\/Encrypt (\d+) (\d+) R/s;
631	0					$trailerObject=$val;
632	0	0				$Catalog=$1 if $pdf=~/\D($Root 0 obj.?endobj\s)/s;
633	0	0				$CatalogPages=int $1 if $Catalog=~/\/Pages (\d+) 0 R\s*/s;
634	0					$Catalog=~s/\/Outlines \d+ \d+ R//; # delete outlines as they won't conform to extracted pages 3.01
635	0					$Catalog=~s/\/PageLabels \d+ \d+ R//; # delete PageLabels as they won't conform to extracted pages 3.01
636	0					$Catalog=~s/\/Threads \d+ \d+ R//; # delete Threads as they won't conform to extracted pages 3.01
637	0					$Catalog=~s/\/StructTreeRoot \d+ \d+ R//; # delete StructTreeRoot as it won't conform to extracted pages 3.01
638	0					$pdf=~s/(\D)$Root 0 obj.?endobj\s/$1$Catalog/s;
639						}
640
641						sub getObj {
642	0	0	0	0		return "" if $vars{"PDFError"};
643	0					my($obj,$instnum,$gd)=@_;
644	0	0				unless ($obj[$obj] ) {
645	0	0				if ($pdf=~/\D($obj $instnum obj.?endobj\s)/s ) {
646	0					$object = $1;
647						# return "" if $object=~/\/GoToR/; # Don't want these link objects
648	0					$obj[$obj]++;
649	0					$object[$obj]=$object;
650	0					$instnum[$obj]=$instnum;
651
652	0					$object[$obj]=~s/(\/Dest \[ )(\d+)( \d.*?)/&uri($1,$2,$3)/es; # Convert page dest to uri if not present
	0
653	0					$object[$obj]=~s/(\d+) (\d+) R([^GD])/&getObj($1, $2, $3)/ges; # Reported bug 33707 found 0 0 R in 0 0 0 RG generated by BUFFETTI software
	0
654						# $object[$obj]=~s/(\/Dest \[ \d+)==/$1 0/s; # Don't follow this path
655	0					$object[$obj]=~s/\/Annots \[\s+\]\s+//s; # Delete empty Annots array
656						} else {
657	0					&error("Can't find object $obj $instnum obj ",__FILE__,__LINE__);
658						}
659						}
660	0	0				(defined $gd) ? "$obj 0 R$gd" : "$obj 0 R"; # Reported bugs 38579 & 41628 - $gd might not be defined. Suggested fix by Patrick Bourdon to avoid warnings
661						}
662
663						sub uri {
664	0		0	0		my($dest,$obj,$param)=@_;
665	0	0				return "$dest$obj$param" if $getPages{ $pageObject{$obj} }; # page is in document
666						#return "/A << /S /URI /URI ($web?PDFDoc%26$vars{PDFDoc}&PDFExtract%26$pageObject{$obj})>> \r"
667						# unless $encryptedPdf;
668	0					"";
669						}
670
671						sub getPages {
672	0	0	0	0		return "" if $vars{"PDFError"};
673	0					my($obj, $instnum)=@_;
674	0	0				my $val=$1 if $pdf=~/\s($obj $instnum obj.?endobj\s)/s;#by Stefano Capuzzimato. There can be even no space after endobj (* instead of +)
675	0					my $found="";
676	0					my $count=0;
677	0	0				if ($val=~/\/Kids\s\[\s(.*?)\]/s ) {#by Stefano Capuzzimato. You can find spaces between "Kids" and "["
678	0					my $kids=$1;
679	0					$kids=~s/\s+/ /gs;
680	0					foreach my $kid (split " R ", $kids) {
681	0					my($f,$c)=&getPages(split " ", $kid);
682	0					$found.=$f;
683	0					$count+=$c;
684						}
685	0					$pdf=~s/(\D$obj $instnum obj.?\/Kids\s\[).*?\]/$1$found\]/s;#by Stefano Capuzzimato. Between "Kids" and "[" there can be even no space
686	0					$pdf=~s/(\D$obj $instnum obj.*?\/Count )\d+/$1$count/s;
687	0	0				$found="$obj $instnum R " if $found;
688						} else {
689	0					$pageObject{$obj}=push @pages, $obj; # create a hash of all pages
690	0	0				if ( $getPages{$pageObject{$obj}} ) {
691	0					$found="$obj $instnum R ";
692	0					$count=1;
693	0	0				$vars{"PDFPagesFound"}.= $vars{"PDFPagesFound"} ? ", $pageObject{$obj}" : $pageObject{$obj};
694	0					$vars{"PDFPageCount"}++;
695						}
696						}
697	0					($found,$count);
698						}
699
700						sub makePdfDoc {
701	0	0	0	0		return "" if $vars{"PDFError"};
702	0	0				return &error("$vars{PDFDoc} is not a PDF file \n$pdf",__FILE__,__LINE__)
703						unless $pdf=~s/^(.*?)($CRLF+)/$2/;
704	0					$vars{"PDFExtract"}=$1.$2;
705	0					$vars{"PDFExtract"}.=$1.$2
706						while( $pdf=~s/^\s+(\%.*?)($CRLF+)/$2/); #include comment lines if any
707	0					my $xref="xxxxxxxxxx 65535 f\015\012";
708	0					my $objCount=1;
709	0					my $cnt=0;
710	0					for( ;$objCount<@object;$objCount++) {
711	0	0				if ($object[$objCount]) {
712	0					$xref.=sprintf("%0.10d %0.5d n\015\012",
713						length $vars{"PDFExtract"},
714						$instnum[$objCount] );
715	0					$vars{"PDFExtract"}.=$object[$objCount];
716	0					$cnt++;
717						}
718						}
719	0	0				return &error("$vars{PDFDoc} does not contain objects",__FILE__,__LINE__)
720						if $cnt==0;
721	0					$xref=~s/xxxxxxxxxx/0000000000/s;
722	0					my $startXref=length $vars{"PDFExtract"};
723	0					$vars{"PDFExtract"}.="xref\n0 $cnt\n$xref"; # changed \r to \n for unixish systems by Alberto Accomazzi
724	0					$trailerObject=~s/__Size__/$cnt/s;
725	0					$vars{"PDFExtract"}.="$trailerObject\nstartxref\n$startXref\n\%\%EOF\n"; # changed \r to \n for unixish systems by Alberto Accomazzi
726						}
727
728						=head1 NOTES
729
730						This version of PDF::Extract has been designed to produce output to the PDF Standard as defined in the PDF Reference Seventh Edition.
731
732						However some third party PDF applications require a non standard feature of PDF documents.
733						Namely: The sequential numbering of objects starting at zero.
734
735						PDF::Extract treats a PDF file as a flat file, for speed of processing, and consequently knows nothing of PDF objects.
736						Objects extracted remain exactly as they were in the original document.
737						These objects are not renumbered. There will be gaps in the object number sequence. This is allowed in the specification.
738						Only the catalog and page tree objects are altered.
739
740						See the web site if you need information how to make PDF documents comply with what your third party PDF application expects.
741
742						=head1 BUGS
743
744						There is a bug that Jon Schaeffer reported that had to do with some font resources not being found in the extracted PDF.
745						The source of the bug has, as yet, not been found.
746						If you find such a bug can you email a one page original pdf that can produce a PDF extract that has this bug.
747
748						Please report any bugs you find.
749
750						=head1 AUTHOR
751
752						Noel Sharrock Emailto:nsharrok@lgmedia.com.auE
753
754						PDF::Extract's home page http://www.lgmedia.com.au/page.aspx?ID=8
755
756						Forum for users and developers has been hacked and database no longer exists. There are some sad folk around.
757
758						=head1 SUPPORT
759
760						Much thanks to:-
761
762						Lyman Byrd for his welcome programming suggestions and editorial comments on the POD.
763						Michael Cox for his suggestion of PDFSaveAs and for the time he spent in testing the module.
764						Alberto Accomazzi for sharing his time and his knowledge of Unixish PDF voodoo magick.
765						Stefano Capuzzimato for correcting some stuff in the regexes he found.
766						Geert Theys for finding a small bug and supplying an excelent solution.
767						Jon Schaeffer for help with finding a solution to a bug in extracting Adobe 6+ pages.
768						Dario Santini for reporting a bug at http://rt.cpan.org//Ticket/Display.html?id=33707
769						Patrick Bourdon suggested several fixes for undefind string concatination warnings.
770
771						=head1 COPYRIGHT
772
773						Copyright (c) 2005 by Noel Sharrock. All rights reserved.
774
775						=head1 LICENSE
776
777						This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself,
778						i.e., under the terms of the ``Artistic License'' or the ``GNU General Public License''.
779
780						The C library at the core of this Perl module can additionally be redistributed and/or modified
781						under the terms of the ``GNU Library General Public License''.
782
783						=head1 DISCLAIMER
784
785						This package is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
786						without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
787
788						See the ``GNU General Public License'' for more details.
789
790						PDF::Extract - Extracting sub PDF documents from a multipage PDF document
791
792						=cut
793
794						#------------------------------------------ End PDF Page ------------------------------------------
795
796						1;