blib/lib/WWW/Patent/Page/USPTO.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 171 | 221 | 77.3 |
branch | 42 | 82 | 51.2 |
condition | 14 | 24 | 58.3 |
subroutine | 16 | 17 | 94.1 |
pod | n/a | ||
total | 243 | 344 | 70.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | |||||||
2 | package WWW::Patent::Page::USPTO; | ||||||
3 | |||||||
4 | # Version 2006-04-04 H. Schier | ||||||
5 | 5 | 5 | 32 | use strict; | |||
5 | 10 | ||||||
5 | 485 | ||||||
6 | 5 | 5 | 35 | use warnings; | |||
5 | 9 | ||||||
5 | 170 | ||||||
7 | 5 | 5 | 28 | use diagnostics; | |||
5 | 10 | ||||||
5 | 48 | ||||||
8 | 5 | 5 | 149 | use Carp; | |||
5 | 21 | ||||||
5 | 352 | ||||||
9 | use subs | ||||||
10 | 5 | 48 | qw( methods USPTO_country_known USPTO_htm USPTO_tif USPTO_pdf USPTO_terms ) | ||||
11 | 5 | 5 | 28 | ; #USPTO_pdf | |||
5 | 12 | ||||||
12 | 5 | 5 | 424 | use LWP::UserAgent 2.003; | |||
5 | 169 | ||||||
5 | 230 | ||||||
13 | require HTTP::Request; | ||||||
14 | 5 | 5 | 3712 | use HTML::HeadParser; | |||
5 | 47735 | ||||||
5 | 159 | ||||||
15 | 5 | 5 | 4361 | use HTML::TokeParser; | |||
5 | 21657 | ||||||
5 | 218 | ||||||
16 | 5 | 5 | 4635 | use PDF::API2 2.00; | |||
5 | 1368654 | ||||||
5 | 201 | ||||||
17 | 5 | 5 | 6405 | use File::Temp 0.17; | |||
5 | 61472 | ||||||
5 | 601 | ||||||
18 | #use Data::Dumper; | ||||||
19 | |||||||
20 | $| = 1 ; | ||||||
21 | |||||||
22 | 5 | 5 | 48 | use vars qw/ $VERSION @ISA/; | |||
5 | 12 | ||||||
5 | 22189 | ||||||
23 | |||||||
24 | $VERSION = "0.30"; | ||||||
25 | |||||||
26 | sub methods { | ||||||
27 | return ( | ||||||
28 | 5 | 5 | 58 | 'USPTO_htm' => \&USPTO_htm, | |||
29 | 'USPTO_tif' => \&USPTO_tif, | ||||||
30 | 'USPTO_pdf' => \&USPTO_pdf, | ||||||
31 | 'USPTO_country_known' => \&USPTO_country_known, | ||||||
32 | |||||||
33 | # 'USPTO_parse_doc_id' => \&USPTO_parse_doc_id, | ||||||
34 | 'USPTO_terms' => \&USPTO_terms, | ||||||
35 | ); | ||||||
36 | |||||||
37 | } | ||||||
38 | |||||||
39 | # sub USPTO_parse_doc_id{ | ||||||
40 | # "All patent numbers must be 7 characters in length" | ||||||
41 | # well, maybe 7 or less... | ||||||
42 | # USPTO will give 692,301 for request of 692301 or 0692301 | ||||||
43 | # Will respond to PN/D339456 (but not PN/D0339456) | ||||||
44 | # PN/D039456 | ||||||
45 | # and PN/D39456 and 1 and 01 | ||||||
46 | # Utility -- 5,146,634 6923014 0000001 | ||||||
47 | # Design -- D339,456 D321987 D000152 | ||||||
48 | # Plant -- PP08,901 PP07514 PP00003 | ||||||
49 | # Reissue -- RE35,312 RE12345 RE00007 | ||||||
50 | # Defensive Publication -- T109,201 T855019 T100001 | ||||||
51 | # Statutory Invention Registration -- H001,523 H001234 H000001 | ||||||
52 | # Re-examination -- RX29,194 RE29183 RE00125 | ||||||
53 | # Additional Improvement -- AI00,002 AI000318 AI00007 | ||||||
54 | # } | ||||||
55 | |||||||
56 | sub USPTO_country_known { | ||||||
57 | 0 | 0 | 0 | my $self = shift @_; | |||
58 | 0 | 0 | my $country = shift; | ||||
59 | 0 | 0 | 0 | if ( 'US' eq uc($country) ) { return ('1790 on'); } | |||
0 | 0 | ||||||
60 | 0 | 0 | else { carp 'US only!'; return undef } | ||||
0 | 0 | ||||||
61 | } | ||||||
62 | |||||||
63 | sub USPTO_htm { | ||||||
64 | 12 | 12 | 16 | my ( $self, $page_response ) = @_; | |||
65 | 12 | 21 | my $request; | ||||
66 | my $request_text; | ||||||
67 | 12 | 100 | 100 | 97 | if ( ( !$self->{'patent'}->{'doc_type'} ) | ||
100 | |||||||
68 | && ( length( $self->{'patent'}{'number'} ) == 11 ) ) | ||||||
69 | { | ||||||
70 | |||||||
71 | # Application (11 digits) | ||||||
72 | 1 | 4 | $request_text = 'http://appft1.uspto.gov/netacgi/nph-Parser?TERM1=' | ||||
73 | . $self->{'patent'}{'number'} | ||||||
74 | . '&Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=0&f=S&l=50'; | ||||||
75 | 1 | 10 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
76 | 1 | 145 | my $intermediate = $self->request($request); | ||||
77 | 1 | 10 | my $p = HTML::TokeParser->new( \$intermediate->content ); | ||||
78 | 1 | 213 | while ( my $token = $p->get_tag("a") ) { | ||||
79 | 19 | 100 | 3868 | my $url = $token->[1]{href} || "-"; | |||
80 | 19 | 52 | my $text = $p->get_trimmed_text("/a"); | ||||
81 | 19 | 100 | 100 | 2132 | if ( ( $url =~ m/$self->{'patent'}{'number'}/ ) | ||
82 | && ( $text =~ m/$self->{'patent'}{'number'}/ ) ) | ||||||
83 | { | ||||||
84 | |||||||
85 | #warn "fully qualified? '$url'\n"; | ||||||
86 | 1 | 4 | $request_text = 'http://appft1.uspto.gov/' . $url; | ||||
87 | 1 | 15 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
88 | } | ||||||
89 | } | ||||||
90 | } | ||||||
91 | |||||||
92 | #http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=1&f=G&l=50&s1=%2220010000044%22.PGNR.&OS=DN/20010000044&RS=DN/20010000044 | ||||||
93 | elsif ( $self->{'patent'}->{'doc_type'} ) { | ||||||
94 | |||||||
95 | # Non-Utility Patent | ||||||
96 | 7 | 31 | $request_text | ||||
97 | = "http://patft.uspto.gov/netacgi/nph-Parser?patentnumber=$self->{'patent'}->{'doc_type'}$self->{'patent'}{'number'}"; | ||||||
98 | 7 | 88 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
99 | } | ||||||
100 | else { | ||||||
101 | |||||||
102 | #Standard Utility Patent | ||||||
103 | 4 | 11 | $request_text | ||||
104 | = "http://patft.uspto.gov/netacgi/nph-Parser?patentnumber=$self->{'patent'}{'number'}"; | ||||||
105 | 4 | 46 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
106 | } | ||||||
107 | |||||||
108 | # print "\nAlmost $self->{'retrieved_identifier'}->{'number'} \n"; | ||||||
109 | 12 | 21066 | my $response = $self->request($request) | ||||
110 | ; # use the agent to make the request and get the response | ||||||
111 | # print "\there\n"; | ||||||
112 | 12 | 50 | 54 | if ( $response->is_success ) { | |||
113 | 12 | 142 | my $html = $response->content; | ||||
114 | |||||||
115 | # print "\n$html\n"; | ||||||
116 | 12 | 477 | my $p = HTML::HeadParser->new; | ||||
117 | 12 | 2553 | $p->parse( $response->content ); | ||||
118 | 12 | 2978 | my $entry; | ||||
119 | 12 | 100 | 53 | if ( $entry = $p->header('Refresh') ) | |||
120 | { # carp "no refresh seen via '$self->{'patent'}{'number'}' in \n'$html' " } | ||||||
121 | 10 | 433 | $entry =~ s/^.*?URL=//; | ||||
122 | 10 | 34 | $entry = 'http://patft.uspto.gov' . $entry; | ||||
123 | |||||||
124 | # print "$entry\n"; | ||||||
125 | 10 | 50 | 173 | $request = new HTTP::Request( 'GET' => "$entry" ) | |||
126 | or carp "bad refresh"; | ||||||
127 | 10 | 3347 | $response = $self->request($request); | ||||
128 | 10 | 274 | $html = $response->content; | ||||
129 | } | ||||||
130 | |||||||
131 | 12 | 100 | 803 | if ( $html =~ m/No patents have matched your query/ ) { | |||
132 | 1 | 9 | $page_response->set_parameter( 'is_success', undef ); | ||||
133 | 1 | 5 | $page_response->set_parameter( 'message', | ||||
134 | 'No patents have matched your query' ) | ||||||
135 | ; # No patents have matched your query | ||||||
136 | 1 | 24 | return $page_response; | ||||
137 | } | ||||||
138 | |||||||
139 | 11 | 50 | 376 | unless ( $html | |||
140 | =~ s/.*?.*?/\n\n |
||||||
141 | ) | ||||||
142 | { | ||||||
143 | 0 | 0 | carp "header weird A \n"; | ||||
144 | } | ||||||
145 | 11 | 50 | 3037 | unless ( $html | |||
146 | =~ s/.*( |
||||||
147 | ) | ||||||
148 | { | ||||||
149 | 0 | 0 | carp "header weird B \n"; | ||||
150 | } | ||||||
151 | 11 | 50 | 280 | unless ( $html =~ s/ |
|||
152 | 0 | 0 | carp "header weird C \n$html\n"; | ||||
153 | } | ||||||
154 | |||||||
155 | #warn " type is $self->{'patent'}->{'doc_type'}'\n"; | ||||||
156 | 11 | 50 | 341 | unless ( $html =~ s/ /is ) { |
|||
157 | 0 | 0 | carp "front weird \n$html\n"; | ||||
158 | } | ||||||
159 | 11 | 50 | 1095 | unless ( $html =~ s/(.*) (.*)body>/$1<\/body>/is ) { |
|||
160 | 0 | 0 | carp "end weird \n$html\n"; | ||||
161 | } | ||||||
162 | $html | ||||||
163 | 11 | 1073 | =~ s|"/netacgi/nph-Parser|"http://patft.uspto.gov/netacgi/nph-Parser|gi; | ||||
164 | 11 | 98 | $page_response->set_parameter( 'content', $html ); | ||||
165 | 11 | 361 | return $page_response; | ||||
166 | } | ||||||
167 | else { | ||||||
168 | 0 | 0 | carp "Unsuccessful response: \n'" | ||||
169 | . $response->status_line | ||||||
170 | . "'\n\nfrom request:\n'$request_text'\n"; | ||||||
171 | 0 | 0 | return undef; | ||||
172 | } | ||||||
173 | } | ||||||
174 | |||||||
175 | sub USPTO_tif { | ||||||
176 | 1 | 1 | 3 | my ( $self, $page_response ) = @_; | |||
177 | 1 | 2 | my ( $request, $base, $zero_fill ); | ||||
178 | |||||||
179 | 1 | 50 | 8 | if ( $self->{'patent'}{'number'} =~ m/(0|1|2|3|4)\d$/ ) { | |||
180 | |||||||
181 | # 0-4 is on one server, 5-9 is on another | ||||||
182 | 0 | 0 | $base = 'patimg1.uspto.gov'; | ||||
183 | } | ||||||
184 | 1 | 3 | else { $base = 'patimg2.uspto.gov'; } | ||||
185 | 1 | 7 | my $zerofill = sprintf '%0.8u', $self->{'patent'}{'number'}; | ||||
186 | |||||||
187 | # print "\nZerofill: $zerofill\n"; | ||||||
188 | 1 | 50 | 6 | if ( $self->{'patent'}->{'doc_type'} ) { | |||
189 | 0 | 0 | $request = HTTP::Request->new( 'GET' => | ||||
190 | "http://$base/.piw?Docid=$self->{'patent'}->{'doc_type'}$zerofill\&idkey=NONE" | ||||||
191 | ); | ||||||
192 | } | ||||||
193 | else { | ||||||
194 | 1 | 16 | $request = HTTP::Request->new( | ||||
195 | 'GET' => "http://$base/.piw?Docid=$zerofill\&idkey=NONE" ); | ||||||
196 | } | ||||||
197 | |||||||
198 | # print "\nAlmost $self->{'retrieved_identifier'}->{'number'} \n"; | ||||||
199 | 1 | 168 | my $response = $self->request($request); | ||||
200 | |||||||
201 | 1 | 7 | my $html = $response->content; | ||||
202 | |||||||
203 | { # page numbers | ||||||
204 | |||||||
205 | 1 | 50 | 14 | if ( $html =~ m/NumPages=(\d+)/ ) { | |||
1 | 0 | 22 | |||||
206 | 1 | 11 | $page_response->set_parameter( 'pages', $1 ); | ||||
207 | |||||||
208 | } | ||||||
209 | elsif ( $html =~ m/(\d+)\s+of\s+(\d+)\s+pages/ ) { | ||||||
210 | 0 | 0 | $page_response->set_parameter( 'pages', $2 ); | ||||
211 | |||||||
212 | # print "Pages: $2\n"; | ||||||
213 | } | ||||||
214 | else { | ||||||
215 | 0 | 0 | carp | ||||
216 | "no maximum page number found in $self->{'patent'}{'country'}$self->{'patent'}{'number'}: \n$html"; | ||||||
217 | } | ||||||
218 | } | ||||||
219 | 1 | 14 | my $p = HTML::TokeParser->new( \$html ); | ||||
220 | 1 | 211 | my $url; | ||||
221 | my $token; | ||||||
222 | 1 | 8 | FINDPAGE: while ( $token = $p->get_tag("a") ) { | ||||
223 | 2 | 50 | 1743 | $url = $token->[1]{href} || "-"; #very strange or construct ??? | |||
224 | 2 | 100 | 39 | if ( $url =~ m/$self->{'patent'}{'number'}/ ) { last FINDPAGE; } | |||
1 | 4 | ||||||
225 | |||||||
226 | # print "$url\n"; | ||||||
227 | } | ||||||
228 | 1 | 3 | undef $p; | ||||
229 | |||||||
230 | 1 | 27 | $url =~ s/PageNum=(\d+)/PageNum=$self->{'patent'}{'page'}/; | ||||
231 | # $url = "http://$base$url"; | ||||||
232 | |||||||
233 | # warn "URL = '$url'\n"; | ||||||
234 | # exit; | ||||||
235 | 1 | 50 | 14 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
236 | or carp "bad numbered page $self->{'patent'}{'page'} fetch $url"; | ||||||
237 | 1 | 182 | $response = $self->request($request); | ||||
238 | 1 | 25 | $html = $response->content; | ||||
239 | |||||||
240 | 1 | 112 | $p = HTML::TokeParser->new( \$html ); | ||||
241 | |||||||
242 | 1 | 324 | FINDPAGE: while ( $token = $p->get_tag("embed") ) { | ||||
243 | 0 | 0 | 0 | $url = $token->[1]->{src} || "-"; | |||
244 | 0 | 0 | 0 | if ( $url =~ m/image\/tiff/ ) { last FINDPAGE; } | |||
0 | 0 | ||||||
245 | } | ||||||
246 | |||||||
247 | # get tiff image | ||||||
248 | # $url = "http://$base$url"; | ||||||
249 | 1 | 50 | 14219 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
250 | or carp "Coudn't retrieve the tiff image fetch $url"; | ||||||
251 | 1 | 345 | $response = $self->request($request); | ||||
252 | |||||||
253 | # print "\nPage response\n$response->content\n\n"; | ||||||
254 | 1 | 23 | $page_response->set_parameter( 'content', $response->content ); | ||||
255 | |||||||
256 | 1 | 34 | return $page_response; | ||||
257 | } | ||||||
258 | |||||||
259 | sub USPTO_pdf { | ||||||
260 | 1 | 1 | 3 | my ( $self, $page_response ) = @_; | |||
261 | 1 | 1 | my ( $request, $base, $zero_fill ); | ||||
262 | |||||||
263 | 1 | 50 | 4 | my $tempdir = $self->{'patent'}->{'tempdir'} | |||
264 | if ( $self->{'patent'}->{'tempdir'} ); | ||||||
265 | 1 | 3 | my $fn_template = $self->{'patent'}->{'doc_id'} . "_XXXX"; | ||||
266 | |||||||
267 | 1 | 9 | my $pdf_file = new File::Temp( | ||||
268 | TEMPLATE => $fn_template, | ||||||
269 | DIR => $tempdir, | ||||||
270 | SUFFIX => '.pdf', | ||||||
271 | UNLINK => 1, | ||||||
272 | ); | ||||||
273 | 1 | 745 | my $pdf_fn = $pdf_file->filename; | ||||
274 | 1 | 16 | my $pdf = new PDF::API2(); | ||||
275 | 1 | 840 | print $pdf_file $pdf->stringify; | ||||
276 | 1 | 14077 | close $pdf_file; | ||||
277 | |||||||
278 | 1 | 60 | my $currenttime = localtime(); | ||||
279 | 1 | 10 | my $short_id | ||||
280 | = $self->{'patent'}{'country'} . $self->{'patent'}->{'number'}; | ||||||
281 | |||||||
282 | # my $pdf = new PDF::API2(-file => "$tempdir/$self->{'patent'}{'doc_id'}".".pdf"); | ||||||
283 | # my $pdf = new PDF::API2(-file => $pdf_fn); | ||||||
284 | 1 | 10 | $pdf = PDF::API2->open($pdf_fn); | ||||
285 | 1 | 11806 | my %h = $pdf->info( | ||||
286 | 'Author' => "Programatically Produced from Public Information", | ||||||
287 | 'CreationDate' => $currenttime, | ||||||
288 | 'ModDate' => $currenttime, | ||||||
289 | 'Creator' => "WWW::Patent::Page::USPTO_pdf", | ||||||
290 | 'Producer' => "US Patent Office and PDF::API2", | ||||||
291 | 'Title' => "$short_id", | ||||||
292 | 'Subject' => "patent", | ||||||
293 | 'Keywords' => "$short_id WWW::Patent::Page" | ||||||
294 | ); | ||||||
295 | 1 | 838 | my $page = $pdf->page(); | ||||
296 | 1 | 560 | $page->mediabox('A4'); | ||||
297 | |||||||
298 | |||||||
299 | 1 | 50 | 211 | if ( $self->{'patent'}{'number'} =~ m/(0|1|2|3|4)\d$/ ) { | |||
300 | |||||||
301 | # 0-4 is on one server, 5-9 is on another | ||||||
302 | 0 | 0 | $base = 'patimg1.uspto.gov'; | ||||
303 | } | ||||||
304 | 1 | 3 | else { $base = 'patimg2.uspto.gov'; } | ||||
305 | 1 | 11 | my $zerofill = sprintf '%0.8u', $self->{'patent'}{'number'}; | ||||
306 | |||||||
307 | 1 | 50 | 5 | if ( $self->{'patent'}->{'doc_type'} ) { | |||
308 | 0 | 0 | $request = HTTP::Request->new( 'GET' => | ||||
309 | "http://$base/.piw?Docid=$self->{'patent'}->{'doc_type'}$zerofill\&idkey=NONE" | ||||||
310 | ); | ||||||
311 | } | ||||||
312 | else { | ||||||
313 | 1 | 16 | $request = HTTP::Request->new( | ||||
314 | 'GET' => "http://$base/.piw?Docid=$zerofill\&idkey=NONE" ); | ||||||
315 | } | ||||||
316 | |||||||
317 | 1 | 17843 | my $response = $self->request($request); | ||||
318 | |||||||
319 | 1 | 10 | my $html = $response->content; | ||||
320 | # warn "html = $html\n"; | ||||||
321 | { # page numbers | ||||||
322 | |||||||
323 | 1 | 50 | 13 | if ( $html =~ m/NumPages=(\d+)/ ) { | |||
1 | 0 | 12 | |||||
324 | 1 | 9 | $page_response->set_parameter( 'pages', $1 ); | ||||
325 | } | ||||||
326 | elsif ( $html =~ m/(\d+)\s+of\s+(\d+)\s+pages/ ) { | ||||||
327 | 0 | 0 | $page_response->set_parameter( 'pages', $2 ); | ||||
328 | |||||||
329 | # print "Pages: $2\n"; | ||||||
330 | } | ||||||
331 | else { | ||||||
332 | 0 | 0 | carp | ||||
333 | "no maximum page number found in $self->{'patent'}{'country'}$self->{'patent'}{'number'}: \n$html"; | ||||||
334 | } | ||||||
335 | } | ||||||
336 | 1 | 13 | my $p = HTML::TokeParser->new( \$html ); | ||||
337 | 1 | 194 | my $url; | ||||
338 | my $token; | ||||||
339 | 1 | 6 | FINDPAGE: while ( $token = $p->get_tag("a") ) { | ||||
340 | 2 | 50 | 1055 | $url = $token->[1]{href} || "-"; #very strange or construct ??? | |||
341 | 2 | 100 | 40 | if ( $url =~ m/$self->{'patent'}{'number'}/ ) { last FINDPAGE; } | |||
1 | 3 | ||||||
342 | } | ||||||
343 | |||||||
344 | 1 | 4 | undef $p; | ||||
345 | |||||||
346 | 1 | 50 | 22 | if ( defined( $self->{'patent'}{'page'} ) ) { | |||
347 | 1 | 4 | $url =~ s/PageNum=(\d+)/PageNum=$self->{'patent'}{'page'}/; | ||||
348 | } | ||||||
349 | |||||||
350 | # print "\n\$self->{'patent'}->{'page'} = '$self->{'patent'}->{'page'}' |$url = '$url' \@ 391 \n"; | ||||||
351 | |||||||
352 | #$url = "http://$base$url"; | ||||||
353 | 1 | 50 | 11 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
354 | or carp "bad numbered page $self->{'patent'}{'page'} fetch $url"; | ||||||
355 | 1 | 50 | 188 | $response = $self->request($request) or carp "bad request" ; | |||
356 | 1 | 39 | $html = $response->content; | ||||
357 | |||||||
358 | # open( my $fh,">", "1.html"); print $fh $html; close $fh; | ||||||
359 | |||||||
360 | 1 | 454 | $p = HTML::TokeParser->new( \$html ); | ||||
361 | # warn "base = $base, URL1 = '$url'\n"; | ||||||
362 | |||||||
363 | 1 | 500 | $url = ""; | ||||
364 | |||||||
365 | 1 | 7 | FINDIMAGE: while($token = $p->get_tag("a") ) { | ||||
366 | 10 | 100 | 3234 | $url = $token->[1]->{href} || "-" ; | |||
367 | # warn "\ntoken 1 href = ", $token->[1]->{href} , "\n" ; | ||||||
368 | 10 | 100 | 56 | if ($url =~ m/View\+first\+page/) {last FINDIMAGE; } | |||
1 | 4 | ||||||
369 | } | ||||||
370 | # warn "URL2 = '$url'\n"; | ||||||
371 | |||||||
372 | |||||||
373 | 1 | 50 | 17 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
374 | or carp "bad numbered page $self->{'patent'}{'page'} fetch $url"; | ||||||
375 | 1 | 50 | 226 | $response = $self->request($request) or carp "bad request" ; | |||
376 | 1 | 20 | $html = $response->content; | ||||
377 | 1 | 10461 | $p = HTML::TokeParser->new( \$html ); | ||||
378 | |||||||
379 | #open( my $fh1,">", "2.html"); print $fh1 $html; close $fh1; | ||||||
380 | |||||||
381 | |||||||
382 | 1 | 256 | FINDTIF: while ( $token = $p->get_tag("embed") ) { | ||||
383 | # print "\ntoken = ", Dumper($token), "\n" ; | ||||||
384 | 1 | 50 | 4618 | $url = $token->[1]->{src} || "-"; | |||
385 | 1 | 50 | 14 | if ( $url =~ m/tif$/ ) { last FINDTIF; } | |||
0 | 0 | ||||||
386 | } | ||||||
387 | |||||||
388 | # get tiff image | ||||||
389 | 1 | 249 | $url = "http://$base$url"; $url =~ s/\n//; | ||||
1 | 4 | ||||||
390 | 1 | 50 | 9 | if ( defined( $self->{'patent'}{'page'} ) ) { | |||
391 | 1 | 5 | $url =~ s/PageNum=(\d+)/PageNum=$self->{'patent'}{'page'}/; | ||||
392 | } | ||||||
393 | |||||||
394 | # warn "URL3 = '$url'\n"; | ||||||
395 | 1 | 3 | my $tif_url = $url; | ||||
396 | 1 | 50 | 15 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
397 | or carp "Coudn't retrieve the tiff image fetch $url"; | ||||||
398 | |||||||
399 | |||||||
400 | # prepare to store tif image | ||||||
401 | 1 | 215 | my $pat_page = 1; | ||||
402 | 1 | 50 | 7 | if ( defined( $self->{'patent'}{'page'} ) ) { | |||
403 | 1 | 4 | $pat_page = $self->{'patent'}{'page'}; | ||||
404 | } | ||||||
405 | |||||||
406 | # print "\n\$self->{'patent'}->{'doc_id'} = '$self->{'patent'}->{'doc_id'}' \@ 413 \n"; | ||||||
407 | |||||||
408 | 1 | 6 | $fn_template = $self->{'patent'}->{'doc_id'} . "_p" . $pat_page . "_XXXX"; | ||||
409 | # warn "TEMPLATE => $fn_template, DIR => $tempdir\n" ; | ||||||
410 | 1 | 16 | my $temp_tif = new File::Temp( | ||||
411 | TEMPLATE => $fn_template, | ||||||
412 | DIR => $tempdir, | ||||||
413 | SUFFIX => '.tif', | ||||||
414 | UNLINK => 1, | ||||||
415 | ); | ||||||
416 | |||||||
417 | 1 | 997 | my $done = 0; | ||||
418 | 1 | 4 | my $trys = 0; | ||||
419 | 1 | 50 | 33 | 13 | if ( !$done and $trys < 5 ) { | ||
420 | 1 | 10 | $response = $self->request($request); | ||||
421 | 0 | 0 | $trys++; | ||||
422 | 0 | 0 | 0 | 0 | if ( $response->is_success and $response->content ) { | ||
423 | 0 | 0 | print $temp_tif $response->content; | ||||
424 | # open( my $th,">", "2.html"); print $th $response->content; close $th; | ||||||
425 | |||||||
426 | 0 | 0 | $done = 1; | ||||
427 | } | ||||||
428 | else { | ||||||
429 | 0 | 0 | carp | ||||
430 | "attempt $trys response failed or content empty, no temporary tiff can be made- possibly network problem or timeout, will try again"; | ||||||
431 | } | ||||||
432 | } | ||||||
433 | 0 | 0 | else { carp "too many attempts, giving up."; return (0); } | ||||
0 | 0 | ||||||
434 | |||||||
435 | # close $temp_tif; | ||||||
436 | |||||||
437 | # convert to pdf | ||||||
438 | 0 | 0 | my $gfx = $page->gfx(); | ||||
439 | # print Dumper($temp_tif); | ||||||
440 | 0 | 0 | $gfx->image( $pdf->image_tiff($temp_tif), 0, 0, 0.23 ); | ||||
441 | |||||||
442 | # one page only | ||||||
443 | 0 | 0 | 0 | if ( $self->{'patent'}{'page'} ) { | |||
444 | 0 | 0 | $page_response->set_parameter( 'content', $pdf->stringify ); | ||||
445 | 0 | 0 | return $page_response; | ||||
446 | } | ||||||
447 | |||||||
448 | # retrieve all pages | ||||||
449 | 0 | 0 | my $maxpage = $page_response->get_parameter('pages'); | ||||
450 | 0 | 0 | for ( my $i = 2; $i <= $maxpage; $i++ ) { | ||||
451 | 0 | 0 | $tif_url =~ s/PageNum=(\d+)/PageNum=$i/; | ||||
452 | 0 | 0 | 0 | $request = new HTTP::Request( 'GET' => "$tif_url" ) | |||
453 | or carp "Couldn't retrieve the tiff image fetch $tif_url"; | ||||||
454 | 0 | 0 | $response = $self->request($request); | ||||
455 | |||||||
456 | # store tif image | ||||||
457 | 0 | 0 | $fn_template = $self->{'patent'}->{'doc_id'} . "_p" . $i . "_XXXX"; | ||||
458 | 0 | 0 | $temp_tif = new File::Temp( | ||||
459 | TEMPLATE => $fn_template, | ||||||
460 | DIR => $tempdir, | ||||||
461 | SUFFIX => '.tif', | ||||||
462 | UNLINK => 1, | ||||||
463 | ); | ||||||
464 | 0 | 0 | print $temp_tif $response->content; | ||||
465 | |||||||
466 | # close $temp_tif; | ||||||
467 | |||||||
468 | # convert to pdf | ||||||
469 | 0 | 0 | $page = $pdf->page(0); | ||||
470 | 0 | 0 | $gfx = $page->gfx(); | ||||
471 | 0 | 0 | $gfx->image( $pdf->image_tiff($temp_tif), 0, 0, 0.23 ); | ||||
472 | } | ||||||
473 | |||||||
474 | # return pdf as string | ||||||
475 | 0 | 0 | $page_response->set_parameter( 'content', $pdf->stringify ); | ||||
476 | |||||||
477 | # $pdf->save; | ||||||
478 | # $pdf->saveas("$tempdir/$self->{'patent'}->{'doc_id'}".".pdf"); | ||||||
479 | |||||||
480 | 0 | 0 | return $page_response; | ||||
481 | } | ||||||
482 | |||||||
483 | sub USPTO_terms { | ||||||
484 | 2 | 2 | 4 | my ($self) = @_; | |||
485 | return ( | ||||||
486 | 2 | 17 | "WWW::Patent::Page utilizes the USPTO web site.\n | ||||
487 | Refer to http://www.USPTO.gov for terms and conditions of use of that site. | ||||||
488 | |||||||
489 | Note that as of September 1, 2004, | ||||||
490 | http://www.uspto.gov/patft/help/notices.htm and the like state in part: | ||||||
491 | |||||||
492 | These databases are intended for use by the general public. | ||||||
493 | Due to limitations of equipment and bandwidth, they are not | ||||||
494 | intended to be a source for bulk downloads of USPTO data. | ||||||
495 | Bulk data may be purchased from USPTO at cost (see the USPTO | ||||||
496 | Products and Services Catalog). Individuals, companies, | ||||||
497 | IP addresses, or blocks of IP addresses who, in effect, | ||||||
498 | deny service to the general public by generating unusually | ||||||
499 | high numbers (1000 or more) of daily database accesses | ||||||
500 | (searches, pages, or hits), whether generated manually or | ||||||
501 | in an automated fashion, may be denied access to these | ||||||
502 | servers without notice. | ||||||
503 | |||||||
504 | Note at http://www.uspto.gov/patft/help/accpat.htm : | ||||||
505 | |||||||
506 | If you can access the main PTO Web site, but cannot access any | ||||||
507 | of the Patent Grant Database Quick Search, Advanced Quick Searching, | ||||||
508 | or Patent Number Searching pages, your workstation or organization | ||||||
509 | may have been denied access to the Web Patent Databases pursuant | ||||||
510 | to the policy stated at the top of this page. To determine if you | ||||||
511 | have been denied access, you can check the Denied List for your | ||||||
512 | computer's IP address. http://www.uspto.gov/patft/help/denied.htm | ||||||
513 | |||||||
514 | (Your IP address is the only means by which you are known to the | ||||||
515 | PTO servers -- server logs do not contain your email address or | ||||||
516 | any other personal identifying information. If you do not know | ||||||
517 | your computer's IP address because you are behind a firewall, do | ||||||
518 | not have a fixed IP address, or for any other reason, you can find | ||||||
519 | your current IP address by using an 'IP reflector,' such as | ||||||
520 | http://www2.simflex.com/ip.shtml or http://www.dslreports.com/ip.) | ||||||
521 | |||||||
522 | If you are an individual whose individual IP address has been | ||||||
523 | denied access: to seek to have your access restored, please send | ||||||
524 | email including your workstation and firewall or gateway IP addresses | ||||||
525 | (consult with your network administrators if necessary), and describing | ||||||
526 | the steps you have taken or will take to insure that future violations | ||||||
527 | of the USPTO access policy will not occur, to the Database Help Desk at | ||||||
528 | www\@uspto.gov. | ||||||
529 | |||||||
530 | If you are a member or employee of an organization which has been | ||||||
531 | denied access: please do not send individual email to PTO. Instead, | ||||||
532 | please have your network administrator or a person holding authority | ||||||
533 | over your organization\'s network operations send email including your | ||||||
534 | firewall, gateway, or workstation IP addresses, and describing the steps | ||||||
535 | you have taken or will take to insure that future violations of the USPTO | ||||||
536 | access policy will not occur, to the Database Help Desk at www\@uspto.gov. | ||||||
537 | |||||||
538 | For all other content-related matters, please send email to the Database | ||||||
539 | Help Desk at www\@uspto.gov | ||||||
540 | |||||||
541 | Note at http://www.uspto.gov/patft/help/images.htm | ||||||
542 | |||||||
543 | Patent images must be retrieved from the database one page at a time. | ||||||
544 | This is necessary since patents can be as long as 5,000 pages, and the | ||||||
545 | resources required to allow downloading such 'jumbo' patents are not | ||||||
546 | available. Users employing third-party software which downloads multiple | ||||||
547 | pages of a patent at once may find this practice subjects them to denial | ||||||
548 | of access to the databases if they exceed PTO's maximum allowable | ||||||
549 | activity levels. | ||||||
550 | |||||||
551 | " | ||||||
552 | ); | ||||||
553 | } | ||||||
554 | 1; | ||||||
555 | |||||||
556 | __END__ |