| blib/lib/WWW/Patent/Page/USPTO.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 171 | 221 | 77.3 |
| branch | 42 | 82 | 51.2 |
| condition | 14 | 24 | 58.3 |
| subroutine | 16 | 17 | 94.1 |
| pod | n/a | ||
| total | 243 | 344 | 70.6 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | |||||||
| 2 | package WWW::Patent::Page::USPTO; | ||||||
| 3 | |||||||
| 4 | # Version 2006-04-04 H. Schier | ||||||
| 5 | 5 | 5 | 32 | use strict; | |||
| 5 | 10 | ||||||
| 5 | 485 | ||||||
| 6 | 5 | 5 | 35 | use warnings; | |||
| 5 | 9 | ||||||
| 5 | 170 | ||||||
| 7 | 5 | 5 | 28 | use diagnostics; | |||
| 5 | 10 | ||||||
| 5 | 48 | ||||||
| 8 | 5 | 5 | 149 | use Carp; | |||
| 5 | 21 | ||||||
| 5 | 352 | ||||||
| 9 | use subs | ||||||
| 10 | 5 | 48 | qw( methods USPTO_country_known USPTO_htm USPTO_tif USPTO_pdf USPTO_terms ) | ||||
| 11 | 5 | 5 | 28 | ; #USPTO_pdf | |||
| 5 | 12 | ||||||
| 12 | 5 | 5 | 424 | use LWP::UserAgent 2.003; | |||
| 5 | 169 | ||||||
| 5 | 230 | ||||||
| 13 | require HTTP::Request; | ||||||
| 14 | 5 | 5 | 3712 | use HTML::HeadParser; | |||
| 5 | 47735 | ||||||
| 5 | 159 | ||||||
| 15 | 5 | 5 | 4361 | use HTML::TokeParser; | |||
| 5 | 21657 | ||||||
| 5 | 218 | ||||||
| 16 | 5 | 5 | 4635 | use PDF::API2 2.00; | |||
| 5 | 1368654 | ||||||
| 5 | 201 | ||||||
| 17 | 5 | 5 | 6405 | use File::Temp 0.17; | |||
| 5 | 61472 | ||||||
| 5 | 601 | ||||||
| 18 | #use Data::Dumper; | ||||||
| 19 | |||||||
| 20 | $| = 1 ; | ||||||
| 21 | |||||||
| 22 | 5 | 5 | 48 | use vars qw/ $VERSION @ISA/; | |||
| 5 | 12 | ||||||
| 5 | 22189 | ||||||
| 23 | |||||||
| 24 | $VERSION = "0.30"; | ||||||
| 25 | |||||||
| 26 | sub methods { | ||||||
| 27 | return ( | ||||||
| 28 | 5 | 5 | 58 | 'USPTO_htm' => \&USPTO_htm, | |||
| 29 | 'USPTO_tif' => \&USPTO_tif, | ||||||
| 30 | 'USPTO_pdf' => \&USPTO_pdf, | ||||||
| 31 | 'USPTO_country_known' => \&USPTO_country_known, | ||||||
| 32 | |||||||
| 33 | # 'USPTO_parse_doc_id' => \&USPTO_parse_doc_id, | ||||||
| 34 | 'USPTO_terms' => \&USPTO_terms, | ||||||
| 35 | ); | ||||||
| 36 | |||||||
| 37 | } | ||||||
| 38 | |||||||
| 39 | # sub USPTO_parse_doc_id{ | ||||||
| 40 | # "All patent numbers must be 7 characters in length" | ||||||
| 41 | # well, maybe 7 or less... | ||||||
| 42 | # USPTO will give 692,301 for request of 692301 or 0692301 | ||||||
| 43 | # Will respond to PN/D339456 (but not PN/D0339456) | ||||||
| 44 | # PN/D039456 | ||||||
| 45 | # and PN/D39456 and 1 and 01 | ||||||
| 46 | # Utility -- 5,146,634 6923014 0000001 | ||||||
| 47 | # Design -- D339,456 D321987 D000152 | ||||||
| 48 | # Plant -- PP08,901 PP07514 PP00003 | ||||||
| 49 | # Reissue -- RE35,312 RE12345 RE00007 | ||||||
| 50 | # Defensive Publication -- T109,201 T855019 T100001 | ||||||
| 51 | # Statutory Invention Registration -- H001,523 H001234 H000001 | ||||||
| 52 | # Re-examination -- RX29,194 RE29183 RE00125 | ||||||
| 53 | # Additional Improvement -- AI00,002 AI000318 AI00007 | ||||||
| 54 | # } | ||||||
| 55 | |||||||
| 56 | sub USPTO_country_known { | ||||||
| 57 | 0 | 0 | 0 | my $self = shift @_; | |||
| 58 | 0 | 0 | my $country = shift; | ||||
| 59 | 0 | 0 | 0 | if ( 'US' eq uc($country) ) { return ('1790 on'); } | |||
| 0 | 0 | ||||||
| 60 | 0 | 0 | else { carp 'US only!'; return undef } | ||||
| 0 | 0 | ||||||
| 61 | } | ||||||
| 62 | |||||||
| 63 | sub USPTO_htm { | ||||||
| 64 | 12 | 12 | 16 | my ( $self, $page_response ) = @_; | |||
| 65 | 12 | 21 | my $request; | ||||
| 66 | my $request_text; | ||||||
| 67 | 12 | 100 | 100 | 97 | if ( ( !$self->{'patent'}->{'doc_type'} ) | ||
| 100 | |||||||
| 68 | && ( length( $self->{'patent'}{'number'} ) == 11 ) ) | ||||||
| 69 | { | ||||||
| 70 | |||||||
| 71 | # Application (11 digits) | ||||||
| 72 | 1 | 4 | $request_text = 'http://appft1.uspto.gov/netacgi/nph-Parser?TERM1=' | ||||
| 73 | . $self->{'patent'}{'number'} | ||||||
| 74 | . '&Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=0&f=S&l=50'; | ||||||
| 75 | 1 | 10 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
| 76 | 1 | 145 | my $intermediate = $self->request($request); | ||||
| 77 | 1 | 10 | my $p = HTML::TokeParser->new( \$intermediate->content ); | ||||
| 78 | 1 | 213 | while ( my $token = $p->get_tag("a") ) { | ||||
| 79 | 19 | 100 | 3868 | my $url = $token->[1]{href} || "-"; | |||
| 80 | 19 | 52 | my $text = $p->get_trimmed_text("/a"); | ||||
| 81 | 19 | 100 | 100 | 2132 | if ( ( $url =~ m/$self->{'patent'}{'number'}/ ) | ||
| 82 | && ( $text =~ m/$self->{'patent'}{'number'}/ ) ) | ||||||
| 83 | { | ||||||
| 84 | |||||||
| 85 | #warn "fully qualified? '$url'\n"; | ||||||
| 86 | 1 | 4 | $request_text = 'http://appft1.uspto.gov/' . $url; | ||||
| 87 | 1 | 15 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
| 88 | } | ||||||
| 89 | } | ||||||
| 90 | } | ||||||
| 91 | |||||||
| 92 | #http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=1&f=G&l=50&s1=%2220010000044%22.PGNR.&OS=DN/20010000044&RS=DN/20010000044 | ||||||
| 93 | elsif ( $self->{'patent'}->{'doc_type'} ) { | ||||||
| 94 | |||||||
| 95 | # Non-Utility Patent | ||||||
| 96 | 7 | 31 | $request_text | ||||
| 97 | = "http://patft.uspto.gov/netacgi/nph-Parser?patentnumber=$self->{'patent'}->{'doc_type'}$self->{'patent'}{'number'}"; | ||||||
| 98 | 7 | 88 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
| 99 | } | ||||||
| 100 | else { | ||||||
| 101 | |||||||
| 102 | #Standard Utility Patent | ||||||
| 103 | 4 | 11 | $request_text | ||||
| 104 | = "http://patft.uspto.gov/netacgi/nph-Parser?patentnumber=$self->{'patent'}{'number'}"; | ||||||
| 105 | 4 | 46 | $request = HTTP::Request->new( 'GET' => $request_text ); | ||||
| 106 | } | ||||||
| 107 | |||||||
| 108 | # print "\nAlmost $self->{'retrieved_identifier'}->{'number'} \n"; | ||||||
| 109 | 12 | 21066 | my $response = $self->request($request) | ||||
| 110 | ; # use the agent to make the request and get the response | ||||||
| 111 | # print "\there\n"; | ||||||
| 112 | 12 | 50 | 54 | if ( $response->is_success ) { | |||
| 113 | 12 | 142 | my $html = $response->content; | ||||
| 114 | |||||||
| 115 | # print "\n$html\n"; | ||||||
| 116 | 12 | 477 | my $p = HTML::HeadParser->new; | ||||
| 117 | 12 | 2553 | $p->parse( $response->content ); | ||||
| 118 | 12 | 2978 | my $entry; | ||||
| 119 | 12 | 100 | 53 | if ( $entry = $p->header('Refresh') ) | |||
| 120 | { # carp "no refresh seen via '$self->{'patent'}{'number'}' in \n'$html' " } | ||||||
| 121 | 10 | 433 | $entry =~ s/^.*?URL=//; | ||||
| 122 | 10 | 34 | $entry = 'http://patft.uspto.gov' . $entry; | ||||
| 123 | |||||||
| 124 | # print "$entry\n"; | ||||||
| 125 | 10 | 50 | 173 | $request = new HTTP::Request( 'GET' => "$entry" ) | |||
| 126 | or carp "bad refresh"; | ||||||
| 127 | 10 | 3347 | $response = $self->request($request); | ||||
| 128 | 10 | 274 | $html = $response->content; | ||||
| 129 | } | ||||||
| 130 | |||||||
| 131 | 12 | 100 | 803 | if ( $html =~ m/No patents have matched your query/ ) { | |||
| 132 | 1 | 9 | $page_response->set_parameter( 'is_success', undef ); | ||||
| 133 | 1 | 5 | $page_response->set_parameter( 'message', | ||||
| 134 | 'No patents have matched your query' ) | ||||||
| 135 | ; # No patents have matched your query | ||||||
| 136 | 1 | 24 | return $page_response; | ||||
| 137 | } | ||||||
| 138 | |||||||
| 139 | 11 | 50 | 376 | unless ( $html | |||
| 140 | =~ s/.*?.*?/\n\n |
||||||
| 141 | ) | ||||||
| 142 | { | ||||||
| 143 | 0 | 0 | carp "header weird A \n"; | ||||
| 144 | } | ||||||
| 145 | 11 | 50 | 3037 | unless ( $html | |||
| 146 | =~ s/.*( |
||||||
| 147 | ) | ||||||
| 148 | { | ||||||
| 149 | 0 | 0 | carp "header weird B \n"; | ||||
| 150 | } | ||||||
| 151 | 11 | 50 | 280 | unless ( $html =~ s/ |
|||
| 152 | 0 | 0 | carp "header weird C \n$html\n"; | ||||
| 153 | } | ||||||
| 154 | |||||||
| 155 | #warn " type is $self->{'patent'}->{'doc_type'}'\n"; | ||||||
| 156 | 11 | 50 | 341 | unless ( $html =~ s/ /is ) { |
|||
| 157 | 0 | 0 | carp "front weird \n$html\n"; | ||||
| 158 | } | ||||||
| 159 | 11 | 50 | 1095 | unless ( $html =~ s/(.*) (.*)body>/$1<\/body>/is ) { |
|||
| 160 | 0 | 0 | carp "end weird \n$html\n"; | ||||
| 161 | } | ||||||
| 162 | $html | ||||||
| 163 | 11 | 1073 | =~ s|"/netacgi/nph-Parser|"http://patft.uspto.gov/netacgi/nph-Parser|gi; | ||||
| 164 | 11 | 98 | $page_response->set_parameter( 'content', $html ); | ||||
| 165 | 11 | 361 | return $page_response; | ||||
| 166 | } | ||||||
| 167 | else { | ||||||
| 168 | 0 | 0 | carp "Unsuccessful response: \n'" | ||||
| 169 | . $response->status_line | ||||||
| 170 | . "'\n\nfrom request:\n'$request_text'\n"; | ||||||
| 171 | 0 | 0 | return undef; | ||||
| 172 | } | ||||||
| 173 | } | ||||||
| 174 | |||||||
| 175 | sub USPTO_tif { | ||||||
| 176 | 1 | 1 | 3 | my ( $self, $page_response ) = @_; | |||
| 177 | 1 | 2 | my ( $request, $base, $zero_fill ); | ||||
| 178 | |||||||
| 179 | 1 | 50 | 8 | if ( $self->{'patent'}{'number'} =~ m/(0|1|2|3|4)\d$/ ) { | |||
| 180 | |||||||
| 181 | # 0-4 is on one server, 5-9 is on another | ||||||
| 182 | 0 | 0 | $base = 'patimg1.uspto.gov'; | ||||
| 183 | } | ||||||
| 184 | 1 | 3 | else { $base = 'patimg2.uspto.gov'; } | ||||
| 185 | 1 | 7 | my $zerofill = sprintf '%0.8u', $self->{'patent'}{'number'}; | ||||
| 186 | |||||||
| 187 | # print "\nZerofill: $zerofill\n"; | ||||||
| 188 | 1 | 50 | 6 | if ( $self->{'patent'}->{'doc_type'} ) { | |||
| 189 | 0 | 0 | $request = HTTP::Request->new( 'GET' => | ||||
| 190 | "http://$base/.piw?Docid=$self->{'patent'}->{'doc_type'}$zerofill\&idkey=NONE" | ||||||
| 191 | ); | ||||||
| 192 | } | ||||||
| 193 | else { | ||||||
| 194 | 1 | 16 | $request = HTTP::Request->new( | ||||
| 195 | 'GET' => "http://$base/.piw?Docid=$zerofill\&idkey=NONE" ); | ||||||
| 196 | } | ||||||
| 197 | |||||||
| 198 | # print "\nAlmost $self->{'retrieved_identifier'}->{'number'} \n"; | ||||||
| 199 | 1 | 168 | my $response = $self->request($request); | ||||
| 200 | |||||||
| 201 | 1 | 7 | my $html = $response->content; | ||||
| 202 | |||||||
| 203 | { # page numbers | ||||||
| 204 | |||||||
| 205 | 1 | 50 | 14 | if ( $html =~ m/NumPages=(\d+)/ ) { | |||
| 1 | 0 | 22 | |||||
| 206 | 1 | 11 | $page_response->set_parameter( 'pages', $1 ); | ||||
| 207 | |||||||
| 208 | } | ||||||
| 209 | elsif ( $html =~ m/(\d+)\s+of\s+(\d+)\s+pages/ ) { | ||||||
| 210 | 0 | 0 | $page_response->set_parameter( 'pages', $2 ); | ||||
| 211 | |||||||
| 212 | # print "Pages: $2\n"; | ||||||
| 213 | } | ||||||
| 214 | else { | ||||||
| 215 | 0 | 0 | carp | ||||
| 216 | "no maximum page number found in $self->{'patent'}{'country'}$self->{'patent'}{'number'}: \n$html"; | ||||||
| 217 | } | ||||||
| 218 | } | ||||||
| 219 | 1 | 14 | my $p = HTML::TokeParser->new( \$html ); | ||||
| 220 | 1 | 211 | my $url; | ||||
| 221 | my $token; | ||||||
| 222 | 1 | 8 | FINDPAGE: while ( $token = $p->get_tag("a") ) { | ||||
| 223 | 2 | 50 | 1743 | $url = $token->[1]{href} || "-"; #very strange or construct ??? | |||
| 224 | 2 | 100 | 39 | if ( $url =~ m/$self->{'patent'}{'number'}/ ) { last FINDPAGE; } | |||
| 1 | 4 | ||||||
| 225 | |||||||
| 226 | # print "$url\n"; | ||||||
| 227 | } | ||||||
| 228 | 1 | 3 | undef $p; | ||||
| 229 | |||||||
| 230 | 1 | 27 | $url =~ s/PageNum=(\d+)/PageNum=$self->{'patent'}{'page'}/; | ||||
| 231 | # $url = "http://$base$url"; | ||||||
| 232 | |||||||
| 233 | # warn "URL = '$url'\n"; | ||||||
| 234 | # exit; | ||||||
| 235 | 1 | 50 | 14 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
| 236 | or carp "bad numbered page $self->{'patent'}{'page'} fetch $url"; | ||||||
| 237 | 1 | 182 | $response = $self->request($request); | ||||
| 238 | 1 | 25 | $html = $response->content; | ||||
| 239 | |||||||
| 240 | 1 | 112 | $p = HTML::TokeParser->new( \$html ); | ||||
| 241 | |||||||
| 242 | 1 | 324 | FINDPAGE: while ( $token = $p->get_tag("embed") ) { | ||||
| 243 | 0 | 0 | 0 | $url = $token->[1]->{src} || "-"; | |||
| 244 | 0 | 0 | 0 | if ( $url =~ m/image\/tiff/ ) { last FINDPAGE; } | |||
| 0 | 0 | ||||||
| 245 | } | ||||||
| 246 | |||||||
| 247 | # get tiff image | ||||||
| 248 | # $url = "http://$base$url"; | ||||||
| 249 | 1 | 50 | 14219 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
| 250 | or carp "Coudn't retrieve the tiff image fetch $url"; | ||||||
| 251 | 1 | 345 | $response = $self->request($request); | ||||
| 252 | |||||||
| 253 | # print "\nPage response\n$response->content\n\n"; | ||||||
| 254 | 1 | 23 | $page_response->set_parameter( 'content', $response->content ); | ||||
| 255 | |||||||
| 256 | 1 | 34 | return $page_response; | ||||
| 257 | } | ||||||
| 258 | |||||||
| 259 | sub USPTO_pdf { | ||||||
| 260 | 1 | 1 | 3 | my ( $self, $page_response ) = @_; | |||
| 261 | 1 | 1 | my ( $request, $base, $zero_fill ); | ||||
| 262 | |||||||
| 263 | 1 | 50 | 4 | my $tempdir = $self->{'patent'}->{'tempdir'} | |||
| 264 | if ( $self->{'patent'}->{'tempdir'} ); | ||||||
| 265 | 1 | 3 | my $fn_template = $self->{'patent'}->{'doc_id'} . "_XXXX"; | ||||
| 266 | |||||||
| 267 | 1 | 9 | my $pdf_file = new File::Temp( | ||||
| 268 | TEMPLATE => $fn_template, | ||||||
| 269 | DIR => $tempdir, | ||||||
| 270 | SUFFIX => '.pdf', | ||||||
| 271 | UNLINK => 1, | ||||||
| 272 | ); | ||||||
| 273 | 1 | 745 | my $pdf_fn = $pdf_file->filename; | ||||
| 274 | 1 | 16 | my $pdf = new PDF::API2(); | ||||
| 275 | 1 | 840 | print $pdf_file $pdf->stringify; | ||||
| 276 | 1 | 14077 | close $pdf_file; | ||||
| 277 | |||||||
| 278 | 1 | 60 | my $currenttime = localtime(); | ||||
| 279 | 1 | 10 | my $short_id | ||||
| 280 | = $self->{'patent'}{'country'} . $self->{'patent'}->{'number'}; | ||||||
| 281 | |||||||
| 282 | # my $pdf = new PDF::API2(-file => "$tempdir/$self->{'patent'}{'doc_id'}".".pdf"); | ||||||
| 283 | # my $pdf = new PDF::API2(-file => $pdf_fn); | ||||||
| 284 | 1 | 10 | $pdf = PDF::API2->open($pdf_fn); | ||||
| 285 | 1 | 11806 | my %h = $pdf->info( | ||||
| 286 | 'Author' => "Programatically Produced from Public Information", | ||||||
| 287 | 'CreationDate' => $currenttime, | ||||||
| 288 | 'ModDate' => $currenttime, | ||||||
| 289 | 'Creator' => "WWW::Patent::Page::USPTO_pdf", | ||||||
| 290 | 'Producer' => "US Patent Office and PDF::API2", | ||||||
| 291 | 'Title' => "$short_id", | ||||||
| 292 | 'Subject' => "patent", | ||||||
| 293 | 'Keywords' => "$short_id WWW::Patent::Page" | ||||||
| 294 | ); | ||||||
| 295 | 1 | 838 | my $page = $pdf->page(); | ||||
| 296 | 1 | 560 | $page->mediabox('A4'); | ||||
| 297 | |||||||
| 298 | |||||||
| 299 | 1 | 50 | 211 | if ( $self->{'patent'}{'number'} =~ m/(0|1|2|3|4)\d$/ ) { | |||
| 300 | |||||||
| 301 | # 0-4 is on one server, 5-9 is on another | ||||||
| 302 | 0 | 0 | $base = 'patimg1.uspto.gov'; | ||||
| 303 | } | ||||||
| 304 | 1 | 3 | else { $base = 'patimg2.uspto.gov'; } | ||||
| 305 | 1 | 11 | my $zerofill = sprintf '%0.8u', $self->{'patent'}{'number'}; | ||||
| 306 | |||||||
| 307 | 1 | 50 | 5 | if ( $self->{'patent'}->{'doc_type'} ) { | |||
| 308 | 0 | 0 | $request = HTTP::Request->new( 'GET' => | ||||
| 309 | "http://$base/.piw?Docid=$self->{'patent'}->{'doc_type'}$zerofill\&idkey=NONE" | ||||||
| 310 | ); | ||||||
| 311 | } | ||||||
| 312 | else { | ||||||
| 313 | 1 | 16 | $request = HTTP::Request->new( | ||||
| 314 | 'GET' => "http://$base/.piw?Docid=$zerofill\&idkey=NONE" ); | ||||||
| 315 | } | ||||||
| 316 | |||||||
| 317 | 1 | 17843 | my $response = $self->request($request); | ||||
| 318 | |||||||
| 319 | 1 | 10 | my $html = $response->content; | ||||
| 320 | # warn "html = $html\n"; | ||||||
| 321 | { # page numbers | ||||||
| 322 | |||||||
| 323 | 1 | 50 | 13 | if ( $html =~ m/NumPages=(\d+)/ ) { | |||
| 1 | 0 | 12 | |||||
| 324 | 1 | 9 | $page_response->set_parameter( 'pages', $1 ); | ||||
| 325 | } | ||||||
| 326 | elsif ( $html =~ m/(\d+)\s+of\s+(\d+)\s+pages/ ) { | ||||||
| 327 | 0 | 0 | $page_response->set_parameter( 'pages', $2 ); | ||||
| 328 | |||||||
| 329 | # print "Pages: $2\n"; | ||||||
| 330 | } | ||||||
| 331 | else { | ||||||
| 332 | 0 | 0 | carp | ||||
| 333 | "no maximum page number found in $self->{'patent'}{'country'}$self->{'patent'}{'number'}: \n$html"; | ||||||
| 334 | } | ||||||
| 335 | } | ||||||
| 336 | 1 | 13 | my $p = HTML::TokeParser->new( \$html ); | ||||
| 337 | 1 | 194 | my $url; | ||||
| 338 | my $token; | ||||||
| 339 | 1 | 6 | FINDPAGE: while ( $token = $p->get_tag("a") ) { | ||||
| 340 | 2 | 50 | 1055 | $url = $token->[1]{href} || "-"; #very strange or construct ??? | |||
| 341 | 2 | 100 | 40 | if ( $url =~ m/$self->{'patent'}{'number'}/ ) { last FINDPAGE; } | |||
| 1 | 3 | ||||||
| 342 | } | ||||||
| 343 | |||||||
| 344 | 1 | 4 | undef $p; | ||||
| 345 | |||||||
| 346 | 1 | 50 | 22 | if ( defined( $self->{'patent'}{'page'} ) ) { | |||
| 347 | 1 | 4 | $url =~ s/PageNum=(\d+)/PageNum=$self->{'patent'}{'page'}/; | ||||
| 348 | } | ||||||
| 349 | |||||||
| 350 | # print "\n\$self->{'patent'}->{'page'} = '$self->{'patent'}->{'page'}' |$url = '$url' \@ 391 \n"; | ||||||
| 351 | |||||||
| 352 | #$url = "http://$base$url"; | ||||||
| 353 | 1 | 50 | 11 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
| 354 | or carp "bad numbered page $self->{'patent'}{'page'} fetch $url"; | ||||||
| 355 | 1 | 50 | 188 | $response = $self->request($request) or carp "bad request" ; | |||
| 356 | 1 | 39 | $html = $response->content; | ||||
| 357 | |||||||
| 358 | # open( my $fh,">", "1.html"); print $fh $html; close $fh; | ||||||
| 359 | |||||||
| 360 | 1 | 454 | $p = HTML::TokeParser->new( \$html ); | ||||
| 361 | # warn "base = $base, URL1 = '$url'\n"; | ||||||
| 362 | |||||||
| 363 | 1 | 500 | $url = ""; | ||||
| 364 | |||||||
| 365 | 1 | 7 | FINDIMAGE: while($token = $p->get_tag("a") ) { | ||||
| 366 | 10 | 100 | 3234 | $url = $token->[1]->{href} || "-" ; | |||
| 367 | # warn "\ntoken 1 href = ", $token->[1]->{href} , "\n" ; | ||||||
| 368 | 10 | 100 | 56 | if ($url =~ m/View\+first\+page/) {last FINDIMAGE; } | |||
| 1 | 4 | ||||||
| 369 | } | ||||||
| 370 | # warn "URL2 = '$url'\n"; | ||||||
| 371 | |||||||
| 372 | |||||||
| 373 | 1 | 50 | 17 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
| 374 | or carp "bad numbered page $self->{'patent'}{'page'} fetch $url"; | ||||||
| 375 | 1 | 50 | 226 | $response = $self->request($request) or carp "bad request" ; | |||
| 376 | 1 | 20 | $html = $response->content; | ||||
| 377 | 1 | 10461 | $p = HTML::TokeParser->new( \$html ); | ||||
| 378 | |||||||
| 379 | #open( my $fh1,">", "2.html"); print $fh1 $html; close $fh1; | ||||||
| 380 | |||||||
| 381 | |||||||
| 382 | 1 | 256 | FINDTIF: while ( $token = $p->get_tag("embed") ) { | ||||
| 383 | # print "\ntoken = ", Dumper($token), "\n" ; | ||||||
| 384 | 1 | 50 | 4618 | $url = $token->[1]->{src} || "-"; | |||
| 385 | 1 | 50 | 14 | if ( $url =~ m/tif$/ ) { last FINDTIF; } | |||
| 0 | 0 | ||||||
| 386 | } | ||||||
| 387 | |||||||
| 388 | # get tiff image | ||||||
| 389 | 1 | 249 | $url = "http://$base$url"; $url =~ s/\n//; | ||||
| 1 | 4 | ||||||
| 390 | 1 | 50 | 9 | if ( defined( $self->{'patent'}{'page'} ) ) { | |||
| 391 | 1 | 5 | $url =~ s/PageNum=(\d+)/PageNum=$self->{'patent'}{'page'}/; | ||||
| 392 | } | ||||||
| 393 | |||||||
| 394 | # warn "URL3 = '$url'\n"; | ||||||
| 395 | 1 | 3 | my $tif_url = $url; | ||||
| 396 | 1 | 50 | 15 | $request = new HTTP::Request( 'GET' => "$url" ) | |||
| 397 | or carp "Coudn't retrieve the tiff image fetch $url"; | ||||||
| 398 | |||||||
| 399 | |||||||
| 400 | # prepare to store tif image | ||||||
| 401 | 1 | 215 | my $pat_page = 1; | ||||
| 402 | 1 | 50 | 7 | if ( defined( $self->{'patent'}{'page'} ) ) { | |||
| 403 | 1 | 4 | $pat_page = $self->{'patent'}{'page'}; | ||||
| 404 | } | ||||||
| 405 | |||||||
| 406 | # print "\n\$self->{'patent'}->{'doc_id'} = '$self->{'patent'}->{'doc_id'}' \@ 413 \n"; | ||||||
| 407 | |||||||
| 408 | 1 | 6 | $fn_template = $self->{'patent'}->{'doc_id'} . "_p" . $pat_page . "_XXXX"; | ||||
| 409 | # warn "TEMPLATE => $fn_template, DIR => $tempdir\n" ; | ||||||
| 410 | 1 | 16 | my $temp_tif = new File::Temp( | ||||
| 411 | TEMPLATE => $fn_template, | ||||||
| 412 | DIR => $tempdir, | ||||||
| 413 | SUFFIX => '.tif', | ||||||
| 414 | UNLINK => 1, | ||||||
| 415 | ); | ||||||
| 416 | |||||||
| 417 | 1 | 997 | my $done = 0; | ||||
| 418 | 1 | 4 | my $trys = 0; | ||||
| 419 | 1 | 50 | 33 | 13 | if ( !$done and $trys < 5 ) { | ||
| 420 | 1 | 10 | $response = $self->request($request); | ||||
| 421 | 0 | 0 | $trys++; | ||||
| 422 | 0 | 0 | 0 | 0 | if ( $response->is_success and $response->content ) { | ||
| 423 | 0 | 0 | print $temp_tif $response->content; | ||||
| 424 | # open( my $th,">", "2.html"); print $th $response->content; close $th; | ||||||
| 425 | |||||||
| 426 | 0 | 0 | $done = 1; | ||||
| 427 | } | ||||||
| 428 | else { | ||||||
| 429 | 0 | 0 | carp | ||||
| 430 | "attempt $trys response failed or content empty, no temporary tiff can be made- possibly network problem or timeout, will try again"; | ||||||
| 431 | } | ||||||
| 432 | } | ||||||
| 433 | 0 | 0 | else { carp "too many attempts, giving up."; return (0); } | ||||
| 0 | 0 | ||||||
| 434 | |||||||
| 435 | # close $temp_tif; | ||||||
| 436 | |||||||
| 437 | # convert to pdf | ||||||
| 438 | 0 | 0 | my $gfx = $page->gfx(); | ||||
| 439 | # print Dumper($temp_tif); | ||||||
| 440 | 0 | 0 | $gfx->image( $pdf->image_tiff($temp_tif), 0, 0, 0.23 ); | ||||
| 441 | |||||||
| 442 | # one page only | ||||||
| 443 | 0 | 0 | 0 | if ( $self->{'patent'}{'page'} ) { | |||
| 444 | 0 | 0 | $page_response->set_parameter( 'content', $pdf->stringify ); | ||||
| 445 | 0 | 0 | return $page_response; | ||||
| 446 | } | ||||||
| 447 | |||||||
| 448 | # retrieve all pages | ||||||
| 449 | 0 | 0 | my $maxpage = $page_response->get_parameter('pages'); | ||||
| 450 | 0 | 0 | for ( my $i = 2; $i <= $maxpage; $i++ ) { | ||||
| 451 | 0 | 0 | $tif_url =~ s/PageNum=(\d+)/PageNum=$i/; | ||||
| 452 | 0 | 0 | 0 | $request = new HTTP::Request( 'GET' => "$tif_url" ) | |||
| 453 | or carp "Couldn't retrieve the tiff image fetch $tif_url"; | ||||||
| 454 | 0 | 0 | $response = $self->request($request); | ||||
| 455 | |||||||
| 456 | # store tif image | ||||||
| 457 | 0 | 0 | $fn_template = $self->{'patent'}->{'doc_id'} . "_p" . $i . "_XXXX"; | ||||
| 458 | 0 | 0 | $temp_tif = new File::Temp( | ||||
| 459 | TEMPLATE => $fn_template, | ||||||
| 460 | DIR => $tempdir, | ||||||
| 461 | SUFFIX => '.tif', | ||||||
| 462 | UNLINK => 1, | ||||||
| 463 | ); | ||||||
| 464 | 0 | 0 | print $temp_tif $response->content; | ||||
| 465 | |||||||
| 466 | # close $temp_tif; | ||||||
| 467 | |||||||
| 468 | # convert to pdf | ||||||
| 469 | 0 | 0 | $page = $pdf->page(0); | ||||
| 470 | 0 | 0 | $gfx = $page->gfx(); | ||||
| 471 | 0 | 0 | $gfx->image( $pdf->image_tiff($temp_tif), 0, 0, 0.23 ); | ||||
| 472 | } | ||||||
| 473 | |||||||
| 474 | # return pdf as string | ||||||
| 475 | 0 | 0 | $page_response->set_parameter( 'content', $pdf->stringify ); | ||||
| 476 | |||||||
| 477 | # $pdf->save; | ||||||
| 478 | # $pdf->saveas("$tempdir/$self->{'patent'}->{'doc_id'}".".pdf"); | ||||||
| 479 | |||||||
| 480 | 0 | 0 | return $page_response; | ||||
| 481 | } | ||||||
| 482 | |||||||
| 483 | sub USPTO_terms { | ||||||
| 484 | 2 | 2 | 4 | my ($self) = @_; | |||
| 485 | return ( | ||||||
| 486 | 2 | 17 | "WWW::Patent::Page utilizes the USPTO web site.\n | ||||
| 487 | Refer to http://www.USPTO.gov for terms and conditions of use of that site. | ||||||
| 488 | |||||||
| 489 | Note that as of September 1, 2004, | ||||||
| 490 | http://www.uspto.gov/patft/help/notices.htm and the like state in part: | ||||||
| 491 | |||||||
| 492 | These databases are intended for use by the general public. | ||||||
| 493 | Due to limitations of equipment and bandwidth, they are not | ||||||
| 494 | intended to be a source for bulk downloads of USPTO data. | ||||||
| 495 | Bulk data may be purchased from USPTO at cost (see the USPTO | ||||||
| 496 | Products and Services Catalog). Individuals, companies, | ||||||
| 497 | IP addresses, or blocks of IP addresses who, in effect, | ||||||
| 498 | deny service to the general public by generating unusually | ||||||
| 499 | high numbers (1000 or more) of daily database accesses | ||||||
| 500 | (searches, pages, or hits), whether generated manually or | ||||||
| 501 | in an automated fashion, may be denied access to these | ||||||
| 502 | servers without notice. | ||||||
| 503 | |||||||
| 504 | Note at http://www.uspto.gov/patft/help/accpat.htm : | ||||||
| 505 | |||||||
| 506 | If you can access the main PTO Web site, but cannot access any | ||||||
| 507 | of the Patent Grant Database Quick Search, Advanced Quick Searching, | ||||||
| 508 | or Patent Number Searching pages, your workstation or organization | ||||||
| 509 | may have been denied access to the Web Patent Databases pursuant | ||||||
| 510 | to the policy stated at the top of this page. To determine if you | ||||||
| 511 | have been denied access, you can check the Denied List for your | ||||||
| 512 | computer's IP address. http://www.uspto.gov/patft/help/denied.htm | ||||||
| 513 | |||||||
| 514 | (Your IP address is the only means by which you are known to the | ||||||
| 515 | PTO servers -- server logs do not contain your email address or | ||||||
| 516 | any other personal identifying information. If you do not know | ||||||
| 517 | your computer's IP address because you are behind a firewall, do | ||||||
| 518 | not have a fixed IP address, or for any other reason, you can find | ||||||
| 519 | your current IP address by using an 'IP reflector,' such as | ||||||
| 520 | http://www2.simflex.com/ip.shtml or http://www.dslreports.com/ip.) | ||||||
| 521 | |||||||
| 522 | If you are an individual whose individual IP address has been | ||||||
| 523 | denied access: to seek to have your access restored, please send | ||||||
| 524 | email including your workstation and firewall or gateway IP addresses | ||||||
| 525 | (consult with your network administrators if necessary), and describing | ||||||
| 526 | the steps you have taken or will take to insure that future violations | ||||||
| 527 | of the USPTO access policy will not occur, to the Database Help Desk at | ||||||
| 528 | www\@uspto.gov. | ||||||
| 529 | |||||||
| 530 | If you are a member or employee of an organization which has been | ||||||
| 531 | denied access: please do not send individual email to PTO. Instead, | ||||||
| 532 | please have your network administrator or a person holding authority | ||||||
| 533 | over your organization\'s network operations send email including your | ||||||
| 534 | firewall, gateway, or workstation IP addresses, and describing the steps | ||||||
| 535 | you have taken or will take to insure that future violations of the USPTO | ||||||
| 536 | access policy will not occur, to the Database Help Desk at www\@uspto.gov. | ||||||
| 537 | |||||||
| 538 | For all other content-related matters, please send email to the Database | ||||||
| 539 | Help Desk at www\@uspto.gov | ||||||
| 540 | |||||||
| 541 | Note at http://www.uspto.gov/patft/help/images.htm | ||||||
| 542 | |||||||
| 543 | Patent images must be retrieved from the database one page at a time. | ||||||
| 544 | This is necessary since patents can be as long as 5,000 pages, and the | ||||||
| 545 | resources required to allow downloading such 'jumbo' patents are not | ||||||
| 546 | available. Users employing third-party software which downloads multiple | ||||||
| 547 | pages of a patent at once may find this practice subjects them to denial | ||||||
| 548 | of access to the databases if they exceed PTO's maximum allowable | ||||||
| 549 | activity levels. | ||||||
| 550 | |||||||
| 551 | " | ||||||
| 552 | ); | ||||||
| 553 | } | ||||||
| 554 | 1; | ||||||
| 555 | |||||||
| 556 | __END__ |