\r\n]+)(\s+<)/$1>\n$2/g;
 
| 183 |  |  |  |  |  |  | # There seems to be a bug in HTML::TreeBuilder that causes | 
| 184 |  |  |  |  |  |  | # abutting tags to be skpped!?! | 
| 185 | 0 |  |  |  |  | 0 | $content =~ s!> | 
| 186 | 0 | 0 |  |  |  | 0 | if (9 < $test) | 
| 187 |  |  |  |  |  |  | { | 
| 188 | 0 |  |  |  |  | 0 | eval "use File::Slurp"; | 
| 189 | 0 |  |  |  |  | 0 | my $sFname = qq'Pages/fetched-$domain.html'; | 
| 190 | 0 |  |  |  |  | 0 | write_file($sFname, $content); | 
| 191 | 0 |  |  |  |  | 0 | warn " DDD wrote HTML to $sFname\n"; | 
| 192 | 0 |  |  |  |  | 0 | exit 88; | 
| 193 |  |  |  |  |  |  | } # if | 
| 194 | 0 |  |  |  |  | 0 | my $iLen = length($content); | 
| 195 |  |  |  |  |  |  | # warn " DDD fetched $iLen bytes.\n"; | 
| 196 |  |  |  |  |  |  |  | 
| 197 | 0 |  |  |  |  | 0 | my $result = _extract($uk, $content, $test); | 
| 198 |  |  |  |  |  |  | # print Dumper($result); | 
| 199 |  |  |  |  |  |  | # exit 88; | 
| 200 | 0 | 0 |  |  |  | 0 | if (! defined $result) | 
| 201 |  |  |  |  |  |  | { | 
| 202 | 0 |  |  |  |  | 0 | DEBUG && warn " WWW _extract() returned nothing\n"; | 
| 203 | 0 |  |  |  |  | 0 | last INFINITE; | 
| 204 |  |  |  |  |  |  | } # if | 
| 205 | 0 | 0 |  |  |  | 0 | if (! ref $result->{items}) | 
| 206 |  |  |  |  |  |  | { | 
| 207 |  |  |  |  |  |  | # Probably an empty wish list | 
| 208 | 0 |  |  |  |  | 0 | DEBUG && warn " WWW _extract() returned no items\n"; | 
| 209 | 0 |  |  |  |  | 0 | last INFINITE; | 
| 210 |  |  |  |  |  |  | } # if | 
| 211 |  |  |  |  |  |  | # Clean up the parsed items and add them to our local @items | 
| 212 |  |  |  |  |  |  | # array: | 
| 213 |  |  |  |  |  |  | ITEM: | 
| 214 | 0 |  |  |  |  | 0 | foreach my $item (@{$result->{items}}) | 
|  | 0 |  |  |  |  | 0 |  | 
| 215 |  |  |  |  |  |  | { | 
| 216 | 0 |  |  |  |  | 0 | $item->{'author'} =~ s!\n!!g; | 
| 217 | 0 |  |  |  |  | 0 | $item->{'author'} =~ s!^\s*by\s+!!g; | 
| 218 | 0 |  |  |  |  | 0 | $item->{'author'} =~ s! \n*!!s;
 | 
| 219 | 0 | 0 |  |  |  | 0 | $item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s*\s*(\d+)!i); | 
| 220 | 0 | 0 |  |  |  | 0 | $item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s*\s*(\d)!i); | 
| 221 | 0 | 0 | 0 |  |  | 0 | if ( | 
|  |  |  | 0 |  |  |  |  | 
| 222 |  |  |  |  |  |  | $uk | 
| 223 |  |  |  |  |  |  | && | 
| 224 |  |  |  |  |  |  | $item->{image} | 
| 225 |  |  |  |  |  |  | && | 
| 226 |  |  |  |  |  |  | ($item->{image} !~ m!^http:!) | 
| 227 |  |  |  |  |  |  | ) | 
| 228 |  |  |  |  |  |  | { | 
| 229 | 0 |  |  |  |  | 0 | $item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image}; | 
| 230 |  |  |  |  |  |  | } # if | 
| 231 | 0 |  |  |  |  | 0 | push @items, $item; | 
| 232 | 0 |  |  |  |  | 0 | DEBUG_HTML && warn " DDD added one item to \@items\n"; | 
| 233 |  |  |  |  |  |  | } # foreach ITEM | 
| 234 |  |  |  |  |  |  | # Assumes an absolute path without hostname: | 
| 235 | 0 | 0 |  |  |  | 0 | if ( ! defined $result->{next}) | 
| 236 |  |  |  |  |  |  | { | 
| 237 | 0 |  |  |  |  | 0 | DEBUG_NEXT && warn " WWW did not find next url\n"; | 
| 238 | 0 |  |  |  |  | 0 | DEBUG_NEXT && write_file(qq'Pages/no-next.html', $content); | 
| 239 | 0 |  |  |  |  | 0 | last INFINITE; | 
| 240 |  |  |  |  |  |  | } # if | 
| 241 | 0 |  |  |  |  | 0 | $url = $sBase . $result->{next}; | 
| 242 | 0 |  |  |  |  | 0 | $iPage++; | 
| 243 |  |  |  |  |  |  | } # while INFINITE | 
| 244 | 0 |  |  |  |  | 0 | return @items; | 
| 245 |  |  |  |  |  |  | } # get_list | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | sub _fetch_page { | 
| 248 | 1 |  |  | 1 |  | 5 | my ($url, $domain) = @_; | 
| 249 | 1 |  |  |  |  | 3 | if (0) | 
| 250 |  |  |  |  |  |  | { | 
| 251 |  |  |  |  |  |  | eval "use File::Slurp"; | 
| 252 |  |  |  |  |  |  | # For debugging UK site: | 
| 253 |  |  |  |  |  |  | return read_file('Pages/uk-2008-12-page1.html'); | 
| 254 |  |  |  |  |  |  | # For debugging USA site: | 
| 255 |  |  |  |  |  |  | return read_file('Pages/2008-12.html'); | 
| 256 |  |  |  |  |  |  | } # if 0 | 
| 257 |  |  |  |  |  |  | # Set up the UA: | 
| 258 | 1 |  |  |  |  | 17 | my $ua = new LWP::UserAgent( | 
| 259 |  |  |  |  |  |  | keep_alive => 1, | 
| 260 |  |  |  |  |  |  | timeout => 30, | 
| 261 |  |  |  |  |  |  | agent => $USER_AGENT, | 
| 262 |  |  |  |  |  |  | ); | 
| 263 |  |  |  |  |  |  | # Setting it in the 'new' seems not to work sometimes | 
| 264 | 1 |  |  |  |  | 6366 | $ua->agent($USER_AGENT); | 
| 265 |  |  |  |  |  |  | # For some reason, this makes stuff work: | 
| 266 |  |  |  |  |  |  | # $ua->max_redirect( 0 ); | 
| 267 |  |  |  |  |  |  | # Make a full set of headers: | 
| 268 | 1 |  |  |  |  | 140 | my $h = new HTTP::Headers( | 
| 269 |  |  |  |  |  |  | 'Host'            => "www.amazon.$domain", | 
| 270 |  |  |  |  |  |  | 'Referer'         => $url, | 
| 271 |  |  |  |  |  |  | 'User-Agent'      => $USER_AGENT, | 
| 272 |  |  |  |  |  |  | 'Accept'          => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1', | 
| 273 |  |  |  |  |  |  | 'Accept-Language' => 'en-us,en;q=0.5', | 
| 274 |  |  |  |  |  |  | 'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', | 
| 275 |  |  |  |  |  |  | #'Accept-Encoding' => 'gzip,deflate', | 
| 276 |  |  |  |  |  |  | 'Keep-Alive'      =>  '300', | 
| 277 |  |  |  |  |  |  | 'Connection'      =>  'keep-alive', | 
| 278 |  |  |  |  |  |  | ); | 
| 279 | 1 |  |  |  |  | 619 | $h->referer("$url"); | 
| 280 | 1 |  |  |  |  | 67 | my $request  =  HTTP::Request->new ( 'GET', $url, $h ); | 
| 281 | 1 |  |  |  |  | 12758 | my $response; | 
| 282 | 1 |  |  |  |  | 5 | my $times = 0; | 
| 283 |  |  |  |  |  |  | # LWP should be able to do this but seemingly fails sometimes | 
| 284 | 1 |  |  |  |  | 8 | while ($times++<3) | 
| 285 |  |  |  |  |  |  | { | 
| 286 | 3 |  |  |  |  | 47 | $response =  $ua->request($request); | 
| 287 | 3 | 50 |  |  |  | 376558 | last if $response->is_success; | 
| 288 | 3 | 50 |  |  |  | 62 | if ($response->is_redirect) | 
| 289 |  |  |  |  |  |  | { | 
| 290 | 0 |  |  |  |  | 0 | $url = $response->header("Location"); | 
| 291 |  |  |  |  |  |  | #$h->header("Referer", $url); | 
| 292 | 0 |  |  |  |  | 0 | $h->referer("$url"); | 
| 293 | 0 |  |  |  |  | 0 | $request  =  HTTP::Request->new ( 'GET', $url, $h ); | 
| 294 |  |  |  |  |  |  | } # if | 
| 295 |  |  |  |  |  |  | } # while | 
| 296 | 1 | 50 |  |  |  | 20 | if (!$response->is_success) | 
| 297 |  |  |  |  |  |  | { | 
| 298 | 1 |  |  |  |  | 859 | croak "Failed to retrieve $url"; | 
| 299 | 0 |  |  |  |  |  | return undef; | 
| 300 |  |  |  |  |  |  | } # if | 
| 301 | 0 |  |  |  |  |  | my $s = $response->content; | 
| 302 |  |  |  |  |  |  | # Clean the CRAP off the page: | 
| 303 | 0 |  |  |  |  |  | $s =~ s!!!gs; | 
| 304 | 0 |  |  |  |  |  | return $s; | 
| 305 |  |  |  |  |  |  | } # _fetch_page | 
| 306 |  |  |  |  |  |  |  | 
| 307 |  |  |  |  |  |  | # This is the HTML parsing version written by Martin Thurn: | 
| 308 |  |  |  |  |  |  |  | 
| 309 |  |  |  |  |  |  | sub _extract { | 
| 310 |  |  |  |  |  |  | # Required arg1 = whether we are parsing the UK site or not (Boolean): | 
| 311 | 0 |  | 0 | 0 |  |  | my $iUK = shift || 0; | 
| 312 |  |  |  |  |  |  | # Required arg2 = the HTML contents of the webpage: | 
| 313 | 0 |  | 0 |  |  |  | my $s = shift || ''; | 
| 314 |  |  |  |  |  |  | # Optional arg = debugging level: | 
| 315 | 0 |  | 0 |  |  |  | my $iDebug = shift || 0; | 
| 316 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD start _extract()\n"; | 
| 317 | 0 |  |  |  |  |  | my $rh = {}; | 
| 318 | 0 |  |  |  |  |  | my $oTree = new HTML::TreeBuilder; | 
| 319 | 0 |  |  |  |  |  | $oTree->parse($s); | 
| 320 | 0 |  |  |  |  |  | $oTree->eof; | 
| 321 | 0 |  |  |  |  |  | my $sTag = q/div/; | 
| 322 | 0 |  |  |  |  |  | my $sClass = q/a-fixed-left-grid a-spacing-none/; | 
| 323 | 0 | 0 |  |  |  |  | $sClass = q/a-text-left a-fixed-left-grid-col a-col-right/ if $iUK; | 
| 324 |  |  |  |  |  |  | # $sClass = q/a-fixed-left-grid   a-spacing-large/ if $iUK; | 
| 325 |  |  |  |  |  |  |  | 
| 326 | 0 |  |  |  |  |  | my @aoSPAN = $oTree->look_down(_tag => $sTag, | 
| 327 |  |  |  |  |  |  | class => $sClass, | 
| 328 |  |  |  |  |  |  | ); | 
| 329 | 0 |  |  |  |  |  | my $iCountSPAN = scalar(@aoSPAN); | 
| 330 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract() found $iCountSPAN $sTag tags of class '$sClass'\n"; | 
| 331 |  |  |  |  |  |  | SPAN_TAG: | 
| 332 | 0 |  |  |  |  |  | foreach my $oSPAN (@aoSPAN) | 
| 333 |  |  |  |  |  |  | { | 
| 334 | 0 | 0 |  |  |  |  | next SPAN_TAG unless ref $oSPAN; | 
| 335 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract() found toplevel item tagset\n"; | 
| 336 | 0 |  |  |  |  |  | if (9 < DEBUG_HTML) | 
| 337 |  |  |  |  |  |  | { | 
| 338 |  |  |  |  |  |  | my $s = $oSPAN->as_HTML; | 
| 339 |  |  |  |  |  |  | warn " DDD ==$s==\n"; | 
| 340 |  |  |  |  |  |  | } # if | 
| 341 | 0 |  |  |  |  |  | my $sASIN = q{}; | 
| 342 | 0 |  |  |  |  |  | my $sName = q{}; | 
| 343 | 0 |  |  |  |  |  | my $sTitle = q{}; | 
| 344 | 0 |  |  |  |  |  | my @aoA = $oSPAN->look_down(_tag => 'a'); | 
| 345 | 0 |  |  |  |  |  | DEBUG_HTML && warn sprintf(" DDD _extract(): oSPAN contains %d  tags\n", scalar(@aoA)); | 
| 346 |  |  |  |  |  |  | A_TAG: | 
| 347 | 0 |  |  |  |  |  | foreach my $oA (@aoA) | 
| 348 |  |  |  |  |  |  | { | 
| 349 | 0 | 0 |  |  |  |  | next A_TAG if ! ref $oA; | 
| 350 | 0 |  |  |  |  |  | my $sA = $oA->as_HTML; | 
| 351 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract(): try A\n"; | 
| 352 | 0 |  |  |  |  |  | if (9 < DEBUG_HTML) | 
| 353 |  |  |  |  |  |  | { | 
| 354 |  |  |  |  |  |  | warn " DDD ==$sA==\n"; | 
| 355 |  |  |  |  |  |  | } # if | 
| 356 | 0 |  | 0 |  |  |  | $sTitle = $oA->attr('title') || $oA->as_text; | 
| 357 |  |  |  |  |  |  | # Strip leading whitespace: | 
| 358 | 0 |  |  |  |  |  | $sTitle =~ s!\A\s+!!; | 
| 359 |  |  |  |  |  |  | # Strip trailing whitespace: | 
| 360 | 0 |  |  |  |  |  | $sTitle =~ s!\s+\Z!!; | 
| 361 |  |  |  |  |  |  | # Ignore empty (image-only) tags: | 
| 362 | 0 | 0 |  |  |  |  | next A_TAG if ($sTitle !~ m/\S/); | 
| 363 |  |  |  |  |  |  | # Strip out zero-width spaces scattered about randomly in item titles | 
| 364 | 0 |  |  |  |  |  | $sTitle =~ s/\x{200b}//g; | 
| 365 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract(): found item named '$sTitle'\n"; | 
| 366 | 0 | 0 |  |  |  |  | next A_TAG if ($sTitle eq 'Universal Wish List Button'); | 
| 367 | 0 | 0 |  |  |  |  | next A_TAG if ($sTitle eq 'Buying this gift elsewhere?'); | 
| 368 | 0 |  |  |  |  |  | my $sURL = $oA->attr('href'); | 
| 369 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract(): URL ==$sURL==\n"; | 
| 370 | 0 | 0 | 0 |  |  |  | if ( | 
|  |  |  | 0 |  |  |  |  | 
| 371 |  |  |  |  |  |  | ($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!) | 
| 372 |  |  |  |  |  |  | || | 
| 373 |  |  |  |  |  |  | ($sURL =~ m!/gp/product/(.+?)/ref!) | 
| 374 |  |  |  |  |  |  | || | 
| 375 |  |  |  |  |  |  | ($sURL =~ m!/dp/(.+?)/(_encoding|ref)!) | 
| 376 |  |  |  |  |  |  | ) | 
| 377 |  |  |  |  |  |  | { | 
| 378 |  |  |  |  |  |  | # It's a match! | 
| 379 | 0 |  |  |  |  |  | $sASIN = $1; | 
| 380 | 0 |  |  |  |  |  | last A_TAG; | 
| 381 |  |  |  |  |  |  | } # if | 
| 382 |  |  |  |  |  |  | else | 
| 383 |  |  |  |  |  |  | { | 
| 384 | 0 |  |  |  |  |  | DEBUG_HTML && warn " EEE   url does not contain asin\n"; | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  | } # foreach A_TAG | 
| 387 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract(): ASIN ==$sASIN==\n"; | 
| 388 | 0 | 0 |  |  |  |  | if ($sASIN eq q{}) | 
| 389 |  |  |  |  |  |  | { | 
| 390 | 0 |  |  |  |  |  | next SPAN_TAG; | 
| 391 |  |  |  |  |  |  | } # if | 
| 392 |  |  |  |  |  |  | # Grab the smallest-containing ancestor of this item: | 
| 393 | 0 | 0 |  |  |  |  | my $oParent = $iUK | 
| 394 |  |  |  |  |  |  | ? $oSPAN->look_up(_tag => 'tbody', | 
| 395 |  |  |  |  |  |  | class => 'itemWrapper', | 
| 396 |  |  |  |  |  |  | ) | 
| 397 |  |  |  |  |  |  | : $oSPAN; | 
| 398 | 0 |  |  |  |  |  | $oParent = $oSPAN; | 
| 399 | 0 | 0 |  |  |  |  | if (! ref $oParent) | 
| 400 |  |  |  |  |  |  | { | 
| 401 | 0 |  |  |  |  |  | DEBUG_HTML && warn " WWW did not find ancestor TBODY\n"; | 
| 402 | 0 |  |  |  |  |  | next SPAN_TAG; | 
| 403 |  |  |  |  |  |  | } # if | 
| 404 | 0 |  |  |  |  |  | my $sParentHTML = $oParent->as_HTML; | 
| 405 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract(): parent HTML ==$sParentHTML==\n"; | 
| 406 | 0 |  |  |  |  |  | my $sParent = $oParent->as_text; | 
| 407 |  |  |  |  |  |  | # Manual text clean-up: | 
| 408 | 0 |  |  |  |  |  | $sParent =~ s/(DESIRED|RECEIVED|PRIORITY)/;  $1: /g; | 
| 409 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract(): parent text ==$sParent==\n"; | 
| 410 | 0 |  |  |  |  |  | my $iDesired = _match_desired($sParent); | 
| 411 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     desired set to =$iDesired=\n"; | 
| 412 | 0 |  |  |  |  |  | my $sPriority = _match_priority($sParent); | 
| 413 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     priority set to =$sPriority=\n"; | 
| 414 | 0 |  |  |  |  |  | my @aoTDtiny = $oParent->look_down(_tag => 'td', | 
| 415 |  |  |  |  |  |  | class => 'tiny', | 
| 416 |  |  |  |  |  |  | ); | 
| 417 |  |  |  |  |  |  | QUANT_TAG: | 
| 418 | 0 |  |  |  |  |  | foreach my $oSPAN (@aoTDtiny) | 
| 419 |  |  |  |  |  |  | { | 
| 420 | 0 | 0 |  |  |  |  | next QUANT_TAG unless ref $oSPAN; | 
| 421 | 0 |  |  |  |  |  | my $sSpan = $oSPAN->as_text; | 
| 422 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():   TDtiny=$sSpan=\n"; | 
| 423 | 0 |  | 0 |  |  |  | $sPriority ||= _match_priority($sSpan); | 
| 424 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     priority set to =$sPriority=\n"; | 
| 425 | 0 |  | 0 |  |  |  | $iDesired ||= _match_desired($sSpan); | 
| 426 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     desired set to =$iDesired=\n"; | 
| 427 |  |  |  |  |  |  | } # foreach QUANT_TAG | 
| 428 | 0 | 0 | 0 |  |  |  | if (! $iDesired || ! $sPriority) | 
| 429 |  |  |  |  |  |  | { | 
| 430 |  |  |  |  |  |  | # See if they are encoded in a FORM: | 
| 431 |  |  |  |  |  |  | # Find the priority: | 
| 432 | 0 | 0 |  |  |  |  | if ($sParentHTML =~ m! | 
| 433 |  |  |  |  |  |  | { | 
| 434 | 0 |  |  |  |  |  | $sPriority = $1; | 
| 435 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     priority set to =$sPriority=\n"; | 
| 436 |  |  |  |  |  |  | } # if | 
| 437 |  |  |  |  |  |  | else | 
| 438 |  |  |  |  |  |  | { | 
| 439 | 0 |  |  |  |  |  | DEBUG_HTML && warn " WWW   did not find | 
| 440 |  |  |  |  |  |  | } | 
| 441 |  |  |  |  |  |  | # Find the quantity desired: | 
| 442 | 0 | 0 |  |  |  |  | if ($sParentHTML =~ m!!) | 
| 443 |  |  |  |  |  |  | { | 
| 444 | 0 |  |  |  |  |  | $iDesired = $1; | 
| 445 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     desired set to =$iDesired=\n"; | 
| 446 |  |  |  |  |  |  | } # if | 
| 447 |  |  |  |  |  |  | else | 
| 448 |  |  |  |  |  |  | { | 
| 449 | 0 |  |  |  |  |  | DEBUG_HTML && warn " WWW   did not find  for desired-quantity\n"; | 
| 450 |  |  |  |  |  |  | } | 
| 451 |  |  |  |  |  |  | } # if | 
| 452 |  |  |  |  |  |  | # Put in default values if we never found them: | 
| 453 | 0 |  | 0 |  |  |  | $sPriority ||= 'medium'; | 
| 454 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():     priority set to =$sPriority=\n"; | 
| 455 | 0 |  | 0 |  |  |  | $iDesired ||= 1; | 
| 456 |  |  |  |  |  |  | # Find the date added: | 
| 457 | 0 |  |  |  |  |  | my $sDate = ''; | 
| 458 | 0 | 0 |  |  |  |  | if ($sParentHTML =~ m!>added\s+(.+?) | 
| 459 |  |  |  |  |  |  | { | 
| 460 | 0 |  |  |  |  |  | $sDate = $1; | 
| 461 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():   date=$sDate=\n"; | 
| 462 |  |  |  |  |  |  | } # if | 
| 463 |  |  |  |  |  |  | else | 
| 464 |  |  |  |  |  |  | { | 
| 465 | 0 |  |  |  |  |  | DEBUG_HTML && warn " WWW   did not find text for date-added\n"; | 
| 466 |  |  |  |  |  |  | } | 
| 467 |  |  |  |  |  |  |  | 
| 468 |  |  |  |  |  |  | # Find the "author" of this item: | 
| 469 | 0 |  |  |  |  |  | my @aoTDauthor; | 
| 470 | 0 | 0 |  |  |  |  | if ($iUK) | 
| 471 |  |  |  |  |  |  | { | 
| 472 | 0 |  |  |  |  |  | @aoTDauthor = $oParent->look_down(_tag => 'td', | 
| 473 |  |  |  |  |  |  | class => 'small', | 
| 474 |  |  |  |  |  |  | ); | 
| 475 |  |  |  |  |  |  | } | 
| 476 |  |  |  |  |  |  | else | 
| 477 |  |  |  |  |  |  | { | 
| 478 |  |  |  |  |  |  | @aoTDauthor = $oParent->look_down(_tag => 'span', | 
| 479 |  |  |  |  |  |  | sub | 
| 480 |  |  |  |  |  |  | { | 
| 481 | 0 |  |  | 0 |  |  | my $sHtml = $_[0]->as_HTML; | 
| 482 |  |  |  |  |  |  | # DEBUG_HTML && warn " DDD _extract():   try oTDauthor span==$sHtml==\n"; | 
| 483 | 0 |  | 0 |  |  |  | my $s = $_[0]->attr('class') || q{}; | 
| 484 | 0 |  |  |  |  |  | $s =~ m'BYLINE'i; | 
| 485 |  |  |  |  |  |  | }, | 
| 486 | 0 |  |  |  |  |  | ); | 
| 487 |  |  |  |  |  |  | } # else | 
| 488 | 0 |  |  |  |  |  | my $sAuthor = ''; | 
| 489 |  |  |  |  |  |  | AUTHOR_TAG: | 
| 490 | 0 |  |  |  |  |  | foreach my $oTD (@aoTDauthor) | 
| 491 |  |  |  |  |  |  | { | 
| 492 | 0 | 0 |  |  |  |  | next AUTHOR_TAG unless ref $oTD; | 
| 493 | 0 |  |  |  |  |  | my $s = $oTD->as_HTML; | 
| 494 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():   try oTDauthor==$s==\n"; | 
| 495 | 0 |  |  |  |  |  | $s = $oTD->as_text; | 
| 496 | 0 | 0 |  |  |  |  | if ($s =~ s!\A\s*(by|~)\s+!!) | 
| 497 |  |  |  |  |  |  | { | 
| 498 | 0 |  |  |  |  |  | $sAuthor = $s; | 
| 499 | 0 |  |  |  |  |  | last AUTHOR_TAG; | 
| 500 |  |  |  |  |  |  | } # if | 
| 501 |  |  |  |  |  |  | } # foreach AUTHOR_TAG | 
| 502 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():   author=$sAuthor=\n"; | 
| 503 |  |  |  |  |  |  | # Find the price of this item: | 
| 504 | 0 |  |  |  |  |  | my $sPrice = ''; | 
| 505 |  |  |  |  |  |  | my $oTDprice = $oParent->look_down(_tag => 'span', | 
| 506 |  |  |  |  |  |  | sub | 
| 507 |  |  |  |  |  |  | { | 
| 508 | 0 |  | 0 | 0 |  |  | my $s = $_[0]->attr('class') || q{}; | 
| 509 | 0 |  |  |  |  |  | $s =~ m'PRICE'i; | 
| 510 |  |  |  |  |  |  | }, | 
| 511 | 0 |  |  |  |  |  | ); | 
| 512 | 0 | 0 |  |  |  |  | if (! ref $oTDprice) | 
| 513 |  |  |  |  |  |  | { | 
| 514 | 0 |  |  |  |  |  | DEBUG_HTML && warn " WWW did not find TD for price\n"; | 
| 515 |  |  |  |  |  |  | # warn $oParent->as_HTML; | 
| 516 |  |  |  |  |  |  | # exit 88; | 
| 517 |  |  |  |  |  |  | # next SPAN_TAG; | 
| 518 |  |  |  |  |  |  | } # if | 
| 519 |  |  |  |  |  |  | else | 
| 520 |  |  |  |  |  |  | { | 
| 521 | 0 |  |  |  |  |  | $sPrice = $oTDprice->as_text; | 
| 522 | 0 | 0 |  |  |  |  | if ($sPrice =~ m!Price:\s+(.+)\Z!) | 
| 523 |  |  |  |  |  |  | { | 
| 524 | 0 |  |  |  |  |  | $sPrice = $1; | 
| 525 |  |  |  |  |  |  | } # if | 
| 526 | 0 |  |  |  |  |  | $sPrice =~ s!\A\s+!!; | 
| 527 | 0 |  |  |  |  |  | $sPrice =~ s!\s+\Z!!; | 
| 528 | 0 |  |  |  |  |  | DEBUG_HTML && warn " DDD _extract():   price=$sPrice=\n"; | 
| 529 |  |  |  |  |  |  | } # else | 
| 530 |  |  |  |  |  |  | # Add this item to the result set: | 
| 531 | 0 |  |  |  |  |  | my %hsItem = ( | 
| 532 |  |  |  |  |  |  | asin => $sASIN, | 
| 533 |  |  |  |  |  |  | author => $sAuthor, | 
| 534 |  |  |  |  |  |  | # image => $sImageURL, | 
| 535 |  |  |  |  |  |  | price => $sPrice, | 
| 536 |  |  |  |  |  |  | priority => $sPriority, | 
| 537 |  |  |  |  |  |  | quantity => $iDesired, | 
| 538 |  |  |  |  |  |  | title => $sTitle, | 
| 539 |  |  |  |  |  |  | # type => $sType, | 
| 540 |  |  |  |  |  |  | ); | 
| 541 | 0 |  |  |  |  |  | DEBUG_HTML && warn Dumper(\%hsItem); | 
| 542 |  |  |  |  |  |  | # warn " DDD   _extract() added one item to \$rh->{items}\n"; | 
| 543 | 0 |  |  |  |  |  | push @{$rh->{items}}, \%hsItem; | 
|  | 0 |  |  |  |  |  |  | 
| 544 |  |  |  |  |  |  | # All done with this item: | 
| 545 | 0 |  |  |  |  |  | $oParent->detach; | 
| 546 | 0 |  |  |  |  |  | $oParent->delete; | 
| 547 |  |  |  |  |  |  | } # foreach SPAN_TAG | 
| 548 |  |  |  |  |  |  | # Look for the next-page link: | 
| 549 |  |  |  |  |  |  | my @aoA = $oTree->look_down(_tag => 'a', | 
| 550 |  |  |  |  |  |  | role => 'link', | 
| 551 |  |  |  |  |  |  | sub { | 
| 552 | 0 | 0 |  | 0 |  |  | return 0 if (length($_[0]->attr('href')) < 55); | 
| 553 |  |  |  |  |  |  | # my $s = $_[0]->as_text || q{}; | 
| 554 |  |  |  |  |  |  | # DEBUG_NEXT && warn " DDD _extract():   try next  ==$s==\n"; | 
| 555 |  |  |  |  |  |  | # $s =~ m/\A\s*(NEXT|SEE\s+MORE)\s*\z/i; | 
| 556 | 0 |  |  |  |  |  | my $s = $_[0]->attr('class'); | 
| 557 | 0 |  |  |  |  |  | DEBUG_NEXT && warn " DDD _extract():   try next  ==$s==\n"; | 
| 558 | 0 |  |  |  |  |  | $s =~ m/wl-see-more/ | 
| 559 |  |  |  |  |  |  | }, | 
| 560 | 0 |  |  |  |  |  | ); | 
| 561 | 0 |  |  |  |  |  | my $iCountA = scalar(@aoA); | 
| 562 | 0 |  |  |  |  |  | DEBUG_NEXT && warn " DDD _extract():   found $iCountA  tags that match 'next'\n"; | 
| 563 | 0 |  |  |  |  |  | my $oA = shift @aoA; | 
| 564 | 0 | 0 |  |  |  |  | if (ref $oA) | 
| 565 |  |  |  |  |  |  | { | 
| 566 | 0 |  |  |  |  |  | $rh->{next} = $oA->attr('href'); | 
| 567 | 0 |  |  |  |  |  | DEBUG_NEXT && warn " DDD _extract(): raw next URL is ==$rh->{next}==\n"; | 
| 568 |  |  |  |  |  |  | } # if | 
| 569 |  |  |  |  |  |  | else | 
| 570 |  |  |  |  |  |  | { | 
| 571 | 0 |  |  |  |  |  | DEBUG_NEXT && warn " DDD _extract(): did not find next URL\n"; | 
| 572 |  |  |  |  |  |  | } | 
| 573 | 0 |  |  |  |  |  | return $rh; | 
| 574 |  |  |  |  |  |  | } # _extract | 
| 575 |  |  |  |  |  |  |  | 
| 576 |  |  |  |  |  |  | sub _match_priority { | 
| 577 | 0 |  | 0 | 0 |  |  | my $s = shift || return; | 
| 578 | 0 | 0 |  |  |  |  | if ($s =~ m'PRIORITY:?\s*(\w+?)(\s|\z)'i) | 
| 579 |  |  |  |  |  |  | { | 
| 580 | 0 |  |  |  |  |  | return lc $1; | 
| 581 |  |  |  |  |  |  | } # if | 
| 582 | 0 |  |  |  |  |  | return; | 
| 583 |  |  |  |  |  |  | } # _match_priority | 
| 584 |  |  |  |  |  |  |  | 
| 585 |  |  |  |  |  |  | sub _match_desired { | 
| 586 | 0 |  | 0 | 0 |  |  | my $s = shift || return; | 
| 587 | 0 | 0 |  |  |  |  | if ($s =~ m'(?:DESIRED|WANTS):?\s*(\d+)'i) | 
| 588 |  |  |  |  |  |  | { | 
| 589 | 0 |  |  |  |  |  | return lc $1; | 
| 590 |  |  |  |  |  |  | } # if | 
| 591 | 0 |  |  |  |  |  | return; | 
| 592 |  |  |  |  |  |  | } # _match_desired | 
| 593 |  |  |  |  |  |  |  | 
| 594 |  |  |  |  |  |  | 1; | 
| 595 |  |  |  |  |  |  |  | 
| 596 |  |  |  |  |  |  | __END__ |