\r\n]+)(\s+<)/$1>\n$2/g;
179
|
|
|
|
|
|
|
# There seems to be a bug in HTML::TreeBuilder that causes
|
180
|
|
|
|
|
|
|
# abutting tags to be skpped!?!
|
181
|
0
|
|
|
|
|
0
|
$content =~ s!>
|
182
|
0
|
0
|
|
|
|
0
|
if (9 < $test)
|
183
|
|
|
|
|
|
|
{
|
184
|
0
|
|
|
|
|
0
|
eval "use File::Slurp";
|
185
|
0
|
|
|
|
|
0
|
write_file(qq'Pages/fetched-$domain.html', $content);
|
186
|
0
|
|
|
|
|
0
|
exit 88;
|
187
|
|
|
|
|
|
|
} # if
|
188
|
0
|
|
|
|
|
0
|
my $iLen = length($content);
|
189
|
|
|
|
|
|
|
# warn " DDD fetched $iLen bytes.\n";
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# UPDATED 2014-11. Both USA and UK sites use the same page
|
192
|
|
|
|
|
|
|
# format, therefore we always pass COM to the _extract() method:
|
193
|
0
|
|
|
|
|
0
|
my $result = _extract(COM, $content, $test);
|
194
|
|
|
|
|
|
|
# print Dumper($result);
|
195
|
|
|
|
|
|
|
# exit 88;
|
196
|
0
|
0
|
|
|
|
0
|
if (! defined $result)
|
197
|
|
|
|
|
|
|
{
|
198
|
0
|
|
|
|
|
0
|
DEBUG && warn " WWW _extract() returned nothing\n";
|
199
|
0
|
|
|
|
|
0
|
last INFINITE;
|
200
|
|
|
|
|
|
|
} # if
|
201
|
0
|
0
|
|
|
|
0
|
if (! ref $result->{items})
|
202
|
|
|
|
|
|
|
{
|
203
|
|
|
|
|
|
|
# Probably an empty wish list
|
204
|
0
|
|
|
|
|
0
|
DEBUG && warn " WWW _extract() returned no items\n";
|
205
|
0
|
|
|
|
|
0
|
last INFINITE;
|
206
|
|
|
|
|
|
|
} # if
|
207
|
|
|
|
|
|
|
# Clean up the parsed items and add them to our local @items
|
208
|
|
|
|
|
|
|
# array:
|
209
|
|
|
|
|
|
|
ITEM:
|
210
|
0
|
|
|
|
|
0
|
foreach my $item (@{$result->{items}})
|
|
0
|
|
|
|
|
0
|
|
211
|
|
|
|
|
|
|
{
|
212
|
0
|
|
|
|
|
0
|
$item->{'author'} =~ s!\n!!g;
|
213
|
0
|
|
|
|
|
0
|
$item->{'author'} =~ s!^\s*by\s+!!g;
|
214
|
0
|
|
|
|
|
0
|
$item->{'author'} =~ s! \n*!!s;
|
215
|
0
|
0
|
|
|
|
0
|
$item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s*\s*(\d+)!i);
|
216
|
0
|
0
|
|
|
|
0
|
$item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s*\s*(\d)!i);
|
217
|
0
|
0
|
0
|
|
|
0
|
if (
|
|
|
|
0
|
|
|
|
|
218
|
|
|
|
|
|
|
$uk
|
219
|
|
|
|
|
|
|
&&
|
220
|
|
|
|
|
|
|
$item->{image}
|
221
|
|
|
|
|
|
|
&&
|
222
|
|
|
|
|
|
|
($item->{image} !~ m!^http:!)
|
223
|
|
|
|
|
|
|
)
|
224
|
|
|
|
|
|
|
{
|
225
|
0
|
|
|
|
|
0
|
$item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
|
226
|
|
|
|
|
|
|
} # if
|
227
|
0
|
|
|
|
|
0
|
push @items, $item;
|
228
|
0
|
|
|
|
|
0
|
DEBUG_HTML && warn " DDD added one item to \@items\n";
|
229
|
|
|
|
|
|
|
} # foreach ITEM
|
230
|
0
|
|
|
|
|
0
|
my $sURLNext = $result->{next};
|
231
|
0
|
|
|
|
|
0
|
my $iNext = 0;
|
232
|
0
|
0
|
|
|
|
0
|
if (! defined $sURLNext)
|
233
|
|
|
|
|
|
|
{
|
234
|
|
|
|
|
|
|
# DEBUG_NEXT && warn " DDD no 'next' URL, content===$content===\n";
|
235
|
|
|
|
|
|
|
# exit 88;
|
236
|
|
|
|
|
|
|
# Use brute force to find it:
|
237
|
0
|
0
|
|
|
|
0
|
if ($content =~ m!([;&]page=\d+)">\s*(<[^>]+>)?\s*Next!)
|
238
|
|
|
|
|
|
|
{
|
239
|
0
|
|
|
|
|
0
|
DEBUG_NEXT && warn " DDD found next URL with brute force\n";
|
240
|
0
|
|
|
|
|
0
|
$sURLNext = $1;
|
241
|
|
|
|
|
|
|
} # if
|
242
|
|
|
|
|
|
|
} # if
|
243
|
|
|
|
|
|
|
# Paranoia:
|
244
|
0
|
0
|
|
|
|
0
|
if ( ! defined $sURLNext)
|
245
|
|
|
|
|
|
|
{
|
246
|
0
|
|
|
|
|
0
|
DEBUG_NEXT && warn " WWW did not find next url\n";
|
247
|
0
|
|
|
|
|
0
|
DEBUG_NEXT && write_file(qq'Pages/no-next.html', $content);
|
248
|
0
|
|
|
|
|
0
|
last INFINITE;
|
249
|
|
|
|
|
|
|
} # if
|
250
|
0
|
0
|
|
|
|
0
|
if ($sURLNext !~ m/[;&]page=(\d+)/)
|
251
|
|
|
|
|
|
|
{
|
252
|
0
|
|
|
|
|
0
|
DEBUG_NEXT && warn " WWW next url =$sURLNext= does not contain page#\n";
|
253
|
0
|
|
|
|
|
0
|
last INFINITE;
|
254
|
|
|
|
|
|
|
} # if
|
255
|
0
|
|
|
|
|
0
|
$iNext = $1;
|
256
|
|
|
|
|
|
|
# More paranoia:
|
257
|
0
|
0
|
|
|
|
0
|
if ($iNext <= $iPage)
|
258
|
|
|
|
|
|
|
{
|
259
|
0
|
|
|
|
|
0
|
DEBUG_NEXT && warn " WWW next url page=$iNext is not greater than current page=$iPage\n";
|
260
|
0
|
|
|
|
|
0
|
last INFINITE;
|
261
|
|
|
|
|
|
|
} # if
|
262
|
|
|
|
|
|
|
# ...and update:
|
263
|
0
|
|
|
|
|
0
|
$iPage = $iNext;
|
264
|
|
|
|
|
|
|
} # while INFINITE
|
265
|
0
|
|
|
|
|
0
|
return @items;
|
266
|
|
|
|
|
|
|
} # get_list
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub _fetch_page {
|
269
|
1
|
|
|
1
|
|
3
|
my ($url, $domain) = @_;
|
270
|
1
|
|
|
|
|
3
|
if (0)
|
271
|
|
|
|
|
|
|
{
|
272
|
|
|
|
|
|
|
eval "use File::Slurp";
|
273
|
|
|
|
|
|
|
# For debugging UK site:
|
274
|
|
|
|
|
|
|
return read_file('Pages/uk-2008-12-page1.html');
|
275
|
|
|
|
|
|
|
# For debugging USA site:
|
276
|
|
|
|
|
|
|
return read_file('Pages/2008-12.html');
|
277
|
|
|
|
|
|
|
} # if 0
|
278
|
|
|
|
|
|
|
# Set up the UA:
|
279
|
1
|
|
|
|
|
13
|
my $ua = new LWP::UserAgent(
|
280
|
|
|
|
|
|
|
keep_alive => 1,
|
281
|
|
|
|
|
|
|
timeout => 30,
|
282
|
|
|
|
|
|
|
agent => $USER_AGENT,
|
283
|
|
|
|
|
|
|
);
|
284
|
|
|
|
|
|
|
# Setting it in the 'new' seems not to work sometimes
|
285
|
1
|
|
|
|
|
3593
|
$ua->agent($USER_AGENT);
|
286
|
|
|
|
|
|
|
# For some reason, this makes stuff work:
|
287
|
|
|
|
|
|
|
# $ua->max_redirect( 0 );
|
288
|
|
|
|
|
|
|
# Make a full set of headers:
|
289
|
1
|
|
|
|
|
80
|
my $h = new HTTP::Headers(
|
290
|
|
|
|
|
|
|
'Host' => "www.amazon.$domain",
|
291
|
|
|
|
|
|
|
'Referer' => $url,
|
292
|
|
|
|
|
|
|
'User-Agent' => $USER_AGENT,
|
293
|
|
|
|
|
|
|
'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1',
|
294
|
|
|
|
|
|
|
'Accept-Language' => 'en-us,en;q=0.5',
|
295
|
|
|
|
|
|
|
'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
296
|
|
|
|
|
|
|
#'Accept-Encoding' => 'gzip,deflate',
|
297
|
|
|
|
|
|
|
'Keep-Alive' => '300',
|
298
|
|
|
|
|
|
|
'Connection' => 'keep-alive',
|
299
|
|
|
|
|
|
|
);
|
300
|
1
|
|
|
|
|
266
|
$h->referer("$url");
|
301
|
1
|
|
|
|
|
41
|
my $request = HTTP::Request->new ( 'GET', $url, $h );
|
302
|
1
|
|
|
|
|
6255
|
my $response;
|
303
|
1
|
|
|
|
|
2
|
my $times = 0;
|
304
|
|
|
|
|
|
|
# LWP should be able to do this but seemingly fails sometimes
|
305
|
1
|
|
|
|
|
6
|
while ($times++<3)
|
306
|
|
|
|
|
|
|
{
|
307
|
3
|
|
|
|
|
47
|
$response = $ua->request($request);
|
308
|
3
|
50
|
|
|
|
3822323
|
last if $response->is_success;
|
309
|
3
|
50
|
|
|
|
55
|
if ($response->is_redirect)
|
310
|
|
|
|
|
|
|
{
|
311
|
0
|
|
|
|
|
0
|
$url = $response->header("Location");
|
312
|
|
|
|
|
|
|
#$h->header("Referer", $url);
|
313
|
0
|
|
|
|
|
0
|
$h->referer("$url");
|
314
|
0
|
|
|
|
|
0
|
$request = HTTP::Request->new ( 'GET', $url, $h );
|
315
|
|
|
|
|
|
|
} # if
|
316
|
|
|
|
|
|
|
} # while
|
317
|
1
|
50
|
|
|
|
16
|
if (!$response->is_success)
|
318
|
|
|
|
|
|
|
{
|
319
|
1
|
|
|
|
|
527
|
croak "Failed to retrieve $url";
|
320
|
0
|
|
|
|
|
|
return undef;
|
321
|
|
|
|
|
|
|
} # if
|
322
|
0
|
|
|
|
|
|
my $s = $response->content;
|
323
|
|
|
|
|
|
|
# Clean the CRAP off the page:
|
324
|
0
|
|
|
|
|
|
$s =~ s!!!gs;
|
325
|
0
|
|
|
|
|
|
return $s;
|
326
|
|
|
|
|
|
|
} # _fetch_page
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# This is the HTML parsing version written by Martin Thurn:
|
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub _extract {
|
331
|
|
|
|
|
|
|
# Required arg1 = whether we are parsing the UK site or not (Boolean):
|
332
|
0
|
|
0
|
0
|
|
|
my $iUK = shift || 0;
|
333
|
|
|
|
|
|
|
# Required arg2 = the HTML contents of the webpage:
|
334
|
0
|
|
0
|
|
|
|
my $s = shift || '';
|
335
|
|
|
|
|
|
|
# Optional arg = debugging level:
|
336
|
0
|
|
0
|
|
|
|
my $iDebug = shift || 0;
|
337
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD start _extract()\n";
|
338
|
0
|
|
|
|
|
|
my $rh = {};
|
339
|
0
|
|
|
|
|
|
my $oTree = new HTML::TreeBuilder;
|
340
|
0
|
|
|
|
|
|
$oTree->parse($s);
|
341
|
0
|
|
|
|
|
|
$oTree->eof;
|
342
|
0
|
|
|
|
|
|
my $sTag = q/div/;
|
343
|
0
|
|
|
|
|
|
my $sClass = q/a-text-left a-fixed-left-grid-col a-col-right/;
|
344
|
0
|
0
|
|
|
|
|
my @aoSPAN = $iUK ? $oTree->look_down(_tag => $sTag,
|
345
|
|
|
|
|
|
|
class => 'a-text-left a-fixed-left-grid-col a-col-right',
|
346
|
|
|
|
|
|
|
# class => 'lineItemGroup',
|
347
|
|
|
|
|
|
|
)
|
348
|
|
|
|
|
|
|
: $oTree->look_down(_tag => $sTag,
|
349
|
|
|
|
|
|
|
class => $sClass,
|
350
|
|
|
|
|
|
|
);
|
351
|
0
|
|
|
|
|
|
my $iCountSPAN = scalar(@aoSPAN);
|
352
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract() found $iCountSPAN $sTag tags of class '$sClass'\n";
|
353
|
|
|
|
|
|
|
SPAN_TAG:
|
354
|
0
|
|
|
|
|
|
foreach my $oSPAN (@aoSPAN)
|
355
|
|
|
|
|
|
|
{
|
356
|
0
|
0
|
|
|
|
|
next SPAN_TAG unless ref $oSPAN;
|
357
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract() found toplevel item tagset\n";
|
358
|
0
|
|
|
|
|
|
if (9 < DEBUG_HTML)
|
359
|
|
|
|
|
|
|
{
|
360
|
|
|
|
|
|
|
my $s = $oSPAN->as_HTML;
|
361
|
|
|
|
|
|
|
warn " DDD ==$s==\n";
|
362
|
|
|
|
|
|
|
} # if
|
363
|
0
|
|
|
|
|
|
my $sASIN = q{};
|
364
|
0
|
|
|
|
|
|
my $sName = q{};
|
365
|
0
|
|
|
|
|
|
my $sTitle = q{};
|
366
|
0
|
|
|
|
|
|
my @aoA = $oSPAN->look_down(_tag => 'a');
|
367
|
0
|
|
|
|
|
|
DEBUG_HTML && warn sprintf(" DDD _extract(): oSPAN contains %d tags\n", scalar(@aoA));
|
368
|
|
|
|
|
|
|
A_TAG:
|
369
|
0
|
|
|
|
|
|
foreach my $oA (@aoA)
|
370
|
|
|
|
|
|
|
{
|
371
|
0
|
0
|
|
|
|
|
next A_TAG if ! ref $oA;
|
372
|
0
|
|
|
|
|
|
my $sA = $oA->as_HTML;
|
373
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): try A\n";
|
374
|
0
|
|
|
|
|
|
if (9 < DEBUG_HTML)
|
375
|
|
|
|
|
|
|
{
|
376
|
|
|
|
|
|
|
warn " DDD ==$sA==\n";
|
377
|
|
|
|
|
|
|
} # if
|
378
|
0
|
|
0
|
|
|
|
$sTitle = $oA->attr('title') || $oA->as_text;
|
379
|
|
|
|
|
|
|
# Strip leading whitespace:
|
380
|
0
|
|
|
|
|
|
$sTitle =~ s!\A\s+!!;
|
381
|
|
|
|
|
|
|
# Strip trailing whitespace:
|
382
|
0
|
|
|
|
|
|
$sTitle =~ s!\s+\Z!!;
|
383
|
|
|
|
|
|
|
# Ignore empty (image-only) tags:
|
384
|
0
|
0
|
|
|
|
|
next A_TAG if ($sTitle !~ m/\S/);
|
385
|
|
|
|
|
|
|
# Strip out zero-width spaces scattered about randomly in item titles
|
386
|
0
|
|
|
|
|
|
$sTitle =~ s/\x{200b}//g;
|
387
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): found item named '$sTitle'\n";
|
388
|
0
|
0
|
|
|
|
|
next A_TAG if ($sTitle eq 'Universal Wish List Button');
|
389
|
0
|
0
|
|
|
|
|
next A_TAG if ($sTitle eq 'Buying this gift elsewhere?');
|
390
|
0
|
|
|
|
|
|
my $sURL = $oA->attr('href');
|
391
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): URL ==$sURL==\n";
|
392
|
0
|
0
|
0
|
|
|
|
if (
|
|
|
|
0
|
|
|
|
|
393
|
|
|
|
|
|
|
($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
|
394
|
|
|
|
|
|
|
||
|
395
|
|
|
|
|
|
|
($sURL =~ m!/gp/product/(.+?)/ref!)
|
396
|
|
|
|
|
|
|
||
|
397
|
|
|
|
|
|
|
($sURL =~ m!/dp/(.+?)/ref!)
|
398
|
|
|
|
|
|
|
)
|
399
|
|
|
|
|
|
|
{
|
400
|
|
|
|
|
|
|
# It's a match!
|
401
|
0
|
|
|
|
|
|
$sASIN = $1;
|
402
|
0
|
|
|
|
|
|
last A_TAG;
|
403
|
|
|
|
|
|
|
} # if
|
404
|
|
|
|
|
|
|
else
|
405
|
|
|
|
|
|
|
{
|
406
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " EEE url does not contain asin\n";
|
407
|
|
|
|
|
|
|
}
|
408
|
|
|
|
|
|
|
} # foreach A_TAG
|
409
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): ASIN ==$sASIN==\n";
|
410
|
0
|
0
|
|
|
|
|
if ($sASIN eq q{})
|
411
|
|
|
|
|
|
|
{
|
412
|
0
|
|
|
|
|
|
next SPAN_TAG;
|
413
|
|
|
|
|
|
|
} # if
|
414
|
|
|
|
|
|
|
# Grab the smallest-containing ancestor of this item:
|
415
|
0
|
0
|
|
|
|
|
my $oParent = $iUK
|
416
|
|
|
|
|
|
|
? $oSPAN->look_up(_tag => 'tbody',
|
417
|
|
|
|
|
|
|
class => 'itemWrapper',
|
418
|
|
|
|
|
|
|
)
|
419
|
|
|
|
|
|
|
: $oSPAN;
|
420
|
0
|
0
|
|
|
|
|
if (! ref $oParent)
|
421
|
|
|
|
|
|
|
{
|
422
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " WWW did not find ancestor TBODY\n";
|
423
|
0
|
|
|
|
|
|
next SPAN_TAG;
|
424
|
|
|
|
|
|
|
} # if
|
425
|
0
|
|
|
|
|
|
my $sParentHTML = $oParent->as_HTML;
|
426
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): parent HTML ==$sParentHTML==\n";
|
427
|
0
|
|
|
|
|
|
my $sParent = $oParent->as_text;
|
428
|
|
|
|
|
|
|
# Manual text clean-up:
|
429
|
0
|
|
|
|
|
|
$sParent =~ s/(DESIRED|RECEIVED|PRIORITY)/; $1: /g;
|
430
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): parent text ==$sParent==\n";
|
431
|
0
|
|
|
|
|
|
my $iDesired = _match_desired($sParent);
|
432
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
|
433
|
0
|
|
|
|
|
|
my $sPriority = _match_priority($sParent);
|
434
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
|
435
|
0
|
|
|
|
|
|
my @aoTDtiny = $oParent->look_down(_tag => 'td',
|
436
|
|
|
|
|
|
|
class => 'tiny',
|
437
|
|
|
|
|
|
|
);
|
438
|
|
|
|
|
|
|
QUANT_TAG:
|
439
|
0
|
|
|
|
|
|
foreach my $oSPAN (@aoTDtiny)
|
440
|
|
|
|
|
|
|
{
|
441
|
0
|
0
|
|
|
|
|
next QUANT_TAG unless ref $oSPAN;
|
442
|
0
|
|
|
|
|
|
my $sSpan = $oSPAN->as_text;
|
443
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): TDtiny=$sSpan=\n";
|
444
|
0
|
|
0
|
|
|
|
$sPriority ||= _match_priority($sSpan);
|
445
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
|
446
|
0
|
|
0
|
|
|
|
$iDesired ||= _match_desired($sSpan);
|
447
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
|
448
|
|
|
|
|
|
|
} # foreach QUANT_TAG
|
449
|
0
|
0
|
0
|
|
|
|
if (! $iDesired || ! $sPriority)
|
450
|
|
|
|
|
|
|
{
|
451
|
|
|
|
|
|
|
# See if they are encoded in a FORM:
|
452
|
|
|
|
|
|
|
# Find the priority:
|
453
|
0
|
0
|
|
|
|
|
if ($sParentHTML =~ m! |
454
|
|
|
|
|
|
|
{
|
455
|
0
|
|
|
|
|
|
$sPriority = $1;
|
456
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
|
457
|
|
|
|
|
|
|
} # if
|
458
|
|
|
|
|
|
|
else
|
459
|
|
|
|
|
|
|
{
|
460
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " WWW did not find |
461
|
|
|
|
|
|
|
}
|
462
|
|
|
|
|
|
|
# Find the quantity desired:
|
463
|
0
|
0
|
|
|
|
|
if ($sParentHTML =~ m!!)
|
464
|
|
|
|
|
|
|
{
|
465
|
0
|
|
|
|
|
|
$iDesired = $1;
|
466
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
|
467
|
|
|
|
|
|
|
} # if
|
468
|
|
|
|
|
|
|
else
|
469
|
|
|
|
|
|
|
{
|
470
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " WWW did not find for desired-quantity\n";
|
471
|
|
|
|
|
|
|
}
|
472
|
|
|
|
|
|
|
} # if
|
473
|
|
|
|
|
|
|
# Put in default values if we never found them:
|
474
|
0
|
|
0
|
|
|
|
$sPriority ||= 'medium';
|
475
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
|
476
|
0
|
|
0
|
|
|
|
$iDesired ||= 1;
|
477
|
|
|
|
|
|
|
# Find the date added:
|
478
|
0
|
|
|
|
|
|
my $sDate = '';
|
479
|
0
|
0
|
|
|
|
|
if ($sParentHTML =~ m!>added\s+(.+?)
|
480
|
|
|
|
|
|
|
{
|
481
|
0
|
|
|
|
|
|
$sDate = $1;
|
482
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): date=$sDate=\n";
|
483
|
|
|
|
|
|
|
} # if
|
484
|
|
|
|
|
|
|
else
|
485
|
|
|
|
|
|
|
{
|
486
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " WWW did not find text for date-added\n";
|
487
|
|
|
|
|
|
|
}
|
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# Find the "author" of this item:
|
490
|
0
|
|
|
|
|
|
my @aoTDauthor;
|
491
|
0
|
0
|
|
|
|
|
if ($iUK)
|
492
|
|
|
|
|
|
|
{
|
493
|
0
|
|
|
|
|
|
@aoTDauthor = $oParent->look_down(_tag => 'td',
|
494
|
|
|
|
|
|
|
class => 'small',
|
495
|
|
|
|
|
|
|
);
|
496
|
|
|
|
|
|
|
}
|
497
|
|
|
|
|
|
|
else
|
498
|
|
|
|
|
|
|
{
|
499
|
|
|
|
|
|
|
@aoTDauthor = $oParent->look_down(_tag => 'span',
|
500
|
|
|
|
|
|
|
sub
|
501
|
|
|
|
|
|
|
{
|
502
|
0
|
|
|
0
|
|
|
my $sHtml = $_[0]->as_HTML;
|
503
|
|
|
|
|
|
|
# DEBUG_HTML && warn " DDD _extract(): try oTDauthor span==$sHtml==\n";
|
504
|
0
|
|
0
|
|
|
|
my $s = $_[0]->attr('class') || q{};
|
505
|
0
|
|
|
|
|
|
$s =~ m'BYLINE'i;
|
506
|
|
|
|
|
|
|
},
|
507
|
0
|
|
|
|
|
|
);
|
508
|
|
|
|
|
|
|
} # else
|
509
|
0
|
|
|
|
|
|
my $sAuthor = '';
|
510
|
|
|
|
|
|
|
AUTHOR_TAG:
|
511
|
0
|
|
|
|
|
|
foreach my $oTD (@aoTDauthor)
|
512
|
|
|
|
|
|
|
{
|
513
|
0
|
0
|
|
|
|
|
next AUTHOR_TAG unless ref $oTD;
|
514
|
0
|
|
|
|
|
|
my $s = $oTD->as_HTML;
|
515
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): try oTDauthor==$s==\n";
|
516
|
0
|
|
|
|
|
|
$s = $oTD->as_text;
|
517
|
0
|
0
|
|
|
|
|
if ($s =~ s!\A\s*(by|~)\s+!!)
|
518
|
|
|
|
|
|
|
{
|
519
|
0
|
|
|
|
|
|
$sAuthor = $s;
|
520
|
0
|
|
|
|
|
|
last AUTHOR_TAG;
|
521
|
|
|
|
|
|
|
} # if
|
522
|
|
|
|
|
|
|
} # foreach AUTHOR_TAG
|
523
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): author=$sAuthor=\n";
|
524
|
|
|
|
|
|
|
# Find the price of this item:
|
525
|
0
|
|
|
|
|
|
my $sPrice = '';
|
526
|
|
|
|
|
|
|
my $oTDprice = $oParent->look_down(_tag => 'span',
|
527
|
|
|
|
|
|
|
sub
|
528
|
|
|
|
|
|
|
{
|
529
|
0
|
|
0
|
0
|
|
|
my $s = $_[0]->attr('class') || q{};
|
530
|
0
|
|
|
|
|
|
$s =~ m'PRICE'i;
|
531
|
|
|
|
|
|
|
},
|
532
|
0
|
|
|
|
|
|
);
|
533
|
0
|
0
|
|
|
|
|
if (! ref $oTDprice)
|
534
|
|
|
|
|
|
|
{
|
535
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " WWW did not find TD for price\n";
|
536
|
|
|
|
|
|
|
# warn $oParent->as_HTML;
|
537
|
|
|
|
|
|
|
# exit 88;
|
538
|
|
|
|
|
|
|
# next SPAN_TAG;
|
539
|
|
|
|
|
|
|
} # if
|
540
|
|
|
|
|
|
|
else
|
541
|
|
|
|
|
|
|
{
|
542
|
0
|
|
|
|
|
|
$sPrice = $oTDprice->as_text;
|
543
|
0
|
0
|
|
|
|
|
if ($sPrice =~ m!Price:\s+(.+)\Z!)
|
544
|
|
|
|
|
|
|
{
|
545
|
0
|
|
|
|
|
|
$sPrice = $1;
|
546
|
|
|
|
|
|
|
} # if
|
547
|
0
|
|
|
|
|
|
$sPrice =~ s!\A\s+!!;
|
548
|
0
|
|
|
|
|
|
$sPrice =~ s!\s+\Z!!;
|
549
|
0
|
|
|
|
|
|
DEBUG_HTML && warn " DDD _extract(): price=$sPrice=\n";
|
550
|
|
|
|
|
|
|
} # else
|
551
|
|
|
|
|
|
|
# Add this item to the result set:
|
552
|
0
|
|
|
|
|
|
my %hsItem = (
|
553
|
|
|
|
|
|
|
asin => $sASIN,
|
554
|
|
|
|
|
|
|
author => $sAuthor,
|
555
|
|
|
|
|
|
|
# image => $sImageURL,
|
556
|
|
|
|
|
|
|
price => $sPrice,
|
557
|
|
|
|
|
|
|
priority => $sPriority,
|
558
|
|
|
|
|
|
|
quantity => $iDesired,
|
559
|
|
|
|
|
|
|
title => $sTitle,
|
560
|
|
|
|
|
|
|
# type => $sType,
|
561
|
|
|
|
|
|
|
);
|
562
|
0
|
|
|
|
|
|
DEBUG_HTML && warn Dumper(\%hsItem);
|
563
|
|
|
|
|
|
|
# warn " DDD _extract() added one item to \$rh->{items}\n";
|
564
|
0
|
|
|
|
|
|
push @{$rh->{items}}, \%hsItem;
|
|
0
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
# All done with this item:
|
566
|
0
|
|
|
|
|
|
$oParent->detach;
|
567
|
0
|
|
|
|
|
|
$oParent->delete;
|
568
|
|
|
|
|
|
|
} # foreach SPAN_TAG
|
569
|
|
|
|
|
|
|
# Look for the next-page link:
|
570
|
|
|
|
|
|
|
my @aoA = $oTree->look_down(_tag => 'a',
|
571
|
|
|
|
|
|
|
sub
|
572
|
|
|
|
|
|
|
{
|
573
|
0
|
0
|
|
0
|
|
|
return 0 if (length($_[0]->attr('href')) < 5);
|
574
|
0
|
|
0
|
|
|
|
my $s = $_[0]->as_text || q{};
|
575
|
|
|
|
|
|
|
# DEBUG_NEXT && warn " DDD _extract(): try next ==$s==\n";
|
576
|
0
|
|
|
|
|
|
$s =~ m/\A\s*NEXT/i;
|
577
|
|
|
|
|
|
|
},
|
578
|
0
|
|
|
|
|
|
);
|
579
|
0
|
|
|
|
|
|
my $iCountA = scalar(@aoA);
|
580
|
0
|
|
|
|
|
|
DEBUG_NEXT && warn " DDD _extract(): found $iCountA tags that match 'next'\n";
|
581
|
0
|
|
|
|
|
|
my $oA = shift @aoA;
|
582
|
0
|
0
|
|
|
|
|
if (ref $oA)
|
583
|
|
|
|
|
|
|
{
|
584
|
0
|
|
|
|
|
|
$rh->{next} = $oA->attr('href');
|
585
|
0
|
|
|
|
|
|
DEBUG_NEXT && warn " DDD _extract(): raw next URL is ==$rh->{next}==\n";
|
586
|
|
|
|
|
|
|
} # if
|
587
|
|
|
|
|
|
|
else
|
588
|
|
|
|
|
|
|
{
|
589
|
0
|
|
|
|
|
|
DEBUG_NEXT && warn " DDD _extract(): did not find next URL\n";
|
590
|
|
|
|
|
|
|
}
|
591
|
0
|
|
|
|
|
|
return $rh;
|
592
|
|
|
|
|
|
|
} # _extract
|
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
sub _match_priority {
|
595
|
0
|
|
0
|
0
|
|
|
my $s = shift || return;
|
596
|
0
|
0
|
|
|
|
|
if ($s =~ m'.+PRIORITY:?\s*(\w+?)(\s|\z)'i)
|
597
|
|
|
|
|
|
|
{
|
598
|
0
|
|
|
|
|
|
return lc $1;
|
599
|
|
|
|
|
|
|
} # if
|
600
|
0
|
|
|
|
|
|
return;
|
601
|
|
|
|
|
|
|
} # _match_priority
|
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
sub _match_desired {
|
604
|
0
|
|
0
|
0
|
|
|
my $s = shift || return;
|
605
|
0
|
0
|
|
|
|
|
if ($s =~ m'(?:DESIRED|WANTS):?\s*(\d+)'i)
|
606
|
|
|
|
|
|
|
{
|
607
|
0
|
|
|
|
|
|
return lc $1;
|
608
|
|
|
|
|
|
|
} # if
|
609
|
0
|
|
|
|
|
|
return;
|
610
|
|
|
|
|
|
|
} # _match_desired
|
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
1;
|
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
__END__
|