File Coverage

blib/lib/WWW/Amazon/Wishlist.pm
Criterion Covered Total %
statement 63 246 25.6
branch 6 74 8.1
condition 4 52 7.6
subroutine 14 20 70.0
pod 0 1 0.0
total 87 393 22.1


line stmt bran cond sub pod time code
1            
2             package WWW::Amazon::Wishlist;
3            
4 1     1   90706 use warnings;
  1         4  
  1         49  
5 1     1   8 use strict;
  1         4  
  1         68  
6            
7             our
8             $VERSION = 2.018;
9            
10 1     1   9 use vars qw( @ISA @EXPORT @EXPORT_OK );
  1         15  
  1         84  
11            
12 1     1   8 use Carp;
  1         3  
  1         84  
13 1     1   9 use Data::Dumper;
  1         3  
  1         61  
14 1     1   1044 use HTML::TreeBuilder;
  1         45177  
  1         17  
15 1     1   1077 use LWP::UserAgent;
  1         67390  
  1         53  
16            
17 1     1   16 use constant COM => 0;
  1         3  
  1         98  
18 1     1   8 use constant UK => 1;
  1         3  
  1         55  
19            
20 1     1   7 use constant DEBUG => 0;
  1         4  
  1         53  
21 1     1   8 use constant DEBUG_HTML => 0;
  1         3  
  1         49  
22 1     1   7 use constant DEBUG_NEXT => 0;
  1         3  
  1         3404  
23            
24             require Exporter;
25            
26             @ISA = qw(Exporter);
27             @EXPORT = qw(
28             );
29             @EXPORT_OK = qw(
30             get_list
31             UK
32             COM
33             );
34            
35             =pod
36            
37             =head1 NAME
38            
39             WWW::Amazon::Wishlist - grab all the details from your Amazon wishlist
40            
41             =head1 SYNOPSIS
42            
43             use WWW::Amazon::Wishlist qw(get_list COM UK);
44            
45             my @wishlist;
46            
47             @wishlist = get_list($my_amazon_com_id); # gets it from amazon.com
48             @wishlist = get_list($my_amazon_com_id, COM); # same, explicitly
49             @wishlist = get_list($my_amazon_couk_id, UK); # gets it from amazon.co.uk
50            
51             # Or, if you didn't import the COM and UK constants:
52             @wishlist = get_list ($my_amazon_couk_id, WWW::Amazon::Wishlist::UK);
53            
54             # The elements of @wishlist are hashrefs that contain the following elements:
55             foreach my $book (@wishlist)
56             {
57             print $book->{title}, # the, err, title
58             $book->{author}, # and the author(s)
59             $book->{asin}, # the asin number, its unique id on Amazon
60             $book->{price}, # how much it will set you back
61             $book->{quantity}, # how many you said you want
62             $book->{priority}, # how urgently you said you want it (1-5)
63             $book->{type}; # Hardcover/Paperback/CD/DVD etc (not available in the US)
64             } # foreach
65            
66             =head1 DESCRIPTION
67            
68             Goes to amazon.(com|co.uk), scrapes your wishlist, and returns it
69             in a array of hashrefs so that you can fiddle with it to your heart's
70             content.
71            
72             =head1 GETTING YOUR AMAZON ID
73            
74             The best way to do this is to search for your own wishlist in the search
75             tools.
76            
77             Searching for mine (simon@twoshortplanks.com) on amazon.com takes me to
78             the URL something like
79            
80             http://www.amazon.com/exec/obidos/wishlist/2EAJG83WS7YZM/...
81            
82             there's some more cruft after that last string of numbers and letters
83             but it's the
84            
85             2EAJG83WS7YZM
86            
87             bit that's important.
88            
89             Doing the same for amazon.co.uk is just as easy.
90            
91             Apparently, some people have had problems getting to their wishlist right
92             after it gets set up. You may have to wait a while for it to become
93             browseable.
94            
95             =head1 SHOWING YOUR APPRECIATION
96            
97             There was a thread on london.pm mailing list about working in a vacuum -
98             that it was a bit depressing to keep writing modules but never get any
99             feedback. So, if you use and like this module then please send me an
100             email and make my day.
101            
102             All it takes is a few little bytes.
103            
104            
105             =head1 BUGS
106            
107             B
108            
109             C is a screen scraper and is there for
110             is vulnerable to any changes that Amazon make to their HTML.
111            
112             If it starts returning no items then this is very likely the reason
113             and I will get around to fixing it as soon as possible.
114            
115             You might want to look at the C module instead.
116            
117             It doesn't cope with anything apart from the UK and USA versions of Amazon.
118            
119             I don't think it likes unavailable items - trying to work around this
120             breaks UK compatability.
121            
122             The code has accumulated lots of cruft.
123            
124             Lack of testing. It works for the pages I've tried it for but that's
125             no guarantee.
126            
127             =head1 LICENSE
128            
129             Copyright (c) 2003 Simon Wistow
130            
131             Distributed under the same terms as Perl itself.
132            
133             This software is under no warranty and will probably destroy your wish
134             list, kill your friends, burn your house and bring about the apocalypse
135            
136             =head1 AUTHOR
137            
138             Simon Wistow
139             Currently maintained by Martin Thurn
140            
141             =head1 SEE ALSO
142            
143             L, L, L
144            
145             =cut
146            
147             my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
148            
149             sub get_list {
150             # Required arg = wishlist ID:
151 1   33 1 0 114 my $id = shift || croak "No ID given to get_list() function\n";
152             # Optional arg = whether we're accessing the UK site. Default is "no":
153 1   50     8 my $uk = shift || 0;
154             # Optional arg = turn on debugging:
155 1   50     5 my $test = shift || DEBUG;
156             # Note to self ... should we UC the id? Nahhhh. Not yet.
157             # fairly self explanatory
158 1 50       5 my $domain = ($uk) ? "co.uk" : "com";
159 1         4 my $sBase = qq'http://www.amazon.$domain';
160             # set up some variables
161 1         4 my $iPage = 1;
162 1         3 my @items;
163             my $url;
164             # and awaaaaaaaaaaaaay we go ....
165             INFINITE:
166 1         4 while (1)
167             {
168 1 50 33     14 $url ||= $uk ? "$sBase/gp/registry/wishlist/ref=cm_wl_search_1?page=$iPage&cid=$id" :
169             "$sBase/gp/registry/wishlist/$id";
170             # This is a typical complete .com URL as of 2008-12:
171             # http://www.amazon.com/gp/registry/wishlist/2O4B95NPM1W3L
172 1         2 DEBUG_HTML && warn " DDD fetching wishlist for $id, page $iPage...\n";
173             # Don't overwhelm the server:
174 1 50       5 sleep(3) if (1 < $iPage);
175 1         6 my $content = _fetch_page($url, $domain);
176 0         0 if (DEBUG_HTML == 88)
177             {
178             warn $content;
179             exit 88;
180             } # if
181             # As of 2009-08, Amazon returns HTML with MISSING BRACKETS:
182 0         0 $content =~ s/(\r\n]+)(\s+<)/$1>\n$2/g;
183             # There seems to be a bug in HTML::TreeBuilder that causes
184             # abutting tags to be skpped!?!
185 0         0 $content =~ s!>
186 0 0       0 if (9 < $test)
187             {
188 0         0 eval "use File::Slurp";
189 0         0 my $sFname = qq'Pages/fetched-$domain.html';
190 0         0 write_file($sFname, $content);
191 0         0 warn " DDD wrote HTML to $sFname\n";
192 0         0 exit 88;
193             } # if
194 0         0 my $iLen = length($content);
195             # warn " DDD fetched $iLen bytes.\n";
196            
197 0         0 my $result = _extract($uk, $content, $test);
198             # print Dumper($result);
199             # exit 88;
200 0 0       0 if (! defined $result)
201             {
202 0         0 DEBUG && warn " WWW _extract() returned nothing\n";
203 0         0 last INFINITE;
204             } # if
205 0 0       0 if (! ref $result->{items})
206             {
207             # Probably an empty wish list
208 0         0 DEBUG && warn " WWW _extract() returned no items\n";
209 0         0 last INFINITE;
210             } # if
211             # Clean up the parsed items and add them to our local @items
212             # array:
213             ITEM:
214 0         0 foreach my $item (@{$result->{items}})
  0         0  
215             {
216 0         0 $item->{'author'} =~ s!\n!!g;
217 0         0 $item->{'author'} =~ s!^\s*by\s+!!g;
218 0         0 $item->{'author'} =~ s!
\n*!!s;
219 0 0       0 $item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s*\s*(\d+)!i);
220 0 0       0 $item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s*\s*(\d)!i);
221 0 0 0     0 if (
      0        
222             $uk
223             &&
224             $item->{image}
225             &&
226             ($item->{image} !~ m!^http:!)
227             )
228             {
229 0         0 $item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
230             } # if
231 0         0 push @items, $item;
232 0         0 DEBUG_HTML && warn " DDD added one item to \@items\n";
233             } # foreach ITEM
234             # Assumes an absolute path without hostname:
235 0 0       0 if ( ! defined $result->{next})
236             {
237 0         0 DEBUG_NEXT && warn " WWW did not find next url\n";
238 0         0 DEBUG_NEXT && write_file(qq'Pages/no-next.html', $content);
239 0         0 last INFINITE;
240             } # if
241 0         0 $url = $sBase . $result->{next};
242 0         0 $iPage++;
243             } # while INFINITE
244 0         0 return @items;
245             } # get_list
246            
247             sub _fetch_page {
248 1     1   5 my ($url, $domain) = @_;
249 1         3 if (0)
250             {
251             eval "use File::Slurp";
252             # For debugging UK site:
253             return read_file('Pages/uk-2008-12-page1.html');
254             # For debugging USA site:
255             return read_file('Pages/2008-12.html');
256             } # if 0
257             # Set up the UA:
258 1         17 my $ua = new LWP::UserAgent(
259             keep_alive => 1,
260             timeout => 30,
261             agent => $USER_AGENT,
262             );
263             # Setting it in the 'new' seems not to work sometimes
264 1         6366 $ua->agent($USER_AGENT);
265             # For some reason, this makes stuff work:
266             # $ua->max_redirect( 0 );
267             # Make a full set of headers:
268 1         140 my $h = new HTTP::Headers(
269             'Host' => "www.amazon.$domain",
270             'Referer' => $url,
271             'User-Agent' => $USER_AGENT,
272             'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1',
273             'Accept-Language' => 'en-us,en;q=0.5',
274             'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
275             #'Accept-Encoding' => 'gzip,deflate',
276             'Keep-Alive' => '300',
277             'Connection' => 'keep-alive',
278             );
279 1         619 $h->referer("$url");
280 1         67 my $request = HTTP::Request->new ( 'GET', $url, $h );
281 1         12758 my $response;
282 1         5 my $times = 0;
283             # LWP should be able to do this but seemingly fails sometimes
284 1         8 while ($times++<3)
285             {
286 3         47 $response = $ua->request($request);
287 3 50       376558 last if $response->is_success;
288 3 50       62 if ($response->is_redirect)
289             {
290 0         0 $url = $response->header("Location");
291             #$h->header("Referer", $url);
292 0         0 $h->referer("$url");
293 0         0 $request = HTTP::Request->new ( 'GET', $url, $h );
294             } # if
295             } # while
296 1 50       20 if (!$response->is_success)
297             {
298 1         859 croak "Failed to retrieve $url";
299 0           return undef;
300             } # if
301 0           my $s = $response->content;
302             # Clean the CRAP off the page:
303 0           $s =~ s!!!gs;
304 0           return $s;
305             } # _fetch_page
306            
307             # This is the HTML parsing version written by Martin Thurn:
308            
309             sub _extract {
310             # Required arg1 = whether we are parsing the UK site or not (Boolean):
311 0   0 0     my $iUK = shift || 0;
312             # Required arg2 = the HTML contents of the webpage:
313 0   0       my $s = shift || '';
314             # Optional arg = debugging level:
315 0   0       my $iDebug = shift || 0;
316 0           DEBUG_HTML && warn " DDD start _extract()\n";
317 0           my $rh = {};
318 0           my $oTree = new HTML::TreeBuilder;
319 0           $oTree->parse($s);
320 0           $oTree->eof;
321 0           my $sTag = q/div/;
322 0           my $sClass = q/a-fixed-left-grid a-spacing-none/;
323 0 0         $sClass = q/a-text-left a-fixed-left-grid-col a-col-right/ if $iUK;
324             # $sClass = q/a-fixed-left-grid a-spacing-large/ if $iUK;
325            
326 0           my @aoSPAN = $oTree->look_down(_tag => $sTag,
327             class => $sClass,
328             );
329 0           my $iCountSPAN = scalar(@aoSPAN);
330 0           DEBUG_HTML && warn " DDD _extract() found $iCountSPAN $sTag tags of class '$sClass'\n";
331             SPAN_TAG:
332 0           foreach my $oSPAN (@aoSPAN)
333             {
334 0 0         next SPAN_TAG unless ref $oSPAN;
335 0           DEBUG_HTML && warn " DDD _extract() found toplevel item tagset\n";
336 0           if (9 < DEBUG_HTML)
337             {
338             my $s = $oSPAN->as_HTML;
339             warn " DDD ==$s==\n";
340             } # if
341 0           my $sASIN = q{};
342 0           my $sName = q{};
343 0           my $sTitle = q{};
344 0           my @aoA = $oSPAN->look_down(_tag => 'a');
345 0           DEBUG_HTML && warn sprintf(" DDD _extract(): oSPAN contains %d tags\n", scalar(@aoA));
346             A_TAG:
347 0           foreach my $oA (@aoA)
348             {
349 0 0         next A_TAG if ! ref $oA;
350 0           my $sA = $oA->as_HTML;
351 0           DEBUG_HTML && warn " DDD _extract(): try A\n";
352 0           if (9 < DEBUG_HTML)
353             {
354             warn " DDD ==$sA==\n";
355             } # if
356 0   0       $sTitle = $oA->attr('title') || $oA->as_text;
357             # Strip leading whitespace:
358 0           $sTitle =~ s!\A\s+!!;
359             # Strip trailing whitespace:
360 0           $sTitle =~ s!\s+\Z!!;
361             # Ignore empty (image-only) tags:
362 0 0         next A_TAG if ($sTitle !~ m/\S/);
363             # Strip out zero-width spaces scattered about randomly in item titles
364 0           $sTitle =~ s/\x{200b}//g;
365 0           DEBUG_HTML && warn " DDD _extract(): found item named '$sTitle'\n";
366 0 0         next A_TAG if ($sTitle eq 'Universal Wish List Button');
367 0 0         next A_TAG if ($sTitle eq 'Buying this gift elsewhere?');
368 0           my $sURL = $oA->attr('href');
369 0           DEBUG_HTML && warn " DDD _extract(): URL ==$sURL==\n";
370 0 0 0       if (
      0        
371             ($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
372             ||
373             ($sURL =~ m!/gp/product/(.+?)/ref!)
374             ||
375             ($sURL =~ m!/dp/(.+?)/(_encoding|ref)!)
376             )
377             {
378             # It's a match!
379 0           $sASIN = $1;
380 0           last A_TAG;
381             } # if
382             else
383             {
384 0           DEBUG_HTML && warn " EEE url does not contain asin\n";
385             }
386             } # foreach A_TAG
387 0           DEBUG_HTML && warn " DDD _extract(): ASIN ==$sASIN==\n";
388 0 0         if ($sASIN eq q{})
389             {
390 0           next SPAN_TAG;
391             } # if
392             # Grab the smallest-containing ancestor of this item:
393 0 0         my $oParent = $iUK
394             ? $oSPAN->look_up(_tag => 'tbody',
395             class => 'itemWrapper',
396             )
397             : $oSPAN;
398 0           $oParent = $oSPAN;
399 0 0         if (! ref $oParent)
400             {
401 0           DEBUG_HTML && warn " WWW did not find ancestor TBODY\n";
402 0           next SPAN_TAG;
403             } # if
404 0           my $sParentHTML = $oParent->as_HTML;
405 0           DEBUG_HTML && warn " DDD _extract(): parent HTML ==$sParentHTML==\n";
406 0           my $sParent = $oParent->as_text;
407             # Manual text clean-up:
408 0           $sParent =~ s/(DESIRED|RECEIVED|PRIORITY)/; $1: /g;
409 0           DEBUG_HTML && warn " DDD _extract(): parent text ==$sParent==\n";
410 0           my $iDesired = _match_desired($sParent);
411 0           DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
412 0           my $sPriority = _match_priority($sParent);
413 0           DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
414 0           my @aoTDtiny = $oParent->look_down(_tag => 'td',
415             class => 'tiny',
416             );
417             QUANT_TAG:
418 0           foreach my $oSPAN (@aoTDtiny)
419             {
420 0 0         next QUANT_TAG unless ref $oSPAN;
421 0           my $sSpan = $oSPAN->as_text;
422 0           DEBUG_HTML && warn " DDD _extract(): TDtiny=$sSpan=\n";
423 0   0       $sPriority ||= _match_priority($sSpan);
424 0           DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
425 0   0       $iDesired ||= _match_desired($sSpan);
426 0           DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
427             } # foreach QUANT_TAG
428 0 0 0       if (! $iDesired || ! $sPriority)
429             {
430             # See if they are encoded in a FORM:
431             # Find the priority:
432 0 0         if ($sParentHTML =~ m!
433             {
434 0           $sPriority = $1;
435 0           DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
436             } # if
437             else
438             {
439 0           DEBUG_HTML && warn " WWW did not find
440             }
441             # Find the quantity desired:
442 0 0         if ($sParentHTML =~ m!!)
443             {
444 0           $iDesired = $1;
445 0           DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
446             } # if
447             else
448             {
449 0           DEBUG_HTML && warn " WWW did not find for desired-quantity\n";
450             }
451             } # if
452             # Put in default values if we never found them:
453 0   0       $sPriority ||= 'medium';
454 0           DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
455 0   0       $iDesired ||= 1;
456             # Find the date added:
457 0           my $sDate = '';
458 0 0         if ($sParentHTML =~ m!>added\s+(.+?)
459             {
460 0           $sDate = $1;
461 0           DEBUG_HTML && warn " DDD _extract(): date=$sDate=\n";
462             } # if
463             else
464             {
465 0           DEBUG_HTML && warn " WWW did not find text for date-added\n";
466             }
467            
468             # Find the "author" of this item:
469 0           my @aoTDauthor;
470 0 0         if ($iUK)
471             {
472 0           @aoTDauthor = $oParent->look_down(_tag => 'td',
473             class => 'small',
474             );
475             }
476             else
477             {
478             @aoTDauthor = $oParent->look_down(_tag => 'span',
479             sub
480             {
481 0     0     my $sHtml = $_[0]->as_HTML;
482             # DEBUG_HTML && warn " DDD _extract(): try oTDauthor span==$sHtml==\n";
483 0   0       my $s = $_[0]->attr('class') || q{};
484 0           $s =~ m'BYLINE'i;
485             },
486 0           );
487             } # else
488 0           my $sAuthor = '';
489             AUTHOR_TAG:
490 0           foreach my $oTD (@aoTDauthor)
491             {
492 0 0         next AUTHOR_TAG unless ref $oTD;
493 0           my $s = $oTD->as_HTML;
494 0           DEBUG_HTML && warn " DDD _extract(): try oTDauthor==$s==\n";
495 0           $s = $oTD->as_text;
496 0 0         if ($s =~ s!\A\s*(by|~)\s+!!)
497             {
498 0           $sAuthor = $s;
499 0           last AUTHOR_TAG;
500             } # if
501             } # foreach AUTHOR_TAG
502 0           DEBUG_HTML && warn " DDD _extract(): author=$sAuthor=\n";
503             # Find the price of this item:
504 0           my $sPrice = '';
505             my $oTDprice = $oParent->look_down(_tag => 'span',
506             sub
507             {
508 0   0 0     my $s = $_[0]->attr('class') || q{};
509 0           $s =~ m'PRICE'i;
510             },
511 0           );
512 0 0         if (! ref $oTDprice)
513             {
514 0           DEBUG_HTML && warn " WWW did not find TD for price\n";
515             # warn $oParent->as_HTML;
516             # exit 88;
517             # next SPAN_TAG;
518             } # if
519             else
520             {
521 0           $sPrice = $oTDprice->as_text;
522 0 0         if ($sPrice =~ m!Price:\s+(.+)\Z!)
523             {
524 0           $sPrice = $1;
525             } # if
526 0           $sPrice =~ s!\A\s+!!;
527 0           $sPrice =~ s!\s+\Z!!;
528 0           DEBUG_HTML && warn " DDD _extract(): price=$sPrice=\n";
529             } # else
530             # Add this item to the result set:
531 0           my %hsItem = (
532             asin => $sASIN,
533             author => $sAuthor,
534             # image => $sImageURL,
535             price => $sPrice,
536             priority => $sPriority,
537             quantity => $iDesired,
538             title => $sTitle,
539             # type => $sType,
540             );
541 0           DEBUG_HTML && warn Dumper(\%hsItem);
542             # warn " DDD _extract() added one item to \$rh->{items}\n";
543 0           push @{$rh->{items}}, \%hsItem;
  0            
544             # All done with this item:
545 0           $oParent->detach;
546 0           $oParent->delete;
547             } # foreach SPAN_TAG
548             # Look for the next-page link:
549             my @aoA = $oTree->look_down(_tag => 'a',
550             role => 'link',
551             sub {
552 0 0   0     return 0 if (length($_[0]->attr('href')) < 55);
553             # my $s = $_[0]->as_text || q{};
554             # DEBUG_NEXT && warn " DDD _extract(): try next ==$s==\n";
555             # $s =~ m/\A\s*(NEXT|SEE\s+MORE)\s*\z/i;
556 0           my $s = $_[0]->attr('class');
557 0           DEBUG_NEXT && warn " DDD _extract(): try next ==$s==\n";
558 0           $s =~ m/wl-see-more/
559             },
560 0           );
561 0           my $iCountA = scalar(@aoA);
562 0           DEBUG_NEXT && warn " DDD _extract(): found $iCountA tags that match 'next'\n";
563 0           my $oA = shift @aoA;
564 0 0         if (ref $oA)
565             {
566 0           $rh->{next} = $oA->attr('href');
567 0           DEBUG_NEXT && warn " DDD _extract(): raw next URL is ==$rh->{next}==\n";
568             } # if
569             else
570             {
571 0           DEBUG_NEXT && warn " DDD _extract(): did not find next URL\n";
572             }
573 0           return $rh;
574             } # _extract
575            
576             sub _match_priority {
577 0   0 0     my $s = shift || return;
578 0 0         if ($s =~ m'PRIORITY:?\s*(\w+?)(\s|\z)'i)
579             {
580 0           return lc $1;
581             } # if
582 0           return;
583             } # _match_priority
584            
585             sub _match_desired {
586 0   0 0     my $s = shift || return;
587 0 0         if ($s =~ m'(?:DESIRED|WANTS):?\s*(\d+)'i)
588             {
589 0           return lc $1;
590             } # if
591 0           return;
592             } # _match_desired
593            
594             1;
595            
596             __END__