File Coverage

blib/lib/WWW/Amazon/Wishlist.pm

Criterion	Covered	Total	%
statement	63	246	25.6
branch	6	74	8.1
condition	4	52	7.6
subroutine	14	20	70.0
pod	0	1	0.0
total	87	393	22.1

line	stmt	bran	cond	sub	pod	time	code
1
2							package WWW::Amazon::Wishlist;
3
4	1			1		90706	use warnings;
	1					4
	1					49
5	1			1		8	use strict;
	1					4
	1					68
6
7							our
8							$VERSION = 2.018;
9
10	1			1		9	use vars qw( @ISA @EXPORT @EXPORT_OK );
	1					15
	1					84
11
12	1			1		8	use Carp;
	1					3
	1					84
13	1			1		9	use Data::Dumper;
	1					3
	1					61
14	1			1		1044	use HTML::TreeBuilder;
	1					45177
	1					17
15	1			1		1077	use LWP::UserAgent;
	1					67390
	1					53
16
17	1			1		16	use constant COM => 0;
	1					3
	1					98
18	1			1		8	use constant UK => 1;
	1					3
	1					55
19
20	1			1		7	use constant DEBUG => 0;
	1					4
	1					53
21	1			1		8	use constant DEBUG_HTML => 0;
	1					3
	1					49
22	1			1		7	use constant DEBUG_NEXT => 0;
	1					3
	1					3404
23
24							require Exporter;
25
26							@ISA = qw(Exporter);
27							@EXPORT = qw(
28							);
29							@EXPORT_OK = qw(
30							get_list
31							UK
32							COM
33							);
34
35							=pod
36
37							=head1 NAME
38
39							WWW::Amazon::Wishlist - grab all the details from your Amazon wishlist
40
41							=head1 SYNOPSIS
42
43							use WWW::Amazon::Wishlist qw(get_list COM UK);
44
45							my @wishlist;
46
47							@wishlist = get_list($my_amazon_com_id); # gets it from amazon.com
48							@wishlist = get_list($my_amazon_com_id, COM); # same, explicitly
49							@wishlist = get_list($my_amazon_couk_id, UK); # gets it from amazon.co.uk
50
51							# Or, if you didn't import the COM and UK constants:
52							@wishlist = get_list ($my_amazon_couk_id, WWW::Amazon::Wishlist::UK);
53
54							# The elements of @wishlist are hashrefs that contain the following elements:
55							foreach my $book (@wishlist)
56							{
57							print $book->{title}, # the, err, title
58							$book->{author}, # and the author(s)
59							$book->{asin}, # the asin number, its unique id on Amazon
60							$book->{price}, # how much it will set you back
61							$book->{quantity}, # how many you said you want
62							$book->{priority}, # how urgently you said you want it (1-5)
63							$book->{type}; # Hardcover/Paperback/CD/DVD etc (not available in the US)
64							} # foreach
65
66							=head1 DESCRIPTION
67
68							Goes to amazon.(com\|co.uk), scrapes your wishlist, and returns it
69							in a array of hashrefs so that you can fiddle with it to your heart's
70							content.
71
72							=head1 GETTING YOUR AMAZON ID
73
74							The best way to do this is to search for your own wishlist in the search
75							tools.
76
77							Searching for mine (simon@twoshortplanks.com) on amazon.com takes me to
78							the URL something like
79
80							http://www.amazon.com/exec/obidos/wishlist/2EAJG83WS7YZM/...
81
82							there's some more cruft after that last string of numbers and letters
83							but it's the
84
85							2EAJG83WS7YZM
86
87							bit that's important.
88
89							Doing the same for amazon.co.uk is just as easy.
90
91							Apparently, some people have had problems getting to their wishlist right
92							after it gets set up. You may have to wait a while for it to become
93							browseable.
94
95							=head1 SHOWING YOUR APPRECIATION
96
97							There was a thread on london.pm mailing list about working in a vacuum -
98							that it was a bit depressing to keep writing modules but never get any
99							feedback. So, if you use and like this module then please send me an
100							email and make my day.
101
102							All it takes is a few little bytes.
103
104
105							=head1 BUGS
106
107							B
108
109							C is a screen scraper and is there for
110							is vulnerable to any changes that Amazon make to their HTML.
111
112							If it starts returning no items then this is very likely the reason
113							and I will get around to fixing it as soon as possible.
114
115							You might want to look at the C module instead.
116
117							It doesn't cope with anything apart from the UK and USA versions of Amazon.
118
119							I don't think it likes unavailable items - trying to work around this
120							breaks UK compatability.
121
122							The code has accumulated lots of cruft.
123
124							Lack of testing. It works for the pages I've tried it for but that's
125							no guarantee.
126
127							=head1 LICENSE
128
129							Copyright (c) 2003 Simon Wistow
130
131							Distributed under the same terms as Perl itself.
132
133							This software is under no warranty and will probably destroy your wish
134							list, kill your friends, burn your house and bring about the apocalypse
135
136							=head1 AUTHOR
137
138							Simon Wistow
139							Currently maintained by Martin Thurn
140
141							=head1 SEE ALSO
142
143							L, L, L
144
145							=cut
146
147							my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
148
149							sub get_list {
150							# Required arg = wishlist ID:
151	1		33	1	0	114	my $id = shift \|\| croak "No ID given to get_list() function\n";
152							# Optional arg = whether we're accessing the UK site. Default is "no":
153	1		50			8	my $uk = shift \|\| 0;
154							# Optional arg = turn on debugging:
155	1		50			5	my $test = shift \|\| DEBUG;
156							# Note to self ... should we UC the id? Nahhhh. Not yet.
157							# fairly self explanatory
158	1	50				5	my $domain = ($uk) ? "co.uk" : "com";
159	1					4	my $sBase = qq'http://www.amazon.$domain';
160							# set up some variables
161	1					4	my $iPage = 1;
162	1					3	my @items;
163							my $url;
164							# and awaaaaaaaaaaaaay we go ....
165							INFINITE:
166	1					4	while (1)
167							{
168	1	50	33			14	$url \|\|= $uk ? "$sBase/gp/registry/wishlist/ref=cm_wl_search_1?page=$iPage&cid=$id" :
169							"$sBase/gp/registry/wishlist/$id";
170							# This is a typical complete .com URL as of 2008-12:
171							# http://www.amazon.com/gp/registry/wishlist/2O4B95NPM1W3L
172	1					2	DEBUG_HTML && warn " DDD fetching wishlist for $id, page $iPage...\n";
173							# Don't overwhelm the server:
174	1	50				5	sleep(3) if (1 < $iPage);
175	1					6	my $content = _fetch_page($url, $domain);
176	0					0	if (DEBUG_HTML == 88)
177							{
178							warn $content;
179							exit 88;
180							} # if
181							# As of 2009-08, Amazon returns HTML with MISSING BRACKETS:
182	0					0	$content =~ s/(\r\n]+)(\s+<)/$1>\n$2/g;
183							# There seems to be a bug in HTML::TreeBuilder that causes
184							# abutting tags to be skpped!?!
185	0					0	$content =~ s!>
186	0	0				0	if (9 < $test)
187							{
188	0					0	eval "use File::Slurp";
189	0					0	my $sFname = qq'Pages/fetched-$domain.html';
190	0					0	write_file($sFname, $content);
191	0					0	warn " DDD wrote HTML to $sFname\n";
192	0					0	exit 88;
193							} # if
194	0					0	my $iLen = length($content);
195							# warn " DDD fetched $iLen bytes.\n";
196
197	0					0	my $result = _extract($uk, $content, $test);
198							# print Dumper($result);
199							# exit 88;
200	0	0				0	if (! defined $result)
201							{
202	0					0	DEBUG && warn " WWW _extract() returned nothing\n";
203	0					0	last INFINITE;
204							} # if
205	0	0				0	if (! ref $result->{items})
206							{
207							# Probably an empty wish list
208	0					0	DEBUG && warn " WWW _extract() returned no items\n";
209	0					0	last INFINITE;
210							} # if
211							# Clean up the parsed items and add them to our local @items
212							# array:
213							ITEM:
214	0					0	foreach my $item (@{$result->{items}})
	0					0
215							{
216	0					0	$item->{'author'} =~ s!\n!!g;
217	0					0	$item->{'author'} =~ s!^\s*by\s+!!g;
218	0					0	$item->{'author'} =~ s! \n*!!s;
219	0	0				0	$item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s\s(\d+)!i);
220	0	0				0	$item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s\s(\d)!i);
221	0	0	0			0	if (
			0
222							$uk
223							&&
224							$item->{image}
225							&&
226							($item->{image} !~ m!^http:!)
227							)
228							{
229	0					0	$item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
230							} # if
231	0					0	push @items, $item;
232	0					0	DEBUG_HTML && warn " DDD added one item to \@items\n";
233							} # foreach ITEM
234							# Assumes an absolute path without hostname:
235	0	0				0	if ( ! defined $result->{next})
236							{
237	0					0	DEBUG_NEXT && warn " WWW did not find next url\n";
238	0					0	DEBUG_NEXT && write_file(qq'Pages/no-next.html', $content);
239	0					0	last INFINITE;
240							} # if
241	0					0	$url = $sBase . $result->{next};
242	0					0	$iPage++;
243							} # while INFINITE
244	0					0	return @items;
245							} # get_list
246
247							sub _fetch_page {
248	1			1		5	my ($url, $domain) = @_;
249	1					3	if (0)
250							{
251							eval "use File::Slurp";
252							# For debugging UK site:
253							return read_file('Pages/uk-2008-12-page1.html');
254							# For debugging USA site:
255							return read_file('Pages/2008-12.html');
256							} # if 0
257							# Set up the UA:
258	1					17	my $ua = new LWP::UserAgent(
259							keep_alive => 1,
260							timeout => 30,
261							agent => $USER_AGENT,
262							);
263							# Setting it in the 'new' seems not to work sometimes
264	1					6366	$ua->agent($USER_AGENT);
265							# For some reason, this makes stuff work:
266							# $ua->max_redirect( 0 );
267							# Make a full set of headers:
268	1					140	my $h = new HTTP::Headers(
269							'Host' => "www.amazon.$domain",
270							'Referer' => $url,
271							'User-Agent' => $USER_AGENT,
272							'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,/;q=0.1',
273							'Accept-Language' => 'en-us,en;q=0.5',
274							'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
275							#'Accept-Encoding' => 'gzip,deflate',
276							'Keep-Alive' => '300',
277							'Connection' => 'keep-alive',
278							);
279	1					619	$h->referer("$url");
280	1					67	my $request = HTTP::Request->new ( 'GET', $url, $h );
281	1					12758	my $response;
282	1					5	my $times = 0;
283							# LWP should be able to do this but seemingly fails sometimes
284	1					8	while ($times++<3)
285							{
286	3					47	$response = $ua->request($request);
287	3	50				376558	last if $response->is_success;
288	3	50				62	if ($response->is_redirect)
289							{
290	0					0	$url = $response->header("Location");
291							#$h->header("Referer", $url);
292	0					0	$h->referer("$url");
293	0					0	$request = HTTP::Request->new ( 'GET', $url, $h );
294							} # if
295							} # while
296	1	50				20	if (!$response->is_success)
297							{
298	1					859	croak "Failed to retrieve $url";
299	0						return undef;
300							} # if
301	0						my $s = $response->content;
302							# Clean the CRAP off the page:
303	0						$s =~ s!!!gs;
304	0						return $s;
305							} # _fetch_page
306
307							# This is the HTML parsing version written by Martin Thurn:
308
309							sub _extract {
310							# Required arg1 = whether we are parsing the UK site or not (Boolean):
311	0		0	0			my $iUK = shift \|\| 0;
312							# Required arg2 = the HTML contents of the webpage:
313	0		0				my $s = shift \|\| '';
314							# Optional arg = debugging level:
315	0		0				my $iDebug = shift \|\| 0;
316	0						DEBUG_HTML && warn " DDD start _extract()\n";
317	0						my $rh = {};
318	0						my $oTree = new HTML::TreeBuilder;
319	0						$oTree->parse($s);
320	0						$oTree->eof;
321	0						my $sTag = q/div/;
322	0						my $sClass = q/a-fixed-left-grid a-spacing-none/;
323	0	0					$sClass = q/a-text-left a-fixed-left-grid-col a-col-right/ if $iUK;
324							# $sClass = q/a-fixed-left-grid a-spacing-large/ if $iUK;
325
326	0						my @aoSPAN = $oTree->look_down(_tag => $sTag,
327							class => $sClass,
328							);
329	0						my $iCountSPAN = scalar(@aoSPAN);
330	0						DEBUG_HTML && warn " DDD _extract() found $iCountSPAN $sTag tags of class '$sClass'\n";
331							SPAN_TAG:
332	0						foreach my $oSPAN (@aoSPAN)
333							{
334	0	0					next SPAN_TAG unless ref $oSPAN;
335	0						DEBUG_HTML && warn " DDD _extract() found toplevel item tagset\n";
336	0						if (9 < DEBUG_HTML)
337							{
338							my $s = $oSPAN->as_HTML;
339							warn " DDD ==$s==\n";
340							} # if
341	0						my $sASIN = q{};
342	0						my $sName = q{};
343	0						my $sTitle = q{};
344	0						my @aoA = $oSPAN->look_down(_tag => 'a');
345	0						DEBUG_HTML && warn sprintf(" DDD _extract(): oSPAN contains %d tags\n", scalar(@aoA));
346							A_TAG:
347	0						foreach my $oA (@aoA)
348							{
349	0	0					next A_TAG if ! ref $oA;
350	0						my $sA = $oA->as_HTML;
351	0						DEBUG_HTML && warn " DDD _extract(): try A\n";
352	0						if (9 < DEBUG_HTML)
353							{
354							warn " DDD ==$sA==\n";
355							} # if
356	0		0				$sTitle = $oA->attr('title') \|\| $oA->as_text;
357							# Strip leading whitespace:
358	0						$sTitle =~ s!\A\s+!!;
359							# Strip trailing whitespace:
360	0						$sTitle =~ s!\s+\Z!!;
361							# Ignore empty (image-only) tags:
362	0	0					next A_TAG if ($sTitle !~ m/\S/);
363							# Strip out zero-width spaces scattered about randomly in item titles
364	0						$sTitle =~ s/\x{200b}//g;
365	0						DEBUG_HTML && warn " DDD _extract(): found item named '$sTitle'\n";
366	0	0					next A_TAG if ($sTitle eq 'Universal Wish List Button');
367	0	0					next A_TAG if ($sTitle eq 'Buying this gift elsewhere?');
368	0						my $sURL = $oA->attr('href');
369	0						DEBUG_HTML && warn " DDD _extract(): URL ==$sURL==\n";
370	0	0	0				if (
			0
371							($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
372							\|\|
373							($sURL =~ m!/gp/product/(.+?)/ref!)
374							\|\|
375							($sURL =~ m!/dp/(.+?)/(_encoding\|ref)!)
376							)
377							{
378							# It's a match!
379	0						$sASIN = $1;
380	0						last A_TAG;
381							} # if
382							else
383							{
384	0						DEBUG_HTML && warn " EEE url does not contain asin\n";
385							}
386							} # foreach A_TAG
387	0						DEBUG_HTML && warn " DDD _extract(): ASIN ==$sASIN==\n";
388	0	0					if ($sASIN eq q{})
389							{
390	0						next SPAN_TAG;
391							} # if
392							# Grab the smallest-containing ancestor of this item:
393	0	0					my $oParent = $iUK
394							? $oSPAN->look_up(_tag => 'tbody',
395							class => 'itemWrapper',
396							)
397							: $oSPAN;
398	0						$oParent = $oSPAN;
399	0	0					if (! ref $oParent)
400							{
401	0						DEBUG_HTML && warn " WWW did not find ancestor TBODY\n";
402	0						next SPAN_TAG;
403							} # if
404	0						my $sParentHTML = $oParent->as_HTML;
405	0						DEBUG_HTML && warn " DDD _extract(): parent HTML ==$sParentHTML==\n";
406	0						my $sParent = $oParent->as_text;
407							# Manual text clean-up:
408	0						$sParent =~ s/(DESIRED\|RECEIVED\|PRIORITY)/; $1: /g;
409	0						DEBUG_HTML && warn " DDD _extract(): parent text ==$sParent==\n";
410	0						my $iDesired = _match_desired($sParent);
411	0						DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
412	0						my $sPriority = _match_priority($sParent);
413	0						DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
414	0						my @aoTDtiny = $oParent->look_down(_tag => 'td',
415							class => 'tiny',
416							);
417							QUANT_TAG:
418	0						foreach my $oSPAN (@aoTDtiny)
419							{
420	0	0					next QUANT_TAG unless ref $oSPAN;
421	0						my $sSpan = $oSPAN->as_text;
422	0						DEBUG_HTML && warn " DDD _extract(): TDtiny=$sSpan=\n";
423	0		0				$sPriority \|\|= _match_priority($sSpan);
424	0						DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
425	0		0				$iDesired \|\|= _match_desired($sSpan);
426	0						DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
427							} # foreach QUANT_TAG
428	0	0	0				if (! $iDesired \|\| ! $sPriority)
429							{
430							# See if they are encoded in a FORM:
431							# Find the priority:
432	0	0					if ($sParentHTML =~ m!
433							{
434	0						$sPriority = $1;
435	0						DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
436							} # if
437							else
438							{
439	0						DEBUG_HTML && warn " WWW did not find
440							}
441							# Find the quantity desired:
442	0	0					if ($sParentHTML =~ m!!)
443							{
444	0						$iDesired = $1;
445	0						DEBUG_HTML && warn " DDD _extract(): desired set to =$iDesired=\n";
446							} # if
447							else
448							{
449	0						DEBUG_HTML && warn " WWW did not find for desired-quantity\n";
450							}
451							} # if
452							# Put in default values if we never found them:
453	0		0				$sPriority \|\|= 'medium';
454	0						DEBUG_HTML && warn " DDD _extract(): priority set to =$sPriority=\n";
455	0		0				$iDesired \|\|= 1;
456							# Find the date added:
457	0						my $sDate = '';
458	0	0					if ($sParentHTML =~ m!>added\s+(.+?)
459							{
460	0						$sDate = $1;
461	0						DEBUG_HTML && warn " DDD _extract(): date=$sDate=\n";
462							} # if
463							else
464							{
465	0						DEBUG_HTML && warn " WWW did not find text for date-added\n";
466							}
467
468							# Find the "author" of this item:
469	0						my @aoTDauthor;
470	0	0					if ($iUK)
471							{
472	0						@aoTDauthor = $oParent->look_down(_tag => 'td',
473							class => 'small',
474							);
475							}
476							else
477							{
478							@aoTDauthor = $oParent->look_down(_tag => 'span',
479							sub
480							{
481	0			0			my $sHtml = $_[0]->as_HTML;
482							# DEBUG_HTML && warn " DDD _extract(): try oTDauthor span==$sHtml==\n";
483	0		0				my $s = $_[0]->attr('class') \|\| q{};
484	0						$s =~ m'BYLINE'i;
485							},
486	0						);
487							} # else
488	0						my $sAuthor = '';
489							AUTHOR_TAG:
490	0						foreach my $oTD (@aoTDauthor)
491							{
492	0	0					next AUTHOR_TAG unless ref $oTD;
493	0						my $s = $oTD->as_HTML;
494	0						DEBUG_HTML && warn " DDD _extract(): try oTDauthor==$s==\n";
495	0						$s = $oTD->as_text;
496	0	0					if ($s =~ s!\A\s*(by\|~)\s+!!)
497							{
498	0						$sAuthor = $s;
499	0						last AUTHOR_TAG;
500							} # if
501							} # foreach AUTHOR_TAG
502	0						DEBUG_HTML && warn " DDD _extract(): author=$sAuthor=\n";
503							# Find the price of this item:
504	0						my $sPrice = '';
505							my $oTDprice = $oParent->look_down(_tag => 'span',
506							sub
507							{
508	0		0	0			my $s = $_[0]->attr('class') \|\| q{};
509	0						$s =~ m'PRICE'i;
510							},
511	0						);
512	0	0					if (! ref $oTDprice)
513							{
514	0						DEBUG_HTML && warn " WWW did not find TD for price\n";
515							# warn $oParent->as_HTML;
516							# exit 88;
517							# next SPAN_TAG;
518							} # if
519							else
520							{
521	0						$sPrice = $oTDprice->as_text;
522	0	0					if ($sPrice =~ m!Price:\s+(.+)\Z!)
523							{
524	0						$sPrice = $1;
525							} # if
526	0						$sPrice =~ s!\A\s+!!;
527	0						$sPrice =~ s!\s+\Z!!;
528	0						DEBUG_HTML && warn " DDD _extract(): price=$sPrice=\n";
529							} # else
530							# Add this item to the result set:
531	0						my %hsItem = (
532							asin => $sASIN,
533							author => $sAuthor,
534							# image => $sImageURL,
535							price => $sPrice,
536							priority => $sPriority,
537							quantity => $iDesired,
538							title => $sTitle,
539							# type => $sType,
540							);
541	0						DEBUG_HTML && warn Dumper(\%hsItem);
542							# warn " DDD _extract() added one item to \$rh->{items}\n";
543	0						push @{$rh->{items}}, \%hsItem;
	0
544							# All done with this item:
545	0						$oParent->detach;
546	0						$oParent->delete;
547							} # foreach SPAN_TAG
548							# Look for the next-page link:
549							my @aoA = $oTree->look_down(_tag => 'a',
550							role => 'link',
551							sub {
552	0	0		0			return 0 if (length($_[0]->attr('href')) < 55);
553							# my $s = $_[0]->as_text \|\| q{};
554							# DEBUG_NEXT && warn " DDD _extract(): try next ==$s==\n";
555							# $s =~ m/\A\s(NEXT\|SEE\s+MORE)\s\z/i;
556	0						my $s = $_[0]->attr('class');
557	0						DEBUG_NEXT && warn " DDD _extract(): try next ==$s==\n";
558	0						$s =~ m/wl-see-more/
559							},
560	0						);
561	0						my $iCountA = scalar(@aoA);
562	0						DEBUG_NEXT && warn " DDD _extract(): found $iCountA tags that match 'next'\n";
563	0						my $oA = shift @aoA;
564	0	0					if (ref $oA)
565							{
566	0						$rh->{next} = $oA->attr('href');
567	0						DEBUG_NEXT && warn " DDD _extract(): raw next URL is ==$rh->{next}==\n";
568							} # if
569							else
570							{
571	0						DEBUG_NEXT && warn " DDD _extract(): did not find next URL\n";
572							}
573	0						return $rh;
574							} # _extract
575
576							sub _match_priority {
577	0		0	0			my $s = shift \|\| return;
578	0	0					if ($s =~ m'PRIORITY:?\s*(\w+?)(\s\|\z)'i)
579							{
580	0						return lc $1;
581							} # if
582	0						return;
583							} # _match_priority
584
585							sub _match_desired {
586	0		0	0			my $s = shift \|\| return;
587	0	0					if ($s =~ m'(?:DESIRED\|WANTS):?\s*(\d+)'i)
588							{
589	0						return lc $1;
590							} # if
591	0						return;
592							} # _match_desired
593
594							1;
595
596							__END__