| blib/lib/WWW/Scraper/ISBN/AmazonUK_Driver.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % | 
| statement | 88 | 95 | 92.6 | 
| branch | 35 | 48 | 72.9 | 
| condition | 10 | 24 | 41.6 | 
| subroutine | 9 | 9 | 100.0 | 
| pod | 1 | 1 | 100.0 | 
| total | 143 | 177 | 80.7 | 
| line | stmt | bran | cond | sub | pod | time | code | 
|---|---|---|---|---|---|---|---|
| 1 | package WWW::Scraper::ISBN::AmazonUK_Driver; | ||||||
| 2 | |||||||
| 3 | 6 | 6 | 68557 | use strict; | |||
| 6 | 14 | ||||||
| 6 | 223 | ||||||
| 4 | 6 | 6 | 21 | use warnings; | |||
| 6 | 15 | ||||||
| 6 | 251 | ||||||
| 5 | |||||||
| 6 | 6 | 6 | 25 | use vars qw($VERSION); | |||
| 6 | 13 | ||||||
| 6 | 351 | ||||||
| 7 | $VERSION = '0.41'; | ||||||
| 8 | |||||||
| 9 | #-------------------------------------------------------------------------- | ||||||
| 10 | |||||||
| 11 | =head1 NAME | ||||||
| 12 | |||||||
| 13 | WWW::Scraper::ISBN::AmazonUK_Driver - Search driver for Amazon.co.uk | ||||||
| 14 | |||||||
| 15 | =head1 SYNOPSIS | ||||||
| 16 | |||||||
| 17 | See parent class documentation (L | ||||||
| 18 | |||||||
| 19 | =head1 DESCRIPTION | ||||||
| 20 | |||||||
| 21 | Searches for book information from the (UK) Amazon online catalog. | ||||||
| 22 | |||||||
| 23 | =cut | ||||||
| 24 | |||||||
| 25 | #-------------------------------------------------------------------------- | ||||||
| 26 | |||||||
| 27 | ########################################################################### | ||||||
| 28 | # Inheritence | ||||||
| 29 | |||||||
| 30 | 6 | 6 | 26 | use base qw(WWW::Scraper::ISBN::Driver); | |||
| 6 | 10 | ||||||
| 6 | 960 | ||||||
| 31 | |||||||
| 32 | ########################################################################### | ||||||
| 33 | # Modules | ||||||
| 34 | |||||||
| 35 | 6 | 6 | 2277 | use WWW::Mechanize; | |||
| 6 | 233680 | ||||||
| 6 | 150 | ||||||
| 36 | 6 | 6 | 769 | use JSON; | |||
| 6 | 9568 | ||||||
| 6 | 41 | ||||||
| 37 | |||||||
| 38 | ########################################################################### | ||||||
| 39 | # Variables | ||||||
| 40 | |||||||
| 41 | my $AMA_SEARCH = 'http://www.amazon.co.uk/s/ref=nb_sb_noss?url=search-alias%3Daps&x=18&y=16&field-keywords='; | ||||||
| 42 | my $AMA_URL = 'http://www.amazon.co.uk/[^/]+/dp/[\dX]+/ref=sr_1_1/'; | ||||||
| 43 | my $IN2MM = 0.0393700787; # number of inches in a millimetre (mm) | ||||||
| 44 | my $LB2G = 0.00220462; # number of pounds (lbs) in a gram | ||||||
| 45 | my $OZ2G = 0.035274; # number of ounces (oz) in a gram | ||||||
| 46 | |||||||
| 47 | #-------------------------------------------------------------------------- | ||||||
| 48 | |||||||
| 49 | ########################################################################### | ||||||
| 50 | # Public Interface | ||||||
| 51 | |||||||
| 52 | =head1 METHODS | ||||||
| 53 | |||||||
| 54 | =over 4 | ||||||
| 55 | |||||||
| 56 | =item C | ||||||
| 57 | |||||||
| 58 | Creates a query string, then passes the appropriate form fields to the | ||||||
| 59 | Amazon (UK) server. | ||||||
| 60 | |||||||
| 61 | The returned page should be the correct catalog page for that ISBN. If not the | ||||||
| 62 | function returns zero and allows the next driver in the chain to have a go. If | ||||||
| 63 | a valid page is returned, the following fields are returned via the book hash: | ||||||
| 64 | |||||||
| 65 | isbn (now returns isbn13) | ||||||
| 66 | isbn10 | ||||||
| 67 | isbn13 | ||||||
| 68 | ean13 (industry name) | ||||||
| 69 | author | ||||||
| 70 | title | ||||||
| 71 | book_link | ||||||
| 72 | thumb_link | ||||||
| 73 | image_link | ||||||
| 74 | pubdate | ||||||
| 75 | publisher | ||||||
| 76 | binding (if known) | ||||||
| 77 | pages (if known) | ||||||
| 78 | weight (if known) (in grams) | ||||||
| 79 | width (if known) (in millimetres) | ||||||
| 80 | height (if known) (in millimetres) | ||||||
| 81 | depth (if known) (in millimetres) | ||||||
| 82 | |||||||
| 83 | The book_link, thumb_link and image_link refer back to the Amazon (UK) website. | ||||||
| 84 | |||||||
| 85 | =back | ||||||
| 86 | |||||||
| 87 | =cut | ||||||
| 88 | |||||||
| 89 | sub search { | ||||||
| 90 | 3 | 3 | 1 | 12607 | my $self = shift; | ||
| 91 | 3 | 5 | my $isbn = shift; | ||||
| 92 | 3 | 11 | $self->found(0); | ||||
| 93 | 3 | 33 | $self->book(undef); | ||||
| 94 | |||||||
| 95 | # validate and convert into EAN13 format | ||||||
| 96 | 3 | 23 | my $ean = $self->convert_to_ean13($isbn); | ||||
| 97 | 3 | 50 | 66 | 118 | return $self->handler("Invalid ISBN specified [$isbn]") | ||
| 33 | |||||||
| 66 | |||||||
| 33 | |||||||
| 98 | if(!$ean || (length $isbn == 13 && $isbn ne $ean) | ||||||
| 99 | || (length $isbn == 10 && $isbn ne $self->convert_to_isbn10($ean))); | ||||||
| 100 | |||||||
| 101 | 3 | 45 | my $mech = WWW::Mechanize->new(); | ||||
| 102 | 3 | 11926 | $mech->agent_alias( 'Linux Mozilla' ); | ||||
| 103 | |||||||
| 104 | 3 | 150 | my $search = $AMA_SEARCH . $ean; | ||||
| 105 | |||||||
| 106 | 3 | 5 | eval { $mech->get( $search ) }; | ||||
| 3 | 9 | ||||||
| 107 | 3 | 50 | 33 | 1164244 | return $self->handler("Amazon UK website appears to be unavailable.") | ||
| 33 | |||||||
| 108 | if($@ || !$mech->success() || !$mech->content()); | ||||||
| 109 | |||||||
| 110 | 3 | 159 | my $content = $mech->content(); | ||||
| 111 | #print STDERR "\n# content=[$content]\n"; | ||||||
| 112 | 3 | 288 | my ($link) = $content =~ m!($AMA_URL)!s; | ||||
| 113 | 3 | 50 | 11 | return $self->handler("Failed to find that book on Amazon UK website.") | |||
| 114 | unless($link); | ||||||
| 115 | |||||||
| 116 | 3 | 6 | eval { $mech->get( $link ) }; | ||||
| 3 | 13 | ||||||
| 117 | 3 | 50 | 33 | 3927947 | return $self->handler("Amazon UK website appears to be unavailable.") | ||
| 33 | |||||||
| 118 | if($@ || !$mech->success() || !$mech->content()); | ||||||
| 119 | |||||||
| 120 | 3 | 204 | return $self->_parse($mech); | ||||
| 121 | } | ||||||
| 122 | |||||||
| 123 | sub _parse { | ||||||
| 124 | 5 | 5 | 1296 | my $self = shift; | |||
| 125 | 5 | 7 | my $mech = shift; | ||||
| 126 | |||||||
| 127 | # The Book page | ||||||
| 128 | 5 | 15 | my $html = $mech->content; | ||||
| 129 | 5 | 398 | my $data = {}; | ||||
| 130 | |||||||
| 131 | #print STDERR "\n# html=[$html]\n"; | ||||||
| 132 | |||||||
| 133 | 5 | 1 | 82130 | my @size                            = $html =~ m! | |||
| 1 | 12 | ||||||
| 1 | 1 | ||||||
| 1 | 14 | ||||||
| 134 | 5 | 100 | 20438 | @size                               = $html =~ m! | |||
| 135 | 5 | 100 | 16 | if(@size) { | |||
| 136 | 4 | 11 | my $type = pop @size; | ||||
| 137 | 4 | 43 | ($data->{depth},$data->{width},$data->{height}) = sort @size; | ||||
| 138 | 4 | 50 | 17 | if($type eq 'cm') { | |||
| 0 | |||||||
| 139 | 4 | 48 | $data->{$_} = int($data->{$_} * 10) for(qw( height width depth )); | ||||
| 140 | } elsif($type eq 'inches') { | ||||||
| 141 | 0 | 0 | $data->{$_} = int($data->{$_} / $IN2MM) for(qw( height width depth )); | ||||
| 142 | } | ||||||
| 143 | } | ||||||
| 144 | |||||||
| 145 | 5 | 19803 | ($data->{binding},$data->{pages})   = $html =~ m! | ||||
| 146 | 5 | 26701 | ($data->{weight})                   = $html =~ m! | ||||
| 147 | 5 | 19386 | ($data->{published})                = $html =~ m! | ||||
| 148 | 5 | 19532 | ($data->{isbn10})                   = $html =~ m! | ||||
| 149 | 5 | 19464 | ($data->{isbn13})                   = $html =~ m! | ||||
| 150 | 5 | 4225 | ($data->{content}) = $html =~ m! | ||||
| 151 | 5 | 35838 | ($data->{description})              = $html =~ m!From the Back Cover\s* \s* (.*?) | ||||
| 152 | 5 | 100 | 20569 | ($data->{description})              = $html =~ m! ]*>.*?{description});  | |||
| 153 | |||||||
| 154 | 5 | 100 | 27 | $data->{weight} = int($data->{weight} / $OZ2G) if($data->{weight}); | |||
| 155 | |||||||
| 156 | 5 | 100 | 16 | if($data->{description}) { | |||
| 157 | 4 | 107 | $data->{description} =~ s!<[^>]+>!!g; | ||||
| 158 | 4 | 300 | $data->{description} =~ s! +! !g; | ||||
| 159 | } | ||||||
| 160 | |||||||
| 161 | # The images | ||||||
| 162 | 5 | 32849 | my ($json) = $html =~ /var colorImages = ([^;]+);/si; | ||||
| 163 | 5 | 50 | 27 | if($json) { | |||
| 164 | 0 | 0 | my $code = decode_json($json); | ||||
| 165 | 0 | 0 | my @order = grep {$_} $code->{initial}[0]{thumb}, $code->{initial}[0]{landing}, @{$code->{initial}[0]{main}}, $code->{initial}[0]{large}; | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 166 | 0 | 0 | 0 | $data->{thumb_link} = $order[0] if(@order); | |||
| 167 | 0 | 0 | 0 | $data->{image_link} = $order[-1] if(@order); | |||
| 168 | |||||||
| 169 | #use Data::Dumper; | ||||||
| 170 | #print STDERR "\n# code=[".Dumper($code)."]\n"; | ||||||
| 171 | } else { | ||||||
| 172 | 5 | 15358 | my ($code) = $html =~ /'imageGalleryData'\s*:\s*([^;]+);/si; | ||||
| 173 | 5 | 100 | 27 | if($code) { | |||
| 174 | 4 | 48 | ($data->{thumb_link}) = $code =~ /"thumbUrl":\s*"([^+]+)"/; | ||||
| 175 | 4 | 56 | ($data->{image_link}) = $code =~ /"mainUrl":\s*"([^+]+)"/; | ||||
| 176 | } | ||||||
| 177 | #use Data::Dumper; | ||||||
| 178 | #print STDERR "\n# code=[".Dumper($code)."]\n"; | ||||||
| 179 | } | ||||||
| 180 | |||||||
| 181 | |||||||
| 182 | # {\"initial\":[{\"large\":\"http://ecx.images-amazon.com/images/I/31cLTIXHKgL.jpg\",\"landing\":[\"http://ecx.images-amazon.com/images/I/31cLTIXHKgL._SY300_.jpg\"],\"thumb\":\"http://ecx.images-amazon.com/images/I/31cLTIXHKgL._SS40_.jpg\",\"main\":[\"http://ecx.images-amazon.com/images/I/31cLTIXHKgL._SX342_.jpg\",\"http://ecx.images-amazon.com/images/I/31cLTIXHKgL._SX385_.jpg\"]}]}; | ||||||
| 183 | |||||||
| 184 | 5 | 100 | 26 | if($data->{content}) { | |||
| 185 | 4 | 29 | $data->{content} =~ s/Amazon\.co\.uk.*?://i; | ||||
| 186 | 4 | 28 | $data->{content} =~ s/: Books.*//i; | ||||
| 187 | 4 | 37 | ($data->{title},$data->{author}) = split(/\s+by\s+/,$data->{content}); | ||||
| 188 | 4 | 50 | 30 | $data->{title} =~ s/^Buy\s+// if($data->{title}); | |||
| 189 | 4 | 100 | 38 | $data->{author} =~ s/\s*\(.*// if($data->{author}); | |||
| 190 | } | ||||||
| 191 | |||||||
| 192 | 5 | 100 | 64 | ($data->{publisher},$data->{pubdate}) = ($data->{published} =~ /\s*(.*?)(?:;.*?)?\s+\((.*?)\)/) if($data->{published}); | |||
| 193 | 5 | 100 | 20 | $data->{isbn10} =~ s/[^\dX]+//g if($data->{isbn10}); | |||
| 194 | 5 | 100 | 26 | $data->{isbn13} =~ s/\D+//g if($data->{isbn13}); | |||
| 195 | 5 | 100 | 21 | $data->{pubdate} =~ s/^.*?\(// if($data->{pubdate}); | |||
| 196 | |||||||
| 197 | 5 | 100 | 24 | return $self->handler("Could not extract data from Amazon UK result page.") | |||
| 198 | unless(defined $data->{isbn13}); | ||||||
| 199 | |||||||
| 200 | # trim top and tail | ||||||
| 201 | 4 | 100 | 28 | foreach (keys %$data) { next unless(defined $data->{$_});$data->{$_} =~ s/^\s+//;$data->{$_} =~ s/\s+$//; } | |||
| 68 | 88 | ||||||
| 64 | 99 | ||||||
| 64 | 220 | ||||||
| 202 | |||||||
| 203 | #use Data::Dumper; | ||||||
| 204 | #print STDERR "\n# data=[".Dumper($data)."]\n"; | ||||||
| 205 | |||||||
| 206 | 4 | 48 | my $bk = { | ||||
| 207 | 'ean13' => $data->{isbn13}, | ||||||
| 208 | 'isbn13' => $data->{isbn13}, | ||||||
| 209 | 'isbn10' => $data->{isbn10}, | ||||||
| 210 | 'isbn' => $data->{isbn13}, | ||||||
| 211 | 'author' => $data->{author}, | ||||||
| 212 | 'title' => $data->{title}, | ||||||
| 213 | 'image_link' => $data->{image_link}, | ||||||
| 214 | 'thumb_link' => $data->{thumb_link}, | ||||||
| 215 | 'publisher' => $data->{publisher}, | ||||||
| 216 | 'pubdate' => $data->{pubdate}, | ||||||
| 217 | 'book_link' => $mech->uri(), | ||||||
| 218 | 'content' => $data->{content}, | ||||||
| 219 | 'binding' => $data->{binding}, | ||||||
| 220 | 'pages' => $data->{pages}, | ||||||
| 221 | 'weight' => $data->{weight}, | ||||||
| 222 | 'width' => $data->{width}, | ||||||
| 223 | 'height' => $data->{height}, | ||||||
| 224 | 'depth' => $data->{depth}, | ||||||
| 225 | 'description' => $data->{description}, | ||||||
| 226 | 'html' => $html | ||||||
| 227 | }; | ||||||
| 228 | 4 | 192 | $self->book($bk); | ||||
| 229 | 4 | 58 | $self->found(1); | ||||
| 230 | 4 | 28 | return $self->book; | ||||
| 231 | } | ||||||
| 232 | |||||||
| 233 | q{currently reading: 'Torn Apart: The Life of Ian Curtis' by Mick Middles and Lindsay Reade}; | ||||||
| 234 | |||||||
| 235 | __END__ |