| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::Scraper::ISBN::AmazonDE_Driver; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 4 |  |  | 4 |  | 113581 | use warnings; | 
|  | 4 |  |  |  |  | 9 |  | 
|  | 4 |  |  |  |  | 140 |  | 
| 4 | 4 |  |  | 4 |  | 23 | use strict; | 
|  | 4 |  |  |  |  | 10 |  | 
|  | 4 |  |  |  |  | 123 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 4 |  |  | 4 |  | 1908 | use WWW::Scraper::ISBN::Driver; | 
|  | 4 |  |  |  |  | 2173 |  | 
|  | 4 |  |  |  |  | 120 |  | 
| 7 | 4 |  |  | 4 |  | 25 | use base qw(WWW::Scraper::ISBN::Driver); | 
|  | 4 |  |  |  |  | 7 |  | 
|  | 4 |  |  |  |  | 393 |  | 
| 8 | 4 |  |  | 4 |  | 4983 | use WWW::Mechanize; | 
|  | 4 |  |  |  |  | 760249 |  | 
|  | 4 |  |  |  |  | 188 |  | 
| 9 | 4 |  |  | 4 |  | 4213 | use Web::Scraper; | 
|  | 4 |  |  |  |  | 311801 |  | 
|  | 4 |  |  |  |  | 63 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 4 |  |  | 4 |  | 376 | use constant    AMAZON => 'http://www.amazon.de/'; | 
|  | 4 |  |  |  |  | 11 |  | 
|  | 4 |  |  |  |  | 705 |  | 
| 12 | 4 |  |  | 4 |  | 26 | use constant    SEARCH => 'http://www.amazon.de/'; | 
|  | 4 |  |  |  |  | 7 |  | 
|  | 4 |  |  |  |  | 181 |  | 
| 13 | 4 |  |  | 4 |  | 52 | use constant    DIRECT => 'http://www.amazon.de/gp/product/'; | 
|  | 4 |  |  |  |  | 9 |  | 
|  | 4 |  |  |  |  | 3427 |  | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | our $DEBUG = $ENV{ISBN_DRIVER_DEBUG}; | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | # ABSTRACT: Search driver for the (DE) Amazon online catalog. | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | our $VERSION = '0.25'; | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | sub search { | 
| 23 | 1 |  |  | 1 | 1 | 250 | my ($self,$isbn) = @_; | 
| 24 |  |  |  |  |  |  |  | 
| 25 | 1 |  |  |  |  | 8 | $self->found(0); | 
| 26 | 1 |  |  |  |  | 21 | $self->book(undef); | 
| 27 |  |  |  |  |  |  |  | 
| 28 | 1 |  |  |  |  | 14 | my $mechanize = WWW::Mechanize->new(); | 
| 29 | 1 |  |  |  |  | 29532 | $mechanize->agent_alias( 'Linux Mozilla' ); | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | #    $mechanize->get( SEARCH ); | 
| 32 |  |  |  |  |  |  | #    return    $self->handler('Error loading amazon.de form web page (unreachable?)') | 
| 33 |  |  |  |  |  |  | #        unless($mechanize->success()); | 
| 34 |  |  |  |  |  |  | # | 
| 35 | 1 |  |  |  |  | 109 | my ($index,$input) = (0,0); | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | #    $mechanize->form_name('site-search') | 
| 38 |  |  |  |  |  |  | #        or return $self->handler('Error parsing amazon.de form'); | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | #    my $keyword ='search-alias=stripbooks'; | 
| 41 |  |  |  |  |  |  | #    $mechanize->set_fields( | 
| 42 |  |  |  |  |  |  | #        'field-keywords' => $isbn, | 
| 43 |  |  |  |  |  |  | #        'url'            => $keyword | 
| 44 |  |  |  |  |  |  | #    ); | 
| 45 |  |  |  |  |  |  | #    $mechanize->submit(); | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | #    return    $self->handler('Error about form submission (form changed?)') | 
| 48 |  |  |  |  |  |  | #        unless($mechanize->success()); | 
| 49 |  |  |  |  |  |  |  | 
| 50 | 1 |  |  |  |  | 5 | (my $norm_isbn = $isbn) =~ s/[^0-9]//g; | 
| 51 | 1 |  |  |  |  | 4 | my $url = DIRECT . $norm_isbn; | 
| 52 | 1 |  |  |  |  | 7 | $mechanize->get( $url ); | 
| 53 |  |  |  |  |  |  |  | 
| 54 | 1 | 50 |  |  |  | 3122680 | return $self->handler( "No success when trying to get $url" ) | 
| 55 |  |  |  |  |  |  | unless $mechanize->success; | 
| 56 |  |  |  |  |  |  |  | 
| 57 | 1 |  |  |  |  | 42 | my $content = $mechanize->content(); | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | #$DEBUG and warn $content; | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | my $scraper = scraper { | 
| 62 | 1 |  |  | 1 |  | 983742 | process "title"                    , title       => 'TEXT'; | 
| 63 | 1 |  |  |  |  | 148126 | process "meta[name=\"description\"]" , content     => '@content'; | 
| 64 |  |  |  |  |  |  | process 'script'                   , 'scripts[]' => sub { | 
| 65 | 54 |  |  |  |  | 160632 | my $script = join '', @{$_->content_array_ref}; | 
|  | 54 |  |  |  |  | 134 |  | 
| 66 | 54 | 50 |  |  |  | 552 | $script =~ /registerImage\("original_image"/ ? $script : (); | 
| 67 | 1 |  |  |  |  | 402513 | }; | 
| 68 | 1 |  |  |  |  | 54 | }; | 
| 69 |  |  |  |  |  |  |  | 
| 70 | 1 |  |  |  |  | 18 | my $sresult = $scraper->scrape( $content ); | 
| 71 |  |  |  |  |  |  |  | 
| 72 | 1 |  |  |  |  | 31243 | my ($thumb,$image) = $sresult->{scripts}->[0] =~ /original_image","([^"]+)"\s*,\s*" | 
| 73 | 1 |  |  |  |  | 135 | my ($pub) = $content =~ m{Verlag:\s*(.*?)}msx; | 
| 74 |  |  |  |  |  |  |  | 
| 75 | 1 |  |  |  |  | 10 | my $data = { | 
| 76 |  |  |  |  |  |  | content    => $sresult->{content}, | 
| 77 |  |  |  |  |  |  | thumb_link => $thumb, | 
| 78 |  |  |  |  |  |  | image_link => $image, | 
| 79 |  |  |  |  |  |  | published  => $pub, | 
| 80 |  |  |  |  |  |  | title      => $sresult->{title}, | 
| 81 |  |  |  |  |  |  | }; | 
| 82 |  |  |  |  |  |  |  | 
| 83 | 1 | 50 |  |  |  | 8 | return $self->handler("Could not extract data from amazon.de result page.") | 
| 84 |  |  |  |  |  |  | unless(defined $data); | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | # trim top and tail | 
| 87 | 1 |  |  |  |  | 7 | foreach (keys %$data) { | 
| 88 | 5 | 100 |  |  |  | 18 | next unless defined $data->{$_}; | 
| 89 | 3 |  |  |  |  | 10 | $data->{$_} =~ s/^\s+//; | 
| 90 | 3 |  |  |  |  | 22 | $data->{$_} =~ s/\s+$//; | 
| 91 |  |  |  |  |  |  | } | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | #    ($data->{title},$data->{author}) = | 
| 94 |  |  |  |  |  |  | #        ($data->{content} =~ | 
| 95 |  |  |  |  |  |  | #                  / | 
| 96 |  |  |  |  |  |  | #                  Amazon.de\s*:\s* | 
| 97 |  |  |  |  |  |  | #                  (.+?) | 
| 98 |  |  |  |  |  |  | #                  \s*:\s*([^:]+)\s*: | 
| 99 |  |  |  |  |  |  | #                  /x); | 
| 100 |  |  |  |  |  |  | #                  #\s*(?:(?:English\sBooks?)|Bücher|Bücher|Bücher).* | 
| 101 |  |  |  |  |  |  | #    #$data->{title} =~ s!\(.*?\)$!!; | 
| 102 |  |  |  |  |  |  |  | 
| 103 | 1 |  |  |  |  | 11 | my @tmp_info = map{ s{\A\s*}{}; $_ }split /:/, $data->{content}; | 
|  | 1 |  |  |  |  | 6 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 104 | 1 |  |  |  |  | 4 | @{ $data }{ qw/title author/ } = @tmp_info[0,-2]; | 
|  | 1 |  |  |  |  | 4 |  | 
| 105 |  |  |  |  |  |  |  | 
| 106 | 1 | 50 |  |  |  | 74 | if ( $data->{author} =~ /\A\d+/ ) { | 
| 107 | 0 |  |  |  |  | 0 | my ($index) = grep{ $tmp_info[$_] eq $data->{author} } reverse ( 0 .. $#tmp_info ); | 
|  | 0 |  |  |  |  | 0 |  | 
| 108 | 0 |  |  |  |  | 0 | $data->{author} = $tmp_info[$index-1]; | 
| 109 |  |  |  |  |  |  | } | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | #my @tmp_info = split /:/, $data->{content}; | 
| 112 |  |  |  |  |  |  | #@{ $data }{ qw/title author/ } = map{ s/^\s*//; $_ }@tmp_info[0,-3]; | 
| 113 |  |  |  |  |  |  |  | 
| 114 | 1 |  |  |  |  | 22 | ($data->{publisher},$data->{pubdate}) = | 
| 115 |  |  |  |  |  |  | ($data->{published} =~ /\s*(.*?)(?:;.*?)?\s+\(([^)]*)/); | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 1 |  |  |  |  | 12 | my $bk = { | 
| 118 |  |  |  |  |  |  | 'isbn'        => $isbn, | 
| 119 |  |  |  |  |  |  | 'author'      => $data->{author}, | 
| 120 |  |  |  |  |  |  | 'title'       => $data->{title}, | 
| 121 |  |  |  |  |  |  | 'image_link'  => $data->{image_link}, | 
| 122 |  |  |  |  |  |  | 'thumb_link'  => $data->{thumb_link}, | 
| 123 |  |  |  |  |  |  | 'publisher'   => $data->{publisher}, | 
| 124 |  |  |  |  |  |  | 'pubdate'     => $data->{pubdate}, | 
| 125 |  |  |  |  |  |  | 'book_link'   => $mechanize->uri() | 
| 126 |  |  |  |  |  |  | }; | 
| 127 |  |  |  |  |  |  |  | 
| 128 | 1 |  |  |  |  | 64 | $self->book($bk); | 
| 129 | 1 |  |  |  |  | 18 | $self->found(1); | 
| 130 | 1 |  |  |  |  | 12 | return $self->book; | 
| 131 |  |  |  |  |  |  | } | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | 1; # End of WWW::Scraper::ISBN::AmazonDE_Driver | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | __END__ |