| blib/lib/WWW/Yandex/Catalog/LookupSite.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 14 | 78 | 17.9 |
| branch | 0 | 20 | 0.0 |
| condition | 0 | 2 | 0.0 |
| subroutine | 6 | 17 | 35.2 |
| pod | 10 | 10 | 100.0 |
| total | 30 | 127 | 23.6 |
| line | stmt | bran | cond | sub | pod | time | code | |
|---|---|---|---|---|---|---|---|---|
| 1 | package WWW::Yandex::Catalog::LookupSite; | |||||||
| 2 | ||||||||
| 3 | # Last updated July 27, 2014 | |||||||
| 4 | # | |||||||
| 5 | # Author: Irakliy Sunguryan ( www.sochi-travel.info ) | |||||||
| 6 | # Date Created: January 30, 2010 | |||||||
| 7 | ||||||||
| 8 | 1 | 1 | 69256 | use strict; | ||||
| 1 | 2 | |||||||
| 1 | 40 | |||||||
| 9 | 1 | 1 | 4 | use warnings; | ||||
| 1 | 2 | |||||||
| 1 | 35 | |||||||
| 10 | ||||||||
| 11 | 1 | 1 | 5 | use vars qw($VERSION); | ||||
| 1 | 313 | |||||||
| 1 | 59 | |||||||
| 12 | $VERSION = '0.11'; | |||||||
| 13 | ||||||||
| 14 | 1 | 1 | 1413 | use LWP::UserAgent; | ||||
| 1 | 195720 | |||||||
| 1 | 52 | |||||||
| 15 | ||||||||
| 16 | my $HAS_PUNYMOD; | |||||||
| 17 | 1 | 1 | 105 | BEGIN { $HAS_PUNYMOD = eval 'use URI::UTF8::Punycode; 1;'; } | ||||
| 1 | 1 | 1591 | ||||||
| 0 | ||||||||
| 0 | ||||||||
| 18 | ||||||||
| 19 | ||||||||
| 20 | sub new { | |||||||
| 21 | 0 | 0 | 1 | my $class = shift; | ||||
| 22 | 0 | my %options = @_; | ||||||
| 23 | ||||||||
| 24 | 0 | my $self = { | ||||||
| 25 | _tic => undef, | |||||||
| 26 | # undef - if there was an error getting or parsing data | |||||||
| 27 | # 0 - (a) when site is not present in catalog and tIC is < 10 | |||||||
| 28 | # - (b) when site is present in catalog, but the catalog | |||||||
| 29 | # reports it as zero (payed submission) | |||||||
| 30 | _shortDescr => undef, | |||||||
| 31 | # defined only when site is present in catalog; undef otherwise | |||||||
| 32 | _longDescr => undef, | |||||||
| 33 | # can be undef when site is present in catalog! | |||||||
| 34 | # not all sites in the catalog have long description | |||||||
| 35 | _categories => [], | |||||||
| 36 | # empty when site is not present in catalog | |||||||
| 37 | # at least one entry when present in catalog | |||||||
| 38 | _orderNum => undef, | |||||||
| 39 | # order number in the sub-category of catalog; "main" subcategory, | |||||||
| 40 | # when there are more than one. | |||||||
| 41 | # defined only when site is present in the catalog; undef otherwise | |||||||
| 42 | _uri => undef, | |||||||
| 43 | # URI as it is recorded in catalog. for example with/without 'www' prefix | |||||||
| 44 | # or it can be recorded with totally different address (narod.ru -> narod.yandex.ru) | |||||||
| 45 | # defined only when site is present in catalog; undef otherwise | |||||||
| 46 | }; | |||||||
| 47 | ||||||||
| 48 | 0 | $self->{ua} = LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION ); | ||||||
| 49 | ||||||||
| 50 | # Pass options on to LWP::UserAgent | |||||||
| 51 | 0 | foreach my $option ( keys %options ) { | ||||||
| 52 | 0 | $self->{ua}->$option( $options{$option} ); | ||||||
| 53 | } | |||||||
| 54 | ||||||||
| 55 | 0 | bless $self, $class; | ||||||
| 56 | 0 | return $self; | ||||||
| 57 | } | |||||||
| 58 | ||||||||
| 59 | ||||||||
| 60 | # Returns [ tIC, short description, long description, [list of catalogs], URI as returned by Yaca, order number in the main category ] | |||||||
| 61 | # "yaca" - Yandex Catalog | |||||||
| 62 | sub yaca_lookup { | |||||||
| 63 | 0 | 0 | 1 | my $self = shift; | ||||
| 64 | ||||||||
| 65 | 0 | 0 | my $address = shift || return; | |||||
| 66 | ||||||||
| 67 | # an $address is nomally a domain name (whatever level), but can include path too. | |||||||
| 68 | # scheme, authentication, port, and query strings are stripped -- | |||||||
| 69 | # assuming Yandex won't accept URIs that contain all this | |||||||
| 70 | ||||||||
| 71 | 0 | $self->{_tic} = $self->{_shortDescr} = $self->{_longDescr} = $self->{_orderNum} = $self->{_uri} = undef; | ||||||
| 72 | 0 | $self->{_categories} = []; | ||||||
| 73 | ||||||||
| 74 | 0 | $address =~ s|.*?://||; # loose scheme | ||||||
| 75 | 0 | $address =~ s|.*?(:.*?)?@||; # loose authentication | ||||||
| 76 | 0 | $address =~ s|(\w):\d+|$1|; # loose port | ||||||
| 77 | 0 | $address =~ s|\?.*||; # loose query | ||||||
| 78 | 0 | $address =~ s|/$||; # loose trailing slash | ||||||
| 79 | ||||||||
| 80 | 0 | my $resp = $self->{ua}->get( 'http://yaca.yandex.ru/yca/cy/ch/' . $address . '/' ); | ||||||
| 81 | 0 | 0 | return unless $resp->is_success; | |||||
| 82 | ||||||||
| 83 | 0 | my $contents = $resp->decoded_content; | ||||||
| 84 | ||||||||
| 85 | 0 | 0 | if( $contents =~ / / ) { |
|||||
| 86 | # "ресурс не описан в Яндекс.Каталоге" | |||||||
| 87 | # It's not in the catalog, but tIC is always displayed. | |||||||
| 88 | # Ex.: Индекс цитирования (тИЦ) ресурса — 10 | |||||||
| 89 | 0 | ( $self->{_tic} ) = $contents =~ / .*?\s(\d+)/s; |
||||||
| 90 | 0 | 0 | $self->{_tic} = 0 unless defined $self->{_tic}; | |||||
| 91 | } | |||||||
| 92 | else { | |||||||
| 93 | 0 | my( $entry ) = $contents =~ qr{( | ||||||
| 94 | ||||||||
| 95 | 0 | ( $self->{_orderNum}, $self->{_uri}, $self->{_shortDescr}, undef, $self->{_longDescr}, $self->{_tic} ) = | ||||||
| 96 | # $1 $2 $3 $4 $5 | |||||||
| 97 | $entry =~ qr{ | (\d+)\.\s* | .*(.*)(||||||
| 98 | ||||||||
| 99 | # main catalog | |||||||
| 100 | 0 | my( $path, $rubric ) = $contents =~ qr{ (.*?) \s* |
||||||
| 101 | 0 | 0 | if( $path ) { | |||||
| 102 | 0 | $path =~ s{?a.*?>|?h1>|\n}{}gs; # remove A, H1 tags and newline | ||||||
| 103 | 0 | $path =~ s|\x{0420}\x{0443}\x{0431}\x{0440}\x{0438}\x{043A}\x{0438} / ||; | ||||||
| 104 | # removed "Рубрики" - it always starts with this root word | |||||||
| 105 | # http://www.rishida.net/tools/conversion/ | |||||||
| 106 | 0 | 0 | push( @{$self->{_categories}}, $path.' / '.$rubric ) if $entry; | |||||
| 0 | ||||||||
| 107 | } | |||||||
| 108 | ||||||||
| 109 | # additional catalogs | |||||||
| 110 | 0 | ( $entry ) = $contents =~ qr{ (.*?) }s; |
||||||
| 111 | 0 | 0 | if( $entry ) { | |||||
| 112 | 0 | while( $entry =~ s{ | ||||||
| 113 | 0 | my $catPath = $1; | ||||||
| 114 | 0 | $catPath =~ s|\x{041A}\x{0430}\x{0442}\x{0430}\x{043B}\x{043E}\x{0433} / ||; | ||||||
| 115 | # removed "Каталог" - we know it's in the catalog | |||||||
| 116 | 0 | 0 | push( @{$self->{_categories}}, $catPath ) if $catPath; | |||||
| 0 | ||||||||
| 117 | } | |||||||
| 118 | } | |||||||
| 119 | } | |||||||
| 120 | ||||||||
| 121 | 0 | return [ $self->{_tic}, $self->{_shortDescr}, $self->{_longDescr}, $self->{_categories}, $self->{_uri}, $self->{_orderNum} ]; | ||||||
| 122 | } | |||||||
| 123 | ||||||||
| 124 | # Converts punycode in a IDN URL to utf8. | |||||||
| 125 | # Returns converted URL. | |||||||
| 126 | sub _punycode_utf8 { | |||||||
| 127 | 0 | 0 | my $uri = shift; | |||||
| 128 | ||||||||
| 129 | 0 | s/^\s+//, s/\s+$// for $uri; # trim $uri just in case | ||||||
| 130 | 0 | my( $schema, $domain, $path ) = $uri =~ m{(http.*?//)(.*?)(($|/|:).*)}; | ||||||
| 131 | # Ex.: http://www.domain.com:80/path?query#anchor -> 'http://' , 'www.domain.com' , ':80/path?query#anchor' | |||||||
| 132 | # I hope there are no urls with username/password links in YaCa | |||||||
| 133 | # I hope there are no non-http(s) links in YaCa | |||||||
| 134 | # I hope all links include schema part | |||||||
| 135 | # Anyway, from what I've seen in YaCa so far we should be Ok | |||||||
| 136 | ||||||||
| 137 | 0 | 0 | $domain = join( '.', map { /^xn--/ ? puny_dec($_) : $_ } split(/\./, $domain) ); | |||||
| 0 | ||||||||
| 138 | # split by dot -> convert only punycode parts -> glue 'em back together | |||||||
| 139 | ||||||||
| 140 | 0 | return $schema.$domain.$path; | ||||||
| 141 | } | |||||||
| 142 | ||||||||
| 143 | # == Convenience functions ================================= | |||||||
| 144 | ||||||||
| 145 | sub is_in_catalog { | |||||||
| 146 | 0 | 0 | 1 | my $self = shift; | ||||
| 147 | 0 | 0 | return scalar @{$self->{_categories}} > 0 ? 1 : 0; | |||||
| 0 | ||||||||
| 148 | } | |||||||
| 149 | ||||||||
| 150 | sub tic { | |||||||
| 151 | 0 | 0 | 1 | my $self = shift; | ||||
| 152 | 0 | return $self->{_tic}; | ||||||
| 153 | } | |||||||
| 154 | ||||||||
| 155 | sub short_description { | |||||||
| 156 | 0 | 0 | 1 | my $self = shift; | ||||
| 157 | 0 | return $self->{_shortDescr}; | ||||||
| 158 | } | |||||||
| 159 | ||||||||
| 160 | sub long_description { | |||||||
| 161 | 0 | 0 | 1 | my $self = shift; | ||||
| 162 | 0 | return $self->{_longDescr}; | ||||||
| 163 | } | |||||||
| 164 | ||||||||
| 165 | sub categories { | |||||||
| 166 | 0 | 0 | 1 | my $self = shift; | ||||
| 167 | 0 | return $self->{_categories}; | ||||||
| 168 | } | |||||||
| 169 | ||||||||
| 170 | sub order_number { | |||||||
| 171 | 0 | 0 | 1 | my $self = shift; | ||||
| 172 | 0 | return $self->{_orderNum}; | ||||||
| 173 | } | |||||||
| 174 | ||||||||
| 175 | sub uri { | |||||||
| 176 | 0 | 0 | 1 | my $self = shift; | ||||
| 177 | 0 | return $self->{_uri}; | ||||||
| 178 | } | |||||||
| 179 | ||||||||
| 180 | sub uri_utf8 { | |||||||
| 181 | 0 | 0 | 1 | my $self = shift; | ||||
| 182 | 0 | 0 | return $HAS_PUNYMOD ? _punycode_utf8( $self->{_uri} ) : $self->{_uri}; | |||||
| 183 | } | |||||||
| 184 | ||||||||
| 185 | 1; | |||||||
| 186 | ||||||||
| 187 | __END__ |