| blib/lib/WWW/Yandex/Catalog/LookupSite.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 14 | 78 | 17.9 |
| branch | 0 | 20 | 0.0 |
| condition | 0 | 2 | 0.0 |
| subroutine | 6 | 17 | 35.2 |
| pod | 10 | 10 | 100.0 |
| total | 30 | 127 | 23.6 |
| line | stmt | bran | cond | sub | pod | time | code | |
|---|---|---|---|---|---|---|---|---|
| 1 | package WWW::Yandex::Catalog::LookupSite; | |||||||
| 2 | ||||||||
| 3 | # ABSTRACT: DEPRECATED | |||||||
| 4 | ||||||||
| 5 | # Last updated July 27, 2014 | |||||||
| 6 | # | |||||||
| 7 | # Author: Irakliy Sunguryan ( www.sochi-travel.info ) | |||||||
| 8 | # Date Created: January 30, 2010 | |||||||
| 9 | ||||||||
| 10 | 1 | 1 | 76093 | use strict; | ||||
| 1 | 2 | |||||||
| 1 | 29 | |||||||
| 11 | 1 | 1 | 10 | use warnings; | ||||
| 1 | 2 | |||||||
| 1 | 27 | |||||||
| 12 | ||||||||
| 13 | 1 | 1 | 5 | use vars qw($VERSION); | ||||
| 1 | 3 | |||||||
| 1 | 65 | |||||||
| 14 | $VERSION = '0.12'; | |||||||
| 15 | ||||||||
| 16 | 1 | 1 | 787 | use LWP::UserAgent; | ||||
| 1 | 57776 | |||||||
| 1 | 39 | |||||||
| 17 | ||||||||
| 18 | my $HAS_PUNYMOD; | |||||||
| 19 | 1 | 1 | 65 | BEGIN { $HAS_PUNYMOD = eval 'use URI::UTF8::Punycode; 1;'; } | ||||
| 1 | 1 | 1380 | ||||||
| 0 | ||||||||
| 0 | ||||||||
| 20 | ||||||||
| 21 | ||||||||
| 22 | sub new { | |||||||
| 23 | 0 | 0 | 1 | my $class = shift; | ||||
| 24 | 0 | my %options = @_; | ||||||
| 25 | ||||||||
| 26 | 0 | my $self = { | ||||||
| 27 | _tic => undef, | |||||||
| 28 | # undef - if there was an error getting or parsing data | |||||||
| 29 | # 0 - (a) when site is not present in catalog and tIC is < 10 | |||||||
| 30 | # - (b) when site is present in catalog, but the catalog | |||||||
| 31 | # reports it as zero (payed submission) | |||||||
| 32 | _shortDescr => undef, | |||||||
| 33 | # defined only when site is present in catalog; undef otherwise | |||||||
| 34 | _longDescr => undef, | |||||||
| 35 | # can be undef when site is present in catalog! | |||||||
| 36 | # not all sites in the catalog have long description | |||||||
| 37 | _categories => [], | |||||||
| 38 | # empty when site is not present in catalog | |||||||
| 39 | # at least one entry when present in catalog | |||||||
| 40 | _orderNum => undef, | |||||||
| 41 | # order number in the sub-category of catalog; "main" subcategory, | |||||||
| 42 | # when there are more than one. | |||||||
| 43 | # defined only when site is present in the catalog; undef otherwise | |||||||
| 44 | _uri => undef, | |||||||
| 45 | # URI as it is recorded in catalog. for example with/without 'www' prefix | |||||||
| 46 | # or it can be recorded with totally different address (narod.ru -> narod.yandex.ru) | |||||||
| 47 | # defined only when site is present in catalog; undef otherwise | |||||||
| 48 | }; | |||||||
| 49 | ||||||||
| 50 | 0 | $self->{ua} = LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION ); | ||||||
| 51 | ||||||||
| 52 | # Pass options on to LWP::UserAgent | |||||||
| 53 | 0 | foreach my $option ( keys %options ) { | ||||||
| 54 | 0 | $self->{ua}->$option( $options{$option} ); | ||||||
| 55 | } | |||||||
| 56 | ||||||||
| 57 | 0 | bless $self, $class; | ||||||
| 58 | 0 | return $self; | ||||||
| 59 | } | |||||||
| 60 | ||||||||
| 61 | ||||||||
| 62 | # Returns [ tIC, short description, long description, [list of catalogs], URI as returned by Yaca, order number in the main category ] | |||||||
| 63 | # "yaca" - Yandex Catalog | |||||||
| 64 | sub yaca_lookup { | |||||||
| 65 | 0 | 0 | 1 | my $self = shift; | ||||
| 66 | ||||||||
| 67 | 0 | 0 | my $address = shift || return; | |||||
| 68 | ||||||||
| 69 | # an $address is nomally a domain name (whatever level), but can include path too. | |||||||
| 70 | # scheme, authentication, port, and query strings are stripped -- | |||||||
| 71 | # assuming Yandex won't accept URIs that contain all this | |||||||
| 72 | ||||||||
| 73 | 0 | $self->{_tic} = $self->{_shortDescr} = $self->{_longDescr} = $self->{_orderNum} = $self->{_uri} = undef; | ||||||
| 74 | 0 | $self->{_categories} = []; | ||||||
| 75 | ||||||||
| 76 | 0 | $address =~ s|.*?://||; # loose scheme | ||||||
| 77 | 0 | $address =~ s|.*?(:.*?)?@||; # loose authentication | ||||||
| 78 | 0 | $address =~ s|(\w):\d+|$1|; # loose port | ||||||
| 79 | 0 | $address =~ s|\?.*||; # loose query | ||||||
| 80 | 0 | $address =~ s|/$||; # loose trailing slash | ||||||
| 81 | ||||||||
| 82 | 0 | my $resp = $self->{ua}->get( 'http://yaca.yandex.ru/yca/cy/ch/' . $address . '/' ); | ||||||
| 83 | 0 | 0 | return unless $resp->is_success; | |||||
| 84 | ||||||||
| 85 | 0 | my $contents = $resp->decoded_content; | ||||||
| 86 | ||||||||
| 87 | 0 | 0 | if( $contents =~ / / ) { |
|||||
| 88 | # "ресурс не описан в Яндекс.Каталоге" | |||||||
| 89 | # It's not in the catalog, but tIC is always displayed. | |||||||
| 90 | # Ex.: Индекс цитирования (тИЦ) ресурса — 10 | |||||||
| 91 | 0 | ( $self->{_tic} ) = $contents =~ / .*?\s(\d+)/s; |
||||||
| 92 | 0 | 0 | $self->{_tic} = 0 unless defined $self->{_tic}; | |||||
| 93 | } | |||||||
| 94 | else { | |||||||
| 95 | 0 | my( $entry ) = $contents =~ qr{( | ||||||
| 96 | ||||||||
| 97 | 0 | ( $self->{_orderNum}, $self->{_uri}, $self->{_shortDescr}, undef, $self->{_longDescr}, $self->{_tic} ) = | ||||||
| 98 | # $1 $2 $3 $4 $5 | |||||||
| 99 | $entry =~ qr{ | (\d+)\.\s* | .*(.*)(||||||
| 100 | ||||||||
| 101 | # main catalog | |||||||
| 102 | 0 | my( $path, $rubric ) = $contents =~ qr{ (.*?) \s* |
||||||
| 103 | 0 | 0 | if( $path ) { | |||||
| 104 | 0 | $path =~ s{?a.*?>|?h1>|\n}{}gs; # remove A, H1 tags and newline | ||||||
| 105 | 0 | $path =~ s|\x{0420}\x{0443}\x{0431}\x{0440}\x{0438}\x{043A}\x{0438} / ||; | ||||||
| 106 | # removed "Рубрики" - it always starts with this root word | |||||||
| 107 | # http://www.rishida.net/tools/conversion/ | |||||||
| 108 | 0 | 0 | push( @{$self->{_categories}}, $path.' / '.$rubric ) if $entry; | |||||
| 0 | ||||||||
| 109 | } | |||||||
| 110 | ||||||||
| 111 | # additional catalogs | |||||||
| 112 | 0 | ( $entry ) = $contents =~ qr{ (.*?) }s; |
||||||
| 113 | 0 | 0 | if( $entry ) { | |||||
| 114 | 0 | while( $entry =~ s{ | ||||||
| 115 | 0 | my $catPath = $1; | ||||||
| 116 | 0 | $catPath =~ s|\x{041A}\x{0430}\x{0442}\x{0430}\x{043B}\x{043E}\x{0433} / ||; | ||||||
| 117 | # removed "Каталог" - we know it's in the catalog | |||||||
| 118 | 0 | 0 | push( @{$self->{_categories}}, $catPath ) if $catPath; | |||||
| 0 | ||||||||
| 119 | } | |||||||
| 120 | } | |||||||
| 121 | } | |||||||
| 122 | ||||||||
| 123 | 0 | return [ $self->{_tic}, $self->{_shortDescr}, $self->{_longDescr}, $self->{_categories}, $self->{_uri}, $self->{_orderNum} ]; | ||||||
| 124 | } | |||||||
| 125 | ||||||||
| 126 | # Converts punycode in a IDN URL to utf8. | |||||||
| 127 | # Returns converted URL. | |||||||
| 128 | sub _punycode_utf8 { | |||||||
| 129 | 0 | 0 | my $uri = shift; | |||||
| 130 | ||||||||
| 131 | 0 | s/^\s+//, s/\s+$// for $uri; # trim $uri just in case | ||||||
| 132 | 0 | my( $schema, $domain, $path ) = $uri =~ m{(http.*?//)(.*?)(($|/|:).*)}; | ||||||
| 133 | # Ex.: http://www.domain.com:80/path?query#anchor -> 'http://' , 'www.domain.com' , ':80/path?query#anchor' | |||||||
| 134 | # I hope there are no urls with username/password links in YaCa | |||||||
| 135 | # I hope there are no non-http(s) links in YaCa | |||||||
| 136 | # I hope all links include schema part | |||||||
| 137 | # Anyway, from what I've seen in YaCa so far we should be Ok | |||||||
| 138 | ||||||||
| 139 | 0 | 0 | $domain = join( '.', map { /^xn--/ ? puny_dec($_) : $_ } split(/\./, $domain) ); | |||||
| 0 | ||||||||
| 140 | # split by dot -> convert only punycode parts -> glue 'em back together | |||||||
| 141 | ||||||||
| 142 | 0 | return $schema.$domain.$path; | ||||||
| 143 | } | |||||||
| 144 | ||||||||
| 145 | # == Convenience functions ================================= | |||||||
| 146 | ||||||||
| 147 | sub is_in_catalog { | |||||||
| 148 | 0 | 0 | 1 | my $self = shift; | ||||
| 149 | 0 | 0 | return scalar @{$self->{_categories}} > 0 ? 1 : 0; | |||||
| 0 | ||||||||
| 150 | } | |||||||
| 151 | ||||||||
| 152 | sub tic { | |||||||
| 153 | 0 | 0 | 1 | my $self = shift; | ||||
| 154 | 0 | return $self->{_tic}; | ||||||
| 155 | } | |||||||
| 156 | ||||||||
| 157 | sub short_description { | |||||||
| 158 | 0 | 0 | 1 | my $self = shift; | ||||
| 159 | 0 | return $self->{_shortDescr}; | ||||||
| 160 | } | |||||||
| 161 | ||||||||
| 162 | sub long_description { | |||||||
| 163 | 0 | 0 | 1 | my $self = shift; | ||||
| 164 | 0 | return $self->{_longDescr}; | ||||||
| 165 | } | |||||||
| 166 | ||||||||
| 167 | sub categories { | |||||||
| 168 | 0 | 0 | 1 | my $self = shift; | ||||
| 169 | 0 | return $self->{_categories}; | ||||||
| 170 | } | |||||||
| 171 | ||||||||
| 172 | sub order_number { | |||||||
| 173 | 0 | 0 | 1 | my $self = shift; | ||||
| 174 | 0 | return $self->{_orderNum}; | ||||||
| 175 | } | |||||||
| 176 | ||||||||
| 177 | sub uri { | |||||||
| 178 | 0 | 0 | 1 | my $self = shift; | ||||
| 179 | 0 | return $self->{_uri}; | ||||||
| 180 | } | |||||||
| 181 | ||||||||
| 182 | sub uri_utf8 { | |||||||
| 183 | 0 | 0 | 1 | my $self = shift; | ||||
| 184 | 0 | 0 | return $HAS_PUNYMOD ? _punycode_utf8( $self->{_uri} ) : $self->{_uri}; | |||||
| 185 | } | |||||||
| 186 | ||||||||
| 187 | 1; | |||||||
| 188 | ||||||||
| 189 | __END__ |