blib/lib/WWW/Yandex/Catalog/LookupSite.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 14 | 78 | 17.9 |
branch | 0 | 20 | 0.0 |
condition | 0 | 2 | 0.0 |
subroutine | 6 | 17 | 35.2 |
pod | 10 | 10 | 100.0 |
total | 30 | 127 | 23.6 |
line | stmt | bran | cond | sub | pod | time | code | |
---|---|---|---|---|---|---|---|---|
1 | package WWW::Yandex::Catalog::LookupSite; | |||||||
2 | ||||||||
3 | # Last updated July 27, 2014 | |||||||
4 | # | |||||||
5 | # Author: Irakliy Sunguryan ( www.sochi-travel.info ) | |||||||
6 | # Date Created: January 30, 2010 | |||||||
7 | ||||||||
8 | 1 | 1 | 69256 | use strict; | ||||
1 | 2 | |||||||
1 | 40 | |||||||
9 | 1 | 1 | 4 | use warnings; | ||||
1 | 2 | |||||||
1 | 35 | |||||||
10 | ||||||||
11 | 1 | 1 | 5 | use vars qw($VERSION); | ||||
1 | 313 | |||||||
1 | 59 | |||||||
12 | $VERSION = '0.11'; | |||||||
13 | ||||||||
14 | 1 | 1 | 1413 | use LWP::UserAgent; | ||||
1 | 195720 | |||||||
1 | 52 | |||||||
15 | ||||||||
16 | my $HAS_PUNYMOD; | |||||||
17 | 1 | 1 | 105 | BEGIN { $HAS_PUNYMOD = eval 'use URI::UTF8::Punycode; 1;'; } | ||||
1 | 1 | 1591 | ||||||
0 | ||||||||
0 | ||||||||
18 | ||||||||
19 | ||||||||
20 | sub new { | |||||||
21 | 0 | 0 | 1 | my $class = shift; | ||||
22 | 0 | my %options = @_; | ||||||
23 | ||||||||
24 | 0 | my $self = { | ||||||
25 | _tic => undef, | |||||||
26 | # undef - if there was an error getting or parsing data | |||||||
27 | # 0 - (a) when site is not present in catalog and tIC is < 10 | |||||||
28 | # - (b) when site is present in catalog, but the catalog | |||||||
29 | # reports it as zero (payed submission) | |||||||
30 | _shortDescr => undef, | |||||||
31 | # defined only when site is present in catalog; undef otherwise | |||||||
32 | _longDescr => undef, | |||||||
33 | # can be undef when site is present in catalog! | |||||||
34 | # not all sites in the catalog have long description | |||||||
35 | _categories => [], | |||||||
36 | # empty when site is not present in catalog | |||||||
37 | # at least one entry when present in catalog | |||||||
38 | _orderNum => undef, | |||||||
39 | # order number in the sub-category of catalog; "main" subcategory, | |||||||
40 | # when there are more than one. | |||||||
41 | # defined only when site is present in the catalog; undef otherwise | |||||||
42 | _uri => undef, | |||||||
43 | # URI as it is recorded in catalog. for example with/without 'www' prefix | |||||||
44 | # or it can be recorded with totally different address (narod.ru -> narod.yandex.ru) | |||||||
45 | # defined only when site is present in catalog; undef otherwise | |||||||
46 | }; | |||||||
47 | ||||||||
48 | 0 | $self->{ua} = LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION ); | ||||||
49 | ||||||||
50 | # Pass options on to LWP::UserAgent | |||||||
51 | 0 | foreach my $option ( keys %options ) { | ||||||
52 | 0 | $self->{ua}->$option( $options{$option} ); | ||||||
53 | } | |||||||
54 | ||||||||
55 | 0 | bless $self, $class; | ||||||
56 | 0 | return $self; | ||||||
57 | } | |||||||
58 | ||||||||
59 | ||||||||
60 | # Returns [ tIC, short description, long description, [list of catalogs], URI as returned by Yaca, order number in the main category ] | |||||||
61 | # "yaca" - Yandex Catalog | |||||||
62 | sub yaca_lookup { | |||||||
63 | 0 | 0 | 1 | my $self = shift; | ||||
64 | ||||||||
65 | 0 | 0 | my $address = shift || return; | |||||
66 | ||||||||
67 | # an $address is nomally a domain name (whatever level), but can include path too. | |||||||
68 | # scheme, authentication, port, and query strings are stripped -- | |||||||
69 | # assuming Yandex won't accept URIs that contain all this | |||||||
70 | ||||||||
71 | 0 | $self->{_tic} = $self->{_shortDescr} = $self->{_longDescr} = $self->{_orderNum} = $self->{_uri} = undef; | ||||||
72 | 0 | $self->{_categories} = []; | ||||||
73 | ||||||||
74 | 0 | $address =~ s|.*?://||; # loose scheme | ||||||
75 | 0 | $address =~ s|.*?(:.*?)?@||; # loose authentication | ||||||
76 | 0 | $address =~ s|(\w):\d+|$1|; # loose port | ||||||
77 | 0 | $address =~ s|\?.*||; # loose query | ||||||
78 | 0 | $address =~ s|/$||; # loose trailing slash | ||||||
79 | ||||||||
80 | 0 | my $resp = $self->{ua}->get( 'http://yaca.yandex.ru/yca/cy/ch/' . $address . '/' ); | ||||||
81 | 0 | 0 | return unless $resp->is_success; | |||||
82 | ||||||||
83 | 0 | my $contents = $resp->decoded_content; | ||||||
84 | ||||||||
85 | 0 | 0 | if( $contents =~ / / ) { |
|||||
86 | # "ресурс не описан в Яндекс.Каталоге" | |||||||
87 | # It's not in the catalog, but tIC is always displayed. | |||||||
88 | # Ex.: Индекс цитирования (тИЦ) ресурса — 10 | |||||||
89 | 0 | ( $self->{_tic} ) = $contents =~ / .*?\s(\d+)/s; |
||||||
90 | 0 | 0 | $self->{_tic} = 0 unless defined $self->{_tic}; | |||||
91 | } | |||||||
92 | else { | |||||||
93 | 0 | my( $entry ) = $contents =~ qr{( | ||||||
94 | ||||||||
95 | 0 | ( $self->{_orderNum}, $self->{_uri}, $self->{_shortDescr}, undef, $self->{_longDescr}, $self->{_tic} ) = | ||||||
96 | # $1 $2 $3 $4 $5 | |||||||
97 | $entry =~ qr{ | (\d+)\.\s* | .*(.*)(||||||
98 | ||||||||
99 | # main catalog | |||||||
100 | 0 | my( $path, $rubric ) = $contents =~ qr{ (.*?) \s* |
||||||
101 | 0 | 0 | if( $path ) { | |||||
102 | 0 | $path =~ s{?a.*?>|?h1>|\n}{}gs; # remove A, H1 tags and newline | ||||||
103 | 0 | $path =~ s|\x{0420}\x{0443}\x{0431}\x{0440}\x{0438}\x{043A}\x{0438} / ||; | ||||||
104 | # removed "Рубрики" - it always starts with this root word | |||||||
105 | # http://www.rishida.net/tools/conversion/ | |||||||
106 | 0 | 0 | push( @{$self->{_categories}}, $path.' / '.$rubric ) if $entry; | |||||
0 | ||||||||
107 | } | |||||||
108 | ||||||||
109 | # additional catalogs | |||||||
110 | 0 | ( $entry ) = $contents =~ qr{ (.*?) }s; |
||||||
111 | 0 | 0 | if( $entry ) { | |||||
112 | 0 | while( $entry =~ s{ | ||||||
113 | 0 | my $catPath = $1; | ||||||
114 | 0 | $catPath =~ s|\x{041A}\x{0430}\x{0442}\x{0430}\x{043B}\x{043E}\x{0433} / ||; | ||||||
115 | # removed "Каталог" - we know it's in the catalog | |||||||
116 | 0 | 0 | push( @{$self->{_categories}}, $catPath ) if $catPath; | |||||
0 | ||||||||
117 | } | |||||||
118 | } | |||||||
119 | } | |||||||
120 | ||||||||
121 | 0 | return [ $self->{_tic}, $self->{_shortDescr}, $self->{_longDescr}, $self->{_categories}, $self->{_uri}, $self->{_orderNum} ]; | ||||||
122 | } | |||||||
123 | ||||||||
124 | # Converts punycode in a IDN URL to utf8. | |||||||
125 | # Returns converted URL. | |||||||
126 | sub _punycode_utf8 { | |||||||
127 | 0 | 0 | my $uri = shift; | |||||
128 | ||||||||
129 | 0 | s/^\s+//, s/\s+$// for $uri; # trim $uri just in case | ||||||
130 | 0 | my( $schema, $domain, $path ) = $uri =~ m{(http.*?//)(.*?)(($|/|:).*)}; | ||||||
131 | # Ex.: http://www.domain.com:80/path?query#anchor -> 'http://' , 'www.domain.com' , ':80/path?query#anchor' | |||||||
132 | # I hope there are no urls with username/password links in YaCa | |||||||
133 | # I hope there are no non-http(s) links in YaCa | |||||||
134 | # I hope all links include schema part | |||||||
135 | # Anyway, from what I've seen in YaCa so far we should be Ok | |||||||
136 | ||||||||
137 | 0 | 0 | $domain = join( '.', map { /^xn--/ ? puny_dec($_) : $_ } split(/\./, $domain) ); | |||||
0 | ||||||||
138 | # split by dot -> convert only punycode parts -> glue 'em back together | |||||||
139 | ||||||||
140 | 0 | return $schema.$domain.$path; | ||||||
141 | } | |||||||
142 | ||||||||
143 | # == Convenience functions ================================= | |||||||
144 | ||||||||
145 | sub is_in_catalog { | |||||||
146 | 0 | 0 | 1 | my $self = shift; | ||||
147 | 0 | 0 | return scalar @{$self->{_categories}} > 0 ? 1 : 0; | |||||
0 | ||||||||
148 | } | |||||||
149 | ||||||||
150 | sub tic { | |||||||
151 | 0 | 0 | 1 | my $self = shift; | ||||
152 | 0 | return $self->{_tic}; | ||||||
153 | } | |||||||
154 | ||||||||
155 | sub short_description { | |||||||
156 | 0 | 0 | 1 | my $self = shift; | ||||
157 | 0 | return $self->{_shortDescr}; | ||||||
158 | } | |||||||
159 | ||||||||
160 | sub long_description { | |||||||
161 | 0 | 0 | 1 | my $self = shift; | ||||
162 | 0 | return $self->{_longDescr}; | ||||||
163 | } | |||||||
164 | ||||||||
165 | sub categories { | |||||||
166 | 0 | 0 | 1 | my $self = shift; | ||||
167 | 0 | return $self->{_categories}; | ||||||
168 | } | |||||||
169 | ||||||||
170 | sub order_number { | |||||||
171 | 0 | 0 | 1 | my $self = shift; | ||||
172 | 0 | return $self->{_orderNum}; | ||||||
173 | } | |||||||
174 | ||||||||
175 | sub uri { | |||||||
176 | 0 | 0 | 1 | my $self = shift; | ||||
177 | 0 | return $self->{_uri}; | ||||||
178 | } | |||||||
179 | ||||||||
180 | sub uri_utf8 { | |||||||
181 | 0 | 0 | 1 | my $self = shift; | ||||
182 | 0 | 0 | return $HAS_PUNYMOD ? _punycode_utf8( $self->{_uri} ) : $self->{_uri}; | |||||
183 | } | |||||||
184 | ||||||||
185 | 1; | |||||||
186 | ||||||||
187 | __END__ |