| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::Crawler::Mojo::ScraperUtil; | 
| 2 | 10 |  |  | 10 |  | 60 | use strict; | 
|  | 10 |  |  |  |  | 18 |  | 
|  | 10 |  |  |  |  | 256 |  | 
| 3 | 10 |  |  | 10 |  | 44 | use warnings; | 
|  | 10 |  |  |  |  | 16 |  | 
|  | 10 |  |  |  |  | 271 |  | 
| 4 | 10 |  |  | 10 |  | 68 | use Mojo::Base -base; | 
|  | 10 |  |  |  |  | 16 |  | 
|  | 10 |  |  |  |  | 56 |  | 
| 5 | 10 |  |  | 10 |  | 1380 | use Encode qw(find_encoding); | 
|  | 10 |  |  |  |  | 17 |  | 
|  | 10 |  |  |  |  | 554 |  | 
| 6 | 10 |  |  | 10 |  | 67 | use Exporter 'import'; | 
|  | 10 |  |  |  |  | 24 |  | 
|  | 10 |  |  |  |  | 18130 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | our @EXPORT_OK = qw(collect_urls_css html_handler_presets reduce_html_handlers | 
| 9 |  |  |  |  |  |  | guess_encoding encoder decoded_body resolve_href); | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | my $charset_re = qr{\bcharset\s*=\s*['"]?([a-zA-Z0-9_\-]+)['"]?}i; | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | sub collect_urls_css { | 
| 14 | 6 | 100 | 50 | 6 | 1 | 18090 | map { s/^(['"])// && s/$1$//; $_ } (shift || '') =~ m{url\((.+?)\)}ig; | 
|  | 12 |  |  |  |  | 76 |  | 
|  | 12 |  |  |  |  | 39 |  | 
| 15 |  |  |  |  |  |  | } | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | sub decoded_body { | 
| 18 | 24 |  |  | 24 | 1 | 59 | my $res = shift; | 
| 19 | 24 |  |  |  |  | 75 | return encoder(guess_encoding($res))->decode($res->body); | 
| 20 |  |  |  |  |  |  | } | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | sub encoder { | 
| 23 | 29 |  | 100 | 29 | 1 | 595 | for (shift || 'utf-8', 'utf-8') { | 
| 24 | 29 | 50 |  |  |  | 122 | if (my $enc = find_encoding($_)) { | 
| 25 | 29 |  |  |  |  | 12836 | return $enc; | 
| 26 |  |  |  |  |  |  | } | 
| 27 |  |  |  |  |  |  | } | 
| 28 |  |  |  |  |  |  | } | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | sub guess_encoding { | 
| 31 | 28 |  |  | 28 | 1 | 1924 | my $res  = shift; | 
| 32 | 28 |  |  |  |  | 86 | my $type = $res->headers->content_type; | 
| 33 | 28 | 50 |  |  |  | 442 | return unless ($type); | 
| 34 | 28 |  |  |  |  | 166 | my $charset = ($type =~ $charset_re)[0]; | 
| 35 | 28 | 100 |  |  |  | 89 | return $charset                         if ($charset); | 
| 36 | 19 | 100 |  |  |  | 177 | return _guess_encoding_html($res->body) if ($type =~ qr{text/(html|xml)}); | 
| 37 | 1 | 50 |  |  |  | 12 | return _guess_encoding_css($res->body)  if ($type =~ qr{text/css}); | 
| 38 |  |  |  |  |  |  | } | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | sub html_handler_presets { | 
| 41 |  |  |  |  |  |  | return { | 
| 42 | 12 |  |  | 12 |  | 30 | 'script[src]'  => sub { $_[0]->{src} }, | 
| 43 | 14 |  |  | 14 |  | 36 | 'link[href]'   => sub { $_[0]->{href} }, | 
| 44 | 38 |  |  | 38 |  | 84 | 'a[href]'      => sub { $_[0]->{href} }, | 
| 45 | 1 |  |  | 1 |  | 3 | 'img[src]'     => sub { $_[0]->{src} }, | 
| 46 | 4 |  |  | 4 |  | 13 | 'area'         => sub { $_[0]->{href}, $_[0]->{ping} }, | 
| 47 | 0 |  |  | 0 |  | 0 | 'embed[src]'   => sub { $_[0]->{src} }, | 
| 48 | 0 |  |  | 0 |  | 0 | 'frame[src]'   => sub { $_[0]->{src} }, | 
| 49 | 0 |  |  | 0 |  | 0 | 'iframe[src]'  => sub { $_[0]->{src} }, | 
| 50 | 0 |  |  | 0 |  | 0 | 'input[src]'   => sub { $_[0]->{src} }, | 
| 51 | 0 |  |  | 0 |  | 0 | 'object[data]' => sub { $_[0]->{data} }, | 
| 52 |  |  |  |  |  |  | 'form' => sub { | 
| 53 | 31 |  |  | 31 |  | 75603 | my $dom = shift; | 
| 54 | 31 |  |  |  |  | 48 | my (%seed, $submit); | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | $dom->find("[name],[type='submit'],[type='image']")->each( | 
| 57 |  |  |  |  |  |  | sub { | 
| 58 | 81 |  |  |  |  | 14627 | my $e = shift; | 
| 59 | 81 | 100 | 100 |  |  | 155 | $seed{my $name = $e->{name}} ||= [] if $e->{name}; | 
| 60 |  |  |  |  |  |  |  | 
| 61 | 81 | 100 | 100 |  |  | 1782 | if ($e->tag eq 'select' && $name) { | 
|  |  | 100 |  |  |  |  |  | 
| 62 | 5 |  |  |  |  | 58 | my $found = 0; | 
| 63 | 5 | 100 |  |  |  | 7 | if (exists $e->{multiple}) { | 
|  |  | 100 |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | $e->find('option[selected]')->each( | 
| 65 |  |  |  |  |  |  | sub { | 
| 66 | 2 |  |  |  |  | 314 | push(@{$seed{$name}}, shift->{value}); | 
|  | 2 |  |  |  |  | 7 |  | 
| 67 | 2 |  |  |  |  | 25 | $found++; | 
| 68 |  |  |  |  |  |  | } | 
| 69 | 1 |  |  |  |  | 13 | ); | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  | elsif (my $opts = $e->at('option[selected]')) { | 
| 72 | 2 |  |  |  |  | 456 | push(@{$seed{$name}}, $opts->{value}); | 
|  | 2 |  |  |  |  | 5 |  | 
| 73 | 2 |  |  |  |  | 25 | $found++; | 
| 74 |  |  |  |  |  |  | } | 
| 75 | 5 | 100 |  |  |  | 544 | if (!$found) { | 
| 76 |  |  |  |  |  |  | $e->find('option:nth-child(1)')->each( | 
| 77 |  |  |  |  |  |  | sub { | 
| 78 | 2 |  |  |  |  | 941 | push(@{$seed{$name}}, shift->{value}); | 
|  | 2 |  |  |  |  | 7 |  | 
| 79 |  |  |  |  |  |  | } | 
| 80 | 2 |  |  |  |  | 35 | ); | 
| 81 |  |  |  |  |  |  | } | 
| 82 |  |  |  |  |  |  | } | 
| 83 |  |  |  |  |  |  | elsif ($e->tag eq 'textarea') { | 
| 84 | 5 |  |  |  |  | 95 | push(@{$seed{$name}}, $e->text); | 
|  | 5 |  |  |  |  | 13 |  | 
| 85 |  |  |  |  |  |  | } | 
| 86 |  |  |  |  |  |  |  | 
| 87 | 81 | 100 |  |  |  | 1567 | return unless (my $type = $e->{type}); | 
| 88 |  |  |  |  |  |  |  | 
| 89 | 69 | 100 | 100 |  |  | 820 | if (!$submit && grep { $_ eq $type } qw{submit image}) { | 
|  | 122 |  |  |  |  | 261 |  | 
| 90 | 27 |  |  |  |  | 39 | $submit = 1; | 
| 91 | 27 | 100 |  |  |  | 43 | push(@{$seed{$name}}, $e->{value}) if $name; | 
|  | 6 |  |  |  |  | 19 |  | 
| 92 |  |  |  |  |  |  | } | 
| 93 | 69 | 100 |  |  |  | 183 | if ($name) { | 
| 94 | 47 | 100 |  |  |  | 57 | if (grep { $_ eq $type } qw{text hidden number password date}) { | 
|  | 235 | 50 |  |  |  | 313 |  | 
|  |  | 100 |  |  |  |  |  | 
| 95 | 23 |  |  |  |  | 28 | push(@{$seed{$name}}, $e->{value}); | 
|  | 23 |  |  |  |  | 50 |  | 
| 96 |  |  |  |  |  |  | } | 
| 97 | 24 |  |  |  |  | 43 | elsif (grep { $_ eq $type } qw{checkbox}) { | 
| 98 | 0 | 0 |  |  |  | 0 | push(@{$seed{$name}}, $e->{value}) if (exists $e->{checked}); | 
|  | 0 |  |  |  |  | 0 |  | 
| 99 |  |  |  |  |  |  | } | 
| 100 | 24 |  |  |  |  | 56 | elsif (grep { $_ eq $type } qw{radio}) { | 
| 101 | 9 | 100 |  |  |  | 13 | push(@{$seed{$name}}, $e->{value}) if (exists $e->{checked}); | 
|  | 3 |  |  |  |  | 37 |  | 
| 102 |  |  |  |  |  |  | } | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  | } | 
| 105 | 31 |  |  |  |  | 70 | ); | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | return [ | 
| 108 |  |  |  |  |  |  | $dom->{action} || '', | 
| 109 | 31 |  | 100 |  |  | 694 | uc($dom->{method} || 'GET'), | 
|  |  |  | 100 |  |  |  |  | 
| 110 |  |  |  |  |  |  | Mojo::Parameters->new(%seed) | 
| 111 |  |  |  |  |  |  | ]; | 
| 112 |  |  |  |  |  |  | }, | 
| 113 |  |  |  |  |  |  | 'meta[content]' => sub { | 
| 114 |  |  |  |  |  |  | return $1 | 
| 115 |  |  |  |  |  |  | if ($_[0] =~ qr{http\-equiv="?Refresh"?}i | 
| 116 | 2 | 100 | 50 | 2 |  | 11 | && (($_[0]->{content} || '') =~ qr{URL=(.+)}i)[0]); | 
|  |  |  | 66 |  |  |  |  | 
| 117 | 1 |  |  |  |  | 45 | return; | 
| 118 |  |  |  |  |  |  | }, | 
| 119 |  |  |  |  |  |  | 'style' => sub { | 
| 120 | 1 |  |  | 1 |  | 4 | collect_urls_css(shift->content); | 
| 121 |  |  |  |  |  |  | }, | 
| 122 |  |  |  |  |  |  | '[style]' => sub { | 
| 123 | 3 |  |  | 3 |  | 11 | collect_urls_css(shift->{style}); | 
| 124 |  |  |  |  |  |  | }, | 
| 125 |  |  |  |  |  |  | 'urlset[xmlns^=http://www.sitemaps.org/schemas/sitemap/]' => sub { | 
| 126 | 1 |  |  | 1 |  | 2 | @{$_->find('url loc')->map(sub { $_->content })->to_array}; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 493 |  | 
| 127 |  |  |  |  |  |  | } | 
| 128 | 17 |  |  | 17 | 1 | 732 | }; | 
| 129 |  |  |  |  |  |  | } | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | sub reduce_html_handlers { | 
| 132 | 23 |  |  | 23 | 1 | 80 | my $handlers = $_[0]; | 
| 133 | 23 | 100 |  |  |  | 85 | my $contexts = ref $_[1] ? $_[1] : [$_[1]]; | 
| 134 | 23 |  |  |  |  | 35 | my $ret; | 
| 135 | 23 |  |  |  |  | 127 | for my $sel (keys %$handlers) { | 
| 136 | 345 |  |  |  |  | 459 | my $cb = $handlers->{$sel}; | 
| 137 | 345 |  |  |  |  | 382 | for my $cont (@$contexts) { | 
| 138 |  |  |  |  |  |  | $ret->{($cont ? $cont . ' ' : '') . $sel} = sub { | 
| 139 | 83 | 100 | 100 | 83 |  | 178 | return if ($_[0]->xml && _wrong_dom_detection($_[0])); | 
| 140 | 82 |  |  |  |  | 618 | return $cb->($_[0]); | 
| 141 |  |  |  |  |  |  | } | 
| 142 | 360 | 100 |  |  |  | 1150 | } | 
| 143 |  |  |  |  |  |  | } | 
| 144 | 23 |  |  |  |  | 75 | return $ret; | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | sub resolve_href { | 
| 148 | 174 |  |  | 174 | 1 | 68471 | my ($base, $href) = @_; | 
| 149 | 174 |  |  |  |  | 787 | $href =~ s{^\s|\s$|\n}{}g; | 
| 150 | 174 | 50 |  |  |  | 603 | $href = ref $href ? $href : Mojo::URL->new($href); | 
| 151 | 174 | 100 |  |  |  | 22123 | $base = ref $base ? $base : Mojo::URL->new($base); | 
| 152 | 174 |  |  |  |  | 690 | my $abs        = $href->fragment(undef)->to_abs($base); | 
| 153 | 174 |  |  |  |  | 56130 | my $path_parts = $abs->path->parts; | 
| 154 | 174 |  | 100 |  |  | 3412 | shift @{$path_parts} while (@$path_parts && $path_parts->[0] eq '..'); | 
|  | 6 |  |  |  |  | 33 |  | 
| 155 | 174 |  |  |  |  | 808 | return $abs; | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | sub _guess_encoding_css { | 
| 159 | 1 |  |  | 1 |  | 26 | return (shift =~ qr{^\s*\@charset ['"](.+?)['"];}is)[0]; | 
| 160 |  |  |  |  |  |  | } | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | sub _guess_encoding_html { | 
| 163 | 18 | 100 |  | 18 |  | 458 | my $head = (shift =~ qr{(.+)}is)[0] or return; | 
| 164 | 13 |  |  |  |  | 29 | my $charset; | 
| 165 |  |  |  |  |  |  | Mojo::DOM->new($head)->find('meta[http\-equiv=Content-Type]')->each( | 
| 166 |  |  |  |  |  |  | sub { | 
| 167 | 1 |  |  | 1 |  | 927 | $charset = (shift->{content} =~ $charset_re)[0]; | 
| 168 |  |  |  |  |  |  | } | 
| 169 | 13 |  |  |  |  | 49 | ); | 
| 170 | 13 |  |  |  |  | 9007 | return $charset; | 
| 171 |  |  |  |  |  |  | } | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | sub _wrong_dom_detection { | 
| 174 | 2 |  |  | 2 |  | 21 | my $dom = shift; | 
| 175 | 2 |  |  |  |  | 9 | while ($dom = $dom->parent) { | 
| 176 | 2 | 100 | 66 |  |  | 97 | return 1 if ($dom->tag && $dom->tag eq 'script'); | 
| 177 |  |  |  |  |  |  | } | 
| 178 | 1 |  |  |  |  | 36 | return; | 
| 179 |  |  |  |  |  |  | } | 
| 180 |  |  |  |  |  |  |  | 
| 181 | 10 |  |  | 10 |  | 163 | use 5.010; | 
|  | 10 |  |  |  |  | 29 |  | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | 1; | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | =head1 NAME | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | WWW::Crawler::Mojo::ScraperUtil - Scraper utitlities | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | This class inherits L and override start method for storing | 
| 194 |  |  |  |  |  |  | user info | 
| 195 |  |  |  |  |  |  |  | 
| 196 |  |  |  |  |  |  | =head1 ATTRIBUTES | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  | WWW::Crawler::Mojo::ScraperUtil implements following attributes. | 
| 199 |  |  |  |  |  |  |  | 
| 200 |  |  |  |  |  |  | =head1 METHODS | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | WWW::Crawler::Mojo::ScraperUtil implements following methods. | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | =head2 collect_urls_css | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | Collects URLs out of CSS. | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | @urls = collect_urls_css($dom); | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | =head2 decoded_body | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | Returns decoded response body for given L using | 
| 213 |  |  |  |  |  |  | guess_encoding and encoder. | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | =head2 encoder | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | Generates L instance for given name. Defaults to L. | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | =head2 html_handler_presets | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | Returns common html handler in hash reference. | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | my $handlers = html_handlers(); | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | =head2 reduce_html_handlers | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  | Narrows html handler selectors by prefixing container CSS snippets. | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | my $handlers = html_handlers($handlers, ['#header', '#footer li']); | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | $handlers->{img} = sub { | 
| 232 |  |  |  |  |  |  | my $dom = shift; | 
| 233 |  |  |  |  |  |  | return $dom->{src}; | 
| 234 |  |  |  |  |  |  | }; | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | my @urls; | 
| 237 |  |  |  |  |  |  | for my $selector (sort keys %{$handlers}) { | 
| 238 |  |  |  |  |  |  | $dom->find($selector)->each(sub { | 
| 239 |  |  |  |  |  |  | push(@urls, $handlers->{$selector}->(shift)); | 
| 240 |  |  |  |  |  |  | })->to_array; | 
| 241 |  |  |  |  |  |  | } | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | =head2 resolve_href | 
| 244 |  |  |  |  |  |  |  | 
| 245 |  |  |  |  |  |  | Resolves URLs with a base URL. | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | WWW::Crawler::Mojo::resolve_href($base, $uri); | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | =head2 guess_encoding | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | Guesses encoding of HTML or CSS with given L instance. | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | $encode = WWW::Crawler::Mojo::guess_encoding($res) || 'utf-8' | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | =head1 AUTHOR | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  | Keita Sugama, Esugama@jamadam.comE | 
| 258 |  |  |  |  |  |  |  | 
| 259 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  | Copyright (C) Keita Sugama. | 
| 262 |  |  |  |  |  |  |  | 
| 263 |  |  |  |  |  |  | This program is free software; you can redistribute it and/or | 
| 264 |  |  |  |  |  |  | modify it under the same terms as Perl itself. | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | =cut |