| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::Crawler::Mojo; | 
| 2 | 10 |  |  | 10 |  | 2137372 | use strict; | 
|  | 10 |  |  |  |  | 127 |  | 
|  | 10 |  |  |  |  | 270 |  | 
| 3 | 10 |  |  | 10 |  | 45 | use warnings; | 
|  | 10 |  |  |  |  | 17 |  | 
|  | 10 |  |  |  |  | 283 |  | 
| 4 | 10 |  |  | 10 |  | 1648 | use Mojo::Base 'Mojo::EventEmitter'; | 
|  | 10 |  |  |  |  | 625522 |  | 
|  | 10 |  |  |  |  | 63 |  | 
| 5 | 10 |  |  | 10 |  | 15599 | use WWW::Crawler::Mojo::Job; | 
|  | 10 |  |  |  |  | 25 |  | 
|  | 10 |  |  |  |  | 69 |  | 
| 6 | 10 |  |  | 10 |  | 4265 | use WWW::Crawler::Mojo::Queue::Memory; | 
|  | 10 |  |  |  |  | 24 |  | 
|  | 10 |  |  |  |  | 61 |  | 
| 7 | 10 |  |  | 10 |  | 3566 | use WWW::Crawler::Mojo::UserAgent; | 
|  | 10 |  |  |  |  | 29 |  | 
|  | 10 |  |  |  |  | 82 |  | 
| 8 | 10 |  |  |  |  | 778 | use WWW::Crawler::Mojo::ScraperUtil qw{ | 
| 9 | 10 |  |  | 10 |  | 4327 | collect_urls_css html_handler_presets reduce_html_handlers resolve_href decoded_body}; | 
|  | 10 |  |  |  |  | 22 |  | 
| 10 | 10 |  |  | 10 |  | 66 | use Mojo::Message::Request; | 
|  | 10 |  |  |  |  | 17 |  | 
|  | 10 |  |  |  |  | 73 |  | 
| 11 |  |  |  |  |  |  | our $VERSION = '0.25'; | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | has clock_speed       => 0.25; | 
| 14 |  |  |  |  |  |  | has html_handlers     => sub { html_handler_presets() }; | 
| 15 |  |  |  |  |  |  | has max_conn          => 1; | 
| 16 |  |  |  |  |  |  | has max_conn_per_host => 1; | 
| 17 |  |  |  |  |  |  | has queue             => sub { WWW::Crawler::Mojo::Queue::Memory->new }; | 
| 18 |  |  |  |  |  |  | has 'shuffle'; | 
| 19 |  |  |  |  |  |  | has ua => sub { WWW::Crawler::Mojo::UserAgent->new }; | 
| 20 |  |  |  |  |  |  | has ua_name => | 
| 21 |  |  |  |  |  |  | "www-crawler-mojo/$VERSION (+https://github.com/jamadam/www-crawler-mojo)"; | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | sub crawl { | 
| 24 | 0 |  |  | 0 | 1 | 0 | my ($self) = @_; | 
| 25 |  |  |  |  |  |  |  | 
| 26 | 0 |  |  |  |  | 0 | $self->init; | 
| 27 |  |  |  |  |  |  |  | 
| 28 | 0 | 0 |  |  |  | 0 | die 'No job is given' unless ($self->queue->length); | 
| 29 |  |  |  |  |  |  |  | 
| 30 | 0 |  |  |  |  | 0 | $self->emit('start'); | 
| 31 |  |  |  |  |  |  |  | 
| 32 | 0 | 0 |  |  |  | 0 | Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | 
| 33 |  |  |  |  |  |  | } | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | sub init { | 
| 36 | 12 |  |  | 12 | 1 | 7583 | my ($self) = @_; | 
| 37 |  |  |  |  |  |  |  | 
| 38 | 2 |  |  | 2 |  | 209 | $self->on('empty', sub { say "Queue is drained out."; $self->stop }) | 
|  | 2 |  |  |  |  | 17 |  | 
| 39 | 12 | 50 |  |  |  | 61 | unless $self->has_subscribers('empty'); | 
| 40 |  |  |  |  |  |  | $self->on( | 
| 41 |  |  |  |  |  |  | 'error', | 
| 42 |  |  |  |  |  |  | sub { | 
| 43 | 0 |  |  | 0 |  | 0 | say | 
| 44 |  |  |  |  |  |  | sprintf("An error occured during crawling %s: %s", $_[2]->url, $_[1]); | 
| 45 |  |  |  |  |  |  | } | 
| 46 | 12 | 50 |  |  |  | 196 | ) unless $self->has_subscribers('error'); | 
| 47 | 12 | 100 |  | 0 |  | 130 | $self->on('res', sub { $_[1]->() }) unless $self->has_subscribers('res'); | 
|  | 0 |  |  |  |  | 0 |  | 
| 48 |  |  |  |  |  |  |  | 
| 49 | 12 |  |  |  |  | 151 | $self->ua->transactor->name($self->ua_name); | 
| 50 | 12 |  |  |  |  | 372 | $self->ua->max_redirects(5); | 
| 51 |  |  |  |  |  |  |  | 
| 52 | 12 |  |  |  |  | 121 | $self->{_loop_ids} = []; | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | my $id = Mojo::IOLoop->recurring( | 
| 55 |  |  |  |  |  |  | $self->clock_speed => sub { | 
| 56 | 15 |  |  | 15 |  | 3408617 | my $queue = $self->queue; | 
| 57 |  |  |  |  |  |  |  | 
| 58 | 15 | 100 |  |  |  | 204 | if (!$queue->length) { | 
| 59 | 2 | 50 |  |  |  | 22 | $self->emit('empty') unless ($self->ua->active_conn); | 
| 60 | 2 |  |  |  |  | 44 | return; | 
| 61 |  |  |  |  |  |  | } | 
| 62 | 13 | 50 |  |  |  | 131 | return if $self->ua->active_conn >= $self->max_conn; | 
| 63 |  |  |  |  |  |  | return | 
| 64 | 13 | 50 |  |  |  | 239 | if $self->ua->active_host($queue->next->url) | 
| 65 |  |  |  |  |  |  | >= $self->max_conn_per_host; | 
| 66 | 13 |  |  |  |  | 138 | $self->process_job($queue->dequeue); | 
| 67 |  |  |  |  |  |  | } | 
| 68 | 12 |  |  |  |  | 40 | ); | 
| 69 |  |  |  |  |  |  |  | 
| 70 | 12 |  |  |  |  | 1115 | push(@{$self->{_loop_ids}}, $id); | 
|  | 12 |  |  |  |  | 42 |  | 
| 71 |  |  |  |  |  |  |  | 
| 72 | 12 | 50 |  |  |  | 38 | if ($self->shuffle) { | 
| 73 |  |  |  |  |  |  | my $id = Mojo::IOLoop->recurring( | 
| 74 |  |  |  |  |  |  | $self->shuffle => sub { | 
| 75 | 0 |  |  | 0 |  | 0 | $self->queue->shuffle; | 
| 76 |  |  |  |  |  |  | } | 
| 77 | 0 |  |  |  |  | 0 | ); | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 0 |  |  |  |  | 0 | push(@{$self->{_loop_ids}}, $id); | 
|  | 0 |  |  |  |  | 0 |  | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  | } | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | sub process_job { | 
| 84 | 13 |  |  | 13 | 1 | 95 | my ($self, $job) = @_; | 
| 85 |  |  |  |  |  |  |  | 
| 86 | 13 |  | 100 |  |  | 47 | my @args = ($job->method || 'get', $job->url); | 
| 87 | 13 | 100 |  |  |  | 2219 | push(@args, form => $job->tx_params->to_hash) if $job->tx_params; | 
| 88 | 13 |  |  |  |  | 132 | my $tx = $self->ua->build_tx(@args); | 
| 89 |  |  |  |  |  |  |  | 
| 90 | 13 |  |  |  |  | 2734 | $self->emit('req', $job, $tx->req); | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | $self->ua->start( | 
| 93 |  |  |  |  |  |  | $tx => sub { | 
| 94 | 13 |  |  | 13 |  | 4599 | my ($ua, $tx) = @_; | 
| 95 |  |  |  |  |  |  |  | 
| 96 | 13 |  |  |  |  | 104 | $job->redirect(_urls_redirect($tx)); | 
| 97 |  |  |  |  |  |  |  | 
| 98 | 13 |  |  |  |  | 107 | my $res = $tx->res; | 
| 99 |  |  |  |  |  |  |  | 
| 100 | 13 | 50 |  |  |  | 81 | unless ($res->code) { | 
| 101 | 0 | 0 |  |  |  | 0 | my $msg = $res->error ? $res->error->{message} : 'Unknown error'; | 
| 102 | 0 |  |  |  |  | 0 | $self->emit('error', $msg, $job); | 
| 103 | 0 |  |  |  |  | 0 | return; | 
| 104 |  |  |  |  |  |  | } | 
| 105 |  |  |  |  |  |  |  | 
| 106 | 13 |  |  |  |  | 149 | $self->emit('res', sub { $self->scrape($res, $job, $_[0]) }, $job, $res); | 
|  | 10 |  |  |  |  | 1075 |  | 
| 107 |  |  |  |  |  |  |  | 
| 108 | 13 |  |  |  |  | 2904 | $job->close; | 
| 109 |  |  |  |  |  |  | } | 
| 110 | 13 |  |  |  |  | 222 | ); | 
| 111 |  |  |  |  |  |  | } | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | sub say_start { | 
| 114 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 115 |  |  |  |  |  |  |  | 
| 116 | 0 |  |  |  |  | 0 | print <<"EOF"; | 
| 117 |  |  |  |  |  |  | ---------------------------------------- | 
| 118 | 0 |  |  |  |  | 0 | Crawling is starting with @{[ $self->queue->next->url ]} | 
| 119 | 0 |  |  |  |  | 0 | Max Connection  : @{[ $self->max_conn ]} | 
| 120 | 0 |  |  |  |  | 0 | User Agent      : @{[ $self->ua_name ]} | 
| 121 |  |  |  |  |  |  | ---------------------------------------- | 
| 122 |  |  |  |  |  |  | EOF | 
| 123 |  |  |  |  |  |  | } | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | sub scrape { | 
| 126 | 26 |  |  | 26 | 1 | 669 | my ($self, $res, $job, $contexts) = @_; | 
| 127 | 26 |  |  |  |  | 42 | my @ret; | 
| 128 |  |  |  |  |  |  |  | 
| 129 | 26 | 100 | 66 |  |  | 94 | return unless $res->headers->content_length && $res->body; | 
| 130 |  |  |  |  |  |  |  | 
| 131 | 25 |  |  |  |  | 1003 | my $base = $job->url; | 
| 132 | 25 |  |  |  |  | 2746 | my $type = $res->headers->content_type; | 
| 133 |  |  |  |  |  |  |  | 
| 134 | 25 | 100 | 66 |  |  | 669 | if ($type && $type =~ qr{^(text|application)/(html|xml|xhtml)}) { | 
| 135 | 23 | 100 |  |  |  | 110 | if ((my $base_tag = $res->dom->at('base[href]'))) { | 
| 136 | 4 |  |  |  |  | 2280 | $base = resolve_href($base, $base_tag->attr('href')); | 
| 137 |  |  |  |  |  |  | } | 
| 138 | 23 |  |  |  |  | 21580 | my $dom      = Mojo::DOM->new(decoded_body($res)); | 
| 139 | 23 |  |  |  |  | 23047 | my $handlers = reduce_html_handlers($self->html_handlers, $contexts); | 
| 140 | 23 |  |  |  |  | 36 | for my $selector (sort keys %{$handlers}) { | 
|  | 23 |  |  |  |  | 179 |  | 
| 141 |  |  |  |  |  |  | $dom->find($selector)->each( | 
| 142 |  |  |  |  |  |  | sub { | 
| 143 | 83 |  |  | 83 |  | 32197 | my $dom = shift; | 
| 144 | 83 |  |  |  |  | 193 | for ($handlers->{$selector}->($dom)) { | 
| 145 | 86 | 100 |  |  |  | 1625 | push(@ret, $self->_make_child($_, $dom, $job, $base)) if $_; | 
| 146 |  |  |  |  |  |  | } | 
| 147 |  |  |  |  |  |  | } | 
| 148 | 360 |  |  |  |  | 121537 | ); | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  | } | 
| 151 |  |  |  |  |  |  |  | 
| 152 | 25 | 100 | 66 |  |  | 8671 | if ($type && $type =~ qr{text/css}) { | 
| 153 | 1 |  |  |  |  | 8 | for (collect_urls_css(decoded_body($res))) { | 
| 154 | 1 |  |  |  |  | 6 | push(@ret, $self->_make_child($_, $job->url, $job, $base)); | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  |  | 
| 158 | 25 |  |  |  |  | 165 | return @ret; | 
| 159 |  |  |  |  |  |  | } | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | sub stop { | 
| 162 | 2 |  |  | 2 | 1 | 7 | my $self = shift; | 
| 163 | 2 |  |  |  |  | 6 | while (my $id = shift @{$self->{_loop_ids}}) { | 
|  | 4 |  |  |  |  | 108 |  | 
| 164 | 2 |  |  |  |  | 25 | Mojo::IOLoop->remove($id); | 
| 165 |  |  |  |  |  |  | } | 
| 166 | 2 |  |  |  |  | 10 | Mojo::IOLoop->stop; | 
| 167 |  |  |  |  |  |  | } | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | sub _make_child { | 
| 170 | 84 |  |  | 84 |  | 286 | my ($self, $url, $context, $job, $base) = @_; | 
| 171 |  |  |  |  |  |  |  | 
| 172 | 84 | 50 |  |  |  | 786 | return unless $url; | 
| 173 | 84 | 100 |  |  |  | 155 | ($url, my $method, my $params) = @$url if (ref $url); | 
| 174 |  |  |  |  |  |  |  | 
| 175 | 84 |  |  |  |  | 169 | my $resolved = resolve_href($base, $url); | 
| 176 |  |  |  |  |  |  |  | 
| 177 | 84 | 100 |  |  |  | 171 | return unless ($resolved->scheme =~ qr{^(http|https|ftp|ws|wss)$}); | 
| 178 |  |  |  |  |  |  |  | 
| 179 | 73 | 100 | 100 |  |  | 787 | $resolved->query->append($params) if ($params && $method eq 'GET'); | 
| 180 |  |  |  |  |  |  |  | 
| 181 | 73 |  |  |  |  | 445 | my $child = $job->child(_url => $resolved, literal_uri => $url, | 
| 182 |  |  |  |  |  |  | _context => $context); | 
| 183 |  |  |  |  |  |  |  | 
| 184 | 73 | 100 |  |  |  | 877 | $child->method($method) if $method; | 
| 185 | 73 | 100 | 100 |  |  | 179 | $child->tx_params($params) if ($params && $method eq 'POST'); | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 73 |  |  |  |  | 464 | return $child; | 
| 188 |  |  |  |  |  |  | } | 
| 189 |  |  |  |  |  |  |  | 
| 190 |  |  |  |  |  |  | sub enqueue { | 
| 191 | 59 |  |  | 59 | 1 | 2494 | my ($self, @jobs) = @_; | 
| 192 |  |  |  |  |  |  | return | 
| 193 | 59 |  |  |  |  | 92 | map { $self->queue->enqueue(WWW::Crawler::Mojo::Job->upgrade($_)) } @jobs; | 
|  | 59 |  |  |  |  | 117 |  | 
| 194 |  |  |  |  |  |  | } | 
| 195 |  |  |  |  |  |  |  | 
| 196 |  |  |  |  |  |  | sub requeue { | 
| 197 | 1 |  |  | 1 | 1 | 527 | my ($self, @jobs) = @_; | 
| 198 |  |  |  |  |  |  | return | 
| 199 | 1 |  |  |  |  | 3 | map { $self->queue->requeue(WWW::Crawler::Mojo::Job->upgrade($_)) } @jobs; | 
|  | 1 |  |  |  |  | 3 |  | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | sub _urls_redirect { | 
| 203 | 13 |  |  | 13 |  | 27 | my $tx = shift; | 
| 204 | 13 |  |  |  |  | 19 | my @urls; | 
| 205 | 13 | 50 |  |  |  | 43 | @urls = _urls_redirect($tx->previous) if ($tx->previous); | 
| 206 | 13 |  |  |  |  | 82 | unshift(@urls, $tx->req->url->userinfo(undef)); | 
| 207 | 13 |  |  |  |  | 176 | return @urls; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | 1; | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | =head1 NAME | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | WWW::Crawler::Mojo - A web crawling framework for Perl | 
| 215 |  |  |  |  |  |  |  | 
| 216 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 217 |  |  |  |  |  |  |  | 
| 218 |  |  |  |  |  |  | use strict; | 
| 219 |  |  |  |  |  |  | use warnings; | 
| 220 |  |  |  |  |  |  | use WWW::Crawler::Mojo; | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | my $bot = WWW::Crawler::Mojo->new; | 
| 223 |  |  |  |  |  |  |  | 
| 224 |  |  |  |  |  |  | $bot->on(res => sub { | 
| 225 |  |  |  |  |  |  | my ($bot, $scrape, $job, $res) = @_; | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  | $bot->enqueue($_) for $scrape->('#context'); | 
| 228 |  |  |  |  |  |  | }); | 
| 229 |  |  |  |  |  |  |  | 
| 230 |  |  |  |  |  |  | $bot->enqueue('http://example.com/'); | 
| 231 |  |  |  |  |  |  | $bot->crawl; | 
| 232 |  |  |  |  |  |  |  | 
| 233 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | L is a web crawling framework for those who are familiar with | 
| 236 |  |  |  |  |  |  | L::* APIs. | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | Althogh the module is only well tested for "focused crawl" at this point, | 
| 239 |  |  |  |  |  |  | you can also use it for endless crawling by taking special care of memory usage. | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | =head1 ATTRIBUTES | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | L inherits all attributes from L and | 
| 244 |  |  |  |  |  |  | implements the following new ones. | 
| 245 |  |  |  |  |  |  |  | 
| 246 |  |  |  |  |  |  | =head2 clock_speed | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | A number of main event loop interval in milliseconds. Defaults to 0.25. | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | $bot->clock_speed(2); | 
| 251 |  |  |  |  |  |  | my $clock = $bot->clock_speed; # 2 | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | =head2 html_handlers | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | Sets HTML handlers of scrapper. Defaults to | 
| 256 |  |  |  |  |  |  | WWW::Crawler::Mojo::ScraperUtil::html_handler_presets. | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | $bot->html_handlers( { | 
| 259 |  |  |  |  |  |  | 'a[href]'   => sub { return $_[0]->{href} }, | 
| 260 |  |  |  |  |  |  | 'img[src]'  => sub { return $_[0]->{src} }, | 
| 261 |  |  |  |  |  |  | } ); | 
| 262 |  |  |  |  |  |  |  | 
| 263 |  |  |  |  |  |  | =head2 max_conn | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | An amount of max connections. | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | $bot->max_conn(5); | 
| 268 |  |  |  |  |  |  | say $bot->max_conn; # 5 | 
| 269 |  |  |  |  |  |  |  | 
| 270 |  |  |  |  |  |  | =head2 max_conn_per_host | 
| 271 |  |  |  |  |  |  |  | 
| 272 |  |  |  |  |  |  | An amount of max connections per host. | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | $bot->max_conn_per_host(5); | 
| 275 |  |  |  |  |  |  | say $bot->max_conn_per_host; # 5 | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | =head2 queue | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | L object for default. | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | $bot->queue(WWW::Crawler::Mojo::Queue::Memory->new); | 
| 282 |  |  |  |  |  |  | $bot->queue->enqueue($job); | 
| 283 |  |  |  |  |  |  |  | 
| 284 |  |  |  |  |  |  | =head2 shuffle | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | An interval in seconds to shuffle the job queue. It also evalutated as boolean | 
| 287 |  |  |  |  |  |  | for disabling/enabling the feature. Defaults to undef, meaning disable. | 
| 288 |  |  |  |  |  |  |  | 
| 289 |  |  |  |  |  |  | $bot->shuffle(5); | 
| 290 |  |  |  |  |  |  | say $bot->shuffle; # 5 | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | =head2 ua | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | A L instance. | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | my $ua = $bot->ua; | 
| 297 |  |  |  |  |  |  | $bot->ua(WWW::Crawler::Mojo::UserAgent->new); | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  | =head2 ua_name | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | Name of crawler for User-Agent header. | 
| 302 |  |  |  |  |  |  |  | 
| 303 |  |  |  |  |  |  | $bot->ua_name('my-bot/0.01 (+https://example.com/)'); | 
| 304 |  |  |  |  |  |  | say $bot->ua_name; # 'my-bot/0.01 (+https://example.com/)' | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | =head1 EVENTS | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | L inherits all events from L and | 
| 309 |  |  |  |  |  |  | implements the following new ones. | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | =head2 req | 
| 312 |  |  |  |  |  |  |  | 
| 313 |  |  |  |  |  |  | Emitted right before crawler perform request to servers. The callback takes 3 | 
| 314 |  |  |  |  |  |  | arguments. | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | $bot->on(req => sub { | 
| 317 |  |  |  |  |  |  | my ($bot, $job, $req) = @_; | 
| 318 |  |  |  |  |  |  |  | 
| 319 |  |  |  |  |  |  | # DO NOTHING | 
| 320 |  |  |  |  |  |  | }); | 
| 321 |  |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  | =head2 res | 
| 323 |  |  |  |  |  |  |  | 
| 324 |  |  |  |  |  |  | Emitted when crawler got response from server. The callback takes 4 arguments. | 
| 325 |  |  |  |  |  |  |  | 
| 326 |  |  |  |  |  |  | $bot->on(res => sub { | 
| 327 |  |  |  |  |  |  | my ($bot, $scrape, $job, $res) = @_; | 
| 328 |  |  |  |  |  |  | if (...) { | 
| 329 |  |  |  |  |  |  | $bot->enqueue($_) for $scrape->(); | 
| 330 |  |  |  |  |  |  | } else { | 
| 331 |  |  |  |  |  |  | # DO NOTHING | 
| 332 |  |  |  |  |  |  | } | 
| 333 |  |  |  |  |  |  | }); | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | =head3 $bot | 
| 336 |  |  |  |  |  |  |  | 
| 337 |  |  |  |  |  |  | L instance. | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | =head3 $scrape | 
| 340 |  |  |  |  |  |  |  | 
| 341 |  |  |  |  |  |  | Scraper code reference for current document. The code takes optional argument | 
| 342 |  |  |  |  |  |  | CSS selector for context and returns new jobs. | 
| 343 |  |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  | for my $job ($scrape->($context)) { | 
| 345 |  |  |  |  |  |  | $bot->enqueue($job) | 
| 346 |  |  |  |  |  |  | } | 
| 347 |  |  |  |  |  |  |  | 
| 348 |  |  |  |  |  |  | Optionally you can specify a scraping target container in CSS selector. | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | @jobs = $scrape->('#container'); | 
| 351 |  |  |  |  |  |  | @jobs = $scrape->(['#container1', '#container2']); | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | =head3 $job | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | L instance. | 
| 356 |  |  |  |  |  |  |  | 
| 357 |  |  |  |  |  |  | =head3 $res | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  | L instance. | 
| 360 |  |  |  |  |  |  |  | 
| 361 |  |  |  |  |  |  | =head2 empty | 
| 362 |  |  |  |  |  |  |  | 
| 363 |  |  |  |  |  |  | Emitted when queue length gets zero. | 
| 364 |  |  |  |  |  |  |  | 
| 365 |  |  |  |  |  |  | $bot->on(empty => sub { | 
| 366 |  |  |  |  |  |  | my ($bot) = @_; | 
| 367 |  |  |  |  |  |  | say "Queue is drained out."; | 
| 368 |  |  |  |  |  |  | }); | 
| 369 |  |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | =head2 error | 
| 371 |  |  |  |  |  |  |  | 
| 372 |  |  |  |  |  |  | Emitted when user agent returns no status code for request. Possibly caused by | 
| 373 |  |  |  |  |  |  | network errors or un-responsible servers. | 
| 374 |  |  |  |  |  |  |  | 
| 375 |  |  |  |  |  |  | $bot->on(error => sub { | 
| 376 |  |  |  |  |  |  | my ($bot, $error, $job) = @_; | 
| 377 |  |  |  |  |  |  | say "error: $error"; | 
| 378 |  |  |  |  |  |  | if (...) { # until failur occures 3 times | 
| 379 |  |  |  |  |  |  | $bot->requeue($job); | 
| 380 |  |  |  |  |  |  | } | 
| 381 |  |  |  |  |  |  | }); | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  | Note that server errors such as 404 or 500 cannot be catched with the event. | 
| 384 |  |  |  |  |  |  | Consider res event for the use case instead of this. | 
| 385 |  |  |  |  |  |  |  | 
| 386 |  |  |  |  |  |  | =head2 start | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | Emitted right before crawl is started. | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | $bot->on(start => sub { | 
| 391 |  |  |  |  |  |  | my $self = shift; | 
| 392 |  |  |  |  |  |  | ... | 
| 393 |  |  |  |  |  |  | }); | 
| 394 |  |  |  |  |  |  |  | 
| 395 |  |  |  |  |  |  | =head1 METHODS | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | L inherits all methods from L and | 
| 398 |  |  |  |  |  |  | implements the following new ones. | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | =head2 crawl | 
| 401 |  |  |  |  |  |  |  | 
| 402 |  |  |  |  |  |  | Starts crawling loop. | 
| 403 |  |  |  |  |  |  |  | 
| 404 |  |  |  |  |  |  | $bot->crawl; | 
| 405 |  |  |  |  |  |  |  | 
| 406 |  |  |  |  |  |  | =head2 init | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | Initializes crawler settings. | 
| 409 |  |  |  |  |  |  |  | 
| 410 |  |  |  |  |  |  | $bot->init; | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  | =head2 process_job | 
| 413 |  |  |  |  |  |  |  | 
| 414 |  |  |  |  |  |  | Processes a job. | 
| 415 |  |  |  |  |  |  |  | 
| 416 |  |  |  |  |  |  | $bot->process_job; | 
| 417 |  |  |  |  |  |  |  | 
| 418 |  |  |  |  |  |  | =head2 say_start | 
| 419 |  |  |  |  |  |  |  | 
| 420 |  |  |  |  |  |  | Displays starting messages to STDOUT | 
| 421 |  |  |  |  |  |  |  | 
| 422 |  |  |  |  |  |  | $bot->say_start; | 
| 423 |  |  |  |  |  |  |  | 
| 424 |  |  |  |  |  |  | =head2 scrape | 
| 425 |  |  |  |  |  |  |  | 
| 426 |  |  |  |  |  |  | Parses and discovers links in a web page and CSS. This performs scraping. | 
| 427 |  |  |  |  |  |  | With the optional 4th argument, you can specify a CSS selector to container | 
| 428 |  |  |  |  |  |  | you would collect URLs within. | 
| 429 |  |  |  |  |  |  |  | 
| 430 |  |  |  |  |  |  | $bot->scrape($res, $job, ); | 
| 431 |  |  |  |  |  |  | $bot->scrape($res, $job, $selector); | 
| 432 |  |  |  |  |  |  | $bot->scrape($res, $job, [$selector1, $selector2]); | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | =head2 stop | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  | Stop crawling. | 
| 437 |  |  |  |  |  |  |  | 
| 438 |  |  |  |  |  |  | $bot->stop; | 
| 439 |  |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | =head2 enqueue | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  | Appends one or more URLs or L objects. Returns the jobs | 
| 443 |  |  |  |  |  |  | actually added. | 
| 444 |  |  |  |  |  |  |  | 
| 445 |  |  |  |  |  |  | my @jobs = $bot->enqueue('http://example.com/index1.html'); | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  | OR | 
| 448 |  |  |  |  |  |  |  | 
| 449 |  |  |  |  |  |  | my @jobs = $bot->enqueue($job1, $job2); | 
| 450 |  |  |  |  |  |  |  | 
| 451 |  |  |  |  |  |  | OR | 
| 452 |  |  |  |  |  |  |  | 
| 453 |  |  |  |  |  |  | my @jobs = $bot->enqueue( | 
| 454 |  |  |  |  |  |  | 'http://example.com/index1.html', | 
| 455 |  |  |  |  |  |  | 'http://example.com/index2.html', | 
| 456 |  |  |  |  |  |  | 'http://example.com/index3.html', | 
| 457 |  |  |  |  |  |  | ); | 
| 458 |  |  |  |  |  |  |  | 
| 459 |  |  |  |  |  |  | =head2 requeue | 
| 460 |  |  |  |  |  |  |  | 
| 461 |  |  |  |  |  |  | Appends one or more URLs or jobs for re-try. This accepts same arguments as | 
| 462 |  |  |  |  |  |  | enqueue method. Returns the jobs actually added. | 
| 463 |  |  |  |  |  |  |  | 
| 464 |  |  |  |  |  |  | $self->on(error => sub { | 
| 465 |  |  |  |  |  |  | my ($self, $msg, $job) = @_; | 
| 466 |  |  |  |  |  |  | if (...) { # until failur occures 3 times | 
| 467 |  |  |  |  |  |  | $bot->requeue($job); | 
| 468 |  |  |  |  |  |  | } | 
| 469 |  |  |  |  |  |  | }); | 
| 470 |  |  |  |  |  |  |  | 
| 471 |  |  |  |  |  |  | =head1 EXAMPLE | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  | L | 
| 474 |  |  |  |  |  |  |  | 
| 475 |  |  |  |  |  |  | =head1 AUTHOR | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  | Keita Sugama, Esugama@jamadam.comE | 
| 478 |  |  |  |  |  |  |  | 
| 479 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 480 |  |  |  |  |  |  |  | 
| 481 |  |  |  |  |  |  | Copyright (C) jamadam | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  | This program is free software; you can redistribute it and/or | 
| 484 |  |  |  |  |  |  | modify it under the same terms as Perl itself. | 
| 485 |  |  |  |  |  |  |  | 
| 486 |  |  |  |  |  |  | =cut |