| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # /=====================================================================\ # | 
| 2 |  |  |  |  |  |  | # |  NNexus Autolinker                                                  | # | 
| 3 |  |  |  |  |  |  | # | Template for Indexing Plug-ins, PULL API                            | # | 
| 4 |  |  |  |  |  |  | # |=====================================================================| # | 
| 5 |  |  |  |  |  |  | # | Part of the Planetary project: http://trac.mathweb.org/planetary    | # | 
| 6 |  |  |  |  |  |  | # |  Research software, produced as part of work done by:               | # | 
| 7 |  |  |  |  |  |  | # |  the KWARC group at Jacobs University                               | # | 
| 8 |  |  |  |  |  |  | # | Copyright (c) 2012                                                  | # | 
| 9 |  |  |  |  |  |  | # | Released under the MIT License (MIT)                                | # | 
| 10 |  |  |  |  |  |  | # |---------------------------------------------------------------------| # | 
| 11 |  |  |  |  |  |  | # | Adapted from the original NNexus code by                            | # | 
| 12 |  |  |  |  |  |  | # |                                  James Gardner and Aaron Krowne     | # | 
| 13 |  |  |  |  |  |  | # |---------------------------------------------------------------------| # | 
| 14 |  |  |  |  |  |  | # | Deyan Ginev                   #_#     | # | 
| 15 |  |  |  |  |  |  | # | http://kwarc.info/people/dginev                            (o o)    | # | 
| 16 |  |  |  |  |  |  | # \=========================================================ooo==U==ooo=/ # | 
| 17 |  |  |  |  |  |  | package NNexus::Index::Template; | 
| 18 | 1 |  |  | 1 |  | 6 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 42 |  | 
| 19 | 1 |  |  | 1 |  | 7 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 20 |  |  |  |  |  |  |  | 
| 21 | 1 |  |  | 1 |  | 756 | use Mojo::DOM; | 
|  | 1 |  |  |  |  | 68474 |  | 
|  | 1 |  |  |  |  | 32 |  | 
| 22 | 1 |  |  | 1 |  | 611 | use Mojo::UserAgent; | 
|  | 1 |  |  |  |  | 184663 |  | 
|  | 1 |  |  |  |  | 13 |  | 
| 23 | 1 |  |  | 1 |  | 35 | use Mojo::UserAgent::CookieJar; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 7 |  | 
| 24 | 1 |  |  | 1 |  | 22 | use Time::HiRes qw(sleep); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 8 |  | 
| 25 | 1 |  |  | 1 |  | 578 | use NNexus::Morphology qw(canonicalize_url); | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 597 |  | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | ### EXTERNAL API | 
| 28 |  |  |  |  |  |  | sub new { | 
| 29 | 0 |  |  | 0 | 1 |  | my ($class,%options) = @_; | 
| 30 | 0 |  |  |  |  |  | my $ua = Mojo::UserAgent->new; | 
| 31 | 0 |  |  |  |  |  | $ua->max_redirects(2)->connect_timeout(10)->request_timeout(20); | 
| 32 | 0 |  |  |  |  |  | $ua->cookie_jar(Mojo::UserAgent::CookieJar->new); | 
| 33 | 0 |  | 0 |  |  |  | my $visited = $options{visited}||{}; | 
| 34 | 0 |  | 0 |  |  |  | my $queue = $options{queue}||[]; | 
| 35 |  |  |  |  |  |  |  | 
| 36 | 0 |  |  |  |  |  | my $self = bless {ua=>$ua,visited=>$visited,queue=>$queue}, $class; | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | # Set current if we're starting up. | 
| 39 | 0 |  |  |  |  |  | my $first_url; | 
| 40 | 0 | 0 |  |  |  |  | if (defined $options{start}) { | 
| 41 | 0 | 0 |  |  |  |  | if ($options{start} eq 'default') { | 
| 42 | 0 |  |  |  |  |  | $first_url = $self->domain_root; | 
| 43 |  |  |  |  |  |  | } else { | 
| 44 | 0 |  |  |  |  |  | $first_url = $options{start}; | 
| 45 |  |  |  |  |  |  | }} | 
| 46 |  |  |  |  |  |  | else { | 
| 47 | 0 |  |  |  |  |  | $first_url = $self->domain_root; } | 
| 48 |  |  |  |  |  |  |  | 
| 49 | 0 | 0 |  |  |  |  | push (@{$self->{queue}}, { | 
|  | 0 |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | url=>canonicalize_url($first_url), | 
| 51 |  |  |  |  |  |  | ($options{dom} ? (dom=>$options{dom}) : ()), | 
| 52 |  |  |  |  |  |  | depth=>0}); | 
| 53 | 0 |  |  |  |  |  | return $self; | 
| 54 |  |  |  |  |  |  | } | 
| 55 | 0 |  |  | 0 | 0 |  | sub ua {$_[0]->{ua};} | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | # index: Traverse a page, obtain candidate concepts and candidate further links | 
| 58 |  |  |  |  |  |  | sub index_step { | 
| 59 | 0 |  |  | 0 | 1 |  | my ($self,%options) = @_; | 
| 60 | 0 |  |  |  |  |  | my $visited = $self->{visited}; | 
| 61 | 0 |  |  |  |  |  | my $depth; | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | # Grab the next job from the queue | 
| 64 | 0 |  |  |  |  |  | my $next_step = $self->next_step; | 
| 65 | 0 | 0 |  |  |  |  | if (ref $next_step) { | 
| 66 | 0 |  |  |  |  |  | $self->current_url($next_step->{url}); | 
| 67 | 0 |  |  |  |  |  | $self->current_categories($next_step->{categories}); | 
| 68 | 0 |  | 0 |  |  |  | $depth = $next_step->{depth} || 0; | 
| 69 |  |  |  |  |  |  | } else { | 
| 70 |  |  |  |  |  |  | # We're out of urls, last step. | 
| 71 | 0 |  |  |  |  |  | delete $self->{current_url}; | 
| 72 |  |  |  |  |  |  | } | 
| 73 |  |  |  |  |  |  | # If we've visited, or we're out of urls, terminate. | 
| 74 | 0 |  |  |  |  |  | my $current_url = $self->current_url; | 
| 75 | 0 | 0 |  |  |  |  | return unless $current_url; # Empty return for last job | 
| 76 | 0 |  |  |  |  |  | $visited->{$current_url} = 1; # Mark visited | 
| 77 |  |  |  |  |  |  | # Also skip if we're over the depth limit. | 
| 78 | 0 | 0 |  |  |  |  | return $self->index_step if $depth > $self->depth_limit; | 
| 79 | 0 | 0 |  |  |  |  | return [] if $options{skip}; # We are skipping over this URL, return | 
| 80 |  |  |  |  |  |  | # 2.1. Prepare (or just accept) a Mojo::DOM to be analyzed | 
| 81 | 0 | 0 |  |  |  |  | if ($next_step->{dom}) { | 
| 82 | 0 |  |  |  |  |  | $self->current_dom($next_step->{dom}); | 
| 83 | 0 |  |  |  |  |  | delete $next_step->{dom}; | 
| 84 |  |  |  |  |  |  | } else { | 
| 85 | 0 |  |  |  |  |  | sleep($self->request_interval()); # Don't overload the server | 
| 86 | 0 |  |  |  |  |  | $self->current_dom($self->ua->get($current_url)->res->dom); | 
| 87 |  |  |  |  |  |  | } | 
| 88 |  |  |  |  |  |  | # Obtain the indexer payload | 
| 89 | 0 |  |  |  |  |  | my $payload = $self->index_page; | 
| 90 |  |  |  |  |  |  | # What are the candidate categories for follow-up jobs? | 
| 91 | 0 |  |  |  |  |  | my $categories = $self->candidate_categories; | 
| 92 |  |  |  |  |  |  | # Push all following candidate jobs to queue | 
| 93 | 0 | 0 |  |  |  |  | if ($depth <= $self->depth_limit) { # Don't add pointless nodes | 
| 94 | 0 |  |  |  |  |  | my $candidate_links = $self->candidate_links; | 
| 95 | 0 |  |  |  |  |  | foreach (@$candidate_links) { | 
| 96 |  |  |  |  |  |  | # push and shift give us breadth-first search. | 
| 97 | 0 |  |  |  |  |  | push (@{$self->{queue}}, { | 
|  | 0 |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | url=>canonicalize_url($_), | 
| 99 |  |  |  |  |  |  | categories=>$categories, | 
| 100 |  |  |  |  |  |  | depth=>$depth+1}); | 
| 101 |  |  |  |  |  |  | } | 
| 102 |  |  |  |  |  |  | } | 
| 103 |  |  |  |  |  |  | # Return final list of concepts for this page | 
| 104 | 0 |  |  |  |  |  | return $payload; | 
| 105 |  |  |  |  |  |  | } | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | sub next_step { | 
| 108 | 0 |  |  | 0 | 0 |  | my ($self) = @_; | 
| 109 | 0 |  |  |  |  |  | my $visited = $self->{visited}; | 
| 110 |  |  |  |  |  |  | # Otherwise, grab the next job from the queue | 
| 111 | 0 |  |  |  |  |  | my $next_step = shift @{$self->{queue}}; | 
|  | 0 |  |  |  |  |  |  | 
| 112 | 0 |  | 0 |  |  |  | while ((ref $next_step) && ($visited->{$next_step->{url}})) { | 
| 113 | 0 |  |  |  |  |  | $next_step = shift @{$self->{queue}}; | 
|  | 0 |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | } | 
| 115 | 0 |  |  |  |  |  | return $next_step; | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | ### PULL API | 
| 119 |  |  |  |  |  |  | # To be overloaded by concrete classes | 
| 120 | 0 |  |  | 0 | 1 |  | sub depth_limit {4;} | 
| 121 | 0 |  |  | 0 | 1 |  | sub domain_root {q{};} # To be overriden in the concrete classes | 
| 122 |  |  |  |  |  |  | # TODO: Rename index_page to candidate_concepts ? Or index_links / index_categories instead? | 
| 123 | 0 |  |  | 0 | 1 |  | sub index_page {[];} # To be overriden in the concrete classes | 
| 124 |  |  |  |  |  |  | sub candidate_links { | 
| 125 | 0 |  |  | 0 | 1 |  | []; | 
| 126 |  |  |  |  |  |  | # TODO: Generic implementation should simply retrieve ALL s as candidate links. | 
| 127 |  |  |  |  |  |  | } | 
| 128 | 0 |  |  | 0 | 1 |  | sub candidate_categories {} | 
| 129 | 0 |  |  | 0 | 1 |  | sub request_interval { 2; } | 
| 130 |  |  |  |  |  |  | # Tests if the page is a leaf, in which case we want to skip it when should_update is 0 | 
| 131 | 0 |  |  | 0 | 0 |  | sub leaf_test {0;} | 
| 132 |  |  |  |  |  |  | ### SHARED METHODS | 
| 133 |  |  |  |  |  |  | # To be directly inherited and used by concrete classes | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | # Getter or Setter for the current URL/DOM/Categories | 
| 136 | 0 | 0 |  | 0 | 1 |  | sub current_url { $_[1] ? $_[0]->{current_url} = $_[1] : $_[0]->{current_url}; } | 
| 137 | 0 | 0 |  | 0 | 1 |  | sub current_dom { $_[1] ? $_[0]->{current_dom} = $_[1] : $_[0]->{current_dom}; } | 
| 138 | 0 | 0 |  | 0 | 1 |  | sub current_categories {$_[1] ? $_[0]->{current_categories} = $_[1] : $_[0]->{current_categories};} | 
| 139 |  |  |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | 1; | 
| 141 |  |  |  |  |  |  | __END__ |