line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
3
|
|
|
|
|
|
|
# | Template for Indexing Plug-ins, PULL API | # |
4
|
|
|
|
|
|
|
# |=====================================================================| # |
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
17
|
|
|
|
|
|
|
package NNexus::Index::Template; |
18
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
42
|
|
19
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
20
|
|
|
|
|
|
|
|
21
|
1
|
|
|
1
|
|
756
|
use Mojo::DOM; |
|
1
|
|
|
|
|
68474
|
|
|
1
|
|
|
|
|
32
|
|
22
|
1
|
|
|
1
|
|
611
|
use Mojo::UserAgent; |
|
1
|
|
|
|
|
184663
|
|
|
1
|
|
|
|
|
13
|
|
23
|
1
|
|
|
1
|
|
35
|
use Mojo::UserAgent::CookieJar; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
7
|
|
24
|
1
|
|
|
1
|
|
22
|
use Time::HiRes qw(sleep); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
8
|
|
25
|
1
|
|
|
1
|
|
578
|
use NNexus::Morphology qw(canonicalize_url); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
597
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
### EXTERNAL API |
28
|
|
|
|
|
|
|
sub new { |
29
|
0
|
|
|
0
|
1
|
|
my ($class,%options) = @_; |
30
|
0
|
|
|
|
|
|
my $ua = Mojo::UserAgent->new; |
31
|
0
|
|
|
|
|
|
$ua->max_redirects(2)->connect_timeout(10)->request_timeout(20); |
32
|
0
|
|
|
|
|
|
$ua->cookie_jar(Mojo::UserAgent::CookieJar->new); |
33
|
0
|
|
0
|
|
|
|
my $visited = $options{visited}||{}; |
34
|
0
|
|
0
|
|
|
|
my $queue = $options{queue}||[]; |
35
|
|
|
|
|
|
|
|
36
|
0
|
|
|
|
|
|
my $self = bless {ua=>$ua,visited=>$visited,queue=>$queue}, $class; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# Set current if we're starting up. |
39
|
0
|
|
|
|
|
|
my $first_url; |
40
|
0
|
0
|
|
|
|
|
if (defined $options{start}) { |
41
|
0
|
0
|
|
|
|
|
if ($options{start} eq 'default') { |
42
|
0
|
|
|
|
|
|
$first_url = $self->domain_root; |
43
|
|
|
|
|
|
|
} else { |
44
|
0
|
|
|
|
|
|
$first_url = $options{start}; |
45
|
|
|
|
|
|
|
}} |
46
|
|
|
|
|
|
|
else { |
47
|
0
|
|
|
|
|
|
$first_url = $self->domain_root; } |
48
|
|
|
|
|
|
|
|
49
|
0
|
0
|
|
|
|
|
push (@{$self->{queue}}, { |
|
0
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
url=>canonicalize_url($first_url), |
51
|
|
|
|
|
|
|
($options{dom} ? (dom=>$options{dom}) : ()), |
52
|
|
|
|
|
|
|
depth=>0}); |
53
|
0
|
|
|
|
|
|
return $self; |
54
|
|
|
|
|
|
|
} |
55
|
0
|
|
|
0
|
0
|
|
sub ua {$_[0]->{ua};} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# index: Traverse a page, obtain candidate concepts and candidate further links |
58
|
|
|
|
|
|
|
sub index_step { |
59
|
0
|
|
|
0
|
1
|
|
my ($self,%options) = @_; |
60
|
0
|
|
|
|
|
|
my $visited = $self->{visited}; |
61
|
0
|
|
|
|
|
|
my $depth; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Grab the next job from the queue |
64
|
0
|
|
|
|
|
|
my $next_step = $self->next_step; |
65
|
0
|
0
|
|
|
|
|
if (ref $next_step) { |
66
|
0
|
|
|
|
|
|
$self->current_url($next_step->{url}); |
67
|
0
|
|
|
|
|
|
$self->current_categories($next_step->{categories}); |
68
|
0
|
|
0
|
|
|
|
$depth = $next_step->{depth} || 0; |
69
|
|
|
|
|
|
|
} else { |
70
|
|
|
|
|
|
|
# We're out of urls, last step. |
71
|
0
|
|
|
|
|
|
delete $self->{current_url}; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
# If we've visited, or we're out of urls, terminate. |
74
|
0
|
|
|
|
|
|
my $current_url = $self->current_url; |
75
|
0
|
0
|
|
|
|
|
return unless $current_url; # Empty return for last job |
76
|
0
|
|
|
|
|
|
$visited->{$current_url} = 1; # Mark visited |
77
|
|
|
|
|
|
|
# Also skip if we're over the depth limit. |
78
|
0
|
0
|
|
|
|
|
return $self->index_step if $depth > $self->depth_limit; |
79
|
0
|
0
|
|
|
|
|
return [] if $options{skip}; # We are skipping over this URL, return |
80
|
|
|
|
|
|
|
# 2.1. Prepare (or just accept) a Mojo::DOM to be analyzed |
81
|
0
|
0
|
|
|
|
|
if ($next_step->{dom}) { |
82
|
0
|
|
|
|
|
|
$self->current_dom($next_step->{dom}); |
83
|
0
|
|
|
|
|
|
delete $next_step->{dom}; |
84
|
|
|
|
|
|
|
} else { |
85
|
0
|
|
|
|
|
|
sleep($self->request_interval()); # Don't overload the server |
86
|
0
|
|
|
|
|
|
$self->current_dom($self->ua->get($current_url)->res->dom); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
# Obtain the indexer payload |
89
|
0
|
|
|
|
|
|
my $payload = $self->index_page; |
90
|
|
|
|
|
|
|
# What are the candidate categories for follow-up jobs? |
91
|
0
|
|
|
|
|
|
my $categories = $self->candidate_categories; |
92
|
|
|
|
|
|
|
# Push all following candidate jobs to queue |
93
|
0
|
0
|
|
|
|
|
if ($depth <= $self->depth_limit) { # Don't add pointless nodes |
94
|
0
|
|
|
|
|
|
my $candidate_links = $self->candidate_links; |
95
|
0
|
|
|
|
|
|
foreach (@$candidate_links) { |
96
|
|
|
|
|
|
|
# push and shift give us breadth-first search. |
97
|
0
|
|
|
|
|
|
push (@{$self->{queue}}, { |
|
0
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
url=>canonicalize_url($_), |
99
|
|
|
|
|
|
|
categories=>$categories, |
100
|
|
|
|
|
|
|
depth=>$depth+1}); |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
# Return final list of concepts for this page |
104
|
0
|
|
|
|
|
|
return $payload; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub next_step { |
108
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
109
|
0
|
|
|
|
|
|
my $visited = $self->{visited}; |
110
|
|
|
|
|
|
|
# Otherwise, grab the next job from the queue |
111
|
0
|
|
|
|
|
|
my $next_step = shift @{$self->{queue}}; |
|
0
|
|
|
|
|
|
|
112
|
0
|
|
0
|
|
|
|
while ((ref $next_step) && ($visited->{$next_step->{url}})) { |
113
|
0
|
|
|
|
|
|
$next_step = shift @{$self->{queue}}; |
|
0
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
115
|
0
|
|
|
|
|
|
return $next_step; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
### PULL API |
119
|
|
|
|
|
|
|
# To be overloaded by concrete classes |
120
|
0
|
|
|
0
|
1
|
|
sub depth_limit {4;} |
121
|
0
|
|
|
0
|
1
|
|
sub domain_root {q{};} # To be overriden in the concrete classes |
122
|
|
|
|
|
|
|
# TODO: Rename index_page to candidate_concepts ? Or index_links / index_categories instead? |
123
|
0
|
|
|
0
|
1
|
|
sub index_page {[];} # To be overriden in the concrete classes |
124
|
|
|
|
|
|
|
sub candidate_links { |
125
|
0
|
|
|
0
|
1
|
|
[]; |
126
|
|
|
|
|
|
|
# TODO: Generic implementation should simply retrieve ALL s as candidate links. |
127
|
|
|
|
|
|
|
} |
128
|
0
|
|
|
0
|
1
|
|
sub candidate_categories {} |
129
|
0
|
|
|
0
|
1
|
|
sub request_interval { 2; } |
130
|
|
|
|
|
|
|
# Tests if the page is a leaf, in which case we want to skip it when should_update is 0 |
131
|
0
|
|
|
0
|
0
|
|
sub leaf_test {0;} |
132
|
|
|
|
|
|
|
### SHARED METHODS |
133
|
|
|
|
|
|
|
# To be directly inherited and used by concrete classes |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Getter or Setter for the current URL/DOM/Categories |
136
|
0
|
0
|
|
0
|
1
|
|
sub current_url { $_[1] ? $_[0]->{current_url} = $_[1] : $_[0]->{current_url}; } |
137
|
0
|
0
|
|
0
|
1
|
|
sub current_dom { $_[1] ? $_[0]->{current_dom} = $_[1] : $_[0]->{current_dom}; } |
138
|
0
|
0
|
|
0
|
1
|
|
sub current_categories {$_[1] ? $_[0]->{current_categories} = $_[1] : $_[0]->{current_categories};} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
1; |
141
|
|
|
|
|
|
|
__END__ |