| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
|
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
|
3
|
|
|
|
|
|
|
# | Template for Indexing Plug-ins, PULL API | # |
|
4
|
|
|
|
|
|
|
# |=====================================================================| # |
|
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
|
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
|
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
|
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
|
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
|
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
|
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
|
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
|
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
|
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
|
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
|
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
|
17
|
|
|
|
|
|
|
package NNexus::Index::Template; |
|
18
|
6
|
|
|
6
|
|
671
|
use warnings; |
|
|
6
|
|
|
|
|
9
|
|
|
|
6
|
|
|
|
|
175
|
|
|
19
|
6
|
|
|
6
|
|
21
|
use strict; |
|
|
6
|
|
|
|
|
8
|
|
|
|
6
|
|
|
|
|
154
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
6
|
|
|
6
|
|
431
|
use Mojo::DOM; |
|
|
6
|
|
|
|
|
64290
|
|
|
|
6
|
|
|
|
|
122
|
|
|
22
|
6
|
|
|
6
|
|
2338
|
use Mojo::UserAgent; |
|
|
6
|
|
|
|
|
573890
|
|
|
|
6
|
|
|
|
|
66
|
|
|
23
|
6
|
|
|
6
|
|
263
|
use Mojo::UserAgent::CookieJar; |
|
|
6
|
|
|
|
|
11
|
|
|
|
6
|
|
|
|
|
44
|
|
|
24
|
6
|
|
|
6
|
|
175
|
use Time::HiRes qw(sleep); |
|
|
6
|
|
|
|
|
10
|
|
|
|
6
|
|
|
|
|
52
|
|
|
25
|
6
|
|
|
6
|
|
1207
|
use NNexus::Morphology qw(canonicalize_url); |
|
|
6
|
|
|
|
|
11
|
|
|
|
6
|
|
|
|
|
3599
|
|
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
### EXTERNAL API |
|
28
|
|
|
|
|
|
|
sub new { |
|
29
|
12
|
|
|
12
|
1
|
382737
|
my ($class,%options) = @_; |
|
30
|
12
|
|
|
|
|
95
|
my $ua = Mojo::UserAgent->new; |
|
31
|
12
|
|
|
|
|
111
|
$ua->max_redirects(2)->connect_timeout(10)->request_timeout(20); |
|
32
|
12
|
|
|
|
|
224
|
$ua->cookie_jar(Mojo::UserAgent::CookieJar->new); |
|
33
|
12
|
|
50
|
|
|
155
|
my $visited = $options{visited}||{}; |
|
34
|
12
|
|
50
|
|
|
58
|
my $queue = $options{queue}||[]; |
|
35
|
|
|
|
|
|
|
|
|
36
|
12
|
|
|
|
|
57
|
my $self = bless {ua=>$ua,visited=>$visited,queue=>$queue}, $class; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# Set current if we're starting up. |
|
39
|
12
|
|
|
|
|
13
|
my $first_url; |
|
40
|
12
|
50
|
|
|
|
38
|
if (defined $options{start}) { |
|
41
|
12
|
50
|
|
|
|
30
|
if ($options{start} eq 'default') { |
|
42
|
0
|
|
|
|
|
0
|
$first_url = $self->domain_root; |
|
43
|
|
|
|
|
|
|
} else { |
|
44
|
12
|
|
|
|
|
26
|
$first_url = $options{start}; |
|
45
|
|
|
|
|
|
|
}} |
|
46
|
|
|
|
|
|
|
else { |
|
47
|
0
|
|
|
|
|
0
|
$first_url = $self->domain_root; } |
|
48
|
|
|
|
|
|
|
|
|
49
|
12
|
50
|
|
|
|
15
|
push (@{$self->{queue}}, { |
|
|
12
|
|
|
|
|
100
|
|
|
50
|
|
|
|
|
|
|
url=>canonicalize_url($first_url), |
|
51
|
|
|
|
|
|
|
($options{dom} ? (dom=>$options{dom}) : ()), |
|
52
|
|
|
|
|
|
|
depth=>0}); |
|
53
|
12
|
|
|
|
|
126
|
return $self; |
|
54
|
|
|
|
|
|
|
} |
|
55
|
0
|
|
|
0
|
0
|
0
|
sub ua {$_[0]->{ua};} |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# index: Traverse a page, obtain candidate concepts and candidate further links |
|
58
|
|
|
|
|
|
|
sub index_step { |
|
59
|
12
|
|
|
12
|
1
|
4415
|
my ($self,%options) = @_; |
|
60
|
12
|
|
|
|
|
24
|
my $visited = $self->{visited}; |
|
61
|
12
|
|
|
|
|
19
|
my $depth; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Grab the next job from the queue |
|
64
|
12
|
|
|
|
|
36
|
my $next_step = $self->next_step; |
|
65
|
12
|
50
|
|
|
|
101
|
if (ref $next_step) { |
|
66
|
12
|
|
|
|
|
73
|
$self->current_url($next_step->{url}); |
|
67
|
12
|
|
|
|
|
72
|
$self->current_categories($next_step->{categories}); |
|
68
|
12
|
|
50
|
|
|
58
|
$depth = $next_step->{depth} || 0; |
|
69
|
|
|
|
|
|
|
} else { |
|
70
|
|
|
|
|
|
|
# We're out of urls, last step. |
|
71
|
0
|
|
|
|
|
0
|
delete $self->{current_url}; |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
# If we've visited, or we're out of urls, terminate. |
|
74
|
12
|
|
|
|
|
26
|
my $current_url = $self->current_url; |
|
75
|
12
|
50
|
|
|
|
30
|
return unless $current_url; # Empty return for last job |
|
76
|
12
|
|
|
|
|
25
|
$visited->{$current_url} = 1; # Mark visited |
|
77
|
|
|
|
|
|
|
# Also skip if we're over the depth limit. |
|
78
|
12
|
50
|
|
|
|
45
|
return $self->index_step if $depth > $self->depth_limit; |
|
79
|
12
|
50
|
|
|
|
30
|
return [] if $options{skip}; # We are skipping over this URL, return |
|
80
|
|
|
|
|
|
|
# 2.1. Prepare (or just accept) a Mojo::DOM to be analyzed |
|
81
|
12
|
50
|
|
|
|
76
|
if ($next_step->{dom}) { |
|
82
|
12
|
|
|
|
|
102
|
$self->current_dom($next_step->{dom}); |
|
83
|
12
|
|
|
|
|
76
|
delete $next_step->{dom}; |
|
84
|
|
|
|
|
|
|
} else { |
|
85
|
0
|
|
|
|
|
0
|
sleep($self->request_interval()); # Don't overload the server |
|
86
|
0
|
|
|
|
|
0
|
$self->current_dom($self->ua->get($current_url)->res->dom); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
# Obtain the indexer payload |
|
89
|
12
|
|
|
|
|
37
|
my $payload = $self->index_page; |
|
90
|
|
|
|
|
|
|
# What are the candidate categories for follow-up jobs? |
|
91
|
12
|
|
|
|
|
98
|
my $categories = $self->candidate_categories; |
|
92
|
|
|
|
|
|
|
# Push all following candidate jobs to queue |
|
93
|
12
|
50
|
|
|
|
41
|
if ($depth <= $self->depth_limit) { # Don't add pointless nodes |
|
94
|
12
|
|
|
|
|
41
|
my $candidate_links = $self->candidate_links; |
|
95
|
12
|
|
|
|
|
40
|
foreach (@$candidate_links) { |
|
96
|
|
|
|
|
|
|
# push and shift give us breadth-first search. |
|
97
|
28
|
|
|
|
|
15
|
push (@{$self->{queue}}, { |
|
|
28
|
|
|
|
|
45
|
|
|
98
|
|
|
|
|
|
|
url=>canonicalize_url($_), |
|
99
|
|
|
|
|
|
|
categories=>$categories, |
|
100
|
|
|
|
|
|
|
depth=>$depth+1}); |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
# Return final list of concepts for this page |
|
104
|
12
|
|
|
|
|
58
|
return $payload; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub next_step { |
|
108
|
25
|
|
|
25
|
0
|
34
|
my ($self) = @_; |
|
109
|
25
|
|
|
|
|
36
|
my $visited = $self->{visited}; |
|
110
|
|
|
|
|
|
|
# Otherwise, grab the next job from the queue |
|
111
|
25
|
|
|
|
|
27
|
my $next_step = shift @{$self->{queue}}; |
|
|
25
|
|
|
|
|
61
|
|
|
112
|
25
|
|
66
|
|
|
131
|
while ((ref $next_step) && ($visited->{$next_step->{url}})) { |
|
113
|
0
|
|
|
|
|
0
|
$next_step = shift @{$self->{queue}}; |
|
|
0
|
|
|
|
|
0
|
|
|
114
|
|
|
|
|
|
|
} |
|
115
|
25
|
|
|
|
|
57
|
return $next_step; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
### PULL API |
|
119
|
|
|
|
|
|
|
# To be overloaded by concrete classes |
|
120
|
0
|
|
|
0
|
1
|
0
|
sub depth_limit {4;} |
|
121
|
0
|
|
|
0
|
1
|
0
|
sub domain_root {q{};} # To be overriden in the concrete classes |
|
122
|
|
|
|
|
|
|
# TODO: Rename index_page to candidate_concepts ? Or index_links / index_categories instead? |
|
123
|
0
|
|
|
0
|
1
|
0
|
sub index_page {[];} # To be overriden in the concrete classes |
|
124
|
|
|
|
|
|
|
sub candidate_links { |
|
125
|
0
|
|
|
0
|
1
|
0
|
[]; |
|
126
|
|
|
|
|
|
|
# TODO: Generic implementation should simply retrieve ALL s as candidate links. |
|
127
|
|
|
|
|
|
|
} |
|
128
|
10
|
|
|
10
|
1
|
19
|
sub candidate_categories {} |
|
129
|
0
|
|
|
0
|
1
|
0
|
sub request_interval { 2; } |
|
130
|
|
|
|
|
|
|
# Tests if the page is a leaf, in which case we want to skip it when should_update is 0 |
|
131
|
0
|
|
|
0
|
0
|
0
|
sub leaf_test {0;} |
|
132
|
|
|
|
|
|
|
### SHARED METHODS |
|
133
|
|
|
|
|
|
|
# To be directly inherited and used by concrete classes |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Getter or Setter for the current URL/DOM/Categories |
|
136
|
50
|
100
|
|
50
|
1
|
179
|
sub current_url { $_[1] ? $_[0]->{current_url} = $_[1] : $_[0]->{current_url}; } |
|
137
|
27
|
100
|
|
27
|
1
|
107
|
sub current_dom { $_[1] ? $_[0]->{current_dom} = $_[1] : $_[0]->{current_dom}; } |
|
138
|
16
|
50
|
|
16
|
1
|
72
|
sub current_categories {$_[1] ? $_[0]->{current_categories} = $_[1] : $_[0]->{current_categories};} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
1; |
|
141
|
|
|
|
|
|
|
__END__ |