line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
3
|
|
|
|
|
|
|
# | Indexing Driver, | # |
4
|
|
|
|
|
|
|
# | Dispatcher for Crawl, Store, Invalidation tasks | # |
5
|
|
|
|
|
|
|
# |=====================================================================| # |
6
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
7
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
8
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
9
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
10
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
11
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
12
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
13
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
14
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
15
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
16
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
17
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
18
|
|
|
|
|
|
|
package NNexus::Index::Dispatcher; |
19
|
1
|
|
|
1
|
|
428
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
27
|
|
20
|
1
|
|
|
1
|
|
3
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
25
|
|
21
|
1
|
|
|
1
|
|
534
|
use Data::Dumper; |
|
1
|
|
|
|
|
7031
|
|
|
1
|
|
|
|
|
56
|
|
22
|
1
|
|
|
1
|
|
354
|
use NNexus::Concepts qw(flatten_concept_harvest diff_concept_harvests); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use NNexus::Morphology qw(admissible_name); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Dispatch to the right NNexus::Index::Domain class |
26
|
|
|
|
|
|
|
sub new { |
27
|
|
|
|
|
|
|
my ($class,%options) = @_; |
28
|
|
|
|
|
|
|
my $domain = $options{domain}; |
29
|
|
|
|
|
|
|
my $db = $options{db}; |
30
|
|
|
|
|
|
|
$domain = $domain ? ucfirst(lc($domain)) : ''; |
31
|
|
|
|
|
|
|
die ("Bad domain name: $domain; Must contain only alphanumeric characters!") if $domain =~ /\W/; |
32
|
|
|
|
|
|
|
my $index_template; |
33
|
|
|
|
|
|
|
my $should_update = $options{should_update} // 1; |
34
|
|
|
|
|
|
|
my $eval_return = eval {require "NNexus/Index/$domain.pm"; 1; }; |
35
|
|
|
|
|
|
|
if ($eval_return && (!$@)) { |
36
|
|
|
|
|
|
|
$index_template = eval { |
37
|
|
|
|
|
|
|
"NNexus::Index::$domain"->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update); |
38
|
|
|
|
|
|
|
}; |
39
|
|
|
|
|
|
|
} else { |
40
|
|
|
|
|
|
|
print STDERR "NNexus::Index::$domain not available, fallback to generic indexer.\n"; |
41
|
|
|
|
|
|
|
print STDERR "Reason: $@\n" if $@; |
42
|
|
|
|
|
|
|
require NNexus::Index::Template; |
43
|
|
|
|
|
|
|
# The generic template will always fail... |
44
|
|
|
|
|
|
|
# TODO: Should we fallback to Planetmath instead? |
45
|
|
|
|
|
|
|
$index_template = NNexus::Index::Template->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update); |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
bless {index_template=>$index_template,domain=>$domain,db=>$db, |
49
|
|
|
|
|
|
|
verbosity=>$options{verbosity}||0,should_update=>$should_update}, $class; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub index_step { |
53
|
|
|
|
|
|
|
my ($self,%options) = @_; |
54
|
|
|
|
|
|
|
my $template = $self->{index_template}; |
55
|
|
|
|
|
|
|
my $db = $self->{db}; |
56
|
|
|
|
|
|
|
my $domain = $self->{domain}; |
57
|
|
|
|
|
|
|
my $verbosity = $options{verbosity} ? $options{verbosity} : $self->{verbosity}; |
58
|
|
|
|
|
|
|
# 1. Check if object has already been indexed: |
59
|
|
|
|
|
|
|
my $next_step = $template->next_step; |
60
|
|
|
|
|
|
|
return unless ref $next_step; # Done if nothing left. |
61
|
|
|
|
|
|
|
unshift @{$template->{queue}}, $next_step; # Just peaking, keep it in the queue |
62
|
|
|
|
|
|
|
my $url = $next_step->{url}; # Grab the next canonical URL |
63
|
|
|
|
|
|
|
my $object = $db->select_object_by(url=>$url); |
64
|
|
|
|
|
|
|
my $objectid = $object->{objectid}; |
65
|
|
|
|
|
|
|
my $old_concepts = []; |
66
|
|
|
|
|
|
|
if (! $objectid) { |
67
|
|
|
|
|
|
|
# 1.1. If not present, add it: |
68
|
|
|
|
|
|
|
$objectid = $db->add_object_by(url=>$url,domain=>$domain); |
69
|
|
|
|
|
|
|
} else { |
70
|
|
|
|
|
|
|
# 1.2. If already indexed, grab all concepts defined by the object. |
71
|
|
|
|
|
|
|
$old_concepts = $db->select_concepts_by(objectid=>$objectid); |
72
|
|
|
|
|
|
|
# 1.3. Skip if we don't want to update and the URL is a leaf with some already known concepts |
73
|
|
|
|
|
|
|
if ((!$self->{should_update}) && $template->leaf_test($url) && scalar(@$old_concepts)) { |
74
|
|
|
|
|
|
|
# Skip leaves, when we don't want to update! |
75
|
|
|
|
|
|
|
print STDERR "Skipping over $url\n"; |
76
|
|
|
|
|
|
|
my $indexed_concepts = $template->index_step(skip=>1); |
77
|
|
|
|
|
|
|
return []; } } |
78
|
|
|
|
|
|
|
# 2. Relay the indexing request to the template, gather concepts |
79
|
|
|
|
|
|
|
my $indexed_concepts = $template->index_step(%options); |
80
|
|
|
|
|
|
|
return unless defined $indexed_concepts; # Last step. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Idea: If a page can no longer be accessed, we will never delete it from the object table, |
83
|
|
|
|
|
|
|
# we will only empty its payload (= no concepts defined by it) from the concept table. |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# 3.0.1 Flatten out incoming synonyms and categories to individual concepts: |
86
|
|
|
|
|
|
|
my $new_concepts = flatten_concept_harvest($indexed_concepts); |
87
|
|
|
|
|
|
|
# 3.0.2 Make sure they're admissible names; |
88
|
|
|
|
|
|
|
@$new_concepts = grep {admissible_name($_->{concept})} @$new_concepts; |
89
|
|
|
|
|
|
|
if ($verbosity > 0) { |
90
|
|
|
|
|
|
|
print STDERR "FlatConcepts: ".scalar(@$new_concepts)."|URL: $url\n"; |
91
|
|
|
|
|
|
|
print STDERR Dumper($new_concepts); |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
# 3.1 Compute diff between previous and new concepts |
94
|
|
|
|
|
|
|
my ($delete_concepts,$add_concepts) = diff_concept_harvests($old_concepts,$new_concepts); |
95
|
|
|
|
|
|
|
# 4. Delete no longer present concepts |
96
|
|
|
|
|
|
|
my $invalidated_URLs = []; |
97
|
|
|
|
|
|
|
foreach my $delc(@$delete_concepts) { |
98
|
|
|
|
|
|
|
my $concepts = $db->select_concepts_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid); |
99
|
|
|
|
|
|
|
my $delc_id = $concepts->[0]->{conceptid}; |
100
|
|
|
|
|
|
|
$db->delete_concept_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid); |
101
|
|
|
|
|
|
|
push @$invalidated_URLs, |
102
|
|
|
|
|
|
|
$db->invalidate_by(conceptid=>$delc_id); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
# 5. Add newly introduced concepts |
105
|
|
|
|
|
|
|
foreach my $addc(@$add_concepts) { |
106
|
|
|
|
|
|
|
my $addc_id = |
107
|
|
|
|
|
|
|
$db->add_concept_by(concept=>$addc->{concept},category=>$addc->{category},objectid=>$objectid, |
108
|
|
|
|
|
|
|
domain=>$domain,link=>($addc->{url}||$url),scheme=>$addc->{scheme}); |
109
|
|
|
|
|
|
|
push @$invalidated_URLs, |
110
|
|
|
|
|
|
|
$db->invalidate_by(conceptid=>$addc_id); |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
# Add the http:// prefix before returning: |
113
|
|
|
|
|
|
|
@$invalidated_URLs = map {'http://'.$_} @$invalidated_URLs; |
114
|
|
|
|
|
|
|
# 6. Return URLs to be invalidated as effect: |
115
|
|
|
|
|
|
|
return $invalidated_URLs; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
1; |
119
|
|
|
|
|
|
|
__END__ |