line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
3
|
|
|
|
|
|
|
# | Indexing Plug-in, MathWorld.wolfram.com domain | # |
4
|
|
|
|
|
|
|
# |=====================================================================| # |
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
17
|
|
|
|
|
|
|
package NNexus::Index::Mathworld; |
18
|
3
|
|
|
3
|
|
1319
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
132
|
|
19
|
3
|
|
|
3
|
|
20
|
use strict; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
133
|
|
20
|
3
|
|
|
3
|
|
17
|
use base qw(NNexus::Index::Template); |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
1727
|
|
21
|
|
|
|
|
|
|
|
22
|
0
|
|
|
0
|
1
|
0
|
sub domain_root { "http://mathworld.wolfram.com/letters/"; } |
23
|
0
|
|
|
0
|
0
|
0
|
sub domain_base { "http://mathworld.wolfram.com" } |
24
|
|
|
|
|
|
|
sub candidate_links { |
25
|
2
|
|
|
2
|
1
|
6
|
my ($self) = @_; |
26
|
2
|
|
|
|
|
11
|
my $url = $self->current_url; |
27
|
2
|
|
|
|
|
10
|
my $dom = $self->current_dom; |
28
|
|
|
|
|
|
|
# Only a letter or a single-slashed path to a concept |
29
|
2
|
|
|
|
|
10
|
my $directory = $dom->find('#directory')->[0]; |
30
|
2
|
50
|
|
|
|
13418
|
$directory = $dom->find('#directorysix')->[0] unless $directory; # Top level? |
31
|
2
|
50
|
|
|
|
12714
|
return [] unless $directory; # Only index the alphabetical indices |
32
|
0
|
|
|
|
|
0
|
my @next_jobs = $directory->find('a')->each; |
33
|
0
|
|
|
|
|
0
|
@next_jobs = map { $self->domain_base . $_ } grep {defined } map {$_->{href}} @next_jobs; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
34
|
0
|
|
|
|
|
0
|
\@next_jobs; } |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub index_page { |
37
|
2
|
|
|
2
|
1
|
3
|
my ($self) = @_; |
38
|
2
|
|
|
|
|
7
|
my $url = $self->current_url; |
39
|
2
|
50
|
|
|
|
6
|
return [] unless $self->leaf_test($url); |
40
|
2
|
|
|
|
|
5
|
my $dom = $self->current_dom; |
41
|
|
|
|
|
|
|
# TODO: Support multiple MSC categories in the same page, not only [0] |
42
|
2
|
|
|
|
|
9
|
my $msc = $dom->find(':root > head > meta[scheme="MSC_2000"]'); |
43
|
2
|
50
|
|
2
|
|
10770
|
my @categories = $msc->map(sub{ $_->attr('content')})->each if $msc; |
|
2
|
|
|
|
|
21
|
|
44
|
2
|
50
|
|
|
|
75
|
@categories = ('XX-XX') unless @categories; |
45
|
|
|
|
|
|
|
|
46
|
2
|
|
|
|
|
12
|
my $title = $dom->find(':root > head > meta[name="DC.Title"]')->[0]; |
47
|
2
|
|
33
|
|
|
10164
|
my $name = $title && $title->attr('content'); |
48
|
2
|
50
|
|
|
|
91
|
return $name ? |
49
|
|
|
|
|
|
|
[{ |
50
|
|
|
|
|
|
|
url=>$url, |
51
|
|
|
|
|
|
|
concept=>$name, |
52
|
|
|
|
|
|
|
categories=>\@categories, |
53
|
|
|
|
|
|
|
}] : []; } |
54
|
|
|
|
|
|
|
|
55
|
4
|
|
|
4
|
1
|
15
|
sub depth_limit {10;} |
56
|
0
|
|
|
0
|
1
|
0
|
sub request_interval { 12; } # We'll sleep manually extra for the GET requests on the letters index |
57
|
2
|
|
|
2
|
0
|
10
|
sub leaf_test { $_[1] !~ /letters/ } |
58
|
|
|
|
|
|
|
1; |
59
|
|
|
|
|
|
|
__END__ |