line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
3
|
|
|
|
|
|
|
# | Indexing Plug-in, PlanetMath.org domain | # |
4
|
|
|
|
|
|
|
# |=====================================================================| # |
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
17
|
|
|
|
|
|
|
package NNexus::Index::Planetmath; |
18
|
6
|
|
|
6
|
|
1464
|
use warnings; |
|
6
|
|
|
|
|
9
|
|
|
6
|
|
|
|
|
220
|
|
19
|
6
|
|
|
6
|
|
29
|
use strict; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
321
|
|
20
|
6
|
|
|
6
|
|
28
|
use base qw(NNexus::Index::Template); |
|
6
|
|
|
|
|
71
|
|
|
6
|
|
|
|
|
3899
|
|
21
|
|
|
|
|
|
|
|
22
|
0
|
|
|
0
|
1
|
0
|
sub domain_root { "http://planetmath.org/articles"; } |
23
|
|
|
|
|
|
|
our $pm_base="http://planetmath.org"; |
24
|
|
|
|
|
|
|
sub candidate_links { |
25
|
7
|
|
|
7
|
1
|
10
|
my ($self) = @_; |
26
|
7
|
|
|
|
|
22
|
my $url = $self->current_url; |
27
|
7
|
50
|
|
|
|
22
|
return [] if $self->leaf_test($url); |
28
|
0
|
|
|
|
|
0
|
my $dom = $self->current_dom; |
29
|
|
|
|
|
|
|
# Encyclopedia entries are root links "/entry" |
30
|
0
|
|
|
|
|
0
|
my $content = $dom->find('div[class="view-content"]')->[0]; |
31
|
0
|
0
|
|
|
|
0
|
my @encyclopedia_links = $content ? $content->find('a')->each : (); |
32
|
0
|
0
|
|
|
|
0
|
@encyclopedia_links = grep {defined && /^\/(\w+)$/} map {$_->{href}} @encyclopedia_links; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
33
|
|
|
|
|
|
|
# Further links can be found in: "/articles?section=All&page=NUMBER" |
34
|
0
|
|
|
|
|
0
|
my $navigation = $dom->find('div[class="item-list"]')->[1]; |
35
|
0
|
0
|
|
|
|
0
|
my @nav_links = $navigation ? $navigation->find('a')->each : (); |
36
|
0
|
0
|
|
|
|
0
|
@nav_links = grep {defined && /^\/articles\?section=All/} map {$_->{href}} @nav_links; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
37
|
0
|
|
|
|
|
0
|
my $candidates = [ map { $pm_base . $_ } (@nav_links, @encyclopedia_links ) ]; |
|
0
|
|
|
|
|
0
|
|
38
|
0
|
|
|
|
|
0
|
return $candidates; } |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub index_page { |
41
|
7
|
|
|
7
|
1
|
12
|
my ($self) = @_; |
42
|
7
|
|
|
|
|
15
|
my $url = $self->current_url; |
43
|
7
|
50
|
|
|
|
22
|
return [] unless $self->leaf_test($url); |
44
|
7
|
|
|
|
|
19
|
my $dom = $self->current_dom->xml(1); |
45
|
7
|
|
|
|
|
132
|
my $title = $dom->find('div[property="dct:title"]')->[0]; |
46
|
7
|
100
|
|
|
|
56627
|
return [] unless $title; |
47
|
5
|
|
|
|
|
51
|
$title = $title->attr('content'); |
48
|
|
|
|
|
|
|
# Only concepts have titles, so return an empty harvest if undefined: |
49
|
|
|
|
|
|
|
# Also record defined concepts |
50
|
5
|
|
|
|
|
122
|
my $content_div = $dom->find('section[class="ltx_document"]')->[0]; |
51
|
5
|
50
|
|
|
|
50329
|
return [] unless $content_div; |
52
|
5
|
|
|
|
|
42
|
my @defined_concepts = $content_div->find('div[property="pm:defines"]')->each; |
53
|
7
|
|
|
|
|
34
|
my @categories = grep {length($_)>0} map {s/^msc\://; $_;} |
|
7
|
|
|
|
|
94
|
|
|
7
|
|
|
|
|
18
|
|
|
7
|
|
|
|
|
42141
|
|
54
|
5
|
|
|
|
|
41141
|
map {$_->attr('resource')} $content_div->find('div[class="ltx_rdf"][property="dct:subject"]')->each; |
55
|
5
|
|
|
|
|
58
|
my @synonyms = map {$_->attr('content')} $content_div->find('div[class="ltx_rdf"][property="pm:synonym"]')->each; |
|
2
|
|
|
|
|
41799
|
|
56
|
|
|
|
|
|
|
|
57
|
5
|
|
|
|
|
1114
|
my @harvest; |
58
|
5
|
50
|
|
|
|
33
|
@categories = ('XX-XX') unless @categories; |
59
|
5
|
|
|
|
|
11
|
foreach my $defined(@defined_concepts) { |
60
|
7
|
|
|
|
|
20
|
my $name = $defined->attr('content'); |
61
|
7
|
|
|
|
|
105
|
$name =~ s/^pmconcept\://; |
62
|
|
|
|
|
|
|
# TODO: No special chars |
63
|
|
|
|
|
|
|
# Wild chars in synonyms - people use TeX math syntax, e.g. ^, $, + ... should we LaTeXML-convert? |
64
|
|
|
|
|
|
|
# Right now we just skip over... |
65
|
7
|
|
|
|
|
38
|
push @harvest, { |
66
|
|
|
|
|
|
|
url=>$url, |
67
|
|
|
|
|
|
|
concept=>$name, |
68
|
|
|
|
|
|
|
categories=>\@categories, |
69
|
|
|
|
|
|
|
}; } |
70
|
|
|
|
|
|
|
# Title with synonyms: |
71
|
5
|
|
|
|
|
21
|
push @harvest, { |
72
|
|
|
|
|
|
|
url=>$url, |
73
|
|
|
|
|
|
|
concept=>$title, |
74
|
|
|
|
|
|
|
categories=>\@categories, |
75
|
|
|
|
|
|
|
synonyms=>\@synonyms |
76
|
|
|
|
|
|
|
}; |
77
|
5
|
|
|
|
|
28
|
return \@harvest; } |
78
|
|
|
|
|
|
|
|
79
|
14
|
|
|
14
|
1
|
48
|
sub depth_limit {10000;} #We're just traversing down the list of pages, nothing dangerous here |
80
|
0
|
|
|
0
|
1
|
0
|
sub request_interval {0.5;} |
81
|
|
|
|
|
|
|
# Only concepts have titles, so consider next links IF undefined: |
82
|
14
|
|
|
14
|
0
|
69
|
sub leaf_test { $_[1] !~ /\/articles/; } |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
1; |
85
|
|
|
|
|
|
|
__END__ |