|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # /=====================================================================\ #  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | NNexus Autolinker                                                   | #  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Indexing Plug-in, Wikipedia.org domain                              | #  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |=====================================================================| #  | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Part of the Planetary project: http://trac.mathweb.org/planetary    | #  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |  Research software, produced as part of work done by:               | #  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |  the KWARC group at Jacobs University                               | #  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Copyright (c) 2012                                                  | #  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Released under the MIT License (MIT)                                | #  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |---------------------------------------------------------------------| #  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Adapted from the original NNexus code by                            | #  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |                                  James Gardner and Aaron Krowne     | #  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |---------------------------------------------------------------------| #  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Deyan Ginev                   #_#     | #  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | http://kwarc.info/people/dginev                            (o o)    | #  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # \=========================================================ooo==U==ooo=/ #  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package NNexus::Index::Wikipedia;  | 
| 
18
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
978
 | 
 use warnings;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
98
 | 
    | 
| 
19
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
14
 | 
 use strict;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
98
 | 
    | 
| 
20
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
12
 | 
 use base qw(NNexus::Index::Template);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
495
 | 
    | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Special Blacklist for Wikipedia categories:  | 
| 
22
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
2384
 | 
 use NNexus::Index::Wikipedia::Lists;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1468
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
24
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
33
 | 
 use feature 'say';  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
283
 | 
    | 
| 
25
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
606
 | 
 use List::MoreUtils qw(uniq);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8355
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # EN.Wikipedia.org indexing template  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 1. We want to start from the top-level math category  | 
| 
30
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 sub domain_root { "http://en.wikipedia.org/wiki/Category:Mathematics"; }  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $category_test = qr/\/wiki\/Category:(.+)$/;  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $english_category_test = qr/^\/wiki\/Category:/;  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $english_concept_test = qr/^\/wiki\/[^\/\:]+$/;  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $wiki_base = 'http://en.wikipedia.org';  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 2. Candidate links to subcategories and concept pages  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub candidate_links {  | 
| 
37
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
3
 | 
   my ($self)=@_;  | 
| 
38
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
   my $url = $self->current_url;  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Add links from subcategory pages  | 
| 
40
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
9
 | 
   if ($url =~ /$category_test/ ) {  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
    | 
| 
41
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $category_name = $1;  | 
| 
42
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return [] if $wiki_category_blacklist->{$category_name};  | 
| 
43
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $dom = $self->current_dom;  | 
| 
44
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $subcategories = $dom->find('#mw-subcategories')->[0];  | 
| 
45
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my @category_links = ();  | 
| 
46
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     if( defined $subcategories ) {  | 
| 
47
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       @category_links = $subcategories->find('a')->each;  | 
| 
48
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       @category_links = grep {defined && /$english_category_test/} map {$_->{href}} @category_links; }  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Also add terminal links:  | 
| 
50
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $concepts = $dom->find('#mw-pages')->[0];  | 
| 
51
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my @concept_links = $concepts->find('a')->each if defined $concepts;  | 
| 
52
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     @concept_links = grep {defined && /$english_concept_test/} map {$_->{href}} @concept_links;  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
54
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $candidates = [ map {$wiki_base . $_ } (@category_links, @concept_links) ];  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
55
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return $candidates;  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {return [];} # skip leaves  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Index a concept page, ignore category pages  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub index_page {   | 
| 
61
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
2
 | 
   my ($self) = @_;  | 
| 
62
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
   my $url = $self->current_url;  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Nothing to do in category pages  | 
| 
64
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
6
 | 
   return [] unless $self->leaf_test($url);  | 
| 
65
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
   my $dom = $self->current_dom;  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # We might want to index a leaf page when descending from different categories, so keep them marked as "not visited"  | 
| 
67
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
   delete $self->{visited}->{$url};  | 
| 
68
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my ($concept) = map {/([^\(]+)/; lc(rtrim($1));} $dom->find('span[dir="auto"]')->map('all_text')->each;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85122
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
69
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
71
 | 
   my @synonyms;  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Bold entries in the first paragraph are typically synonyms.  | 
| 
71
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
   my $first_p = $dom->find('p')->[0];    | 
| 
72
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
83997
 | 
   @synonyms = (grep {(length($_)>4) && ($_ ne $concept)} map {lc $_} $first_p->children('b')->map('all_text')->each) if $first_p;  | 
| 
 
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
30
 | 
    | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2394
 | 
    | 
| 
73
 | 
2
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
64
 | 
   my $categories = $self->current_categories || ['XX-XX'];  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
24
 | 
   return [{ url => $url,  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	 concept => $concept,  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    scheme => 'wiki',  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	 categories => $categories,  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	 @synonyms ? (synonyms => \@synonyms) : ()  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    }];  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub candidate_categories {  | 
| 
84
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
4
 | 
 	my ($self) = @_;  | 
| 
85
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
8
 | 
 	if ($self->current_url =~ /$category_test/ ) {  | 
| 
86
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		return [$1];  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	} else {  | 
| 
88
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
 		return $self->current_categories;  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # The subcategories trail into unrelated topics after the 4th level...  | 
| 
93
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
1
  
 | 
16
 | 
 sub depth_limit {20;} # But let's bite the bullet and manually strip away the ones that are pointless  | 
| 
94
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
16
 | 
 sub leaf_test { $_[1] !~ /$category_test/ }  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Utility:  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Right trim function to remove trailing whitespace  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub rtrim {  | 
| 
98
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
7
 | 
 	my $string = shift;  | 
| 
99
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
 	$string =~ s/\s+$//;  | 
| 
100
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
 	return $string; }  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |