| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # /=====================================================================\ # | 
| 2 |  |  |  |  |  |  | # | NNexus Autolinker                                                   | # | 
| 3 |  |  |  |  |  |  | # | Indexing Plug-in, Wikipedia.org domain                              | # | 
| 4 |  |  |  |  |  |  | # |=====================================================================| # | 
| 5 |  |  |  |  |  |  | # | Part of the Planetary project: http://trac.mathweb.org/planetary    | # | 
| 6 |  |  |  |  |  |  | # |  Research software, produced as part of work done by:               | # | 
| 7 |  |  |  |  |  |  | # |  the KWARC group at Jacobs University                               | # | 
| 8 |  |  |  |  |  |  | # | Copyright (c) 2012                                                  | # | 
| 9 |  |  |  |  |  |  | # | Released under the MIT License (MIT)                                | # | 
| 10 |  |  |  |  |  |  | # |---------------------------------------------------------------------| # | 
| 11 |  |  |  |  |  |  | # | Adapted from the original NNexus code by                            | # | 
| 12 |  |  |  |  |  |  | # |                                  James Gardner and Aaron Krowne     | # | 
| 13 |  |  |  |  |  |  | # |---------------------------------------------------------------------| # | 
| 14 |  |  |  |  |  |  | # | Deyan Ginev                   #_#     | # | 
| 15 |  |  |  |  |  |  | # | http://kwarc.info/people/dginev                            (o o)    | # | 
| 16 |  |  |  |  |  |  | # \=========================================================ooo==U==ooo=/ # | 
| 17 |  |  |  |  |  |  | package NNexus::Index::Wikipedia; | 
| 18 | 1 |  |  | 1 |  | 427 | use warnings; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 32 |  | 
| 19 | 1 |  |  | 1 |  | 5 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 26 |  | 
| 20 | 1 |  |  | 1 |  | 3 | use base qw(NNexus::Index::Template); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 64 |  | 
| 21 |  |  |  |  |  |  | # Special Blacklist for Wikipedia categories: | 
| 22 | 1 |  |  | 1 |  | 581 | use NNexus::Index::Wikipedia::Lists; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 229 |  | 
| 23 |  |  |  |  |  |  |  | 
| 24 | 1 |  |  | 1 |  | 8 | use feature 'say'; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 102 |  | 
| 25 | 1 |  |  | 1 |  | 1267 | use List::MoreUtils qw(uniq); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | # EN.Wikipedia.org indexing template | 
| 29 |  |  |  |  |  |  | # 1. We want to start from the top-level math category | 
| 30 |  |  |  |  |  |  | sub domain_root { "http://en.wikipedia.org/wiki/Category:Mathematics"; } | 
| 31 |  |  |  |  |  |  | our $category_test = qr/\/wiki\/Category:(.+)$/; | 
| 32 |  |  |  |  |  |  | our $english_category_test = qr/^\/wiki\/Category:/; | 
| 33 |  |  |  |  |  |  | our $english_concept_test = qr/^\/wiki\/[^\/\:]+$/; | 
| 34 |  |  |  |  |  |  | our $wiki_base = 'http://en.wikipedia.org'; | 
| 35 |  |  |  |  |  |  | # 2. Candidate links to subcategories and concept pages | 
| 36 |  |  |  |  |  |  | sub candidate_links { | 
| 37 |  |  |  |  |  |  | my ($self)=@_; | 
| 38 |  |  |  |  |  |  | my $url = $self->current_url; | 
| 39 |  |  |  |  |  |  | # Add links from subcategory pages | 
| 40 |  |  |  |  |  |  | if ($url =~ /$category_test/ ) { | 
| 41 |  |  |  |  |  |  | my $category_name = $1; | 
| 42 |  |  |  |  |  |  | return [] if $wiki_category_blacklist->{$category_name}; | 
| 43 |  |  |  |  |  |  | my $dom = $self->current_dom; | 
| 44 |  |  |  |  |  |  | my $subcategories = $dom->find('#mw-subcategories')->[0]; | 
| 45 |  |  |  |  |  |  | my @category_links = (); | 
| 46 |  |  |  |  |  |  | if( defined $subcategories ) { | 
| 47 |  |  |  |  |  |  | @category_links = $subcategories->find('a')->each; | 
| 48 |  |  |  |  |  |  | @category_links = grep {defined && /$english_category_test/} map {$_->{href}} @category_links; } | 
| 49 |  |  |  |  |  |  | # Also add terminal links: | 
| 50 |  |  |  |  |  |  | my $concepts = $dom->find('#mw-pages')->[0]; | 
| 51 |  |  |  |  |  |  | my @concept_links = $concepts->find('a')->each if defined $concepts; | 
| 52 |  |  |  |  |  |  | @concept_links = grep {defined && /$english_concept_test/} map {$_->{href}} @concept_links; | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | my $candidates = [ map {$wiki_base . $_ } (@category_links, @concept_links) ]; | 
| 55 |  |  |  |  |  |  | return $candidates; | 
| 56 |  |  |  |  |  |  | } else {return [];} # skip leaves | 
| 57 |  |  |  |  |  |  | } | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | # Index a concept page, ignore category pages | 
| 60 |  |  |  |  |  |  | sub index_page { | 
| 61 |  |  |  |  |  |  | my ($self) = @_; | 
| 62 |  |  |  |  |  |  | my $url = $self->current_url; | 
| 63 |  |  |  |  |  |  | # Nothing to do in category pages | 
| 64 |  |  |  |  |  |  | return [] unless $self->leaf_test($url); | 
| 65 |  |  |  |  |  |  | my $dom = $self->current_dom; | 
| 66 |  |  |  |  |  |  | # We might want to index a leaf page when descending from different categories, so keep them marked as "not visited" | 
| 67 |  |  |  |  |  |  | delete $self->{visited}->{$url}; | 
| 68 |  |  |  |  |  |  | my ($concept) = map {/([^\(]+)/; lc(rtrim($1));} $dom->find('span[dir="auto"]')->pluck('all_text')->each; | 
| 69 |  |  |  |  |  |  | my @synonyms; | 
| 70 |  |  |  |  |  |  | # Bold entries in the first paragraph are typically synonyms. | 
| 71 |  |  |  |  |  |  | my $first_p = $dom->find('p')->[0]; | 
| 72 |  |  |  |  |  |  | @synonyms = (grep {(length($_)>4) && ($_ ne $concept)} map {lc $_} $first_p->children('b')->pluck('all_text')->each) if $first_p; | 
| 73 |  |  |  |  |  |  | my $categories = $self->current_categories || ['XX-XX']; | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | return [{ url => $url, | 
| 76 |  |  |  |  |  |  | concept => $concept, | 
| 77 |  |  |  |  |  |  | scheme => 'wiki', | 
| 78 |  |  |  |  |  |  | categories => $categories, | 
| 79 |  |  |  |  |  |  | @synonyms ? (synonyms => \@synonyms) : () | 
| 80 |  |  |  |  |  |  | }]; | 
| 81 |  |  |  |  |  |  | } | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | sub candidate_categories { | 
| 84 |  |  |  |  |  |  | my ($self) = @_; | 
| 85 |  |  |  |  |  |  | if ($self->current_url =~ /$category_test/ ) { | 
| 86 |  |  |  |  |  |  | return [$1]; | 
| 87 |  |  |  |  |  |  | } else { | 
| 88 |  |  |  |  |  |  | return $self->current_categories; | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  | } | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | # The subcategories trail into unrelated topics after the 4th level... | 
| 93 |  |  |  |  |  |  | sub depth_limit {20;} # But let's bite the bullet and manually strip away the ones that are pointless | 
| 94 |  |  |  |  |  |  | sub leaf_test { $_[1] !~ /$category_test/ } | 
| 95 |  |  |  |  |  |  | # Utility: | 
| 96 |  |  |  |  |  |  | # Right trim function to remove trailing whitespace | 
| 97 |  |  |  |  |  |  | sub rtrim { | 
| 98 |  |  |  |  |  |  | my $string = shift; | 
| 99 |  |  |  |  |  |  | $string =~ s/\s+$//; | 
| 100 |  |  |  |  |  |  | return $string; } | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | 1; | 
| 103 |  |  |  |  |  |  | __END__ |