| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # /=====================================================================\ # | 
| 2 |  |  |  |  |  |  | # | NNexus Autolinker                                                   | # | 
| 3 |  |  |  |  |  |  | # | Indexing Plug-in, Wikipedia.org domain                              | # | 
| 4 |  |  |  |  |  |  | # |=====================================================================| # | 
| 5 |  |  |  |  |  |  | # | Part of the Planetary project: http://trac.mathweb.org/planetary    | # | 
| 6 |  |  |  |  |  |  | # |  Research software, produced as part of work done by:               | # | 
| 7 |  |  |  |  |  |  | # |  the KWARC group at Jacobs University                               | # | 
| 8 |  |  |  |  |  |  | # | Copyright (c) 2012                                                  | # | 
| 9 |  |  |  |  |  |  | # | Released under the MIT License (MIT)                                | # | 
| 10 |  |  |  |  |  |  | # |---------------------------------------------------------------------| # | 
| 11 |  |  |  |  |  |  | # | Adapted from the original NNexus code by                            | # | 
| 12 |  |  |  |  |  |  | # |                                  James Gardner and Aaron Krowne     | # | 
| 13 |  |  |  |  |  |  | # |---------------------------------------------------------------------| # | 
| 14 |  |  |  |  |  |  | # | Deyan Ginev                   #_#     | # | 
| 15 |  |  |  |  |  |  | # | http://kwarc.info/people/dginev                            (o o)    | # | 
| 16 |  |  |  |  |  |  | # \=========================================================ooo==U==ooo=/ # | 
| 17 |  |  |  |  |  |  | package NNexus::Index::Wikipedia; | 
| 18 | 3 |  |  | 3 |  | 978 | use warnings; | 
|  | 3 |  |  |  |  | 4 |  | 
|  | 3 |  |  |  |  | 98 |  | 
| 19 | 3 |  |  | 3 |  | 14 | use strict; | 
|  | 3 |  |  |  |  | 3 |  | 
|  | 3 |  |  |  |  | 98 |  | 
| 20 | 3 |  |  | 3 |  | 12 | use base qw(NNexus::Index::Template); | 
|  | 3 |  |  |  |  | 4 |  | 
|  | 3 |  |  |  |  | 495 |  | 
| 21 |  |  |  |  |  |  | # Special Blacklist for Wikipedia categories: | 
| 22 | 3 |  |  | 3 |  | 2384 | use NNexus::Index::Wikipedia::Lists; | 
|  | 3 |  |  |  |  | 10 |  | 
|  | 3 |  |  |  |  | 1468 |  | 
| 23 |  |  |  |  |  |  |  | 
| 24 | 3 |  |  | 3 |  | 33 | use feature 'say'; | 
|  | 3 |  |  |  |  | 3 |  | 
|  | 3 |  |  |  |  | 283 |  | 
| 25 | 3 |  |  | 3 |  | 606 | use List::MoreUtils qw(uniq); | 
|  | 3 |  |  |  |  | 8355 |  | 
|  | 3 |  |  |  |  | 36 |  | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | # EN.Wikipedia.org indexing template | 
| 29 |  |  |  |  |  |  | # 1. We want to start from the top-level math category | 
| 30 | 0 |  |  | 0 | 1 | 0 | sub domain_root { "http://en.wikipedia.org/wiki/Category:Mathematics"; } | 
| 31 |  |  |  |  |  |  | our $category_test = qr/\/wiki\/Category:(.+)$/; | 
| 32 |  |  |  |  |  |  | our $english_category_test = qr/^\/wiki\/Category:/; | 
| 33 |  |  |  |  |  |  | our $english_concept_test = qr/^\/wiki\/[^\/\:]+$/; | 
| 34 |  |  |  |  |  |  | our $wiki_base = 'http://en.wikipedia.org'; | 
| 35 |  |  |  |  |  |  | # 2. Candidate links to subcategories and concept pages | 
| 36 |  |  |  |  |  |  | sub candidate_links { | 
| 37 | 2 |  |  | 2 | 1 | 3 | my ($self)=@_; | 
| 38 | 2 |  |  |  |  | 5 | my $url = $self->current_url; | 
| 39 |  |  |  |  |  |  | # Add links from subcategory pages | 
| 40 | 2 | 50 |  |  |  | 9 | if ($url =~ /$category_test/ ) { | 
|  | 2 |  |  |  |  | 5 |  | 
| 41 | 0 |  |  |  |  | 0 | my $category_name = $1; | 
| 42 | 0 | 0 |  |  |  | 0 | return [] if $wiki_category_blacklist->{$category_name}; | 
| 43 | 0 |  |  |  |  | 0 | my $dom = $self->current_dom; | 
| 44 | 0 |  |  |  |  | 0 | my $subcategories = $dom->find('#mw-subcategories')->[0]; | 
| 45 | 0 |  |  |  |  | 0 | my @category_links = (); | 
| 46 | 0 | 0 |  |  |  | 0 | if( defined $subcategories ) { | 
| 47 | 0 |  |  |  |  | 0 | @category_links = $subcategories->find('a')->each; | 
| 48 | 0 | 0 |  |  |  | 0 | @category_links = grep {defined && /$english_category_test/} map {$_->{href}} @category_links; } | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 49 |  |  |  |  |  |  | # Also add terminal links: | 
| 50 | 0 |  |  |  |  | 0 | my $concepts = $dom->find('#mw-pages')->[0]; | 
| 51 | 0 | 0 |  |  |  | 0 | my @concept_links = $concepts->find('a')->each if defined $concepts; | 
| 52 | 0 | 0 |  |  |  | 0 | @concept_links = grep {defined && /$english_concept_test/} map {$_->{href}} @concept_links; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 53 |  |  |  |  |  |  |  | 
| 54 | 0 |  |  |  |  | 0 | my $candidates = [ map {$wiki_base . $_ } (@category_links, @concept_links) ]; | 
|  | 0 |  |  |  |  | 0 |  | 
| 55 | 0 |  |  |  |  | 0 | return $candidates; | 
| 56 |  |  |  |  |  |  | } else {return [];} # skip leaves | 
| 57 |  |  |  |  |  |  | } | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | # Index a concept page, ignore category pages | 
| 60 |  |  |  |  |  |  | sub index_page { | 
| 61 | 2 |  |  | 2 | 1 | 2 | my ($self) = @_; | 
| 62 | 2 |  |  |  |  | 6 | my $url = $self->current_url; | 
| 63 |  |  |  |  |  |  | # Nothing to do in category pages | 
| 64 | 2 | 50 |  |  |  | 6 | return [] unless $self->leaf_test($url); | 
| 65 | 2 |  |  |  |  | 5 | my $dom = $self->current_dom; | 
| 66 |  |  |  |  |  |  | # We might want to index a leaf page when descending from different categories, so keep them marked as "not visited" | 
| 67 | 2 |  |  |  |  | 5 | delete $self->{visited}->{$url}; | 
| 68 | 2 |  |  |  |  | 7 | my ($concept) = map {/([^\(]+)/; lc(rtrim($1));} $dom->find('span[dir="auto"]')->map('all_text')->each; | 
|  | 2 |  |  |  |  | 85122 |  | 
|  | 2 |  |  |  |  | 11 |  | 
| 69 | 2 |  |  |  |  | 71 | my @synonyms; | 
| 70 |  |  |  |  |  |  | # Bold entries in the first paragraph are typically synonyms. | 
| 71 | 2 |  |  |  |  | 9 | my $first_p = $dom->find('p')->[0]; | 
| 72 | 2 | 50 |  |  |  | 83997 | @synonyms = (grep {(length($_)>4) && ($_ ne $concept)} map {lc $_} $first_p->children('b')->map('all_text')->each) if $first_p; | 
|  | 4 | 50 |  |  |  | 30 |  | 
|  | 4 |  |  |  |  | 2394 |  | 
| 73 | 2 |  | 50 |  |  | 64 | my $categories = $self->current_categories || ['XX-XX']; | 
| 74 |  |  |  |  |  |  |  | 
| 75 | 2 | 50 |  |  |  | 24 | return [{ url => $url, | 
| 76 |  |  |  |  |  |  | concept => $concept, | 
| 77 |  |  |  |  |  |  | scheme => 'wiki', | 
| 78 |  |  |  |  |  |  | categories => $categories, | 
| 79 |  |  |  |  |  |  | @synonyms ? (synonyms => \@synonyms) : () | 
| 80 |  |  |  |  |  |  | }]; | 
| 81 |  |  |  |  |  |  | } | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | sub candidate_categories { | 
| 84 | 2 |  |  | 2 | 1 | 4 | my ($self) = @_; | 
| 85 | 2 | 50 |  |  |  | 8 | if ($self->current_url =~ /$category_test/ ) { | 
| 86 | 0 |  |  |  |  | 0 | return [$1]; | 
| 87 |  |  |  |  |  |  | } else { | 
| 88 | 2 |  |  |  |  | 9 | return $self->current_categories; | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  | } | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | # The subcategories trail into unrelated topics after the 4th level... | 
| 93 | 4 |  |  | 4 | 1 | 16 | sub depth_limit {20;} # But let's bite the bullet and manually strip away the ones that are pointless | 
| 94 | 2 |  |  | 2 | 0 | 16 | sub leaf_test { $_[1] !~ /$category_test/ } | 
| 95 |  |  |  |  |  |  | # Utility: | 
| 96 |  |  |  |  |  |  | # Right trim function to remove trailing whitespace | 
| 97 |  |  |  |  |  |  | sub rtrim { | 
| 98 | 2 |  |  | 2 | 0 | 7 | my $string = shift; | 
| 99 | 2 |  |  |  |  | 6 | $string =~ s/\s+$//; | 
| 100 | 2 |  |  |  |  | 8 | return $string; } | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | 1; | 
| 103 |  |  |  |  |  |  | __END__ |