File Coverage

blib/lib/NNexus/Discover.pm

Criterion	Covered	Total	%
statement	28	30	93.3
branch			n/a
condition			n/a
subroutine	10	10	100.0
pod			n/a
total	38	40	95.0

line	stmt	sub	time	code
1				# /=====================================================================\ #
2				# \| NNexus Autolinker \| #
3				# \| Concept Discovery Module \| #
4				# \|=====================================================================\| #
5				# \| Part of the Planetary project: http://trac.mathweb.org/planetary \| #
6				# \| Research software, produced as part of work done by: \| #
7				# \| the KWARC group at Jacobs University \| #
8				# \| Copyright (c) 2012 \| #
9				# \| Released under the MIT License (MIT) \| #
10				# \|---------------------------------------------------------------------\| #
11				# \| Adapted from the original NNexus code by \| #
12				# \| James Gardner and Aaron Krowne \| #
13				# \|---------------------------------------------------------------------\| #
14				# \| Deyan Ginev #_# \| #
15				# \| http://kwarc.info/people/dginev (o o) \| #
16				# \=========================================================ooo==U==ooo=/ #
17				package NNexus::Discover;
18	5	5	20	use strict;
	5		5
	5		150
19	5	5	20	use warnings;
	5		7
	5		95
20
21	5	5	16	use Exporter;
	5		19
	5		312
22				our @ISA = qw(Exporter);
23				our @EXPORT_OK = qw(mine_candidates);
24	5	5	20	use Encode qw/is_utf8/;
	5		5
	5		194
25	5	5	19	use utf8;
	5		37
	5		41
26	5	5	85	use Data::Dumper;
	5		14
	5		252
27	5	5	21	use Time::HiRes qw ( time alarm sleep );
	5		4
	5		37
28
29	5	5	2526	use NNexus::StopWordList qw(stop_words_ref);
	5		8
	5		275
30	5	5	1441	use NNexus::Morphology qw(normalize_word);
	5		8
	5		285
31	5	5	1885	use NNexus::Concepts qw(clone_concepts);
	0
	0
32
33				use HTML::Parser;
34
35				# Reusable parser object (TODO: What happens if we thread/fork ?)
36				our $HTML_Parser =
37				HTML::Parser->new(
38				'api_version' => 3,
39				'start_h' => [sub {
40				my ($self,$tagname,$attr)=@_;
41				if ($tagname=~/^(head\|style\|title\|script\|xmp\|iframe\|code\|math\|svg\|sup\|a\|(h\d+))$/ \|\|
42				(($tagname eq 'span') && $attr->{class} && ($attr->{class} =~ 'nolink'))) {
43				$self->{fresh_skip}=1;
44				$self->{noparse}++;
45				} else {
46				$self->{fresh_skip}=0;
47				}
48				} , 'self, tagname, attr'],
49				'end_h' => [sub {
50				my ($self,$tagname)=@_;
51				if (($tagname=~/^(head\|style\|title\|script\|xmp\|iframe\|code\|math\|svg\|sup\|a\|(h\d+))$/) \|\|
52				(((length($tagname)==0)\|\|($tagname eq 'span')) && ($self->{fresh_skip} == 1))) {
53				$self->{noparse}--;
54				$self->{fresh_skip}=0;
55				}
56				}, 'self,tagname'],
57				'text_h' => [\&_text_event_handler, 'self,text,offset']
58				);
59				$HTML_Parser->unbroken_text;
60				$HTML_Parser->xml_mode;
61				$HTML_Parser->attr_encoded(1);
62				$HTML_Parser->empty_element_tags(1);
63
64				# Prepare cache for first-word concept lookup
65				our $first_word_cache_template = {map { ($_,[]) } @{stop_words_ref()}};
66				sub mine_candidates {
67				my (%options) = @_;
68				# State: We need a db object with a properly set database
69				# Input: We need a string representing the (HTML) body of the chunk we're
70				# mining on, as well as its URL.
71				# Optional: Deprecated details such as 'domain' or 'format'.
72				# Interesting: allow 'nolink' again?
73				my ($db,$format,$body,$nolink,$url,$domain) =
74				map {$options{$_}} qw(db format body nolink url domain);
75				die "The db key is a mandatory parameter for mine_candidates!\n" unless ref $db; # TODO: Maybe raise a better error?
76				$format = 'html' unless defined $format;
77				return ([],0) unless $body;
78				# Prepare data, if we have a URL:
79				my $objectid; # undefined if we don't have a URL, we only do MoC for named resources
80				if ($url) {
81				my $object = $db->select_object_by(url=>$url) \|\| {};
82				$objectid = $object->{objectid} \|\| -1;
83				$domain = $object->{domain} unless defined $domain;
84				# If objectid is -1 , we will also need to add_object on the url
85				if ($objectid == -1) {
86				# TODO: Extract the domain from the URL, this is unreliable
87				$objectid = $db->add_object_by(url=>$options{'url'},domain=>$domain);
88				} else {
89				# If already known, flush the links_cache for this object
90				$db->delete_linkscache_by(objectid=>$objectid);
91				}
92				}
93				# Keep a cache of first words, that will simultaneously act as a blacklist.
94				# TODO: Incorporate the single words from 'nolink'
95				# Experiment: Use a global first_word cache
96				$options{first_word_cache} = $first_word_cache_template; # Default are stopwords, keep a global cache
97				# $options{first_word_cache} = { %$first_word_cache_template }; # Default are stopwords
98				# Always return an embedded annotation with links, as well as a stand-off mined_canidates hash, containing the individual concepts with pointers.
99				my $time;
100				if ($options{verbosity}) {
101				$time = time();
102				}
103				my $mined_candidates=[];
104				my $text_length=0;
105				if ($format eq 'html') {
106				($mined_candidates,$text_length) = mine_candidates_html(\%options);
107				} elsif ($format eq 'text') {
108				($mined_candidates,$text_length) = mine_candidates_text(\%options);
109				} else {
110				print STDERR "Error: Unrecognized input format for auto-linking.\n";
111				}
112				# Only mark-up first found candidate, unless requested otherwise
113				my @uniq_candidates;
114				while (@$mined_candidates) {
115				my $candidate = shift @$mined_candidates;
116				my $concept = $candidate->{concept};
117				my $link = $candidate->{link};
118				my $category = $candidate->{category};
119				@$mined_candidates = grep {($_->{concept} ne $concept) \|\| ($_->{link} ne $link) \|\| ($_->{category} ne $category)} @$mined_candidates;
120				push @uniq_candidates, $candidate;
121				}
122				# Also, don't add self-links, coming from $url
123				@uniq_candidates = grep {$_->{link} ne $url} @uniq_candidates if $url;
124				@$mined_candidates = @uniq_candidates;
125
126				#TODO: When do we deal with the nolink settings?
127				# next if (inset($concept,@$nolink));
128				if ($options{verbosity}) {
129				printf STDERR " Discovered %d concepts in %.3f seconds.\n",scalar(@uniq_candidates),time()-$time;
130				}
131
132				# Update linkscache:
133				if ($objectid) {
134				$db->add_linkscache_by(objectid=>$objectid,conceptid=>$_->{conceptid})
135				foreach (@$mined_candidates);
136				}
137				return ($mined_candidates,$text_length);
138				}
139
140				sub mine_candidates_html {
141				my ($options) = @_;
142				my ($db,$domain,$body,$syns,$targetid,$class) = map {$options->{$_}} qw(db domain body nolink targetid class);
143				# Current HTML Parsing strategy - fire events for all HTML tags and explicitly skip over tags that
144				# won't be of interest. We need to autolink in all textual elements.
145				# TODO: Handle MathML better
146				return ([],0) unless $body;
147
148				$HTML_Parser->{mined_candidates} = [];
149				$HTML_Parser->{text_length} = 0;
150				$HTML_Parser->{state_information}=$options; # Not pretty, but TODO: improve
151				$HTML_Parser->parse($body);
152				$HTML_Parser->eof();
153				return ($HTML_Parser->{mined_candidates},$HTML_Parser->{text_length});
154				}
155
156				sub _text_event_handler {
157				my ($self,$body,$offset) = @_;
158				my $state = $self->{state_information};
159				# Skip if in a silly element:
160				if (($self->{noparse} && ($self->{noparse}>0)) \|\| ($body !~ /\w/)) {
161				return;
162				}
163				# Otherwise - discover concepts and annotate!
164				my $time = time();
165				my ($mined_candidates,$chunk_length) =
166				mine_candidates_text({db=>$state->{db},
167				nolink=>$state->{nolink},
168				body=>$body,
169				domain=>$state->{domain},
170				first_word_cache=>$state->{first_word_cache},
171				class=>$state->{class}});
172				#printf STDERR " --processed textual chunk in %.3f seconds\n",time()-$time;
173				foreach my $candidate(@$mined_candidates) {
174				$candidate->{offset_begin}+=$offset;
175				$candidate->{offset_end}+=$offset;
176				}
177				push @{$self->{mined_candidates}}, @$mined_candidates;
178				$self->{text_length} += $chunk_length;
179				}
180
181				# Core Data Mining routine - inspects plain-text strings
182				# returns back the matches and position of disambiguated links of the supplied text.
183				sub mine_candidates_text {
184				my ($options) = @_;
185				my ($db,$domain,$body,$syns,$targetid,$nolink,$class,$first_word_cache) =
186				map {$options->{$_}} qw(db domain body nolink targetid nolink class first_word_cache);
187
188				# TODO: We have to make a distinction between "defined concepts" and "candidate concepts" here.
189				# Probably just based on whether we find a URL or not?
190				my @matches;
191				my %termlist = ();
192				my $offset=0;
193				my $text_length = length($body);
194				# Read one (2+ letter) word at a time
195				my $concept_word_rex = $NNexus::Morphology::concept_word_rex;
196				CONCEPT_TRAVERSAL:
197				while ($body =~ s/^(.*?)($concept_word_rex)//s) {
198				$offset += length($1);
199				my $offset_begin = $offset;
200				$offset += length($2);
201				my $offset_end = $offset;
202				my $word = lc($2); # lower-case to match stopwords
203				# Use a cache for first-word lookups, with the dual-role of a blacklist.
204				my $cached = $first_word_cache->{$word};
205				my @candidates=();
206				if (! (ref $cached )) {
207				# Normalize word
208				my $norm_word = normalize_word($word);
209				# get all possible candidates for both posessive and plural forms of $word
210				@candidates = $db->select_firstword_matches($norm_word);
211				# Cache the candidates:
212				my $saved_candidates = clone_concepts(\@candidates); # Clone the candidates
213				$first_word_cache->{$word} = $saved_candidates;
214				$first_word_cache->{$norm_word} = $saved_candidates;
215				} else {
216				#Cached, clone into a new array
217				@candidates = @{ clone_concepts($cached)};
218				}
219				next CONCEPT_TRAVERSAL unless @candidates; # if there are no candidates skip the word
220				# Split tailwords into an array
221				foreach my $c(@candidates) {
222				$c->{tailwords} = [split(/\s+/,$c->{tailwords}\|\|'')]; }
223				my $inner_offset = 0;
224				my $match_offset = 0; # Record the offset of the current longest match, add to end_position when finalized
225				my $inner_body = $body; # A copy of the text to munge around while searching.
226				my @inner_matches = grep {@{$_->{tailwords}} == 0} @candidates; # Record the current longest matches here
227				# Longest-match:
228				# as long as:
229				# - there is leftover tail in some candidate(s)
230				@candidates = grep {@{$_->{tailwords}} > 0} @candidates;
231				CANDIDATE_LOOP:
232				while (@candidates) {
233				# - AND there are leftover words in current phrase
234				if ($inner_body =~ /^(\s+)($concept_word_rex)/s) {
235				# then: pull and compare next word, reduce text and tailwords
236				# 1. Pull next.
237				my $step_offset = length($1) + length($2);
238				$inner_offset += $step_offset;
239				my $next_word = normalize_word($2);
240				# 2. Filter for applicable candidates
241				my @inner_candidates = grep { $_->{tailwords}->[0] eq $next_word } @candidates;
242				if (@inner_candidates) {
243				# We have indeed a longer match, remove the first tailword
244				shift @{$_->{tailwords}} foreach @inner_candidates;
245				# candidates for next iteration must have leftover tail words
246				@candidates = grep {@{$_->{tailwords}} > 0} @inner_candidates;
247				# record intermediate longest matches - the current empty tailwords
248				my @step_matches = grep {@{$_->{tailwords}} == 0} @inner_candidates;
249				if (@step_matches) {
250				@inner_matches = @step_matches;
251				$match_offset = $inner_offset;
252				}
253				# Move $step_offset right the text
254				substr($inner_body,0,$step_offset)='';
255				} else {last CANDIDATE_LOOP;} # Last here as well.
256				} else {last CANDIDATE_LOOP;} # Otherwise we are done
257				}
258				# In the end, do we have one or more matches?
259				if (@inner_matches > 0) {
260				# Yes!
261				# merge multi-links into single match entry
262				# multi-link = same concept, category and domain, different URLs
263				# CARE: careful not to confuse with cases of different categories, which need disambiguation
264				my @merged_matches;
265				#print STDERR Dumper(\@inner_matches);
266				while (@inner_matches) {
267				my $match = shift @inner_matches;
268				my $category = $match->{category};
269				my $domain = $match->{domain};
270				my @multilinks = map {$_->{link}}
271				grep {($_->{category} eq $category) && ($_->{domain} eq $domain)} @inner_matches;
272				@inner_matches = grep {($_->{category} ne $category) \|\| ($_->{domain} ne $domain)} @inner_matches;
273				if (@multilinks>0) {
274				unshift @multilinks, $match->{link};
275				$match->{multilinks} = \@multilinks;
276				}
277				push @merged_matches, $match;
278				}
279				@inner_matches = @merged_matches;
280				# Record offsets:
281				$offset += $match_offset;
282				$offset_end += $match_offset;
283				foreach my $match(@inner_matches) {
284				$match->{offset_begin} = $offset_begin;
285				$match->{offset_end} = $offset_end;
286				delete $match->{tailwords};
287				}
288				# And push to main matches array
289				push @matches, @inner_matches;
290				# And move the text forward with the match_offset
291				substr($body,0,$match_offset)='' if $match_offset;
292				} else { next CONCEPT_TRAVERSAL; } # If not, we just move on to the next word
293				}
294				return (\@matches,$text_length);
295				}
296
297				1;
298
299				__END__