line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
3
|
|
|
|
|
|
|
# | Concept Discovery Module | # |
4
|
|
|
|
|
|
|
# |=====================================================================| # |
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
17
|
|
|
|
|
|
|
package NNexus::Discover; |
18
|
5
|
|
|
5
|
|
21
|
use strict; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
174
|
|
19
|
5
|
|
|
5
|
|
20
|
use warnings; |
|
5
|
|
|
|
|
5
|
|
|
5
|
|
|
|
|
115
|
|
20
|
|
|
|
|
|
|
|
21
|
5
|
|
|
5
|
|
19
|
use Exporter; |
|
5
|
|
|
|
|
23
|
|
|
5
|
|
|
|
|
359
|
|
22
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
23
|
|
|
|
|
|
|
our @EXPORT_OK = qw(mine_candidates); |
24
|
5
|
|
|
5
|
|
21
|
use Encode qw/is_utf8/; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
233
|
|
25
|
5
|
|
|
5
|
|
21
|
use utf8; |
|
5
|
|
|
|
|
24
|
|
|
5
|
|
|
|
|
41
|
|
26
|
5
|
|
|
5
|
|
97
|
use Data::Dumper; |
|
5
|
|
|
|
|
5
|
|
|
5
|
|
|
|
|
236
|
|
27
|
5
|
|
|
5
|
|
19
|
use Time::HiRes qw ( time alarm sleep ); |
|
5
|
|
|
|
|
7
|
|
|
5
|
|
|
|
|
42
|
|
28
|
|
|
|
|
|
|
|
29
|
5
|
|
|
5
|
|
2777
|
use NNexus::StopWordList qw(stop_words_ref); |
|
5
|
|
|
|
|
6
|
|
|
5
|
|
|
|
|
261
|
|
30
|
5
|
|
|
5
|
|
1322
|
use NNexus::Morphology qw(normalize_word); |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
386
|
|
31
|
5
|
|
|
5
|
|
2200
|
use NNexus::Concepts qw(clone_concepts); |
|
5
|
|
|
|
|
13
|
|
|
5
|
|
|
|
|
413
|
|
32
|
|
|
|
|
|
|
|
33
|
5
|
|
|
5
|
|
3080
|
use HTML::Parser; |
|
5
|
|
|
|
|
32068
|
|
|
5
|
|
|
|
|
8442
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Reusable parser object (TODO: What happens if we thread/fork ?) |
36
|
|
|
|
|
|
|
our $HTML_Parser = |
37
|
|
|
|
|
|
|
HTML::Parser->new( |
38
|
|
|
|
|
|
|
'api_version' => 3, |
39
|
|
|
|
|
|
|
'start_h' => [sub { |
40
|
|
|
|
|
|
|
my ($self,$tagname,$attr)=@_; |
41
|
|
|
|
|
|
|
if ($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/ || |
42
|
|
|
|
|
|
|
(($tagname eq 'span') && $attr->{class} && ($attr->{class} =~ 'nolink'))) { |
43
|
|
|
|
|
|
|
$self->{fresh_skip}=1; |
44
|
|
|
|
|
|
|
$self->{noparse}++; |
45
|
|
|
|
|
|
|
} else { |
46
|
|
|
|
|
|
|
$self->{fresh_skip}=0; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
} , 'self, tagname, attr'], |
49
|
|
|
|
|
|
|
'end_h' => [sub { |
50
|
|
|
|
|
|
|
my ($self,$tagname)=@_; |
51
|
|
|
|
|
|
|
if (($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/) || |
52
|
|
|
|
|
|
|
(((length($tagname)==0)||($tagname eq 'span')) && ($self->{fresh_skip} == 1))) { |
53
|
|
|
|
|
|
|
$self->{noparse}--; |
54
|
|
|
|
|
|
|
$self->{fresh_skip}=0; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
}, 'self,tagname'], |
57
|
|
|
|
|
|
|
'text_h' => [\&_text_event_handler, 'self,text,offset'] |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
$HTML_Parser->unbroken_text; |
60
|
|
|
|
|
|
|
$HTML_Parser->xml_mode; |
61
|
|
|
|
|
|
|
$HTML_Parser->attr_encoded(1); |
62
|
|
|
|
|
|
|
$HTML_Parser->empty_element_tags(1); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Prepare cache for first-word concept lookup |
65
|
|
|
|
|
|
|
our $first_word_cache_template = {map { ($_,[]) } @{stop_words_ref()}}; |
66
|
|
|
|
|
|
|
sub mine_candidates { |
67
|
8
|
|
|
8
|
0
|
45
|
my (%options) = @_; |
68
|
|
|
|
|
|
|
# State: We need a db object with a properly set database |
69
|
|
|
|
|
|
|
# Input: We need a string representing the (HTML) body of the chunk we're |
70
|
|
|
|
|
|
|
# mining on, as well as its URL. |
71
|
|
|
|
|
|
|
# Optional: Deprecated details such as 'domain' or 'format'. |
72
|
|
|
|
|
|
|
# Interesting: allow 'nolink' again? |
73
|
48
|
|
|
|
|
73
|
my ($db,$format,$body,$nolink,$url,$domain) = |
74
|
8
|
|
|
|
|
20
|
map {$options{$_}} qw(db format body nolink url domain); |
75
|
8
|
50
|
|
|
|
35
|
die "The db key is a mandatory parameter for mine_candidates!\n" unless ref $db; # TODO: Maybe raise a better error? |
76
|
8
|
50
|
|
|
|
22
|
$format = 'html' unless defined $format; |
77
|
8
|
50
|
|
|
|
19
|
return ([],0) unless $body; |
78
|
|
|
|
|
|
|
# Prepare data, if we have a URL: |
79
|
8
|
|
|
|
|
11
|
my $objectid; # undefined if we don't have a URL, we only do MoC for named resources |
80
|
8
|
100
|
|
|
|
20
|
if ($url) { |
81
|
1
|
|
50
|
|
|
7
|
my $object = $db->select_object_by(url=>$url) || {}; |
82
|
1
|
|
50
|
|
|
5
|
$objectid = $object->{objectid} || -1; |
83
|
1
|
50
|
|
|
|
3
|
$domain = $object->{domain} unless defined $domain; |
84
|
|
|
|
|
|
|
# If objectid is -1 , we will also need to add_object on the url |
85
|
1
|
50
|
|
|
|
3
|
if ($objectid == -1) { |
86
|
|
|
|
|
|
|
# TODO: Extract the domain from the URL, this is unreliable |
87
|
1
|
|
|
|
|
6
|
$objectid = $db->add_object_by(url=>$options{'url'},domain=>$domain); |
88
|
|
|
|
|
|
|
} else { |
89
|
|
|
|
|
|
|
# If already known, flush the links_cache for this object |
90
|
0
|
|
|
|
|
0
|
$db->delete_linkscache_by(objectid=>$objectid); |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
# Keep a cache of first words, that will simultaneously act as a blacklist. |
94
|
|
|
|
|
|
|
# TODO: Incorporate the single words from 'nolink' |
95
|
|
|
|
|
|
|
# Experiment: Use a global first_word cache |
96
|
8
|
|
|
|
|
18
|
$options{first_word_cache} = $first_word_cache_template; # Default are stopwords, keep a global cache |
97
|
|
|
|
|
|
|
# $options{first_word_cache} = { %$first_word_cache_template }; # Default are stopwords |
98
|
|
|
|
|
|
|
# Always return an embedded annotation with links, as well as a stand-off mined_canidates hash, containing the individual concepts with pointers. |
99
|
8
|
|
|
|
|
11
|
my $time; |
100
|
8
|
50
|
|
|
|
20
|
if ($options{verbosity}) { |
101
|
0
|
|
|
|
|
0
|
$time = time(); |
102
|
|
|
|
|
|
|
} |
103
|
8
|
|
|
|
|
15
|
my $mined_candidates=[]; |
104
|
8
|
|
|
|
|
10
|
my $text_length=0; |
105
|
8
|
100
|
|
|
|
27
|
if ($format eq 'html') { |
|
|
50
|
|
|
|
|
|
106
|
2
|
|
|
|
|
8
|
($mined_candidates,$text_length) = mine_candidates_html(\%options); |
107
|
|
|
|
|
|
|
} elsif ($format eq 'text') { |
108
|
6
|
|
|
|
|
21
|
($mined_candidates,$text_length) = mine_candidates_text(\%options); |
109
|
|
|
|
|
|
|
} else { |
110
|
0
|
|
|
|
|
0
|
print STDERR "Error: Unrecognized input format for auto-linking.\n"; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
# Only mark-up first found candidate, unless requested otherwise |
113
|
8
|
|
|
|
|
13
|
my @uniq_candidates; |
114
|
8
|
|
|
|
|
30
|
while (@$mined_candidates) { |
115
|
19
|
|
|
|
|
19
|
my $candidate = shift @$mined_candidates; |
116
|
19
|
|
|
|
|
25
|
my $concept = $candidate->{concept}; |
117
|
19
|
|
|
|
|
21
|
my $link = $candidate->{link}; |
118
|
19
|
|
|
|
|
21
|
my $category = $candidate->{category}; |
119
|
19
|
100
|
100
|
|
|
28
|
@$mined_candidates = grep {($_->{concept} ne $concept) || ($_->{link} ne $link) || ($_->{category} ne $category)} @$mined_candidates; |
|
43
|
|
|
|
|
111
|
|
120
|
19
|
|
|
|
|
40
|
push @uniq_candidates, $candidate; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
# Also, don't add self-links, coming from $url |
123
|
8
|
100
|
|
|
|
21
|
@uniq_candidates = grep {$_->{link} ne $url} @uniq_candidates if $url; |
|
1
|
|
|
|
|
4
|
|
124
|
8
|
|
|
|
|
16
|
@$mined_candidates = @uniq_candidates; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
#TODO: When do we deal with the nolink settings? |
127
|
|
|
|
|
|
|
# next if (inset($concept,@$nolink)); |
128
|
8
|
50
|
|
|
|
23
|
if ($options{verbosity}) { |
129
|
0
|
|
|
|
|
0
|
printf STDERR " Discovered %d concepts in %.3f seconds.\n",scalar(@uniq_candidates),time()-$time; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Update linkscache: |
133
|
8
|
100
|
|
|
|
20
|
if ($objectid) { |
134
|
|
|
|
|
|
|
$db->add_linkscache_by(objectid=>$objectid,conceptid=>$_->{conceptid}) |
135
|
1
|
|
|
|
|
6
|
foreach (@$mined_candidates); |
136
|
|
|
|
|
|
|
} |
137
|
8
|
|
|
|
|
56
|
return ($mined_candidates,$text_length); |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub mine_candidates_html { |
141
|
2
|
|
|
2
|
0
|
5
|
my ($options) = @_; |
142
|
2
|
|
|
|
|
4
|
my ($db,$domain,$body,$syns,$targetid,$class) = map {$options->{$_}} qw(db domain body nolink targetid class); |
|
12
|
|
|
|
|
20
|
|
143
|
|
|
|
|
|
|
# Current HTML Parsing strategy - fire events for all HTML tags and explicitly skip over tags that |
144
|
|
|
|
|
|
|
# won't be of interest. We need to autolink in all textual elements. |
145
|
|
|
|
|
|
|
# TODO: Handle MathML better |
146
|
2
|
50
|
|
|
|
7
|
return ([],0) unless $body; |
147
|
|
|
|
|
|
|
|
148
|
2
|
|
|
|
|
12
|
$HTML_Parser->{mined_candidates} = []; |
149
|
2
|
|
|
|
|
7
|
$HTML_Parser->{text_length} = 0; |
150
|
2
|
|
|
|
|
3
|
$HTML_Parser->{state_information}=$options; # Not pretty, but TODO: improve |
151
|
2
|
|
|
|
|
33
|
$HTML_Parser->parse($body); |
152
|
2
|
|
|
|
|
10
|
$HTML_Parser->eof(); |
153
|
2
|
|
|
|
|
11
|
return ($HTML_Parser->{mined_candidates},$HTML_Parser->{text_length}); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub _text_event_handler { |
157
|
665
|
|
|
665
|
|
783
|
my ($self,$body,$offset) = @_; |
158
|
665
|
|
|
|
|
730
|
my $state = $self->{state_information}; |
159
|
|
|
|
|
|
|
# Skip if in a silly element: |
160
|
665
|
100
|
66
|
|
|
2898
|
if (($self->{noparse} && ($self->{noparse}>0)) || ($body !~ /\w/)) { |
|
|
|
100
|
|
|
|
|
161
|
580
|
|
|
|
|
2301
|
return; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
# Otherwise - discover concepts and annotate! |
164
|
85
|
|
|
|
|
188
|
my $time = time(); |
165
|
85
|
|
|
|
|
538
|
my ($mined_candidates,$chunk_length) = |
166
|
|
|
|
|
|
|
mine_candidates_text({db=>$state->{db}, |
167
|
|
|
|
|
|
|
nolink=>$state->{nolink}, |
168
|
|
|
|
|
|
|
body=>$body, |
169
|
|
|
|
|
|
|
domain=>$state->{domain}, |
170
|
|
|
|
|
|
|
first_word_cache=>$state->{first_word_cache}, |
171
|
|
|
|
|
|
|
class=>$state->{class}}); |
172
|
|
|
|
|
|
|
#printf STDERR " --processed textual chunk in %.3f seconds\n",time()-$time; |
173
|
85
|
|
|
|
|
270
|
foreach my $candidate(@$mined_candidates) { |
174
|
2
|
|
|
|
|
6
|
$candidate->{offset_begin}+=$offset; |
175
|
2
|
|
|
|
|
6
|
$candidate->{offset_end}+=$offset; |
176
|
|
|
|
|
|
|
} |
177
|
85
|
|
|
|
|
68
|
push @{$self->{mined_candidates}}, @$mined_candidates; |
|
85
|
|
|
|
|
153
|
|
178
|
85
|
|
|
|
|
674
|
$self->{text_length} += $chunk_length; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Core Data Mining routine - inspects plain-text strings |
182
|
|
|
|
|
|
|
# returns back the matches and position of disambiguated links of the supplied text. |
183
|
|
|
|
|
|
|
sub mine_candidates_text { |
184
|
91
|
|
|
91
|
0
|
103
|
my ($options) = @_; |
185
|
728
|
|
|
|
|
942
|
my ($db,$domain,$body,$syns,$targetid,$nolink,$class,$first_word_cache) = |
186
|
91
|
|
|
|
|
143
|
map {$options->{$_}} qw(db domain body nolink targetid nolink class first_word_cache); |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# TODO: We have to make a distinction between "defined concepts" and "candidate concepts" here. |
189
|
|
|
|
|
|
|
# Probably just based on whether we find a URL or not? |
190
|
91
|
|
|
|
|
132
|
my @matches; |
191
|
91
|
|
|
|
|
160
|
my %termlist = (); |
192
|
91
|
|
|
|
|
94
|
my $offset=0; |
193
|
91
|
|
|
|
|
100
|
my $text_length = length($body); |
194
|
|
|
|
|
|
|
# Read one (2+ letter) word at a time |
195
|
91
|
|
|
|
|
96
|
my $concept_word_rex = $NNexus::Morphology::concept_word_rex; |
196
|
|
|
|
|
|
|
CONCEPT_TRAVERSAL: |
197
|
91
|
|
|
|
|
961
|
while ($body =~ s/^(.*?)($concept_word_rex)//s) { |
198
|
401
|
|
|
|
|
812
|
$offset += length($1); |
199
|
401
|
|
|
|
|
389
|
my $offset_begin = $offset; |
200
|
401
|
|
|
|
|
453
|
$offset += length($2); |
201
|
401
|
|
|
|
|
299
|
my $offset_end = $offset; |
202
|
401
|
|
|
|
|
577
|
my $word = lc($2); # lower-case to match stopwords |
203
|
|
|
|
|
|
|
# Use a cache for first-word lookups, with the dual-role of a blacklist. |
204
|
401
|
|
|
|
|
622
|
my $cached = $first_word_cache->{$word}; |
205
|
401
|
|
|
|
|
515
|
my @candidates=(); |
206
|
401
|
100
|
|
|
|
708
|
if (! (ref $cached )) { |
207
|
|
|
|
|
|
|
# Normalize word |
208
|
134
|
|
|
|
|
344
|
my $norm_word = normalize_word($word); |
209
|
|
|
|
|
|
|
# get all possible candidates for both posessive and plural forms of $word |
210
|
134
|
|
|
|
|
454
|
@candidates = $db->select_firstword_matches($norm_word); |
211
|
|
|
|
|
|
|
# Cache the candidates: |
212
|
134
|
|
|
|
|
398
|
my $saved_candidates = clone_concepts(\@candidates); # Clone the candidates |
213
|
134
|
|
|
|
|
271
|
$first_word_cache->{$word} = $saved_candidates; |
214
|
134
|
|
|
|
|
206
|
$first_word_cache->{$norm_word} = $saved_candidates; |
215
|
|
|
|
|
|
|
} else { |
216
|
|
|
|
|
|
|
#Cached, clone into a new array |
217
|
267
|
|
|
|
|
220
|
@candidates = @{ clone_concepts($cached)}; |
|
267
|
|
|
|
|
577
|
|
218
|
|
|
|
|
|
|
} |
219
|
401
|
100
|
|
|
|
3645
|
next CONCEPT_TRAVERSAL unless @candidates; # if there are no candidates skip the word |
220
|
|
|
|
|
|
|
# Split tailwords into an array |
221
|
14
|
|
|
|
|
25
|
foreach my $c(@candidates) { |
222
|
182
|
|
100
|
|
|
477
|
$c->{tailwords} = [split(/\s+/,$c->{tailwords}||'')]; } |
223
|
14
|
|
|
|
|
20
|
my $inner_offset = 0; |
224
|
14
|
|
|
|
|
17
|
my $match_offset = 0; # Record the offset of the current longest match, add to end_position when finalized |
225
|
14
|
|
|
|
|
24
|
my $inner_body = $body; # A copy of the text to munge around while searching. |
226
|
14
|
|
|
|
|
25
|
my @inner_matches = grep {@{$_->{tailwords}} == 0} @candidates; # Record the current longest matches here |
|
182
|
|
|
|
|
90
|
|
|
182
|
|
|
|
|
204
|
|
227
|
|
|
|
|
|
|
# Longest-match: |
228
|
|
|
|
|
|
|
# as long as: |
229
|
|
|
|
|
|
|
# - there is leftover tail in some candidate(s) |
230
|
14
|
|
|
|
|
19
|
@candidates = grep {@{$_->{tailwords}} > 0} @candidates; |
|
182
|
|
|
|
|
99
|
|
|
182
|
|
|
|
|
217
|
|
231
|
|
|
|
|
|
|
CANDIDATE_LOOP: |
232
|
14
|
|
|
|
|
36
|
while (@candidates) { |
233
|
|
|
|
|
|
|
# - AND there are leftover words in current phrase |
234
|
14
|
100
|
|
|
|
252
|
if ($inner_body =~ /^(\s+)($concept_word_rex)/s) { |
|
4
|
|
|
|
|
9
|
|
235
|
|
|
|
|
|
|
# then: pull and compare next word, reduce text and tailwords |
236
|
|
|
|
|
|
|
# 1. Pull next. |
237
|
10
|
|
|
|
|
35
|
my $step_offset = length($1) + length($2); |
238
|
10
|
|
|
|
|
15
|
$inner_offset += $step_offset; |
239
|
10
|
|
|
|
|
40
|
my $next_word = normalize_word($2); |
240
|
|
|
|
|
|
|
# 2. Filter for applicable candidates |
241
|
10
|
|
|
|
|
20
|
my @inner_candidates = grep { $_->{tailwords}->[0] eq $next_word } @candidates; |
|
174
|
|
|
|
|
207
|
|
242
|
10
|
100
|
|
|
|
24
|
if (@inner_candidates) { |
|
4
|
|
|
|
|
10
|
|
243
|
|
|
|
|
|
|
# We have indeed a longer match, remove the first tailword |
244
|
6
|
|
|
|
|
14
|
shift @{$_->{tailwords}} foreach @inner_candidates; |
|
16
|
|
|
|
|
36
|
|
245
|
|
|
|
|
|
|
# candidates for next iteration must have leftover tail words |
246
|
6
|
|
|
|
|
14
|
@candidates = grep {@{$_->{tailwords}} > 0} @inner_candidates; |
|
16
|
|
|
|
|
13
|
|
|
16
|
|
|
|
|
125
|
|
247
|
|
|
|
|
|
|
# record intermediate longest matches - the current empty tailwords |
248
|
6
|
|
|
|
|
12
|
my @step_matches = grep {@{$_->{tailwords}} == 0} @inner_candidates; |
|
16
|
|
|
|
|
14
|
|
|
16
|
|
|
|
|
32
|
|
249
|
6
|
50
|
|
|
|
17
|
if (@step_matches) { |
250
|
6
|
|
|
|
|
13
|
@inner_matches = @step_matches; |
251
|
6
|
|
|
|
|
12
|
$match_offset = $inner_offset; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
# Move $step_offset right the text |
254
|
6
|
|
|
|
|
35
|
substr($inner_body,0,$step_offset)=''; |
255
|
|
|
|
|
|
|
} else {last CANDIDATE_LOOP;} # Last here as well. |
256
|
|
|
|
|
|
|
} else {last CANDIDATE_LOOP;} # Otherwise we are done |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
# In the end, do we have one or more matches? |
259
|
14
|
100
|
|
|
|
32
|
if (@inner_matches > 0) { |
260
|
|
|
|
|
|
|
# Yes! |
261
|
|
|
|
|
|
|
# merge multi-links into single match entry |
262
|
|
|
|
|
|
|
# multi-link = same concept, category and domain, different URLs |
263
|
|
|
|
|
|
|
# CARE: careful not to confuse with cases of different categories, which need disambiguation |
264
|
10
|
|
|
|
|
12
|
my @merged_matches; |
265
|
|
|
|
|
|
|
#print STDERR Dumper(\@inner_matches); |
266
|
10
|
|
|
|
|
23
|
while (@inner_matches) { |
267
|
20
|
|
|
|
|
29
|
my $match = shift @inner_matches; |
268
|
20
|
|
|
|
|
26
|
my $category = $match->{category}; |
269
|
20
|
|
|
|
|
21
|
my $domain = $match->{domain}; |
270
|
0
|
100
|
|
|
|
0
|
my @multilinks = map {$_->{link}} |
|
22
|
|
|
|
|
42
|
|
271
|
20
|
|
|
|
|
25
|
grep {($_->{category} eq $category) && ($_->{domain} eq $domain)} @inner_matches; |
272
|
20
|
100
|
|
|
|
23
|
@inner_matches = grep {($_->{category} ne $category) || ($_->{domain} ne $domain)} @inner_matches; |
|
22
|
|
|
|
|
46
|
|
273
|
20
|
50
|
|
|
|
32
|
if (@multilinks>0) { |
274
|
0
|
|
|
|
|
0
|
unshift @multilinks, $match->{link}; |
275
|
0
|
|
|
|
|
0
|
$match->{multilinks} = \@multilinks; |
276
|
|
|
|
|
|
|
} |
277
|
20
|
|
|
|
|
42
|
push @merged_matches, $match; |
278
|
|
|
|
|
|
|
} |
279
|
10
|
|
|
|
|
15
|
@inner_matches = @merged_matches; |
280
|
|
|
|
|
|
|
# Record offsets: |
281
|
10
|
|
|
|
|
23
|
$offset += $match_offset; |
282
|
10
|
|
|
|
|
9
|
$offset_end += $match_offset; |
283
|
10
|
|
|
|
|
17
|
foreach my $match(@inner_matches) { |
284
|
20
|
|
|
|
|
32
|
$match->{offset_begin} = $offset_begin; |
285
|
20
|
|
|
|
|
27
|
$match->{offset_end} = $offset_end; |
286
|
20
|
|
|
|
|
37
|
delete $match->{tailwords}; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
# And push to main matches array |
289
|
10
|
|
|
|
|
21
|
push @matches, @inner_matches; |
290
|
|
|
|
|
|
|
# And move the text forward with the match_offset |
291
|
10
|
100
|
|
|
|
150
|
substr($body,0,$match_offset)='' if $match_offset; |
292
|
4
|
|
|
|
|
42
|
} else { next CONCEPT_TRAVERSAL; } # If not, we just move on to the next word |
293
|
|
|
|
|
|
|
} |
294
|
91
|
|
|
|
|
300
|
return (\@matches,$text_length); |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
1; |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
__END__ |