| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
|
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
|
3
|
|
|
|
|
|
|
# | Concept Discovery Module | # |
|
4
|
|
|
|
|
|
|
# |=====================================================================| # |
|
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
|
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
|
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
|
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
|
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
|
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
|
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
|
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
|
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
|
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
|
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
|
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
|
17
|
|
|
|
|
|
|
package NNexus::Discover; |
|
18
|
5
|
|
|
5
|
|
20
|
use strict; |
|
|
5
|
|
|
|
|
5
|
|
|
|
5
|
|
|
|
|
150
|
|
|
19
|
5
|
|
|
5
|
|
20
|
use warnings; |
|
|
5
|
|
|
|
|
7
|
|
|
|
5
|
|
|
|
|
95
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
5
|
|
|
5
|
|
16
|
use Exporter; |
|
|
5
|
|
|
|
|
19
|
|
|
|
5
|
|
|
|
|
312
|
|
|
22
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
23
|
|
|
|
|
|
|
our @EXPORT_OK = qw(mine_candidates); |
|
24
|
5
|
|
|
5
|
|
20
|
use Encode qw/is_utf8/; |
|
|
5
|
|
|
|
|
5
|
|
|
|
5
|
|
|
|
|
194
|
|
|
25
|
5
|
|
|
5
|
|
19
|
use utf8; |
|
|
5
|
|
|
|
|
37
|
|
|
|
5
|
|
|
|
|
41
|
|
|
26
|
5
|
|
|
5
|
|
85
|
use Data::Dumper; |
|
|
5
|
|
|
|
|
14
|
|
|
|
5
|
|
|
|
|
252
|
|
|
27
|
5
|
|
|
5
|
|
21
|
use Time::HiRes qw ( time alarm sleep ); |
|
|
5
|
|
|
|
|
4
|
|
|
|
5
|
|
|
|
|
37
|
|
|
28
|
|
|
|
|
|
|
|
|
29
|
5
|
|
|
5
|
|
2526
|
use NNexus::StopWordList qw(stop_words_ref); |
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
275
|
|
|
30
|
5
|
|
|
5
|
|
1441
|
use NNexus::Morphology qw(normalize_word); |
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
285
|
|
|
31
|
5
|
|
|
5
|
|
1885
|
use NNexus::Concepts qw(clone_concepts); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
use HTML::Parser; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Reusable parser object (TODO: What happens if we thread/fork ?) |
|
36
|
|
|
|
|
|
|
our $HTML_Parser = |
|
37
|
|
|
|
|
|
|
HTML::Parser->new( |
|
38
|
|
|
|
|
|
|
'api_version' => 3, |
|
39
|
|
|
|
|
|
|
'start_h' => [sub { |
|
40
|
|
|
|
|
|
|
my ($self,$tagname,$attr)=@_; |
|
41
|
|
|
|
|
|
|
if ($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/ || |
|
42
|
|
|
|
|
|
|
(($tagname eq 'span') && $attr->{class} && ($attr->{class} =~ 'nolink'))) { |
|
43
|
|
|
|
|
|
|
$self->{fresh_skip}=1; |
|
44
|
|
|
|
|
|
|
$self->{noparse}++; |
|
45
|
|
|
|
|
|
|
} else { |
|
46
|
|
|
|
|
|
|
$self->{fresh_skip}=0; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
} , 'self, tagname, attr'], |
|
49
|
|
|
|
|
|
|
'end_h' => [sub { |
|
50
|
|
|
|
|
|
|
my ($self,$tagname)=@_; |
|
51
|
|
|
|
|
|
|
if (($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/) || |
|
52
|
|
|
|
|
|
|
(((length($tagname)==0)||($tagname eq 'span')) && ($self->{fresh_skip} == 1))) { |
|
53
|
|
|
|
|
|
|
$self->{noparse}--; |
|
54
|
|
|
|
|
|
|
$self->{fresh_skip}=0; |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
}, 'self,tagname'], |
|
57
|
|
|
|
|
|
|
'text_h' => [\&_text_event_handler, 'self,text,offset'] |
|
58
|
|
|
|
|
|
|
); |
|
59
|
|
|
|
|
|
|
$HTML_Parser->unbroken_text; |
|
60
|
|
|
|
|
|
|
$HTML_Parser->xml_mode; |
|
61
|
|
|
|
|
|
|
$HTML_Parser->attr_encoded(1); |
|
62
|
|
|
|
|
|
|
$HTML_Parser->empty_element_tags(1); |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Prepare cache for first-word concept lookup |
|
65
|
|
|
|
|
|
|
our $first_word_cache_template = {map { ($_,[]) } @{stop_words_ref()}}; |
|
66
|
|
|
|
|
|
|
sub mine_candidates { |
|
67
|
|
|
|
|
|
|
my (%options) = @_; |
|
68
|
|
|
|
|
|
|
# State: We need a db object with a properly set database |
|
69
|
|
|
|
|
|
|
# Input: We need a string representing the (HTML) body of the chunk we're |
|
70
|
|
|
|
|
|
|
# mining on, as well as its URL. |
|
71
|
|
|
|
|
|
|
# Optional: Deprecated details such as 'domain' or 'format'. |
|
72
|
|
|
|
|
|
|
# Interesting: allow 'nolink' again? |
|
73
|
|
|
|
|
|
|
my ($db,$format,$body,$nolink,$url,$domain) = |
|
74
|
|
|
|
|
|
|
map {$options{$_}} qw(db format body nolink url domain); |
|
75
|
|
|
|
|
|
|
die "The db key is a mandatory parameter for mine_candidates!\n" unless ref $db; # TODO: Maybe raise a better error? |
|
76
|
|
|
|
|
|
|
$format = 'html' unless defined $format; |
|
77
|
|
|
|
|
|
|
return ([],0) unless $body; |
|
78
|
|
|
|
|
|
|
# Prepare data, if we have a URL: |
|
79
|
|
|
|
|
|
|
my $objectid; # undefined if we don't have a URL, we only do MoC for named resources |
|
80
|
|
|
|
|
|
|
if ($url) { |
|
81
|
|
|
|
|
|
|
my $object = $db->select_object_by(url=>$url) || {}; |
|
82
|
|
|
|
|
|
|
$objectid = $object->{objectid} || -1; |
|
83
|
|
|
|
|
|
|
$domain = $object->{domain} unless defined $domain; |
|
84
|
|
|
|
|
|
|
# If objectid is -1 , we will also need to add_object on the url |
|
85
|
|
|
|
|
|
|
if ($objectid == -1) { |
|
86
|
|
|
|
|
|
|
# TODO: Extract the domain from the URL, this is unreliable |
|
87
|
|
|
|
|
|
|
$objectid = $db->add_object_by(url=>$options{'url'},domain=>$domain); |
|
88
|
|
|
|
|
|
|
} else { |
|
89
|
|
|
|
|
|
|
# If already known, flush the links_cache for this object |
|
90
|
|
|
|
|
|
|
$db->delete_linkscache_by(objectid=>$objectid); |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
# Keep a cache of first words, that will simultaneously act as a blacklist. |
|
94
|
|
|
|
|
|
|
# TODO: Incorporate the single words from 'nolink' |
|
95
|
|
|
|
|
|
|
# Experiment: Use a global first_word cache |
|
96
|
|
|
|
|
|
|
$options{first_word_cache} = $first_word_cache_template; # Default are stopwords, keep a global cache |
|
97
|
|
|
|
|
|
|
# $options{first_word_cache} = { %$first_word_cache_template }; # Default are stopwords |
|
98
|
|
|
|
|
|
|
# Always return an embedded annotation with links, as well as a stand-off mined_canidates hash, containing the individual concepts with pointers. |
|
99
|
|
|
|
|
|
|
my $time; |
|
100
|
|
|
|
|
|
|
if ($options{verbosity}) { |
|
101
|
|
|
|
|
|
|
$time = time(); |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
my $mined_candidates=[]; |
|
104
|
|
|
|
|
|
|
my $text_length=0; |
|
105
|
|
|
|
|
|
|
if ($format eq 'html') { |
|
106
|
|
|
|
|
|
|
($mined_candidates,$text_length) = mine_candidates_html(\%options); |
|
107
|
|
|
|
|
|
|
} elsif ($format eq 'text') { |
|
108
|
|
|
|
|
|
|
($mined_candidates,$text_length) = mine_candidates_text(\%options); |
|
109
|
|
|
|
|
|
|
} else { |
|
110
|
|
|
|
|
|
|
print STDERR "Error: Unrecognized input format for auto-linking.\n"; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
# Only mark-up first found candidate, unless requested otherwise |
|
113
|
|
|
|
|
|
|
my @uniq_candidates; |
|
114
|
|
|
|
|
|
|
while (@$mined_candidates) { |
|
115
|
|
|
|
|
|
|
my $candidate = shift @$mined_candidates; |
|
116
|
|
|
|
|
|
|
my $concept = $candidate->{concept}; |
|
117
|
|
|
|
|
|
|
my $link = $candidate->{link}; |
|
118
|
|
|
|
|
|
|
my $category = $candidate->{category}; |
|
119
|
|
|
|
|
|
|
@$mined_candidates = grep {($_->{concept} ne $concept) || ($_->{link} ne $link) || ($_->{category} ne $category)} @$mined_candidates; |
|
120
|
|
|
|
|
|
|
push @uniq_candidates, $candidate; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
# Also, don't add self-links, coming from $url |
|
123
|
|
|
|
|
|
|
@uniq_candidates = grep {$_->{link} ne $url} @uniq_candidates if $url; |
|
124
|
|
|
|
|
|
|
@$mined_candidates = @uniq_candidates; |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
#TODO: When do we deal with the nolink settings? |
|
127
|
|
|
|
|
|
|
# next if (inset($concept,@$nolink)); |
|
128
|
|
|
|
|
|
|
if ($options{verbosity}) { |
|
129
|
|
|
|
|
|
|
printf STDERR " Discovered %d concepts in %.3f seconds.\n",scalar(@uniq_candidates),time()-$time; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Update linkscache: |
|
133
|
|
|
|
|
|
|
if ($objectid) { |
|
134
|
|
|
|
|
|
|
$db->add_linkscache_by(objectid=>$objectid,conceptid=>$_->{conceptid}) |
|
135
|
|
|
|
|
|
|
foreach (@$mined_candidates); |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
return ($mined_candidates,$text_length); |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub mine_candidates_html { |
|
141
|
|
|
|
|
|
|
my ($options) = @_; |
|
142
|
|
|
|
|
|
|
my ($db,$domain,$body,$syns,$targetid,$class) = map {$options->{$_}} qw(db domain body nolink targetid class); |
|
143
|
|
|
|
|
|
|
# Current HTML Parsing strategy - fire events for all HTML tags and explicitly skip over tags that |
|
144
|
|
|
|
|
|
|
# won't be of interest. We need to autolink in all textual elements. |
|
145
|
|
|
|
|
|
|
# TODO: Handle MathML better |
|
146
|
|
|
|
|
|
|
return ([],0) unless $body; |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
$HTML_Parser->{mined_candidates} = []; |
|
149
|
|
|
|
|
|
|
$HTML_Parser->{text_length} = 0; |
|
150
|
|
|
|
|
|
|
$HTML_Parser->{state_information}=$options; # Not pretty, but TODO: improve |
|
151
|
|
|
|
|
|
|
$HTML_Parser->parse($body); |
|
152
|
|
|
|
|
|
|
$HTML_Parser->eof(); |
|
153
|
|
|
|
|
|
|
return ($HTML_Parser->{mined_candidates},$HTML_Parser->{text_length}); |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub _text_event_handler { |
|
157
|
|
|
|
|
|
|
my ($self,$body,$offset) = @_; |
|
158
|
|
|
|
|
|
|
my $state = $self->{state_information}; |
|
159
|
|
|
|
|
|
|
# Skip if in a silly element: |
|
160
|
|
|
|
|
|
|
if (($self->{noparse} && ($self->{noparse}>0)) || ($body !~ /\w/)) { |
|
161
|
|
|
|
|
|
|
return; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
# Otherwise - discover concepts and annotate! |
|
164
|
|
|
|
|
|
|
my $time = time(); |
|
165
|
|
|
|
|
|
|
my ($mined_candidates,$chunk_length) = |
|
166
|
|
|
|
|
|
|
mine_candidates_text({db=>$state->{db}, |
|
167
|
|
|
|
|
|
|
nolink=>$state->{nolink}, |
|
168
|
|
|
|
|
|
|
body=>$body, |
|
169
|
|
|
|
|
|
|
domain=>$state->{domain}, |
|
170
|
|
|
|
|
|
|
first_word_cache=>$state->{first_word_cache}, |
|
171
|
|
|
|
|
|
|
class=>$state->{class}}); |
|
172
|
|
|
|
|
|
|
#printf STDERR " --processed textual chunk in %.3f seconds\n",time()-$time; |
|
173
|
|
|
|
|
|
|
foreach my $candidate(@$mined_candidates) { |
|
174
|
|
|
|
|
|
|
$candidate->{offset_begin}+=$offset; |
|
175
|
|
|
|
|
|
|
$candidate->{offset_end}+=$offset; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
push @{$self->{mined_candidates}}, @$mined_candidates; |
|
178
|
|
|
|
|
|
|
$self->{text_length} += $chunk_length; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Core Data Mining routine - inspects plain-text strings |
|
182
|
|
|
|
|
|
|
# returns back the matches and position of disambiguated links of the supplied text. |
|
183
|
|
|
|
|
|
|
sub mine_candidates_text { |
|
184
|
|
|
|
|
|
|
my ($options) = @_; |
|
185
|
|
|
|
|
|
|
my ($db,$domain,$body,$syns,$targetid,$nolink,$class,$first_word_cache) = |
|
186
|
|
|
|
|
|
|
map {$options->{$_}} qw(db domain body nolink targetid nolink class first_word_cache); |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# TODO: We have to make a distinction between "defined concepts" and "candidate concepts" here. |
|
189
|
|
|
|
|
|
|
# Probably just based on whether we find a URL or not? |
|
190
|
|
|
|
|
|
|
my @matches; |
|
191
|
|
|
|
|
|
|
my %termlist = (); |
|
192
|
|
|
|
|
|
|
my $offset=0; |
|
193
|
|
|
|
|
|
|
my $text_length = length($body); |
|
194
|
|
|
|
|
|
|
# Read one (2+ letter) word at a time |
|
195
|
|
|
|
|
|
|
my $concept_word_rex = $NNexus::Morphology::concept_word_rex; |
|
196
|
|
|
|
|
|
|
CONCEPT_TRAVERSAL: |
|
197
|
|
|
|
|
|
|
while ($body =~ s/^(.*?)($concept_word_rex)//s) { |
|
198
|
|
|
|
|
|
|
$offset += length($1); |
|
199
|
|
|
|
|
|
|
my $offset_begin = $offset; |
|
200
|
|
|
|
|
|
|
$offset += length($2); |
|
201
|
|
|
|
|
|
|
my $offset_end = $offset; |
|
202
|
|
|
|
|
|
|
my $word = lc($2); # lower-case to match stopwords |
|
203
|
|
|
|
|
|
|
# Use a cache for first-word lookups, with the dual-role of a blacklist. |
|
204
|
|
|
|
|
|
|
my $cached = $first_word_cache->{$word}; |
|
205
|
|
|
|
|
|
|
my @candidates=(); |
|
206
|
|
|
|
|
|
|
if (! (ref $cached )) { |
|
207
|
|
|
|
|
|
|
# Normalize word |
|
208
|
|
|
|
|
|
|
my $norm_word = normalize_word($word); |
|
209
|
|
|
|
|
|
|
# get all possible candidates for both posessive and plural forms of $word |
|
210
|
|
|
|
|
|
|
@candidates = $db->select_firstword_matches($norm_word); |
|
211
|
|
|
|
|
|
|
# Cache the candidates: |
|
212
|
|
|
|
|
|
|
my $saved_candidates = clone_concepts(\@candidates); # Clone the candidates |
|
213
|
|
|
|
|
|
|
$first_word_cache->{$word} = $saved_candidates; |
|
214
|
|
|
|
|
|
|
$first_word_cache->{$norm_word} = $saved_candidates; |
|
215
|
|
|
|
|
|
|
} else { |
|
216
|
|
|
|
|
|
|
#Cached, clone into a new array |
|
217
|
|
|
|
|
|
|
@candidates = @{ clone_concepts($cached)}; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
next CONCEPT_TRAVERSAL unless @candidates; # if there are no candidates skip the word |
|
220
|
|
|
|
|
|
|
# Split tailwords into an array |
|
221
|
|
|
|
|
|
|
foreach my $c(@candidates) { |
|
222
|
|
|
|
|
|
|
$c->{tailwords} = [split(/\s+/,$c->{tailwords}||'')]; } |
|
223
|
|
|
|
|
|
|
my $inner_offset = 0; |
|
224
|
|
|
|
|
|
|
my $match_offset = 0; # Record the offset of the current longest match, add to end_position when finalized |
|
225
|
|
|
|
|
|
|
my $inner_body = $body; # A copy of the text to munge around while searching. |
|
226
|
|
|
|
|
|
|
my @inner_matches = grep {@{$_->{tailwords}} == 0} @candidates; # Record the current longest matches here |
|
227
|
|
|
|
|
|
|
# Longest-match: |
|
228
|
|
|
|
|
|
|
# as long as: |
|
229
|
|
|
|
|
|
|
# - there is leftover tail in some candidate(s) |
|
230
|
|
|
|
|
|
|
@candidates = grep {@{$_->{tailwords}} > 0} @candidates; |
|
231
|
|
|
|
|
|
|
CANDIDATE_LOOP: |
|
232
|
|
|
|
|
|
|
while (@candidates) { |
|
233
|
|
|
|
|
|
|
# - AND there are leftover words in current phrase |
|
234
|
|
|
|
|
|
|
if ($inner_body =~ /^(\s+)($concept_word_rex)/s) { |
|
235
|
|
|
|
|
|
|
# then: pull and compare next word, reduce text and tailwords |
|
236
|
|
|
|
|
|
|
# 1. Pull next. |
|
237
|
|
|
|
|
|
|
my $step_offset = length($1) + length($2); |
|
238
|
|
|
|
|
|
|
$inner_offset += $step_offset; |
|
239
|
|
|
|
|
|
|
my $next_word = normalize_word($2); |
|
240
|
|
|
|
|
|
|
# 2. Filter for applicable candidates |
|
241
|
|
|
|
|
|
|
my @inner_candidates = grep { $_->{tailwords}->[0] eq $next_word } @candidates; |
|
242
|
|
|
|
|
|
|
if (@inner_candidates) { |
|
243
|
|
|
|
|
|
|
# We have indeed a longer match, remove the first tailword |
|
244
|
|
|
|
|
|
|
shift @{$_->{tailwords}} foreach @inner_candidates; |
|
245
|
|
|
|
|
|
|
# candidates for next iteration must have leftover tail words |
|
246
|
|
|
|
|
|
|
@candidates = grep {@{$_->{tailwords}} > 0} @inner_candidates; |
|
247
|
|
|
|
|
|
|
# record intermediate longest matches - the current empty tailwords |
|
248
|
|
|
|
|
|
|
my @step_matches = grep {@{$_->{tailwords}} == 0} @inner_candidates; |
|
249
|
|
|
|
|
|
|
if (@step_matches) { |
|
250
|
|
|
|
|
|
|
@inner_matches = @step_matches; |
|
251
|
|
|
|
|
|
|
$match_offset = $inner_offset; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
# Move $step_offset right the text |
|
254
|
|
|
|
|
|
|
substr($inner_body,0,$step_offset)=''; |
|
255
|
|
|
|
|
|
|
} else {last CANDIDATE_LOOP;} # Last here as well. |
|
256
|
|
|
|
|
|
|
} else {last CANDIDATE_LOOP;} # Otherwise we are done |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
# In the end, do we have one or more matches? |
|
259
|
|
|
|
|
|
|
if (@inner_matches > 0) { |
|
260
|
|
|
|
|
|
|
# Yes! |
|
261
|
|
|
|
|
|
|
# merge multi-links into single match entry |
|
262
|
|
|
|
|
|
|
# multi-link = same concept, category and domain, different URLs |
|
263
|
|
|
|
|
|
|
# CARE: careful not to confuse with cases of different categories, which need disambiguation |
|
264
|
|
|
|
|
|
|
my @merged_matches; |
|
265
|
|
|
|
|
|
|
#print STDERR Dumper(\@inner_matches); |
|
266
|
|
|
|
|
|
|
while (@inner_matches) { |
|
267
|
|
|
|
|
|
|
my $match = shift @inner_matches; |
|
268
|
|
|
|
|
|
|
my $category = $match->{category}; |
|
269
|
|
|
|
|
|
|
my $domain = $match->{domain}; |
|
270
|
|
|
|
|
|
|
my @multilinks = map {$_->{link}} |
|
271
|
|
|
|
|
|
|
grep {($_->{category} eq $category) && ($_->{domain} eq $domain)} @inner_matches; |
|
272
|
|
|
|
|
|
|
@inner_matches = grep {($_->{category} ne $category) || ($_->{domain} ne $domain)} @inner_matches; |
|
273
|
|
|
|
|
|
|
if (@multilinks>0) { |
|
274
|
|
|
|
|
|
|
unshift @multilinks, $match->{link}; |
|
275
|
|
|
|
|
|
|
$match->{multilinks} = \@multilinks; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
push @merged_matches, $match; |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
@inner_matches = @merged_matches; |
|
280
|
|
|
|
|
|
|
# Record offsets: |
|
281
|
|
|
|
|
|
|
$offset += $match_offset; |
|
282
|
|
|
|
|
|
|
$offset_end += $match_offset; |
|
283
|
|
|
|
|
|
|
foreach my $match(@inner_matches) { |
|
284
|
|
|
|
|
|
|
$match->{offset_begin} = $offset_begin; |
|
285
|
|
|
|
|
|
|
$match->{offset_end} = $offset_end; |
|
286
|
|
|
|
|
|
|
delete $match->{tailwords}; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
# And push to main matches array |
|
289
|
|
|
|
|
|
|
push @matches, @inner_matches; |
|
290
|
|
|
|
|
|
|
# And move the text forward with the match_offset |
|
291
|
|
|
|
|
|
|
substr($body,0,$match_offset)='' if $match_offset; |
|
292
|
|
|
|
|
|
|
} else { next CONCEPT_TRAVERSAL; } # If not, we just move on to the next word |
|
293
|
|
|
|
|
|
|
} |
|
294
|
|
|
|
|
|
|
return (\@matches,$text_length); |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
1; |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
__END__ |