| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
|
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
|
3
|
|
|
|
|
|
|
# | Text Morphology Module | # |
|
4
|
|
|
|
|
|
|
# |=====================================================================| # |
|
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
|
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
|
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
|
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
|
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
|
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
|
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
|
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
|
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
|
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
|
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
|
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
|
17
|
|
|
|
|
|
|
package NNexus::Morphology; |
|
18
|
|
|
|
|
|
|
########################################################################### |
|
19
|
|
|
|
|
|
|
# text morphology |
|
20
|
|
|
|
|
|
|
########################################################################### |
|
21
|
10
|
|
|
10
|
|
19013
|
use strict; |
|
|
10
|
|
|
|
|
13
|
|
|
|
10
|
|
|
|
|
371
|
|
|
22
|
10
|
|
|
10
|
|
47
|
use warnings; |
|
|
10
|
|
|
|
|
16
|
|
|
|
10
|
|
|
|
|
290
|
|
|
23
|
|
|
|
|
|
|
|
|
24
|
10
|
|
|
10
|
|
54
|
use Exporter; |
|
|
10
|
|
|
|
|
11
|
|
|
|
10
|
|
|
|
|
981
|
|
|
25
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
26
|
|
|
|
|
|
|
our @EXPORT_OK = qw(is_possessive is_plural get_nonpossessive get_possessive |
|
27
|
|
|
|
|
|
|
depluralize_word depluralize_phrase root pluralize undetermine_word |
|
28
|
|
|
|
|
|
|
admissible_name firstword_split normalize_word |
|
29
|
|
|
|
|
|
|
canonicalize_url); |
|
30
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all=>qw(is_possessive is_plural get_nonpossessive get_possessive |
|
31
|
|
|
|
|
|
|
depluralize_word depluralize_phrase root pluralize undetermine_word |
|
32
|
|
|
|
|
|
|
admissible_name firstword_split normalize_word |
|
33
|
|
|
|
|
|
|
canonicalize_url)); |
|
34
|
|
|
|
|
|
|
|
|
35
|
10
|
|
|
10
|
|
3093
|
use utf8; |
|
|
10
|
|
|
|
|
82
|
|
|
|
10
|
|
|
|
|
48
|
|
|
36
|
10
|
|
|
10
|
|
3029
|
use Encode qw{is_utf8}; |
|
|
10
|
|
|
|
|
38023
|
|
|
|
10
|
|
|
|
|
705
|
|
|
37
|
10
|
|
|
10
|
|
5559
|
use Text::Unidecode qw/unidecode/; |
|
|
10
|
|
|
|
|
13068
|
|
|
|
10
|
|
|
|
|
18838
|
|
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# TODO: Think about MathML |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# 0. Define what we consider admissible and grammatical words and phrases, for the NNexus use case |
|
42
|
|
|
|
|
|
|
our $concept_word_rex = qr/\w(?:\w|[\-\+\'])*/; |
|
43
|
|
|
|
|
|
|
our $concept_phrase_rex = qr/$concept_word_rex(?:\s+$concept_word_rex)*/; |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# I. Possessives |
|
47
|
|
|
|
|
|
|
# return true if any word is possessive (ends in 's or s') |
|
48
|
5
|
|
|
5
|
1
|
61
|
sub is_possessive { $_[0] =~ /\w('s|s')(\s|$)/; } |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# return phrase without possessive suffix ("Euler's" becomes "Euler") |
|
51
|
|
|
|
|
|
|
sub get_nonpossessive { |
|
52
|
23
|
|
|
23
|
1
|
68
|
my ($word) = @_; |
|
53
|
23
|
100
|
|
|
|
99
|
$word =~ s/'s(\s|$)/$1/ || $word =~ s/s'(\s|$)/s$1/; |
|
54
|
23
|
|
|
|
|
69
|
$word; } |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# return first word with possessive suffix ("Euler" becomes "Euler's") |
|
57
|
|
|
|
|
|
|
sub get_possessive { |
|
58
|
0
|
|
|
0
|
1
|
0
|
my ($word) = @_; |
|
59
|
0
|
|
|
|
|
0
|
$word =~ s/^($concept_word_rex)/$1'/; |
|
60
|
0
|
|
|
|
|
0
|
$word =~ s/^($concept_word_rex[^s])'/$1's/; |
|
61
|
0
|
|
|
|
|
0
|
$word; } |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# II. Plurality |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# predicate for plural or not |
|
66
|
6
|
|
|
6
|
1
|
22
|
sub is_plural { $_[0] ne depluralize_phrase($_[0]); } |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub pluralize { |
|
69
|
|
|
|
|
|
|
# "root of unity" pluralizes as "roots of unity" for example |
|
70
|
0
|
0
|
|
0
|
1
|
0
|
if ($_[0] =~ /($concept_word_rex)(\s+(of|by)\s+.+)/) { return pluralize($1).$2; } |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# normal pluralization |
|
72
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ri)x$/) { return "$1ces"; } |
|
73
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+t)ex$/) { return "$1ices"; } |
|
74
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo])x$/) { return "$1xes"; } |
|
75
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[^aeiou])y$/) { return "$1ies"; } |
|
76
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ee$/) { return "$1ees"; } |
|
77
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)us$/) { return "$1i"; } |
|
78
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ch$/) { return "$1ches"; } |
|
79
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ss$/) { return "$1sses"; } |
|
80
|
0
|
|
|
|
|
0
|
else { return $_[0].'s'; } } |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# singularize a phrase... remove root and replace |
|
83
|
|
|
|
|
|
|
sub depluralize_phrase { |
|
84
|
|
|
|
|
|
|
# "spaces of functions" depluralizes as "space of functions" for example. |
|
85
|
|
|
|
|
|
|
# also "proofs by induction" |
|
86
|
14
|
100
|
|
14
|
1
|
192
|
if ($_[0] =~ /(^\w[\w\s]+\w)(\s+(of|by)\s+.+)$/) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
87
|
4
|
|
|
|
|
19
|
my ($l,$r) = ($1,$2); |
|
88
|
4
|
|
|
|
|
13
|
return depluralize_phrase($l).$r; } |
|
89
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; } |
|
90
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; } |
|
91
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; } |
|
92
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ies$/) { return "$1y"; } |
|
93
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; } |
|
94
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; } |
|
95
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; } |
|
96
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; } |
|
97
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ie)s$/) { return $1; } |
|
98
|
6
|
|
|
|
|
31
|
elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; } |
|
99
|
2
|
|
|
|
|
17
|
elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; } |
|
100
|
2
|
|
|
|
|
10
|
else { return $_[0]; } } |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub depluralize_word { |
|
103
|
20
|
100
|
|
20
|
1
|
181
|
if($_[0] !~ /oci|s$/) { return $_[0]; } |
|
|
12
|
50
|
|
|
|
38
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
104
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; } |
|
105
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; } |
|
106
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; } |
|
107
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ies$/) { return "$1y"; } |
|
108
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; } |
|
109
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; } |
|
110
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; } |
|
111
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ie)s$/) { return $1; } |
|
112
|
6
|
|
|
|
|
26
|
elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; } |
|
113
|
1
|
|
|
|
|
14
|
elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; } |
|
114
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; } |
|
115
|
1
|
|
|
|
|
4
|
else { return $_[0]; } } |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# III. Stemming |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# get the non-plural root for a word |
|
120
|
|
|
|
|
|
|
sub root { |
|
121
|
0
|
0
|
|
0
|
1
|
0
|
if($_[0] =~ /(.+ri)ces$/) { return $1; } |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; } |
|
123
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ies$/) { return $1; } |
|
124
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; } |
|
125
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+o)ci$/) { return "$1c"; } |
|
126
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; } |
|
127
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; } |
|
128
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; } |
|
129
|
0
|
|
|
|
|
0
|
else { return $_[0]; } } |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Remove determiners from a word: |
|
132
|
|
|
|
|
|
|
sub undetermine_word { |
|
133
|
8
|
|
|
8
|
1
|
61
|
my ($concept) = @_; |
|
134
|
8
|
|
|
|
|
28
|
$concept =~ s/^(?:an?|the)(?:\s+|$)//; |
|
135
|
8
|
|
|
|
|
27
|
return $concept; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# IV. Admissible concept words and high-level api |
|
139
|
5
|
|
|
5
|
1
|
90
|
sub admissible_name {$_[0]=~/^$concept_phrase_rex$/; } |
|
140
|
|
|
|
|
|
|
our %normalized_words = (); |
|
141
|
|
|
|
|
|
|
sub normalize_word { |
|
142
|
2
|
|
|
2
|
1
|
828
|
my ($concept)=@_; |
|
143
|
2
|
|
|
|
|
6
|
my $normalized_concept = $normalized_words{$concept}; |
|
144
|
2
|
50
|
|
|
|
7
|
return $normalized_concept if $normalized_concept; |
|
145
|
2
|
|
|
|
|
8
|
$normalized_concept= |
|
146
|
|
|
|
|
|
|
depluralize_word( |
|
147
|
|
|
|
|
|
|
get_nonpossessive( |
|
148
|
|
|
|
|
|
|
undetermine_word( |
|
149
|
|
|
|
|
|
|
lc( |
|
150
|
|
|
|
|
|
|
unidecode( |
|
151
|
|
|
|
|
|
|
$concept))))); |
|
152
|
2
|
|
|
|
|
8
|
$normalized_words{$concept} = $normalized_concept; |
|
153
|
2
|
|
|
|
|
8
|
return $normalized_concept; } |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub firstword_split { |
|
156
|
2
|
|
|
2
|
1
|
754
|
my ($concept)=@_; |
|
157
|
2
|
50
|
|
|
|
108
|
if ($concept=~/^($concept_word_rex)\s?(.*)$/) { # Grab first word if not provided |
|
158
|
2
|
|
50
|
|
|
21
|
return ($1,($2||'')); |
|
159
|
|
|
|
|
|
|
} |
|
160
|
0
|
|
|
|
|
|
return; } |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# Not the ideal place for it but... closest that comes to mind! |
|
163
|
|
|
|
|
|
|
# Internal utilities: |
|
164
|
|
|
|
|
|
|
# Canonicalize absolute URLs, borrowed from LaTeXML::Util::Pathname |
|
165
|
|
|
|
|
|
|
our $PROTOCOL_RE = '(?:https?)(?=:)'; |
|
166
|
|
|
|
|
|
|
sub canonicalize_url { |
|
167
|
0
|
|
|
0
|
1
|
|
my ($pathname) = @_; |
|
168
|
0
|
|
|
|
|
|
my $urlprefix= undef; |
|
169
|
0
|
0
|
|
|
|
|
if($pathname =~ s|^($PROTOCOL_RE)://||){ |
|
170
|
0
|
|
|
|
|
|
$urlprefix = $1; } |
|
171
|
0
|
|
|
|
|
|
$pathname =~ s|/\./|/|g; |
|
172
|
|
|
|
|
|
|
# Collapse any foo/.. patterns, but not ../.. |
|
173
|
0
|
|
|
|
|
|
while($pathname =~ s|/(?!\.\./)[^/]+/\.\.(/\|$)|$1|){} |
|
174
|
0
|
|
|
|
|
|
$pathname =~ s|^\./||; |
|
175
|
0
|
|
|
|
|
|
$pathname =~ s|^www.||; |
|
176
|
|
|
|
|
|
|
# Deprecated: We don't want the prefix, keeps the index smaller |
|
177
|
|
|
|
|
|
|
#(defined $urlprefix ? $urlprefix . $pathname : $pathname); } |
|
178
|
0
|
|
|
|
|
|
$pathname; } |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
1; |
|
181
|
|
|
|
|
|
|
__END__ |