line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# /=====================================================================\ # |
2
|
|
|
|
|
|
|
# | NNexus Autolinker | # |
3
|
|
|
|
|
|
|
# | Text Morphology Module | # |
4
|
|
|
|
|
|
|
# |=====================================================================| # |
5
|
|
|
|
|
|
|
# | Part of the Planetary project: http://trac.mathweb.org/planetary | # |
6
|
|
|
|
|
|
|
# | Research software, produced as part of work done by: | # |
7
|
|
|
|
|
|
|
# | the KWARC group at Jacobs University | # |
8
|
|
|
|
|
|
|
# | Copyright (c) 2012 | # |
9
|
|
|
|
|
|
|
# | Released under the MIT License (MIT) | # |
10
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
11
|
|
|
|
|
|
|
# | Adapted from the original NNexus code by | # |
12
|
|
|
|
|
|
|
# | James Gardner and Aaron Krowne | # |
13
|
|
|
|
|
|
|
# |---------------------------------------------------------------------| # |
14
|
|
|
|
|
|
|
# | Deyan Ginev #_# | # |
15
|
|
|
|
|
|
|
# | http://kwarc.info/people/dginev (o o) | # |
16
|
|
|
|
|
|
|
# \=========================================================ooo==U==ooo=/ # |
17
|
|
|
|
|
|
|
package NNexus::Morphology; |
18
|
|
|
|
|
|
|
########################################################################### |
19
|
|
|
|
|
|
|
# text morphology |
20
|
|
|
|
|
|
|
########################################################################### |
21
|
11
|
|
|
11
|
|
30796
|
use strict; |
|
11
|
|
|
|
|
15
|
|
|
11
|
|
|
|
|
322
|
|
22
|
11
|
|
|
11
|
|
44
|
use warnings; |
|
11
|
|
|
|
|
17
|
|
|
11
|
|
|
|
|
237
|
|
23
|
|
|
|
|
|
|
|
24
|
11
|
|
|
11
|
|
40
|
use Exporter; |
|
11
|
|
|
|
|
13
|
|
|
11
|
|
|
|
|
999
|
|
25
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
26
|
|
|
|
|
|
|
our @EXPORT_OK = qw(is_possessive is_plural get_nonpossessive get_possessive |
27
|
|
|
|
|
|
|
depluralize_word depluralize_phrase root pluralize undetermine_word |
28
|
|
|
|
|
|
|
admissible_name firstword_split normalize_word |
29
|
|
|
|
|
|
|
canonicalize_url); |
30
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all=>qw(is_possessive is_plural get_nonpossessive get_possessive |
31
|
|
|
|
|
|
|
depluralize_word depluralize_phrase root pluralize undetermine_word |
32
|
|
|
|
|
|
|
admissible_name firstword_split normalize_word |
33
|
|
|
|
|
|
|
canonicalize_url)); |
34
|
|
|
|
|
|
|
|
35
|
11
|
|
|
11
|
|
3119
|
use utf8; |
|
11
|
|
|
|
|
65
|
|
|
11
|
|
|
|
|
55
|
|
36
|
11
|
|
|
11
|
|
2813
|
use Encode qw{is_utf8}; |
|
11
|
|
|
|
|
46396
|
|
|
11
|
|
|
|
|
755
|
|
37
|
11
|
|
|
11
|
|
6110
|
use Text::Unidecode qw/unidecode/; |
|
11
|
|
|
|
|
16699
|
|
|
11
|
|
|
|
|
20632
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# TODO: Think about MathML |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# 0. Define what we consider admissible and grammatical words and phrases, for the NNexus use case |
42
|
|
|
|
|
|
|
our $concept_word_rex = qr/\w(?:\w|[\-\+\'])*/; |
43
|
|
|
|
|
|
|
our $concept_phrase_rex = qr/$concept_word_rex(?:\s+$concept_word_rex)*/; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# I. Possessives |
47
|
|
|
|
|
|
|
# return true if any word is possessive (ends in 's or s') |
48
|
5
|
|
|
5
|
1
|
58
|
sub is_possessive { $_[0] =~ /\w('s|s')(\s|$)/; } |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# return phrase without possessive suffix ("Euler's" becomes "Euler") |
51
|
|
|
|
|
|
|
sub get_nonpossessive { |
52
|
181
|
|
|
181
|
1
|
227
|
my ($word) = @_; |
53
|
181
|
100
|
|
|
|
565
|
$word =~ s/'s(\s|$)/$1/ || $word =~ s/s'(\s|$)/s$1/; |
54
|
181
|
|
|
|
|
371
|
$word; } |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# return first word with possessive suffix ("Euler" becomes "Euler's") |
57
|
|
|
|
|
|
|
sub get_possessive { |
58
|
0
|
|
|
0
|
1
|
0
|
my ($word) = @_; |
59
|
0
|
|
|
|
|
0
|
$word =~ s/^($concept_word_rex)/$1'/; |
60
|
0
|
|
|
|
|
0
|
$word =~ s/^($concept_word_rex[^s])'/$1's/; |
61
|
0
|
|
|
|
|
0
|
$word; } |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# II. Plurality |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# predicate for plural or not |
66
|
6
|
|
|
6
|
1
|
22
|
sub is_plural { $_[0] ne depluralize_phrase($_[0]); } |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub pluralize { |
69
|
|
|
|
|
|
|
# "root of unity" pluralizes as "roots of unity" for example |
70
|
0
|
0
|
|
0
|
1
|
0
|
if ($_[0] =~ /($concept_word_rex)(\s+(of|by)\s+.+)/) { return pluralize($1).$2; } |
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# normal pluralization |
72
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ri)x$/) { return "$1ces"; } |
73
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+t)ex$/) { return "$1ices"; } |
74
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo])x$/) { return "$1xes"; } |
75
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[^aeiou])y$/) { return "$1ies"; } |
76
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ee$/) { return "$1ees"; } |
77
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)us$/) { return "$1i"; } |
78
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ch$/) { return "$1ches"; } |
79
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ss$/) { return "$1sses"; } |
80
|
0
|
|
|
|
|
0
|
else { return $_[0].'s'; } } |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# singularize a phrase... remove root and replace |
83
|
|
|
|
|
|
|
sub depluralize_phrase { |
84
|
|
|
|
|
|
|
# "spaces of functions" depluralizes as "space of functions" for example. |
85
|
|
|
|
|
|
|
# also "proofs by induction" |
86
|
14
|
100
|
|
14
|
1
|
256
|
if ($_[0] =~ /(^\w[\w\s]+\w)(\s+(of|by)\s+.+)$/) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
87
|
4
|
|
|
|
|
11
|
my ($l,$r) = ($1,$2); |
88
|
4
|
|
|
|
|
13
|
return depluralize_phrase($l).$r; } |
89
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; } |
90
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; } |
91
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; } |
92
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ies$/) { return "$1y"; } |
93
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; } |
94
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; } |
95
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; } |
96
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; } |
97
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ie)s$/) { return $1; } |
98
|
6
|
|
|
|
|
32
|
elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; } |
99
|
2
|
|
|
|
|
17
|
elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; } |
100
|
2
|
|
|
|
|
9
|
else { return $_[0]; } } |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub depluralize_word { |
103
|
178
|
100
|
|
178
|
1
|
1020
|
if($_[0] !~ /oci|s$/) { return $_[0]; } |
|
153
|
50
|
|
|
|
335
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
104
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; } |
105
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; } |
106
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; } |
107
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ies$/) { return "$1y"; } |
108
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; } |
109
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; } |
110
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; } |
111
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+ie)s$/) { return $1; } |
112
|
17
|
|
|
|
|
62
|
elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; } |
113
|
5
|
|
|
|
|
35
|
elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; } |
114
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; } |
115
|
3
|
|
|
|
|
9
|
else { return $_[0]; } } |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# III. Stemming |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# get the non-plural root for a word |
120
|
|
|
|
|
|
|
sub root { |
121
|
0
|
0
|
|
0
|
1
|
0
|
if($_[0] =~ /(.+ri)ces$/) { return $1; } |
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
122
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; } |
123
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ies$/) { return $1; } |
124
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; } |
125
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+o)ci$/) { return "$1c"; } |
126
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; } |
127
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; } |
128
|
0
|
|
|
|
|
0
|
elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; } |
129
|
0
|
|
|
|
|
0
|
else { return $_[0]; } } |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Remove determiners from a word: |
132
|
|
|
|
|
|
|
sub undetermine_word { |
133
|
166
|
|
|
166
|
1
|
2720
|
my ($concept) = @_; |
134
|
166
|
|
|
|
|
358
|
$concept =~ s/^(?:an?|the)(?:\s+|$)//; |
135
|
166
|
|
|
|
|
376
|
return $concept; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# IV. Admissible concept words and high-level api |
139
|
23
|
|
|
23
|
1
|
296
|
sub admissible_name {$_[0]=~/^$concept_phrase_rex$/; } |
140
|
|
|
|
|
|
|
our %normalized_words = (); |
141
|
|
|
|
|
|
|
sub normalize_word { |
142
|
178
|
|
|
178
|
1
|
752
|
my ($concept)=@_; |
143
|
178
|
|
|
|
|
253
|
my $normalized_concept = $normalized_words{$concept}; |
144
|
178
|
100
|
|
|
|
334
|
return $normalized_concept if $normalized_concept; |
145
|
160
|
|
|
|
|
381
|
$normalized_concept= |
146
|
|
|
|
|
|
|
depluralize_word( |
147
|
|
|
|
|
|
|
get_nonpossessive( |
148
|
|
|
|
|
|
|
undetermine_word( |
149
|
|
|
|
|
|
|
lc( |
150
|
|
|
|
|
|
|
unidecode( |
151
|
|
|
|
|
|
|
$concept))))); |
152
|
160
|
|
|
|
|
412
|
$normalized_words{$concept} = $normalized_concept; |
153
|
160
|
|
|
|
|
378
|
return $normalized_concept; } |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub firstword_split { |
156
|
21
|
|
|
21
|
1
|
654
|
my ($concept)=@_; |
157
|
21
|
50
|
|
|
|
710
|
if ($concept=~/^($concept_word_rex)\s?(.*)$/) { # Grab first word if not provided |
158
|
21
|
|
100
|
|
|
173
|
return ($1,($2||'')); |
159
|
|
|
|
|
|
|
} |
160
|
0
|
|
|
|
|
0
|
return; } |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# Not the ideal place for it but... closest that comes to mind! |
163
|
|
|
|
|
|
|
# Internal utilities: |
164
|
|
|
|
|
|
|
# Canonicalize absolute URLs, borrowed from LaTeXML::Util::Pathname |
165
|
|
|
|
|
|
|
our $PROTOCOL_RE = '(?:https?)(?=:)'; |
166
|
|
|
|
|
|
|
sub canonicalize_url { |
167
|
46
|
|
|
46
|
1
|
52
|
my ($pathname) = @_; |
168
|
46
|
|
|
|
|
77
|
my $urlprefix= undef; |
169
|
46
|
100
|
|
|
|
353
|
if($pathname =~ s|^($PROTOCOL_RE)://||){ |
170
|
32
|
|
|
|
|
55
|
$urlprefix = $1; } |
171
|
46
|
|
|
|
|
77
|
$pathname =~ s|/\./|/|g; |
172
|
|
|
|
|
|
|
# Collapse any foo/.. patterns, but not ../.. |
173
|
46
|
|
|
|
|
126
|
while($pathname =~ s|/(?!\.\./)[^/]+/\.\.(/\|$)|$1|){} |
174
|
46
|
|
|
|
|
51
|
$pathname =~ s|^\./||; |
175
|
46
|
|
|
|
|
76
|
$pathname =~ s|^www.||; |
176
|
|
|
|
|
|
|
# Deprecated: We don't want the prefix, keeps the index smaller |
177
|
|
|
|
|
|
|
#(defined $urlprefix ? $urlprefix . $pathname : $pathname); } |
178
|
46
|
|
|
|
|
276
|
$pathname; } |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
1; |
181
|
|
|
|
|
|
|
__END__ |