File Coverage

blib/lib/NNexus/Morphology.pm

Criterion	Covered	Total	%
statement	47	100	47.0
branch	34	90	37.7
condition	1	2	50.0
subroutine	15	19	78.9
pod	13	13	100.0
total	110	224	49.1

line	stmt	bran	cond	sub	pod	time	code
1							# /=====================================================================\ #
2							# \| NNexus Autolinker \| #
3							# \| Text Morphology Module \| #
4							# \|=====================================================================\| #
5							# \| Part of the Planetary project: http://trac.mathweb.org/planetary \| #
6							# \| Research software, produced as part of work done by: \| #
7							# \| the KWARC group at Jacobs University \| #
8							# \| Copyright (c) 2012 \| #
9							# \| Released under the MIT License (MIT) \| #
10							# \|---------------------------------------------------------------------\| #
11							# \| Adapted from the original NNexus code by \| #
12							# \| James Gardner and Aaron Krowne \| #
13							# \|---------------------------------------------------------------------\| #
14							# \| Deyan Ginev #_# \| #
15							# \| http://kwarc.info/people/dginev (o o) \| #
16							# \=========================================================ooo==U==ooo=/ #
17							package NNexus::Morphology;
18							###########################################################################
19							# text morphology
20							###########################################################################
21	10			10		19013	use strict;
	10					13
	10					371
22	10			10		47	use warnings;
	10					16
	10					290
23
24	10			10		54	use Exporter;
	10					11
	10					981
25							our @ISA = qw(Exporter);
26							our @EXPORT_OK = qw(is_possessive is_plural get_nonpossessive get_possessive
27							depluralize_word depluralize_phrase root pluralize undetermine_word
28							admissible_name firstword_split normalize_word
29							canonicalize_url);
30							our %EXPORT_TAGS = (all=>qw(is_possessive is_plural get_nonpossessive get_possessive
31							depluralize_word depluralize_phrase root pluralize undetermine_word
32							admissible_name firstword_split normalize_word
33							canonicalize_url));
34
35	10			10		3093	use utf8;
	10					82
	10					48
36	10			10		3029	use Encode qw{is_utf8};
	10					38023
	10					705
37	10			10		5559	use Text::Unidecode qw/unidecode/;
	10					13068
	10					18838
38
39							# TODO: Think about MathML
40
41							# 0. Define what we consider admissible and grammatical words and phrases, for the NNexus use case
42							our $concept_word_rex = qr/\w(?:\w\|[\-\+\'])*/;
43							our $concept_phrase_rex = qr/$concept_word_rex(?:\s+$concept_word_rex)*/;
44
45
46							# I. Possessives
47							# return true if any word is possessive (ends in 's or s')
48	5			5	1	61	sub is_possessive { $_[0] =~ /\w('s\|s')(\s\|$)/; }
49
50							# return phrase without possessive suffix ("Euler's" becomes "Euler")
51							sub get_nonpossessive {
52	23			23	1	68	my ($word) = @_;
53	23	100				99	$word =~ s/'s(\s\|$)/$1/ \|\| $word =~ s/s'(\s\|$)/s$1/;
54	23					69	$word; }
55
56							# return first word with possessive suffix ("Euler" becomes "Euler's")
57							sub get_possessive {
58	0			0	1	0	my ($word) = @_;
59	0					0	$word =~ s/^($concept_word_rex)/$1'/;
60	0					0	$word =~ s/^($concept_word_rex[^s])'/$1's/;
61	0					0	$word; }
62
63							# II. Plurality
64
65							# predicate for plural or not
66	6			6	1	22	sub is_plural { $_[0] ne depluralize_phrase($_[0]); }
67
68							sub pluralize {
69							# "root of unity" pluralizes as "roots of unity" for example
70	0	0		0	1	0	if ($_[0] =~ /($concept_word_rex)(\s+(of\|by)\s+.+)/) { return pluralize($1).$2; }
	0	0				0
		0
		0
		0
		0
		0
		0
		0
71							# normal pluralization
72	0					0	elsif($_[0] =~ /(.+ri)x$/) { return "$1ces"; }
73	0					0	elsif($_[0] =~ /(.+t)ex$/) { return "$1ices"; }
74	0					0	elsif($_[0] =~ /(.+[aeiuo])x$/) { return "$1xes"; }
75	0					0	elsif($_[0] =~ /(.+[^aeiou])y$/) { return "$1ies"; }
76	0					0	elsif($_[0] =~ /(.+)ee$/) { return "$1ees"; }
77	0					0	elsif($_[0] =~ /(.+)us$/) { return "$1i"; }
78	0					0	elsif($_[0] =~ /(.+)ch$/) { return "$1ches"; }
79	0					0	elsif($_[0] =~ /(.+)ss$/) { return "$1sses"; }
80	0					0	else { return $_[0].'s'; } }
81
82							# singularize a phrase... remove root and replace
83							sub depluralize_phrase {
84							# "spaces of functions" depluralizes as "space of functions" for example.
85							# also "proofs by induction"
86	14	100		14	1	192	if ($_[0] =~ /(^\w[\w\s]+\w)(\s+(of\|by)\s+.+)$/) {
		50
		50
		50
		50
		50
		50
		50
		50
		50
		100
		100
87	4					19	my ($l,$r) = ($1,$2);
88	4					13	return depluralize_phrase($l).$r; }
89	0					0	elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; }
90	0					0	elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; }
91	0					0	elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; }
92	0					0	elsif($_[0] =~ /(.+)ies$/) { return "$1y"; }
93	0					0	elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; }
94	0					0	elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; }
95	0					0	elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; }
96	0					0	elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; }
97	0					0	elsif($_[0] =~ /(.+ie)s$/) { return $1; }
98	6					31	elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; }
99	2					17	elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; }
100	2					10	else { return $_[0]; } }
101
102							sub depluralize_word {
103	20	100		20	1	181	if($_[0] !~ /oci\|s$/) { return $_[0]; }
	12	50				38
		50
		50
		50
		50
		50
		50
		50
		100
		100
		50
104	0					0	elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; }
105	0					0	elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; }
106	0					0	elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; }
107	0					0	elsif($_[0] =~ /(.+)ies$/) { return "$1y"; }
108	0					0	elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; }
109	0					0	elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; }
110	0					0	elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; }
111	0					0	elsif($_[0] =~ /(.+ie)s$/) { return $1; }
112	6					26	elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; }
113	1					14	elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; }
114	0					0	elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; }
115	1					4	else { return $_[0]; } }
116
117							# III. Stemming
118
119							# get the non-plural root for a word
120							sub root {
121	0	0		0	1	0	if($_[0] =~ /(.+ri)ces$/) { return $1; }
	0	0				0
		0
		0
		0
		0
		0
		0
122	0					0	elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; }
123	0					0	elsif($_[0] =~ /(.+)ies$/) { return $1; }
124	0					0	elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; }
125	0					0	elsif($_[0] =~ /(.+o)ci$/) { return "$1c"; }
126	0					0	elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; }
127	0					0	elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; }
128	0					0	elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; }
129	0					0	else { return $_[0]; } }
130
131							# Remove determiners from a word:
132							sub undetermine_word {
133	8			8	1	61	my ($concept) = @_;
134	8					28	$concept =~ s/^(?:an?\|the)(?:\s+\|$)//;
135	8					27	return $concept;
136							}
137
138							# IV. Admissible concept words and high-level api
139	5			5	1	90	sub admissible_name {$_[0]=~/^$concept_phrase_rex$/; }
140							our %normalized_words = ();
141							sub normalize_word {
142	2			2	1	828	my ($concept)=@_;
143	2					6	my $normalized_concept = $normalized_words{$concept};
144	2	50				7	return $normalized_concept if $normalized_concept;
145	2					8	$normalized_concept=
146							depluralize_word(
147							get_nonpossessive(
148							undetermine_word(
149							lc(
150							unidecode(
151							$concept)))));
152	2					8	$normalized_words{$concept} = $normalized_concept;
153	2					8	return $normalized_concept; }
154
155							sub firstword_split {
156	2			2	1	754	my ($concept)=@_;
157	2	50				108	if ($concept=~/^($concept_word_rex)\s?(.*)$/) { # Grab first word if not provided
158	2		50			21	return ($1,($2\|\|''));
159							}
160	0						return; }
161
162							# Not the ideal place for it but... closest that comes to mind!
163							# Internal utilities:
164							# Canonicalize absolute URLs, borrowed from LaTeXML::Util::Pathname
165							our $PROTOCOL_RE = '(?:https?)(?=:)';
166							sub canonicalize_url {
167	0			0	1		my ($pathname) = @_;
168	0						my $urlprefix= undef;
169	0	0					if($pathname =~ s\|^($PROTOCOL_RE)://\|\|){
170	0						$urlprefix = $1; }
171	0						$pathname =~ s\|/\./\|/\|g;
172							# Collapse any foo/.. patterns, but not ../..
173	0						while($pathname =~ s\|/(?!\.\./)[^/]+/\.\.(/\\|$)\|$1\|){}
174	0						$pathname =~ s\|^\./\|\|;
175	0						$pathname =~ s\|^www.\|\|;
176							# Deprecated: We don't want the prefix, keeps the index smaller
177							#(defined $urlprefix ? $urlprefix . $pathname : $pathname); }
178	0						$pathname; }
179
180							1;
181							__END__