File Coverage

blib/lib/NNexus/Morphology.pm

Criterion	Covered	Total	%
statement	56	100	56.0
branch	37	90	41.1
condition	2	2	100.0
subroutine	16	19	84.2
pod	13	13	100.0
total	124	224	55.3

line	stmt	bran	cond	sub	pod	time	code
1							# /=====================================================================\ #
2							# \| NNexus Autolinker \| #
3							# \| Text Morphology Module \| #
4							# \|=====================================================================\| #
5							# \| Part of the Planetary project: http://trac.mathweb.org/planetary \| #
6							# \| Research software, produced as part of work done by: \| #
7							# \| the KWARC group at Jacobs University \| #
8							# \| Copyright (c) 2012 \| #
9							# \| Released under the MIT License (MIT) \| #
10							# \|---------------------------------------------------------------------\| #
11							# \| Adapted from the original NNexus code by \| #
12							# \| James Gardner and Aaron Krowne \| #
13							# \|---------------------------------------------------------------------\| #
14							# \| Deyan Ginev #_# \| #
15							# \| http://kwarc.info/people/dginev (o o) \| #
16							# \=========================================================ooo==U==ooo=/ #
17							package NNexus::Morphology;
18							###########################################################################
19							# text morphology
20							###########################################################################
21	11			11		30796	use strict;
	11					15
	11					322
22	11			11		44	use warnings;
	11					17
	11					237
23
24	11			11		40	use Exporter;
	11					13
	11					999
25							our @ISA = qw(Exporter);
26							our @EXPORT_OK = qw(is_possessive is_plural get_nonpossessive get_possessive
27							depluralize_word depluralize_phrase root pluralize undetermine_word
28							admissible_name firstword_split normalize_word
29							canonicalize_url);
30							our %EXPORT_TAGS = (all=>qw(is_possessive is_plural get_nonpossessive get_possessive
31							depluralize_word depluralize_phrase root pluralize undetermine_word
32							admissible_name firstword_split normalize_word
33							canonicalize_url));
34
35	11			11		3119	use utf8;
	11					65
	11					55
36	11			11		2813	use Encode qw{is_utf8};
	11					46396
	11					755
37	11			11		6110	use Text::Unidecode qw/unidecode/;
	11					16699
	11					20632
38
39							# TODO: Think about MathML
40
41							# 0. Define what we consider admissible and grammatical words and phrases, for the NNexus use case
42							our $concept_word_rex = qr/\w(?:\w\|[\-\+\'])*/;
43							our $concept_phrase_rex = qr/$concept_word_rex(?:\s+$concept_word_rex)*/;
44
45
46							# I. Possessives
47							# return true if any word is possessive (ends in 's or s')
48	5			5	1	58	sub is_possessive { $_[0] =~ /\w('s\|s')(\s\|$)/; }
49
50							# return phrase without possessive suffix ("Euler's" becomes "Euler")
51							sub get_nonpossessive {
52	181			181	1	227	my ($word) = @_;
53	181	100				565	$word =~ s/'s(\s\|$)/$1/ \|\| $word =~ s/s'(\s\|$)/s$1/;
54	181					371	$word; }
55
56							# return first word with possessive suffix ("Euler" becomes "Euler's")
57							sub get_possessive {
58	0			0	1	0	my ($word) = @_;
59	0					0	$word =~ s/^($concept_word_rex)/$1'/;
60	0					0	$word =~ s/^($concept_word_rex[^s])'/$1's/;
61	0					0	$word; }
62
63							# II. Plurality
64
65							# predicate for plural or not
66	6			6	1	22	sub is_plural { $_[0] ne depluralize_phrase($_[0]); }
67
68							sub pluralize {
69							# "root of unity" pluralizes as "roots of unity" for example
70	0	0		0	1	0	if ($_[0] =~ /($concept_word_rex)(\s+(of\|by)\s+.+)/) { return pluralize($1).$2; }
	0	0				0
		0
		0
		0
		0
		0
		0
		0
71							# normal pluralization
72	0					0	elsif($_[0] =~ /(.+ri)x$/) { return "$1ces"; }
73	0					0	elsif($_[0] =~ /(.+t)ex$/) { return "$1ices"; }
74	0					0	elsif($_[0] =~ /(.+[aeiuo])x$/) { return "$1xes"; }
75	0					0	elsif($_[0] =~ /(.+[^aeiou])y$/) { return "$1ies"; }
76	0					0	elsif($_[0] =~ /(.+)ee$/) { return "$1ees"; }
77	0					0	elsif($_[0] =~ /(.+)us$/) { return "$1i"; }
78	0					0	elsif($_[0] =~ /(.+)ch$/) { return "$1ches"; }
79	0					0	elsif($_[0] =~ /(.+)ss$/) { return "$1sses"; }
80	0					0	else { return $_[0].'s'; } }
81
82							# singularize a phrase... remove root and replace
83							sub depluralize_phrase {
84							# "spaces of functions" depluralizes as "space of functions" for example.
85							# also "proofs by induction"
86	14	100		14	1	256	if ($_[0] =~ /(^\w[\w\s]+\w)(\s+(of\|by)\s+.+)$/) {
		50
		50
		50
		50
		50
		50
		50
		50
		50
		100
		100
87	4					11	my ($l,$r) = ($1,$2);
88	4					13	return depluralize_phrase($l).$r; }
89	0					0	elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; }
90	0					0	elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; }
91	0					0	elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; }
92	0					0	elsif($_[0] =~ /(.+)ies$/) { return "$1y"; }
93	0					0	elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; }
94	0					0	elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; }
95	0					0	elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; }
96	0					0	elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; }
97	0					0	elsif($_[0] =~ /(.+ie)s$/) { return $1; }
98	6					32	elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; }
99	2					17	elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; }
100	2					9	else { return $_[0]; } }
101
102							sub depluralize_word {
103	178	100		178	1	1020	if($_[0] !~ /oci\|s$/) { return $_[0]; }
	153	50				335
		50
		50
		50
		50
		50
		50
		50
		100
		100
		50
104	0					0	elsif($_[0] =~ /(.+ri)ces$/) { return "$1x"; }
105	0					0	elsif($_[0] =~ /(.+t)ices$/) { return "$1ex"; }
106	0					0	elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; }
107	0					0	elsif($_[0] =~ /(.+)ies$/) { return "$1y"; }
108	0					0	elsif($_[0] =~ /(.+)ees$/) { return "$1ee"; }
109	0					0	elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; }
110	0					0	elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; }
111	0					0	elsif($_[0] =~ /(.+ie)s$/) { return $1; }
112	17					62	elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; }
113	5					35	elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; }
114	0					0	elsif($_[0] =~ /(.+o)ci$/) { return "$1cus"; }
115	3					9	else { return $_[0]; } }
116
117							# III. Stemming
118
119							# get the non-plural root for a word
120							sub root {
121	0	0		0	1	0	if($_[0] =~ /(.+ri)ces$/) { return $1; }
	0	0				0
		0
		0
		0
		0
		0
		0
122	0					0	elsif($_[0] =~ /(.+[aeiuo]x)es$/) { return $1; }
123	0					0	elsif($_[0] =~ /(.+)ies$/) { return $1; }
124	0					0	elsif($_[0] =~ /(.+)ches$/) { return "$1ch"; }
125	0					0	elsif($_[0] =~ /(.+o)ci$/) { return "$1c"; }
126	0					0	elsif($_[0] =~ /(.+)sses$/) { return "$1ss"; }
127	0					0	elsif($_[0] =~ /(.+[^eiuos])s$/) { return $1; }
128	0					0	elsif($_[0] =~ /(.+[^aeio])es$/) { return "$1e"; }
129	0					0	else { return $_[0]; } }
130
131							# Remove determiners from a word:
132							sub undetermine_word {
133	166			166	1	2720	my ($concept) = @_;
134	166					358	$concept =~ s/^(?:an?\|the)(?:\s+\|$)//;
135	166					376	return $concept;
136							}
137
138							# IV. Admissible concept words and high-level api
139	23			23	1	296	sub admissible_name {$_[0]=~/^$concept_phrase_rex$/; }
140							our %normalized_words = ();
141							sub normalize_word {
142	178			178	1	752	my ($concept)=@_;
143	178					253	my $normalized_concept = $normalized_words{$concept};
144	178	100				334	return $normalized_concept if $normalized_concept;
145	160					381	$normalized_concept=
146							depluralize_word(
147							get_nonpossessive(
148							undetermine_word(
149							lc(
150							unidecode(
151							$concept)))));
152	160					412	$normalized_words{$concept} = $normalized_concept;
153	160					378	return $normalized_concept; }
154
155							sub firstword_split {
156	21			21	1	654	my ($concept)=@_;
157	21	50				710	if ($concept=~/^($concept_word_rex)\s?(.*)$/) { # Grab first word if not provided
158	21		100			173	return ($1,($2\|\|''));
159							}
160	0					0	return; }
161
162							# Not the ideal place for it but... closest that comes to mind!
163							# Internal utilities:
164							# Canonicalize absolute URLs, borrowed from LaTeXML::Util::Pathname
165							our $PROTOCOL_RE = '(?:https?)(?=:)';
166							sub canonicalize_url {
167	46			46	1	52	my ($pathname) = @_;
168	46					77	my $urlprefix= undef;
169	46	100				353	if($pathname =~ s\|^($PROTOCOL_RE)://\|\|){
170	32					55	$urlprefix = $1; }
171	46					77	$pathname =~ s\|/\./\|/\|g;
172							# Collapse any foo/.. patterns, but not ../..
173	46					126	while($pathname =~ s\|/(?!\.\./)[^/]+/\.\.(/\\|$)\|$1\|){}
174	46					51	$pathname =~ s\|^\./\|\|;
175	46					76	$pathname =~ s\|^www.\|\|;
176							# Deprecated: We don't want the prefix, keeps the index smaller
177							#(defined $urlprefix ? $urlprefix . $pathname : $pathname); }
178	46					276	$pathname; }
179
180							1;
181							__END__