| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Treex::Tool::EnglishMorpho::Lemmatizer; |
|
2
|
|
|
|
|
|
|
$Treex::Tool::EnglishMorpho::Lemmatizer::VERSION = '2.20151102'; |
|
3
|
1
|
|
|
1
|
|
834
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
40
|
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
5
|
1
|
|
|
1
|
|
853
|
use Moose; |
|
|
1
|
|
|
|
|
164054
|
|
|
|
1
|
|
|
|
|
9
|
|
|
6
|
1
|
|
|
1
|
|
8002
|
use Treex::Core::Common; |
|
|
1
|
|
|
|
|
570075
|
|
|
|
1
|
|
|
|
|
5
|
|
|
7
|
1
|
|
|
1
|
|
6200
|
use Treex::Core::Resource qw(require_file_from_share); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
60
|
|
|
8
|
1
|
|
|
1
|
|
5
|
use File::Slurp; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
74
|
|
|
9
|
1
|
|
|
1
|
|
5
|
use utf8; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
5
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has 'exceptions_filename' => ( |
|
12
|
|
|
|
|
|
|
is => 'ro', |
|
13
|
|
|
|
|
|
|
init_arg => undef, |
|
14
|
|
|
|
|
|
|
default => sub { |
|
15
|
|
|
|
|
|
|
return require_file_from_share('data/models/lemmatizer/en/exceptions2014_11_21.tsv'); |
|
16
|
|
|
|
|
|
|
}, |
|
17
|
|
|
|
|
|
|
); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has 'negation_filename' => ( |
|
20
|
|
|
|
|
|
|
is => 'ro', |
|
21
|
|
|
|
|
|
|
init_arg => undef, |
|
22
|
|
|
|
|
|
|
default => sub { |
|
23
|
|
|
|
|
|
|
return require_file_from_share('data/models/lemmatizer/en/negation'); |
|
24
|
|
|
|
|
|
|
}, |
|
25
|
|
|
|
|
|
|
); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
has 'exceptions' => ( |
|
28
|
|
|
|
|
|
|
is => 'ro', |
|
29
|
|
|
|
|
|
|
builder => '_build_exceptions', |
|
30
|
|
|
|
|
|
|
init_arg => undef, |
|
31
|
|
|
|
|
|
|
lazy => 1, |
|
32
|
|
|
|
|
|
|
); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has 'negation' => ( |
|
35
|
|
|
|
|
|
|
is => 'ro', |
|
36
|
|
|
|
|
|
|
builder => '_build_negation', |
|
37
|
|
|
|
|
|
|
init_arg => undef, |
|
38
|
|
|
|
|
|
|
lazy => 1, |
|
39
|
|
|
|
|
|
|
); |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
has 'cut_negation' => ( |
|
42
|
|
|
|
|
|
|
isa => 'Bool', |
|
43
|
|
|
|
|
|
|
default => 1, |
|
44
|
|
|
|
|
|
|
reader => 'cut_negation', |
|
45
|
|
|
|
|
|
|
); |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
has 'lowercase_proper_names' => ( |
|
48
|
|
|
|
|
|
|
isa => 'Bool', |
|
49
|
|
|
|
|
|
|
default => 0, |
|
50
|
|
|
|
|
|
|
reader => 'lowercase_proper_names', |
|
51
|
|
|
|
|
|
|
); |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $V = qr/[aeiou]/; |
|
54
|
|
|
|
|
|
|
my $VY = qr/[aeiouy]/; |
|
55
|
|
|
|
|
|
|
my $C = qr/[bcdfghjklmnpqrstvwxyz]/; |
|
56
|
|
|
|
|
|
|
my $CXY = qr/[bcdfghjklmnpqrstvwxz]/; |
|
57
|
|
|
|
|
|
|
my $S = qr/([sxz]|[cs]h)/; |
|
58
|
|
|
|
|
|
|
my $S2 = qr/(ss|zz)/; |
|
59
|
|
|
|
|
|
|
my $PRE = qr/(be|ex|in|mis|pre|pro|re)/; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
#The most importat sub: |
|
62
|
|
|
|
|
|
|
#Input: word form and POS tag (Penn style) |
|
63
|
|
|
|
|
|
|
#Output: lemma and was_negative_prefix |
|
64
|
|
|
|
|
|
|
sub lemmatize { |
|
65
|
1
|
|
|
1
|
1
|
225132
|
my ( $self, $word, $tag ) = @_; |
|
66
|
1
|
|
|
|
|
3
|
my $negative_prefix = 0; |
|
67
|
|
|
|
|
|
|
|
|
68
|
1
|
50
|
33
|
|
|
21
|
if ( ( $tag !~ /^NNP/ || $self->lowercase_proper_names ) && $word ne 'I' ) { |
|
|
|
|
33
|
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
$word = lc $word; |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
1
|
|
|
|
|
57
|
my $entry = $self->exceptions->{$tag}{$word}; |
|
73
|
1
|
50
|
|
|
|
6
|
if ($entry) { |
|
74
|
0
|
|
|
|
|
0
|
return @$entry; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
else { |
|
78
|
1
|
50
|
|
|
|
56
|
if ( $self->cut_negation ) { |
|
79
|
1
|
|
|
|
|
10
|
( $word, $negative_prefix ) = $self->_cut_negative_prefix( $word, $tag ); |
|
80
|
|
|
|
|
|
|
} |
|
81
|
1
|
|
|
|
|
7
|
return ( $self->_lemmatize_by_rules( $word, $tag ), $negative_prefix ); |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _cut_negative_prefix { |
|
86
|
1
|
|
|
1
|
|
4
|
my ( $self, $word, $tag ) = @_; |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# We are interested only in adjectives, adverbs and nouns. |
|
89
|
|
|
|
|
|
|
# English verbs are negated usually by "not" (don't,...). |
|
90
|
|
|
|
|
|
|
# Proper nouns (NNP,NNPS) are also left unchanged (Disney, Intel, Irvin... Non-IBM). |
|
91
|
1
|
50
|
33
|
|
|
11
|
if ( $tag =~ /^(J.*|R.*|NN|NNS)$/ and $word =~ $self->negation ) { |
|
92
|
0
|
|
|
|
|
0
|
$word =~ s/^(un|in|im|non-?|dis-?|il|ir)//; |
|
93
|
0
|
|
|
|
|
0
|
return ( $word, 1 ); |
|
94
|
|
|
|
|
|
|
} |
|
95
|
1
|
|
|
|
|
4
|
return ( $word, 0 ); |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub _lemmatize_NNS_NNPS { |
|
99
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
|
100
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/men$/man/; #over 600 words (in BNC) |
|
101
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/shoes$/shoe/; |
|
102
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/wives$/wife/; |
|
103
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}us)es$/$1/; #buses bonuses |
|
104
|
|
|
|
|
|
|
|
|
105
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${V}se)s$/$1/; |
|
106
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${CXY}z)es$/$1/; |
|
107
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}ze)s$/$1/; |
|
108
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)es$/$1/; |
|
109
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}rse)s$/$1/; |
|
110
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/onses$/onse/; |
|
111
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S)es$/$1/; |
|
112
|
|
|
|
|
|
|
|
|
113
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.$C)ies$/$1y/; #ponies vs ties |
|
114
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}o)es$/$1/; |
|
115
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/s$//; |
|
116
|
0
|
|
|
|
|
0
|
return $word; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub _lemmatize_VBG { ## no critic (Subroutines::ProhibitExcessComplexity) this is complex |
|
120
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
|
121
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}z)ing$/$1/; |
|
122
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}z)ing$/$1e/; |
|
123
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)ing$/$1/; |
|
124
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}ll)ing$/$1/; |
|
125
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}($CXY)\2)ing$/$1/; #cancel-ling vs call-ing - exception is needed |
|
126
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($CXY)ing$/$1/; |
|
127
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C${V}ng)ing$/$1/; |
|
128
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/icking$/ick/; |
|
129
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}in)ing$/$1e/; |
|
130
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])ing$/$1/; |
|
131
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(qu$V${C})ing$/$1e/; |
|
132
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(u${V}d)ing$/$1e/; |
|
133
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}let)ing$/$1e/; |
|
134
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+[ei]t)ing$/$1e/; |
|
135
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([ei]t)ing$/$1/; |
|
136
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($PRE$CXY${CXY}eat)ing$/$1/; |
|
137
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$CXY${CXY}eat)ing$/$1e/; |
|
138
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[eo]at)ing$/$1/; #treating vs creating |
|
139
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}at)ing$/$1e/; |
|
140
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V[cgsv])ing$/$1e/; #announcing increasing |
|
141
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)ing$/$1/; |
|
142
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[rw]l)ing$/$1/; |
|
143
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.th)ing$/$1e/; |
|
144
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY[cglsv])ing$/$1e/; #involving |
|
145
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY$CXY)ing$/$1/; #reporting |
|
146
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/uing$/ue/; |
|
147
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($VY$VY)ing$/$1/; |
|
148
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ying$/y/; |
|
149
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}o)ing$/$1/; |
|
150
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+or)ing$/$1e/; |
|
151
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C[clt]or)ing$/$1e/; |
|
152
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([eo]r)ing$/$1/; #offering |
|
153
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ing$/e/; |
|
154
|
0
|
|
|
|
|
0
|
return $word; |
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub _lemmatize_VBD_VBN { ## no critic (Subroutines::ProhibitExcessComplexity) this is complex |
|
158
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
|
159
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/en$/e/; |
|
160
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}z)ed$/$1/; |
|
161
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}z)ed$/$1e/; |
|
162
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)ed$/$1/; |
|
163
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}ll)ed$/$1/; |
|
164
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}($CXY)\2)ed$/$1/; #cancel-led vs call-ed - wordlist is needed |
|
165
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($CXY)ed$/$1/; |
|
166
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C${V}ng)ed$/$1/; |
|
167
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/icked$/ick/; |
|
168
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}(in|[clnt]or))ed$/$1e/; |
|
169
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])ed$/$1/; |
|
170
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+or)ed$/$1e/; |
|
171
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([eo]r)ed$/$1/; |
|
172
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C})ied$/$1y/; |
|
173
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(qu$V${C})ed$/$1e/; |
|
174
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(u${V}d)ed$/$1e/; |
|
175
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}let)ed$/$1e/; |
|
176
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+[ei]t)ed$/$1e/; |
|
177
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([ei]t)ed$/$1/; |
|
178
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($PRE$CXY${CXY}eat)ed$/$1/; |
|
179
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$CXY${CXY}eat)ed$/$1e/; |
|
180
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[eo]at)ed$/$1/; #treated vs created |
|
181
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}at)ed$/$1e/; |
|
182
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V[cgsv])ed$/$1e/; #announced |
|
183
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)ed$/$1/; |
|
184
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[rw]l)ed$/$1/; |
|
185
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.th)ed$/$1e/; |
|
186
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ued$/ue/; |
|
187
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY[cglsv])ed$/$1e/; #involved |
|
188
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY$CXY)ed$/$1/; #reported |
|
189
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($VY$VY)ed$/$1/; |
|
190
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ed$/e/; |
|
191
|
0
|
|
|
|
|
0
|
return $word; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub _lemmatize_VBZ { |
|
195
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
|
196
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${V}se)s$/$1/; |
|
197
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${CXY}z)es$/$1/; |
|
198
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}ze)s$/$1/; |
|
199
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)es$/$1/; |
|
200
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}rse)s$/$1/; |
|
201
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/onses$/onse/; |
|
202
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S)es$/$1/; |
|
203
|
|
|
|
|
|
|
|
|
204
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.$C)ies$/$1y/; #tries, relies vs lies |
|
205
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}o)es$/$1/; #does, undergoes |
|
206
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.)s$/$1/; |
|
207
|
0
|
|
|
|
|
0
|
return $word; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub _lemmatize_JJR_RBR { |
|
211
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
|
212
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([^e]ll)er$/$1/; #smaller |
|
213
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C)\1er$/$1/; #bigger |
|
214
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ier$/y/; #earlier |
|
215
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)er$/$1/; #weaker |
|
216
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])er$/$1/; #lower |
|
217
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$C)er$/$1e/; #nicer wider |
|
218
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([bcdfghjklmpqrstvwxz][cglsv])er$/$1e/; #larger,stranger vs stronger, younger |
|
219
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([ue])er$/$1e/; #freer |
|
220
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/er$//; #harder |
|
221
|
0
|
|
|
|
|
0
|
return $word; |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub _lemmatize_JJS_RBS { |
|
225
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
|
226
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([^e]ll)est$/$1/; #smallest |
|
227
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.)\1est$/$1/; #biggest |
|
228
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/iest$/y/; #earliest |
|
229
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)est$/$1/; #weakest |
|
230
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])est$/$1/; #lowest |
|
231
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$C)est$/$1e/; #nicest widest |
|
232
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([bcdfghjklmpqrstvwxz][cglsv])est$/$1e/; #largest vs strongest |
|
233
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.{3,})est$/$1/; #hardest |
|
234
|
0
|
|
|
|
|
0
|
return $word; |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub _lemmatize_by_rules { |
|
238
|
1
|
|
|
1
|
|
4
|
my ( $self, $word, $tag ) = @_; |
|
239
|
|
|
|
|
|
|
|
|
240
|
1
|
50
|
|
|
|
18
|
my $lemma = $tag =~ /NNP?S/ |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
? $self->_lemmatize_NNS_NNPS($word) |
|
242
|
|
|
|
|
|
|
: $tag =~ /^VBG/ ? $self->_lemmatize_VBG($word) |
|
243
|
|
|
|
|
|
|
: $tag =~ /VB[DN]/ ? $self->_lemmatize_VBD_VBN($word) |
|
244
|
|
|
|
|
|
|
: $tag eq 'VBZ' ? $self->_lemmatize_VBZ($word) |
|
245
|
|
|
|
|
|
|
: $tag =~ /JJR|RBR/ ? $self->_lemmatize_JJR_RBR($word) |
|
246
|
|
|
|
|
|
|
: $tag =~ /JJS|RBS/ ? $self->_lemmatize_JJS_RBS($word) |
|
247
|
|
|
|
|
|
|
: $word |
|
248
|
|
|
|
|
|
|
; |
|
249
|
1
|
50
|
|
|
|
5
|
return $word if $lemma eq ''; # Otherwise e.g. "est"->"" |
|
250
|
1
|
|
|
|
|
8
|
return $lemma; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub _build_exceptions { |
|
254
|
1
|
|
|
1
|
|
28
|
my $self = shift; |
|
255
|
1
|
|
|
|
|
2
|
my %exceptions; |
|
256
|
1
|
|
|
|
|
58
|
log_debug( $self->exceptions_filename ); |
|
257
|
1
|
50
|
|
|
|
79
|
open my $ex_file, "<:encoding(utf-8)", $self->exceptions_filename or log_fatal($!); |
|
258
|
1
|
|
|
|
|
126
|
while (<$ex_file>) { |
|
259
|
5610
|
|
|
|
|
11535
|
chomp; |
|
260
|
5610
|
|
|
|
|
24033
|
my ( $word, $tag, $lemma, $negative_prefix ) = split /\t/; |
|
261
|
|
|
|
|
|
|
|
|
262
|
5610
|
|
33
|
|
|
15788
|
$negative_prefix = ( defined $negative_prefix and $negative_prefix eq '1' ); |
|
263
|
5610
|
|
|
|
|
35333
|
$exceptions{$tag}{$word} = [ $lemma, $negative_prefix ]; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
1
|
|
|
|
|
45
|
close $ex_file; |
|
266
|
1
|
|
|
|
|
91
|
return \%exceptions; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub _build_negation { |
|
270
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
271
|
0
|
|
|
|
|
|
my $pattern = ''; |
|
272
|
0
|
|
|
|
|
|
my @lines = read_file( $self->negation_filename, binmode => ':encoding(utf-8)', err_mode => 'log_fatal' ); |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# or log_fatal('Cannot load lemmatization exceptions from ' . $self->negation_filename); |
|
275
|
0
|
|
|
|
|
|
chomp(@lines); |
|
276
|
0
|
|
|
|
|
|
$pattern = join '|', @lines; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
#$pattern =~ s/-/\-/g; |
|
279
|
0
|
|
|
|
|
|
my $negation = qr/^($pattern)/; |
|
280
|
0
|
|
|
|
|
|
return $negation; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
1; |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
__END__ |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Cutting off negative prefixes is quite discutable. |
|
288
|
|
|
|
|
|
|
Even if we filter out cases when: |
|
289
|
|
|
|
|
|
|
a) a word starts with (un|in|im|dis|il|ir) but it is not a prefix (Intel, disaster,...) |
|
290
|
|
|
|
|
|
|
b) it is a prefix but not negative (indoor, impress,...) |
|
291
|
|
|
|
|
|
|
Still there are other cases, when etymologicaly it is a negative prefix, but... |
|
292
|
|
|
|
|
|
|
unease, uneasily, uneasiness,... is definitelly not a negation of ease, easily, easiness |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
indiscriminately ?? |
|
295
|
|
|
|
|
|
|
indiscriminate ?? |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=pod |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=head1 NAME |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
Treex::Tool::EnglishMorpho::Lemmatizer - rule based lemmatizer for English |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=head1 VERSION |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
version 2.20151102 |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
use Treex::Tool::EnglishMorpho::Lemmatizer; |
|
310
|
|
|
|
|
|
|
my $lemmatizer = Treex::Tool::EnglishMorpho::Lemmatizer->new(); |
|
311
|
|
|
|
|
|
|
my ($word, $tag) = qw( goes VBZ ); |
|
312
|
|
|
|
|
|
|
my ($lemma, $neg) = $lemmatizer->lemmatize($word, $tag); |
|
313
|
|
|
|
|
|
|
# $lemma = 'go', $neg = 0 |
|
314
|
|
|
|
|
|
|
($lemma, $neg) = $lemmatizer->lemmatize('unhappy', 'JJ'); |
|
315
|
|
|
|
|
|
|
# $lemma = 'happy', $neg = 1 |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 METHODS |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=over 4 |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item lemmatize |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
Accepts pair of word and tag. |
|
324
|
|
|
|
|
|
|
Produces pair with its lemma and indication if word was negation |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=back |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Covers: |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=over |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=item * noun -s (dogs -> dog, ponies -> pony,..., mice -> mouse) |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item * verb -s (does -> do,...) |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=item * verb -ing |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=item * verb -ed, -en |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item * adjective/adverb -er |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=item * adjective/adverb -est |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=item * cut off negative prefixes (un|in|im|non|dis|il|ir) |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=back |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head2 Input requirements |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=over |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=item Tokenization |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
I<doesn't> should be tokenized as two words: I<does> and I<n't> |
|
357
|
|
|
|
|
|
|
(It will be lemmatized as I<do> and I<not>). |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=item Tagging |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
Correct tagging (Penn style) is quite crucial for Lemmatizer to work. |
|
362
|
|
|
|
|
|
|
For example it doesn't change words with tags NN and NNP |
|
363
|
|
|
|
|
|
|
(it changes only NNS and NNPS). So (I<pence>, NN) -> I<pence>, |
|
364
|
|
|
|
|
|
|
but (I<pence>, NNS) -> I<penny>. |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=back |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head2 Differences from the previous implementation |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
Modul C<PEDT::MorphologyAnalysis> uses Morpha (written in Flex) |
|
371
|
|
|
|
|
|
|
and in some cases gives different lemmatization. |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=over |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=item Adverbs and adjectives. |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
Morpha leaves comparatives and superlatives unchanged. |
|
378
|
|
|
|
|
|
|
C<PEDT::MorphologyAnalysis> does only basic analysis (I<later> -> I<lat>). |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=item Capitalization of proper names |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=item Changes of NN |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=item Latin words |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
Declination of words with latin origin is not covered by any Lemmatizer |
|
387
|
|
|
|
|
|
|
rules on purpose. |
|
388
|
|
|
|
|
|
|
There are few widely known english words with latin origin which are |
|
389
|
|
|
|
|
|
|
(or should be) covered by exception files (f.e. indices NNS -> index). |
|
390
|
|
|
|
|
|
|
In my opinion, it is better, especially for translation purposes, |
|
391
|
|
|
|
|
|
|
to leave the other latin words unchanged. Mostly they will have the same |
|
392
|
|
|
|
|
|
|
form also in the target language (biological terms like Spheniscidae). |
|
393
|
|
|
|
|
|
|
BTW: Errors made by Morpha latin fallbacks are sometimes funny: |
|
394
|
|
|
|
|
|
|
sci-fi -> sci-fus, Mitsubishi -> mitsubishus, Shanghai -> shanghaus,... |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=back |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head1 TODO |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=over |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item * this POD documentation !!! |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=item * better list of exceptions |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=item * change exceptions format from tsv to stored perl hash |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=back |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head1 AUTHOR |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
Martin Popel <popel@ufal.mff.cuni.cz> |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Copyright © 2008 - 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |