line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Tool::EnglishMorpho::Lemmatizer; |
2
|
|
|
|
|
|
|
$Treex::Tool::EnglishMorpho::Lemmatizer::VERSION = '2.20151102'; |
3
|
1
|
|
|
1
|
|
834
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
40
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
5
|
1
|
|
|
1
|
|
853
|
use Moose; |
|
1
|
|
|
|
|
164054
|
|
|
1
|
|
|
|
|
9
|
|
6
|
1
|
|
|
1
|
|
8002
|
use Treex::Core::Common; |
|
1
|
|
|
|
|
570075
|
|
|
1
|
|
|
|
|
5
|
|
7
|
1
|
|
|
1
|
|
6200
|
use Treex::Core::Resource qw(require_file_from_share); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
60
|
|
8
|
1
|
|
|
1
|
|
5
|
use File::Slurp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
74
|
|
9
|
1
|
|
|
1
|
|
5
|
use utf8; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
5
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has 'exceptions_filename' => ( |
12
|
|
|
|
|
|
|
is => 'ro', |
13
|
|
|
|
|
|
|
init_arg => undef, |
14
|
|
|
|
|
|
|
default => sub { |
15
|
|
|
|
|
|
|
return require_file_from_share('data/models/lemmatizer/en/exceptions2014_11_21.tsv'); |
16
|
|
|
|
|
|
|
}, |
17
|
|
|
|
|
|
|
); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has 'negation_filename' => ( |
20
|
|
|
|
|
|
|
is => 'ro', |
21
|
|
|
|
|
|
|
init_arg => undef, |
22
|
|
|
|
|
|
|
default => sub { |
23
|
|
|
|
|
|
|
return require_file_from_share('data/models/lemmatizer/en/negation'); |
24
|
|
|
|
|
|
|
}, |
25
|
|
|
|
|
|
|
); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
has 'exceptions' => ( |
28
|
|
|
|
|
|
|
is => 'ro', |
29
|
|
|
|
|
|
|
builder => '_build_exceptions', |
30
|
|
|
|
|
|
|
init_arg => undef, |
31
|
|
|
|
|
|
|
lazy => 1, |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has 'negation' => ( |
35
|
|
|
|
|
|
|
is => 'ro', |
36
|
|
|
|
|
|
|
builder => '_build_negation', |
37
|
|
|
|
|
|
|
init_arg => undef, |
38
|
|
|
|
|
|
|
lazy => 1, |
39
|
|
|
|
|
|
|
); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
has 'cut_negation' => ( |
42
|
|
|
|
|
|
|
isa => 'Bool', |
43
|
|
|
|
|
|
|
default => 1, |
44
|
|
|
|
|
|
|
reader => 'cut_negation', |
45
|
|
|
|
|
|
|
); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
has 'lowercase_proper_names' => ( |
48
|
|
|
|
|
|
|
isa => 'Bool', |
49
|
|
|
|
|
|
|
default => 0, |
50
|
|
|
|
|
|
|
reader => 'lowercase_proper_names', |
51
|
|
|
|
|
|
|
); |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $V = qr/[aeiou]/; |
54
|
|
|
|
|
|
|
my $VY = qr/[aeiouy]/; |
55
|
|
|
|
|
|
|
my $C = qr/[bcdfghjklmnpqrstvwxyz]/; |
56
|
|
|
|
|
|
|
my $CXY = qr/[bcdfghjklmnpqrstvwxz]/; |
57
|
|
|
|
|
|
|
my $S = qr/([sxz]|[cs]h)/; |
58
|
|
|
|
|
|
|
my $S2 = qr/(ss|zz)/; |
59
|
|
|
|
|
|
|
my $PRE = qr/(be|ex|in|mis|pre|pro|re)/; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
#The most importat sub: |
62
|
|
|
|
|
|
|
#Input: word form and POS tag (Penn style) |
63
|
|
|
|
|
|
|
#Output: lemma and was_negative_prefix |
64
|
|
|
|
|
|
|
sub lemmatize { |
65
|
1
|
|
|
1
|
1
|
225132
|
my ( $self, $word, $tag ) = @_; |
66
|
1
|
|
|
|
|
3
|
my $negative_prefix = 0; |
67
|
|
|
|
|
|
|
|
68
|
1
|
50
|
33
|
|
|
21
|
if ( ( $tag !~ /^NNP/ || $self->lowercase_proper_names ) && $word ne 'I' ) { |
|
|
|
33
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
$word = lc $word; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
1
|
|
|
|
|
57
|
my $entry = $self->exceptions->{$tag}{$word}; |
73
|
1
|
50
|
|
|
|
6
|
if ($entry) { |
74
|
0
|
|
|
|
|
0
|
return @$entry; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
else { |
78
|
1
|
50
|
|
|
|
56
|
if ( $self->cut_negation ) { |
79
|
1
|
|
|
|
|
10
|
( $word, $negative_prefix ) = $self->_cut_negative_prefix( $word, $tag ); |
80
|
|
|
|
|
|
|
} |
81
|
1
|
|
|
|
|
7
|
return ( $self->_lemmatize_by_rules( $word, $tag ), $negative_prefix ); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _cut_negative_prefix { |
86
|
1
|
|
|
1
|
|
4
|
my ( $self, $word, $tag ) = @_; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# We are interested only in adjectives, adverbs and nouns. |
89
|
|
|
|
|
|
|
# English verbs are negated usually by "not" (don't,...). |
90
|
|
|
|
|
|
|
# Proper nouns (NNP,NNPS) are also left unchanged (Disney, Intel, Irvin... Non-IBM). |
91
|
1
|
50
|
33
|
|
|
11
|
if ( $tag =~ /^(J.*|R.*|NN|NNS)$/ and $word =~ $self->negation ) { |
92
|
0
|
|
|
|
|
0
|
$word =~ s/^(un|in|im|non-?|dis-?|il|ir)//; |
93
|
0
|
|
|
|
|
0
|
return ( $word, 1 ); |
94
|
|
|
|
|
|
|
} |
95
|
1
|
|
|
|
|
4
|
return ( $word, 0 ); |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub _lemmatize_NNS_NNPS { |
99
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
100
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/men$/man/; #over 600 words (in BNC) |
101
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/shoes$/shoe/; |
102
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/wives$/wife/; |
103
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}us)es$/$1/; #buses bonuses |
104
|
|
|
|
|
|
|
|
105
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${V}se)s$/$1/; |
106
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${CXY}z)es$/$1/; |
107
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}ze)s$/$1/; |
108
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)es$/$1/; |
109
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}rse)s$/$1/; |
110
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/onses$/onse/; |
111
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S)es$/$1/; |
112
|
|
|
|
|
|
|
|
113
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.$C)ies$/$1y/; #ponies vs ties |
114
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}o)es$/$1/; |
115
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/s$//; |
116
|
0
|
|
|
|
|
0
|
return $word; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub _lemmatize_VBG { ## no critic (Subroutines::ProhibitExcessComplexity) this is complex |
120
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
121
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}z)ing$/$1/; |
122
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}z)ing$/$1e/; |
123
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)ing$/$1/; |
124
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}ll)ing$/$1/; |
125
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}($CXY)\2)ing$/$1/; #cancel-ling vs call-ing - exception is needed |
126
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($CXY)ing$/$1/; |
127
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C${V}ng)ing$/$1/; |
128
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/icking$/ick/; |
129
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}in)ing$/$1e/; |
130
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])ing$/$1/; |
131
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(qu$V${C})ing$/$1e/; |
132
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(u${V}d)ing$/$1e/; |
133
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}let)ing$/$1e/; |
134
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+[ei]t)ing$/$1e/; |
135
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([ei]t)ing$/$1/; |
136
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($PRE$CXY${CXY}eat)ing$/$1/; |
137
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$CXY${CXY}eat)ing$/$1e/; |
138
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[eo]at)ing$/$1/; #treating vs creating |
139
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}at)ing$/$1e/; |
140
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V[cgsv])ing$/$1e/; #announcing increasing |
141
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)ing$/$1/; |
142
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[rw]l)ing$/$1/; |
143
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.th)ing$/$1e/; |
144
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY[cglsv])ing$/$1e/; #involving |
145
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY$CXY)ing$/$1/; #reporting |
146
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/uing$/ue/; |
147
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($VY$VY)ing$/$1/; |
148
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ying$/y/; |
149
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}o)ing$/$1/; |
150
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+or)ing$/$1e/; |
151
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C[clt]or)ing$/$1e/; |
152
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([eo]r)ing$/$1/; #offering |
153
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ing$/e/; |
154
|
0
|
|
|
|
|
0
|
return $word; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub _lemmatize_VBD_VBN { ## no critic (Subroutines::ProhibitExcessComplexity) this is complex |
158
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
159
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/en$/e/; |
160
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}z)ed$/$1/; |
161
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}z)ed$/$1e/; |
162
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)ed$/$1/; |
163
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}ll)ed$/$1/; |
164
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C${V}($CXY)\2)ed$/$1/; #cancel-led vs call-ed - wordlist is needed |
165
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($CXY)ed$/$1/; |
166
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C${V}ng)ed$/$1/; |
167
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/icked$/ick/; |
168
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}(in|[clnt]or))ed$/$1e/; |
169
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])ed$/$1/; |
170
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+or)ed$/$1e/; |
171
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([eo]r)ed$/$1/; |
172
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C})ied$/$1y/; |
173
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(qu$V${C})ed$/$1e/; |
174
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(u${V}d)ed$/$1e/; |
175
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${C}let)ed$/$1e/; |
176
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/^($PRE*$C+[ei]t)ed$/$1e/; |
177
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([ei]t)ed$/$1/; |
178
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($PRE$CXY${CXY}eat)ed$/$1/; |
179
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$CXY${CXY}eat)ed$/$1e/; |
180
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[eo]at)ed$/$1/; #treated vs created |
181
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}at)ed$/$1e/; |
182
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V[cgsv])ed$/$1e/; #announced |
183
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)ed$/$1/; |
184
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.[rw]l)ed$/$1/; |
185
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.th)ed$/$1e/; |
186
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ued$/ue/; |
187
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY[cglsv])ed$/$1e/; #involved |
188
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($CXY$CXY)ed$/$1/; #reported |
189
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($VY$VY)ed$/$1/; |
190
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ed$/e/; |
191
|
0
|
|
|
|
|
0
|
return $word; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub _lemmatize_VBZ { |
195
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
196
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${V}se)s$/$1/; |
197
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${CXY}z)es$/$1/; |
198
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${VY}ze)s$/$1/; |
199
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S2)es$/$1/; |
200
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.${V}rse)s$/$1/; |
201
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/onses$/onse/; |
202
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($S)es$/$1/; |
203
|
|
|
|
|
|
|
|
204
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.$C)ies$/$1y/; #tries, relies vs lies |
205
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(${CXY}o)es$/$1/; #does, undergoes |
206
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.)s$/$1/; |
207
|
0
|
|
|
|
|
0
|
return $word; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub _lemmatize_JJR_RBR { |
211
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
212
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([^e]ll)er$/$1/; #smaller |
213
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C)\1er$/$1/; #bigger |
214
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/ier$/y/; #earlier |
215
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)er$/$1/; #weaker |
216
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])er$/$1/; #lower |
217
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$C)er$/$1e/; #nicer wider |
218
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([bcdfghjklmpqrstvwxz][cglsv])er$/$1e/; #larger,stranger vs stronger, younger |
219
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([ue])er$/$1e/; #freer |
220
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/er$//; #harder |
221
|
0
|
|
|
|
|
0
|
return $word; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub _lemmatize_JJS_RBS { |
225
|
0
|
|
|
0
|
|
0
|
my ( $self, $word ) = @_; |
226
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([^e]ll)est$/$1/; #smallest |
227
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.)\1est$/$1/; #biggest |
228
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/iest$/y/; #earliest |
229
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$V$C)est$/$1/; #weakest |
230
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($C$V[npwx])est$/$1/; #lowest |
231
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/($V$C)est$/$1e/; #nicest widest |
232
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/([bcdfghjklmpqrstvwxz][cglsv])est$/$1e/; #largest vs strongest |
233
|
0
|
0
|
|
|
|
0
|
return $word if $word =~ s/(.{3,})est$/$1/; #hardest |
234
|
0
|
|
|
|
|
0
|
return $word; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub _lemmatize_by_rules { |
238
|
1
|
|
|
1
|
|
4
|
my ( $self, $word, $tag ) = @_; |
239
|
|
|
|
|
|
|
|
240
|
1
|
50
|
|
|
|
18
|
my $lemma = $tag =~ /NNP?S/ |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
241
|
|
|
|
|
|
|
? $self->_lemmatize_NNS_NNPS($word) |
242
|
|
|
|
|
|
|
: $tag =~ /^VBG/ ? $self->_lemmatize_VBG($word) |
243
|
|
|
|
|
|
|
: $tag =~ /VB[DN]/ ? $self->_lemmatize_VBD_VBN($word) |
244
|
|
|
|
|
|
|
: $tag eq 'VBZ' ? $self->_lemmatize_VBZ($word) |
245
|
|
|
|
|
|
|
: $tag =~ /JJR|RBR/ ? $self->_lemmatize_JJR_RBR($word) |
246
|
|
|
|
|
|
|
: $tag =~ /JJS|RBS/ ? $self->_lemmatize_JJS_RBS($word) |
247
|
|
|
|
|
|
|
: $word |
248
|
|
|
|
|
|
|
; |
249
|
1
|
50
|
|
|
|
5
|
return $word if $lemma eq ''; # Otherwise e.g. "est"->"" |
250
|
1
|
|
|
|
|
8
|
return $lemma; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub _build_exceptions { |
254
|
1
|
|
|
1
|
|
28
|
my $self = shift; |
255
|
1
|
|
|
|
|
2
|
my %exceptions; |
256
|
1
|
|
|
|
|
58
|
log_debug( $self->exceptions_filename ); |
257
|
1
|
50
|
|
|
|
79
|
open my $ex_file, "<:encoding(utf-8)", $self->exceptions_filename or log_fatal($!); |
258
|
1
|
|
|
|
|
126
|
while (<$ex_file>) { |
259
|
5610
|
|
|
|
|
11535
|
chomp; |
260
|
5610
|
|
|
|
|
24033
|
my ( $word, $tag, $lemma, $negative_prefix ) = split /\t/; |
261
|
|
|
|
|
|
|
|
262
|
5610
|
|
33
|
|
|
15788
|
$negative_prefix = ( defined $negative_prefix and $negative_prefix eq '1' ); |
263
|
5610
|
|
|
|
|
35333
|
$exceptions{$tag}{$word} = [ $lemma, $negative_prefix ]; |
264
|
|
|
|
|
|
|
} |
265
|
1
|
|
|
|
|
45
|
close $ex_file; |
266
|
1
|
|
|
|
|
91
|
return \%exceptions; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub _build_negation { |
270
|
0
|
|
|
0
|
|
|
my $self = shift; |
271
|
0
|
|
|
|
|
|
my $pattern = ''; |
272
|
0
|
|
|
|
|
|
my @lines = read_file( $self->negation_filename, binmode => ':encoding(utf-8)', err_mode => 'log_fatal' ); |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# or log_fatal('Cannot load lemmatization exceptions from ' . $self->negation_filename); |
275
|
0
|
|
|
|
|
|
chomp(@lines); |
276
|
0
|
|
|
|
|
|
$pattern = join '|', @lines; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
#$pattern =~ s/-/\-/g; |
279
|
0
|
|
|
|
|
|
my $negation = qr/^($pattern)/; |
280
|
0
|
|
|
|
|
|
return $negation; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
1; |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
__END__ |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Cutting off negative prefixes is quite discutable. |
288
|
|
|
|
|
|
|
Even if we filter out cases when: |
289
|
|
|
|
|
|
|
a) a word starts with (un|in|im|dis|il|ir) but it is not a prefix (Intel, disaster,...) |
290
|
|
|
|
|
|
|
b) it is a prefix but not negative (indoor, impress,...) |
291
|
|
|
|
|
|
|
Still there are other cases, when etymologicaly it is a negative prefix, but... |
292
|
|
|
|
|
|
|
unease, uneasily, uneasiness,... is definitelly not a negation of ease, easily, easiness |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
indiscriminately ?? |
295
|
|
|
|
|
|
|
indiscriminate ?? |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=pod |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=head1 NAME |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
Treex::Tool::EnglishMorpho::Lemmatizer - rule based lemmatizer for English |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=head1 VERSION |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
version 2.20151102 |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head1 SYNOPSIS |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
use Treex::Tool::EnglishMorpho::Lemmatizer; |
310
|
|
|
|
|
|
|
my $lemmatizer = Treex::Tool::EnglishMorpho::Lemmatizer->new(); |
311
|
|
|
|
|
|
|
my ($word, $tag) = qw( goes VBZ ); |
312
|
|
|
|
|
|
|
my ($lemma, $neg) = $lemmatizer->lemmatize($word, $tag); |
313
|
|
|
|
|
|
|
# $lemma = 'go', $neg = 0 |
314
|
|
|
|
|
|
|
($lemma, $neg) = $lemmatizer->lemmatize('unhappy', 'JJ'); |
315
|
|
|
|
|
|
|
# $lemma = 'happy', $neg = 1 |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 METHODS |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=over 4 |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item lemmatize |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
Accepts pair of word and tag. |
324
|
|
|
|
|
|
|
Produces pair with its lemma and indication if word was negation |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=back |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=head1 DESCRIPTION |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Covers: |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=over |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=item * noun -s (dogs -> dog, ponies -> pony,..., mice -> mouse) |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item * verb -s (does -> do,...) |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=item * verb -ing |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=item * verb -ed, -en |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item * adjective/adverb -er |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=item * adjective/adverb -est |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=item * cut off negative prefixes (un|in|im|non|dis|il|ir) |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=back |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head2 Input requirements |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=over |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=item Tokenization |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
I<doesn't> should be tokenized as two words: I<does> and I<n't> |
357
|
|
|
|
|
|
|
(It will be lemmatized as I<do> and I<not>). |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=item Tagging |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
Correct tagging (Penn style) is quite crucial for Lemmatizer to work. |
362
|
|
|
|
|
|
|
For example it doesn't change words with tags NN and NNP |
363
|
|
|
|
|
|
|
(it changes only NNS and NNPS). So (I<pence>, NN) -> I<pence>, |
364
|
|
|
|
|
|
|
but (I<pence>, NNS) -> I<penny>. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=back |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head2 Differences from the previous implementation |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
Modul C<PEDT::MorphologyAnalysis> uses Morpha (written in Flex) |
371
|
|
|
|
|
|
|
and in some cases gives different lemmatization. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=over |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=item Adverbs and adjectives. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
Morpha leaves comparatives and superlatives unchanged. |
378
|
|
|
|
|
|
|
C<PEDT::MorphologyAnalysis> does only basic analysis (I<later> -> I<lat>). |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=item Capitalization of proper names |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=item Changes of NN |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=item Latin words |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
Declination of words with latin origin is not covered by any Lemmatizer |
387
|
|
|
|
|
|
|
rules on purpose. |
388
|
|
|
|
|
|
|
There are few widely known english words with latin origin which are |
389
|
|
|
|
|
|
|
(or should be) covered by exception files (f.e. indices NNS -> index). |
390
|
|
|
|
|
|
|
In my opinion, it is better, especially for translation purposes, |
391
|
|
|
|
|
|
|
to leave the other latin words unchanged. Mostly they will have the same |
392
|
|
|
|
|
|
|
form also in the target language (biological terms like Spheniscidae). |
393
|
|
|
|
|
|
|
BTW: Errors made by Morpha latin fallbacks are sometimes funny: |
394
|
|
|
|
|
|
|
sci-fi -> sci-fus, Mitsubishi -> mitsubishus, Shanghai -> shanghaus,... |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=back |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head1 TODO |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=over |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item * this POD documentation !!! |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=item * better list of exceptions |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=item * change exceptions format from tsv to stored perl hash |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=back |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head1 AUTHOR |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
Martin Popel <popel@ufal.mff.cuni.cz> |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 COPYRIGHT |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Copyright © 2008 - 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |