| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Treex::Tool::Parser::MSTperl::FeaturesControl; |
|
2
|
|
|
|
|
|
|
{ |
|
3
|
|
|
|
|
|
|
$Treex::Tool::Parser::MSTperl::FeaturesControl::VERSION = '0.11949'; |
|
4
|
|
|
|
|
|
|
} |
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
2433
|
use Moose; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use autodie; |
|
8
|
|
|
|
|
|
|
use Carp; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use Treex::Tool::Parser::MSTperl::ModelAdditional; |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# TODO dynamic features |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has 'config' => ( |
|
15
|
|
|
|
|
|
|
isa => 'Treex::Tool::Parser::MSTperl::Config', |
|
16
|
|
|
|
|
|
|
is => 'ro', |
|
17
|
|
|
|
|
|
|
required => '1', |
|
18
|
|
|
|
|
|
|
weak_ref => '1', |
|
19
|
|
|
|
|
|
|
); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# FEATURES |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
has 'feature_count' => ( |
|
24
|
|
|
|
|
|
|
is => 'rw', |
|
25
|
|
|
|
|
|
|
isa => 'Int', |
|
26
|
|
|
|
|
|
|
); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
has 'feature_codes_from_config' => ( |
|
29
|
|
|
|
|
|
|
is => 'rw', |
|
30
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
|
31
|
|
|
|
|
|
|
default => sub { [] }, |
|
32
|
|
|
|
|
|
|
); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has 'feature_codes' => ( |
|
35
|
|
|
|
|
|
|
is => 'rw', |
|
36
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
|
37
|
|
|
|
|
|
|
default => sub { [] }, |
|
38
|
|
|
|
|
|
|
); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has 'feature_codes_hash' => ( |
|
41
|
|
|
|
|
|
|
is => 'rw', |
|
42
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
43
|
|
|
|
|
|
|
default => sub { {} }, |
|
44
|
|
|
|
|
|
|
); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
has 'feature_indexes' => ( |
|
47
|
|
|
|
|
|
|
is => 'rw', |
|
48
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
49
|
|
|
|
|
|
|
default => sub { {} }, |
|
50
|
|
|
|
|
|
|
); |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# for each feature contains (a reference to) an array |
|
53
|
|
|
|
|
|
|
# which cointains all its subfeature indexes |
|
54
|
|
|
|
|
|
|
has 'feature_simple_features_indexes' => ( |
|
55
|
|
|
|
|
|
|
is => 'rw', |
|
56
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[Int]]', |
|
57
|
|
|
|
|
|
|
default => sub { [] }, |
|
58
|
|
|
|
|
|
|
); |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# features containing array simple features |
|
61
|
|
|
|
|
|
|
has 'array_features' => ( |
|
62
|
|
|
|
|
|
|
is => 'rw', |
|
63
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
|
64
|
|
|
|
|
|
|
default => sub { {} }, |
|
65
|
|
|
|
|
|
|
); |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# features containing dynamic simple features |
|
68
|
|
|
|
|
|
|
has 'dynamic_features' => ( |
|
69
|
|
|
|
|
|
|
is => 'rw', |
|
70
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
|
71
|
|
|
|
|
|
|
default => sub { {} }, |
|
72
|
|
|
|
|
|
|
); |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# SIMPLE FEATURES |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
has 'simple_feature_count' => ( |
|
77
|
|
|
|
|
|
|
is => 'rw', |
|
78
|
|
|
|
|
|
|
isa => 'Int', |
|
79
|
|
|
|
|
|
|
); |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
has 'simple_feature_codes' => ( |
|
82
|
|
|
|
|
|
|
is => 'rw', |
|
83
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
|
84
|
|
|
|
|
|
|
default => sub { [] }, |
|
85
|
|
|
|
|
|
|
); |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
has 'simple_feature_codes_hash' => ( |
|
88
|
|
|
|
|
|
|
is => 'rw', |
|
89
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
90
|
|
|
|
|
|
|
default => sub { {} }, |
|
91
|
|
|
|
|
|
|
); |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
has 'simple_feature_indexes' => ( |
|
94
|
|
|
|
|
|
|
is => 'rw', |
|
95
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
96
|
|
|
|
|
|
|
default => sub { {} }, |
|
97
|
|
|
|
|
|
|
); |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
has 'simple_feature_subs' => ( |
|
100
|
|
|
|
|
|
|
is => 'rw', |
|
101
|
|
|
|
|
|
|
isa => 'ArrayRef', |
|
102
|
|
|
|
|
|
|
default => sub { [] }, |
|
103
|
|
|
|
|
|
|
); |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
has 'simple_feature_sub_arguments' => ( |
|
106
|
|
|
|
|
|
|
is => 'rw', |
|
107
|
|
|
|
|
|
|
isa => 'ArrayRef', |
|
108
|
|
|
|
|
|
|
default => sub { [] }, |
|
109
|
|
|
|
|
|
|
); |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# simple features that return an array of values |
|
112
|
|
|
|
|
|
|
has 'array_simple_features' => ( |
|
113
|
|
|
|
|
|
|
is => 'rw', |
|
114
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
|
115
|
|
|
|
|
|
|
default => sub { {} }, |
|
116
|
|
|
|
|
|
|
); |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# simple features that must be always recomputed |
|
119
|
|
|
|
|
|
|
# because their value cannot be always computed from input data |
|
120
|
|
|
|
|
|
|
# (for labeller - parent's label, brother's label etc.) |
|
121
|
|
|
|
|
|
|
has 'dynamic_simple_features' => ( |
|
122
|
|
|
|
|
|
|
is => 'rw', |
|
123
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
|
124
|
|
|
|
|
|
|
default => sub { {} }, |
|
125
|
|
|
|
|
|
|
); |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# # simple features that get more than 1 argument as input |
|
128
|
|
|
|
|
|
|
# has 'multiarg_simple_features' => ( |
|
129
|
|
|
|
|
|
|
# is => 'rw', |
|
130
|
|
|
|
|
|
|
# isa => 'HashRef[Int]', |
|
131
|
|
|
|
|
|
|
# default => sub { {} }, |
|
132
|
|
|
|
|
|
|
# ); |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# CACHING |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
has 'use_edge_features_cache' => ( |
|
137
|
|
|
|
|
|
|
is => 'ro', |
|
138
|
|
|
|
|
|
|
isa => 'Bool', |
|
139
|
|
|
|
|
|
|
default => '0', |
|
140
|
|
|
|
|
|
|
); |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# using cache turned off to fit into RAM by default |
|
143
|
|
|
|
|
|
|
# turn on if training with a lot of RAM or on small training data |
|
144
|
|
|
|
|
|
|
# turned off when parsing (does not make any sense for parsing) |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
has 'edge_features_cache' => ( |
|
147
|
|
|
|
|
|
|
is => 'rw', |
|
148
|
|
|
|
|
|
|
isa => 'HashRef[ArrayRef[Str]]', |
|
149
|
|
|
|
|
|
|
default => sub { {} }, |
|
150
|
|
|
|
|
|
|
); |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
has pmi_model => ( |
|
153
|
|
|
|
|
|
|
is => 'rw', |
|
154
|
|
|
|
|
|
|
isa => 'Maybe[Treex::Tool::Parser::MSTperl::ModelAdditional]', |
|
155
|
|
|
|
|
|
|
default => undef, |
|
156
|
|
|
|
|
|
|
); |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
has cprob_model => ( |
|
159
|
|
|
|
|
|
|
is => 'rw', |
|
160
|
|
|
|
|
|
|
isa => 'Maybe[Treex::Tool::Parser::MSTperl::ModelAdditional]', |
|
161
|
|
|
|
|
|
|
default => undef, |
|
162
|
|
|
|
|
|
|
); |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub BUILD { |
|
165
|
|
|
|
|
|
|
my ($self) = @_; |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# ignore some settings if in parsing-only mode |
|
168
|
|
|
|
|
|
|
# if ( !$self->training ) { |
|
169
|
|
|
|
|
|
|
# $self->use_edge_features_cache(0); |
|
170
|
|
|
|
|
|
|
# } |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# features |
|
173
|
|
|
|
|
|
|
foreach my $feature ( @{ $self->feature_codes_from_config } ) { |
|
174
|
|
|
|
|
|
|
$self->set_feature($feature); |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
$self->feature_count( scalar( @{ $self->feature_codes } ) ); |
|
178
|
|
|
|
|
|
|
$self->simple_feature_count( scalar( @{ $self->simple_feature_codes } ) ); |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
return; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub set_feature { |
|
184
|
|
|
|
|
|
|
my ( $self, $feature_code ) = @_; |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
if ( $self->feature_codes_hash->{$feature_code} ) { |
|
187
|
|
|
|
|
|
|
warn "Feature '$feature_code' is defined more than once; " . |
|
188
|
|
|
|
|
|
|
"disregarding its later definitions.\n"; |
|
189
|
|
|
|
|
|
|
} else { |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# get simple features |
|
192
|
|
|
|
|
|
|
my $isArrayFeature = 0; |
|
193
|
|
|
|
|
|
|
my $isDynamicFeature = 0; |
|
194
|
|
|
|
|
|
|
my @simple_features_indexes; |
|
195
|
|
|
|
|
|
|
my %simple_features_hash; |
|
196
|
|
|
|
|
|
|
foreach my $simple_feature_code ( split( /\|/, $feature_code ) ) { |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# checks |
|
199
|
|
|
|
|
|
|
if ( $simple_features_hash{$simple_feature_code} ) { |
|
200
|
|
|
|
|
|
|
warn "Simple feature '$simple_feature_code' " . |
|
201
|
|
|
|
|
|
|
"is used more than once in '$feature_code'; " . |
|
202
|
|
|
|
|
|
|
"disregarding its later uses.\n"; |
|
203
|
|
|
|
|
|
|
next; |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
if ( !$self->simple_feature_codes_hash->{$simple_feature_code} ) { |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# this simple feature has not been used at all yet |
|
208
|
|
|
|
|
|
|
$self->set_simple_feature($simple_feature_code); |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# save |
|
212
|
|
|
|
|
|
|
my $simple_feature_index = |
|
213
|
|
|
|
|
|
|
$self->simple_feature_indexes->{$simple_feature_code}; |
|
214
|
|
|
|
|
|
|
$simple_features_hash{$simple_feature_code} = 1; |
|
215
|
|
|
|
|
|
|
if ( $self->array_simple_features->{$simple_feature_index} ) { |
|
216
|
|
|
|
|
|
|
$isArrayFeature = 1; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
if ( $self->dynamic_simple_features->{$simple_feature_index} ) { |
|
219
|
|
|
|
|
|
|
$isDynamicFeature = 1; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
push @simple_features_indexes, $simple_feature_index; |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# save |
|
225
|
|
|
|
|
|
|
my $feature_index = scalar( @{ $self->feature_codes } ); |
|
226
|
|
|
|
|
|
|
$self->feature_codes_hash->{$feature_code} = 1; |
|
227
|
|
|
|
|
|
|
$self->feature_indexes->{$feature_code} = $feature_index; |
|
228
|
|
|
|
|
|
|
push @{ $self->feature_codes }, $feature_code; |
|
229
|
|
|
|
|
|
|
push @{ $self->feature_simple_features_indexes }, |
|
230
|
|
|
|
|
|
|
[@simple_features_indexes]; |
|
231
|
|
|
|
|
|
|
if ($isArrayFeature) { |
|
232
|
|
|
|
|
|
|
$self->array_features->{$feature_index} = 1; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
if ($isDynamicFeature) { |
|
235
|
|
|
|
|
|
|
$self->dynamic_features->{$feature_index} = 1; |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
return; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub set_simple_feature { |
|
243
|
|
|
|
|
|
|
my ( $self, $simple_feature_code ) = @_; |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# get sub reference and field index |
|
246
|
|
|
|
|
|
|
my $simple_feature_index = scalar @{ $self->simple_feature_codes }; |
|
247
|
|
|
|
|
|
|
my $simple_feature_sub; |
|
248
|
|
|
|
|
|
|
my $simple_feature_field; |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# simple parent/child feature |
|
251
|
|
|
|
|
|
|
if ( $simple_feature_code =~ /^([a-zA-Z0-9_]+)$/ ) { |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
if ( $simple_feature_code =~ /^([a-z0-9_]+)$/ ) { |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# child feature |
|
256
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_child}; |
|
257
|
|
|
|
|
|
|
$simple_feature_field = $1; |
|
258
|
|
|
|
|
|
|
} elsif ( $simple_feature_code =~ /^([A-Z0-9_]+)$/ ) { |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# parent feature |
|
261
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_parent}; |
|
262
|
|
|
|
|
|
|
$simple_feature_field = lc($1); |
|
263
|
|
|
|
|
|
|
} else { |
|
264
|
|
|
|
|
|
|
die "Incorrect simple feature format '$simple_feature_code'. " . |
|
265
|
|
|
|
|
|
|
"Use lowercase (" . lc($simple_feature_code) . |
|
266
|
|
|
|
|
|
|
") for child node and UPPERCASE (" . uc($simple_feature_code) . |
|
267
|
|
|
|
|
|
|
") for parent node.\n"; |
|
268
|
|
|
|
|
|
|
} |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# first/second/(left sibling)/(right sibling)/Grandparent/grandchildren |
|
271
|
|
|
|
|
|
|
# node feature |
|
272
|
|
|
|
|
|
|
} elsif ( $simple_feature_code =~ /^([12gGlr])\.([a-z0-9_]+)$/ ) { |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
$simple_feature_field = $2; |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
if ( $1 eq '1' ) { |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# first node feature |
|
279
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_first}; |
|
280
|
|
|
|
|
|
|
} elsif ( $1 eq '2' ) { |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# second node feature |
|
283
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_second}; |
|
284
|
|
|
|
|
|
|
} elsif ( $1 eq 'g' ) { |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# grandchildren node feature |
|
287
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_grandchildren}; |
|
288
|
|
|
|
|
|
|
} elsif ( $1 eq 'G' ) { |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# grandparent node feature |
|
291
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_grandparent}; |
|
292
|
|
|
|
|
|
|
} elsif ( $1 eq 'l' ) { |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# left sibling edge child feature |
|
295
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_left_sibling}; |
|
296
|
|
|
|
|
|
|
} elsif ( $1 eq 'r' ) { |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# right sibling edge child feature |
|
299
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_right_sibling}; |
|
300
|
|
|
|
|
|
|
} else { |
|
301
|
|
|
|
|
|
|
croak "Assertion failed!"; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# function feature |
|
305
|
|
|
|
|
|
|
} elsif ( |
|
306
|
|
|
|
|
|
|
$simple_feature_code |
|
307
|
|
|
|
|
|
|
=~ /^([12gGlr\.a-z]+|[A-Z]+)\([-a-z0-9_,]*\)$/ |
|
308
|
|
|
|
|
|
|
) |
|
309
|
|
|
|
|
|
|
{ |
|
310
|
|
|
|
|
|
|
my $function_name = $1; |
|
311
|
|
|
|
|
|
|
$simple_feature_sub = |
|
312
|
|
|
|
|
|
|
$self->get_simple_feature_sub_reference($function_name); |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
if ($function_name eq 'between' |
|
315
|
|
|
|
|
|
|
|| $function_name eq 'foreach' |
|
316
|
|
|
|
|
|
|
|| substr( $function_name, 0, 2 ) eq 'g.' |
|
317
|
|
|
|
|
|
|
) |
|
318
|
|
|
|
|
|
|
{ |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# array function |
|
321
|
|
|
|
|
|
|
$self->array_simple_features->{$simple_feature_index} = 1; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
if ($function_name eq 'LABEL' |
|
325
|
|
|
|
|
|
|
|| $function_name eq 'l.label' || $function_name eq 'prevlabel' |
|
326
|
|
|
|
|
|
|
|| $function_name eq 'G.label' |
|
327
|
|
|
|
|
|
|
|| $function_name eq 'g.label' |
|
328
|
|
|
|
|
|
|
) |
|
329
|
|
|
|
|
|
|
{ |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# dynamic feature |
|
332
|
|
|
|
|
|
|
$self->dynamic_simple_features->{$simple_feature_index} = 1; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# set $simple_feature_field |
|
336
|
|
|
|
|
|
|
if ( $simple_feature_code =~ /$function_name\(\)$/ ) { |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# no-arg function feature |
|
339
|
|
|
|
|
|
|
$simple_feature_field = []; |
|
340
|
|
|
|
|
|
|
} elsif ( $simple_feature_code =~ /$function_name\(([-a-z0-9_]+)\)$/ ) { |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# one-arg function feature |
|
343
|
|
|
|
|
|
|
$simple_feature_field = $1; |
|
344
|
|
|
|
|
|
|
} elsif ( |
|
345
|
|
|
|
|
|
|
$simple_feature_code |
|
346
|
|
|
|
|
|
|
=~ /$function_name\(([-a-z0-9_,]+)\)$/ |
|
347
|
|
|
|
|
|
|
) |
|
348
|
|
|
|
|
|
|
{ |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# multiarg function feature |
|
351
|
|
|
|
|
|
|
my @fields = split /,/, $1; |
|
352
|
|
|
|
|
|
|
$simple_feature_field = \@fields; |
|
353
|
|
|
|
|
|
|
} else { |
|
354
|
|
|
|
|
|
|
die "Incorrect simple function feature format " . |
|
355
|
|
|
|
|
|
|
"'$simple_feature_code'.\n"; |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
} else { |
|
358
|
|
|
|
|
|
|
die "Incorrect simple feature format '$simple_feature_code'.\n"; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# if $simple_feature_field is (a ref to) an array of field names, |
|
362
|
|
|
|
|
|
|
# handles that correctly by iterating over the array and returning |
|
363
|
|
|
|
|
|
|
# an array of field indexes; |
|
364
|
|
|
|
|
|
|
# if there is an integer argument instead of a field name, |
|
365
|
|
|
|
|
|
|
# detects that and keeps that integer unchanged |
|
366
|
|
|
|
|
|
|
my $simple_feature_sub_arguments = |
|
367
|
|
|
|
|
|
|
$self->config->field_name2index($simple_feature_field); |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# save |
|
370
|
|
|
|
|
|
|
$self->simple_feature_codes_hash->{$simple_feature_code} = 1; |
|
371
|
|
|
|
|
|
|
$self->simple_feature_indexes->{$simple_feature_code} = |
|
372
|
|
|
|
|
|
|
$simple_feature_index; |
|
373
|
|
|
|
|
|
|
push @{ $self->simple_feature_codes }, $simple_feature_code; |
|
374
|
|
|
|
|
|
|
push @{ $self->simple_feature_subs }, $simple_feature_sub; |
|
375
|
|
|
|
|
|
|
push @{ $self->simple_feature_sub_arguments }, |
|
376
|
|
|
|
|
|
|
$simple_feature_sub_arguments; |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
return; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# FEATURES COMPUTATION |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# array (ref) of all features of the edge, |
|
384
|
|
|
|
|
|
|
# in the form of "feature_index:values_string" strings, |
|
385
|
|
|
|
|
|
|
# where feature_index is the index of the feature |
|
386
|
|
|
|
|
|
|
# (index in feature_codes, translatable via feature_indexes) |
|
387
|
|
|
|
|
|
|
# and values_string are values of corresponding simple features, |
|
388
|
|
|
|
|
|
|
# joined together by '|' |
|
389
|
|
|
|
|
|
|
# (if any of the simple features does not return a value, the whole feature |
|
390
|
|
|
|
|
|
|
# is not present) |
|
391
|
|
|
|
|
|
|
# TODO maybe not returning a value is still a valuable information -> include? |
|
392
|
|
|
|
|
|
|
sub get_all_features { |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# Edge; 0: all features, 1: only dynamic, -1: only non-dynamic |
|
395
|
|
|
|
|
|
|
# either get only dynamic features or get all but dynamic features |
|
396
|
|
|
|
|
|
|
my ( $self, $edge, $only_dynamic_features ) = @_; |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# try to get features from cache |
|
399
|
|
|
|
|
|
|
# TODO: cache not used now and probably does not even work: |
|
400
|
|
|
|
|
|
|
# check&fix or remove |
|
401
|
|
|
|
|
|
|
my $edge_signature; |
|
402
|
|
|
|
|
|
|
if ( $self->use_edge_features_cache ) { |
|
403
|
|
|
|
|
|
|
$edge_signature = $edge->signature(); |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
my $cache_features = $self->edge_features_cache->{$edge_signature}; |
|
406
|
|
|
|
|
|
|
if ($cache_features) { |
|
407
|
|
|
|
|
|
|
return $cache_features; |
|
408
|
|
|
|
|
|
|
} |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# double else: if cache not used or if edge features not found in cache |
|
412
|
|
|
|
|
|
|
my $simple_feature_values = $self->get_simple_feature_values_array($edge); |
|
413
|
|
|
|
|
|
|
my @features; |
|
414
|
|
|
|
|
|
|
my $features_count = $self->feature_count; |
|
415
|
|
|
|
|
|
|
for ( |
|
416
|
|
|
|
|
|
|
my $feature_index = 0; |
|
417
|
|
|
|
|
|
|
$feature_index < $features_count; |
|
418
|
|
|
|
|
|
|
$feature_index++ |
|
419
|
|
|
|
|
|
|
) |
|
420
|
|
|
|
|
|
|
{ |
|
421
|
|
|
|
|
|
|
if ($only_dynamic_features |
|
422
|
|
|
|
|
|
|
&& $only_dynamic_features == 1 |
|
423
|
|
|
|
|
|
|
&& !$self->dynamic_features->{$feature_index} |
|
424
|
|
|
|
|
|
|
) |
|
425
|
|
|
|
|
|
|
{ |
|
426
|
|
|
|
|
|
|
next; |
|
427
|
|
|
|
|
|
|
} elsif ( |
|
428
|
|
|
|
|
|
|
$only_dynamic_features |
|
429
|
|
|
|
|
|
|
&& $only_dynamic_features == -1 |
|
430
|
|
|
|
|
|
|
&& $self->dynamic_features->{$feature_index} |
|
431
|
|
|
|
|
|
|
) |
|
432
|
|
|
|
|
|
|
{ |
|
433
|
|
|
|
|
|
|
next; |
|
434
|
|
|
|
|
|
|
} else { |
|
435
|
|
|
|
|
|
|
my $feature_value = |
|
436
|
|
|
|
|
|
|
$self->get_feature_value( $feature_index, $simple_feature_values ); |
|
437
|
|
|
|
|
|
|
if ( $self->array_features->{$feature_index} ) { |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
#it is an array feature, the returned value is an array reference |
|
440
|
|
|
|
|
|
|
foreach my $value ( @{$feature_value} ) { |
|
441
|
|
|
|
|
|
|
push @features, "$feature_index:$value"; |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
} else { |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
#it is not an array feature, the returned value is a string |
|
446
|
|
|
|
|
|
|
if ( $feature_value ne '' ) { |
|
447
|
|
|
|
|
|
|
push @features, "$feature_index:$feature_value"; |
|
448
|
|
|
|
|
|
|
} |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# save result in cache |
|
454
|
|
|
|
|
|
|
if ( $self->use_edge_features_cache ) { |
|
455
|
|
|
|
|
|
|
$self->edge_features_cache->{$edge_signature} = \@features; |
|
456
|
|
|
|
|
|
|
} |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
return \@features; |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# returns value of feature: simple feature values joined by '|' |
|
462
|
|
|
|
|
|
|
# or '' if any of them is undefined or empty; |
|
463
|
|
|
|
|
|
|
# for an array feature returns an array (ref) of these |
|
464
|
|
|
|
|
|
|
# or an empty array (ref) |
|
465
|
|
|
|
|
|
|
sub get_feature_value { |
|
466
|
|
|
|
|
|
|
my ( $self, $feature_index, $simple_feature_values ) = @_; |
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
my $simple_features_indexes = |
|
469
|
|
|
|
|
|
|
$self->feature_simple_features_indexes->[$feature_index]; |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
if ( $self->array_features->{$feature_index} ) { |
|
472
|
|
|
|
|
|
|
my $feature_value = |
|
473
|
|
|
|
|
|
|
$self->get_array_feature_value( |
|
474
|
|
|
|
|
|
|
$simple_features_indexes, |
|
475
|
|
|
|
|
|
|
$simple_feature_values, 0 |
|
476
|
|
|
|
|
|
|
); |
|
477
|
|
|
|
|
|
|
if ($feature_value) { |
|
478
|
|
|
|
|
|
|
return $feature_value; |
|
479
|
|
|
|
|
|
|
} else { |
|
480
|
|
|
|
|
|
|
return []; |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
} else { |
|
483
|
|
|
|
|
|
|
my @values; |
|
484
|
|
|
|
|
|
|
foreach my $simple_feature_index ( @{$simple_features_indexes} ) { |
|
485
|
|
|
|
|
|
|
my $value = $simple_feature_values->[$simple_feature_index]; |
|
486
|
|
|
|
|
|
|
if ( defined $value && $value ne '' ) { |
|
487
|
|
|
|
|
|
|
push @values, $value; |
|
488
|
|
|
|
|
|
|
} else { |
|
489
|
|
|
|
|
|
|
return ''; |
|
490
|
|
|
|
|
|
|
} |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
my $feature_value = join '|', @values; |
|
494
|
|
|
|
|
|
|
return $feature_value; |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
} |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# for features containing subfeatures that return an array of values |
|
499
|
|
|
|
|
|
|
sub get_array_feature_value { |
|
500
|
|
|
|
|
|
|
my ( |
|
501
|
|
|
|
|
|
|
$self, |
|
502
|
|
|
|
|
|
|
$simple_features_indexes, |
|
503
|
|
|
|
|
|
|
$simple_feature_values, |
|
504
|
|
|
|
|
|
|
$start_from |
|
505
|
|
|
|
|
|
|
) = @_; |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# get value at this position (position = $start_from) |
|
508
|
|
|
|
|
|
|
my $simple_feature_index = $simple_features_indexes->[$start_from]; |
|
509
|
|
|
|
|
|
|
my $value = $simple_feature_values->[$simple_feature_index]; |
|
510
|
|
|
|
|
|
|
if ( !$self->array_simple_features->{$simple_feature_index} ) { |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# if not an array reference |
|
513
|
|
|
|
|
|
|
$value = [ ($value) ]; # make it an array reference |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
my $simple_features_count = scalar @{$simple_features_indexes}; |
|
517
|
|
|
|
|
|
|
if ( $start_from < $simple_features_count - 1 ) { |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# not the last simple feature => have to recurse |
|
520
|
|
|
|
|
|
|
my $append = |
|
521
|
|
|
|
|
|
|
$self->get_array_feature_value( |
|
522
|
|
|
|
|
|
|
$simple_features_indexes, |
|
523
|
|
|
|
|
|
|
$simple_feature_values, $start_from + 1 |
|
524
|
|
|
|
|
|
|
); |
|
525
|
|
|
|
|
|
|
my @values; |
|
526
|
|
|
|
|
|
|
foreach my $my_value ( @{$value} ) { |
|
527
|
|
|
|
|
|
|
foreach my $append_value ( @{$append} ) { |
|
528
|
|
|
|
|
|
|
my $add_value = "$my_value|$append_value"; |
|
529
|
|
|
|
|
|
|
push @values, $add_value; |
|
530
|
|
|
|
|
|
|
} |
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
return [@values]; |
|
533
|
|
|
|
|
|
|
} else { # else bottom of recursion |
|
534
|
|
|
|
|
|
|
return $value; |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
# SIMPLE FEATURES |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub get_simple_feature_values_array { |
|
541
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
my @simple_feature_values; |
|
544
|
|
|
|
|
|
|
my $simple_feature_count = $self->simple_feature_count; |
|
545
|
|
|
|
|
|
|
for ( |
|
546
|
|
|
|
|
|
|
my $simple_feature_index = 0; |
|
547
|
|
|
|
|
|
|
$simple_feature_index < $simple_feature_count; |
|
548
|
|
|
|
|
|
|
$simple_feature_index++ |
|
549
|
|
|
|
|
|
|
) |
|
550
|
|
|
|
|
|
|
{ |
|
551
|
|
|
|
|
|
|
my $sub = $self->simple_feature_subs->[$simple_feature_index]; |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# If the simple feature has one parameter, |
|
554
|
|
|
|
|
|
|
# then $arguments is the one argument; |
|
555
|
|
|
|
|
|
|
# if the simple feature has more than one parameter, |
|
556
|
|
|
|
|
|
|
# then $arguments is a reference to an array of arguments. |
|
557
|
|
|
|
|
|
|
my $arguments = |
|
558
|
|
|
|
|
|
|
$self->simple_feature_sub_arguments->[$simple_feature_index]; |
|
559
|
|
|
|
|
|
|
my $value = &$sub( $self, $edge, $arguments ); |
|
560
|
|
|
|
|
|
|
push @simple_feature_values, $value; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
return [@simple_feature_values]; |
|
564
|
|
|
|
|
|
|
} |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
my %simple_feature_sub_references = ( |
|
567
|
|
|
|
|
|
|
'LABEL' => \&{feature_parent_label}, |
|
568
|
|
|
|
|
|
|
'prevlabel' => \&{feature_previous_label}, |
|
569
|
|
|
|
|
|
|
'l.label' => \&{feature_previous_label}, |
|
570
|
|
|
|
|
|
|
'G.label' => \&{feature_grandparent_label}, |
|
571
|
|
|
|
|
|
|
'g.label' => \&{feature_grandchildren_label}, |
|
572
|
|
|
|
|
|
|
'distance' => \&{feature_distance}, |
|
573
|
|
|
|
|
|
|
'G.distance' => \&{feature_grandparent_distance}, |
|
574
|
|
|
|
|
|
|
'attdir' => \&{feature_attachement_direction}, |
|
575
|
|
|
|
|
|
|
'G.attdir' => \&{feature_grandparent_attachement_direction}, # grandparent to child |
|
576
|
|
|
|
|
|
|
'preceding' => \&{feature_preceding_child}, |
|
577
|
|
|
|
|
|
|
'PRECEDING' => \&{feature_preceding_parent}, |
|
578
|
|
|
|
|
|
|
'1.preceding' => \&{feature_preceding_first}, |
|
579
|
|
|
|
|
|
|
'2.preceding' => \&{feature_preceding_second}, |
|
580
|
|
|
|
|
|
|
'following' => \&{feature_following_child}, |
|
581
|
|
|
|
|
|
|
'FOLLOWING' => \&{feature_following_parent}, |
|
582
|
|
|
|
|
|
|
'1.following' => \&{feature_following_first}, |
|
583
|
|
|
|
|
|
|
'2.following' => \&{feature_following_second}, |
|
584
|
|
|
|
|
|
|
'between' => \&{feature_between}, |
|
585
|
|
|
|
|
|
|
'foreach' => \&{feature_foreach}, |
|
586
|
|
|
|
|
|
|
'equals' => \&{feature_equals}, |
|
587
|
|
|
|
|
|
|
'equalspc' => \&{feature_equals_pc}, |
|
588
|
|
|
|
|
|
|
'equalspcat' => \&{feature_equals_pc_at}, |
|
589
|
|
|
|
|
|
|
'arrayat' => \&{feature_array_at_child}, |
|
590
|
|
|
|
|
|
|
'ARRAYAT' => \&{feature_array_at_parent}, |
|
591
|
|
|
|
|
|
|
'arrayatcp' => \&{feature_array_at_cp}, |
|
592
|
|
|
|
|
|
|
'isfirst' => \&{feature_child_is_first_in_sentence}, |
|
593
|
|
|
|
|
|
|
'ISFIRST' => \&{feature_parent_is_first_in_sentence}, |
|
594
|
|
|
|
|
|
|
'islast' => \&{feature_child_is_last_in_sentence}, |
|
595
|
|
|
|
|
|
|
'ISLAST' => \&{feature_parent_is_last_in_sentence}, |
|
596
|
|
|
|
|
|
|
'isfirstchild' => \&{feature_child_is_first_child}, |
|
597
|
|
|
|
|
|
|
'islastchild' => \&{feature_child_is_last_child}, |
|
598
|
|
|
|
|
|
|
'islastleftchild' => \&{feature_child_is_last_left_child}, |
|
599
|
|
|
|
|
|
|
'isfirstrightchild' => \&{feature_child_is_first_right_child}, |
|
600
|
|
|
|
|
|
|
'childno' => \&{feature_number_of_childs_children}, |
|
601
|
|
|
|
|
|
|
'CHILDNO' => \&{feature_number_of_parents_children}, |
|
602
|
|
|
|
|
|
|
'substr' => \&{feature_substr_child}, |
|
603
|
|
|
|
|
|
|
'SUBSTR' => \&{feature_substr_parent}, |
|
604
|
|
|
|
|
|
|
'pmi' => \&{feature_pmi}, |
|
605
|
|
|
|
|
|
|
'pmibucketed' => \&{feature_pmi_bucketed}, |
|
606
|
|
|
|
|
|
|
'pmirounded' => \&{feature_pmi_rounded}, |
|
607
|
|
|
|
|
|
|
'pmid' => \&{feature_pmi_d}, |
|
608
|
|
|
|
|
|
|
'cprob' => \&{feature_cprob}, |
|
609
|
|
|
|
|
|
|
'cprobbucketed' => \&{feature_cprob_bucketed}, |
|
610
|
|
|
|
|
|
|
'cprobrounded' => \&{feature_cprob_rounded}, |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
# obsolete |
|
613
|
|
|
|
|
|
|
# 'pmitworounded' => \&{feature_pmi_2_rounded}, |
|
614
|
|
|
|
|
|
|
# 'pmithreerounded' => \&{feature_pmi_3_rounded}, |
|
615
|
|
|
|
|
|
|
# 'cprobtworounded' => \&{feature_cprob_2_rounded}, |
|
616
|
|
|
|
|
|
|
# 'cprobthreerounded' => \&{feature_cprob_3_rounded}, |
|
617
|
|
|
|
|
|
|
); |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub get_simple_feature_sub_reference { |
|
620
|
|
|
|
|
|
|
my ( $self, $simple_feature_function ) = @_; |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
if ( $simple_feature_sub_references{$simple_feature_function} ) { |
|
623
|
|
|
|
|
|
|
return $simple_feature_sub_references{$simple_feature_function}; |
|
624
|
|
|
|
|
|
|
} else { |
|
625
|
|
|
|
|
|
|
croak "Unknown feature function '$simple_feature_function'!"; |
|
626
|
|
|
|
|
|
|
} |
|
627
|
|
|
|
|
|
|
} |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
# returns undef if there is no grandparent, i.e. the parent is the root |
|
630
|
|
|
|
|
|
|
sub get_grandparent { |
|
631
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
return ( $edge->parent )->parent; |
|
634
|
|
|
|
|
|
|
} |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
sub feature_distance { |
|
637
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
return $self->feature_distance_generic( $edge->parent, $edge->child ); |
|
640
|
|
|
|
|
|
|
} |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub feature_grandparent_distance { |
|
643
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
|
646
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
|
647
|
|
|
|
|
|
|
return $self->feature_distance_generic( $edge->parent, $edge->child ); |
|
648
|
|
|
|
|
|
|
} else { |
|
649
|
|
|
|
|
|
|
return '#novalue#'; |
|
650
|
|
|
|
|
|
|
} |
|
651
|
|
|
|
|
|
|
} |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
sub feature_distance_generic { |
|
654
|
|
|
|
|
|
|
my ( $self, $node1, $node2 ) = @_; |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
my $distance = $node1->ord - $node2->ord; |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
my $bucket = $self->config->distance2bucket->{$distance}; |
|
659
|
|
|
|
|
|
|
if ( defined $bucket ) { |
|
660
|
|
|
|
|
|
|
return $bucket; |
|
661
|
|
|
|
|
|
|
} else { |
|
662
|
|
|
|
|
|
|
if ( $distance <= $self->config->minBucket ) { |
|
663
|
|
|
|
|
|
|
return $self->config->minBucket; |
|
664
|
|
|
|
|
|
|
} else { # $distance >= $self->maxBucket |
|
665
|
|
|
|
|
|
|
return $self->config->maxBucket; |
|
666
|
|
|
|
|
|
|
} |
|
667
|
|
|
|
|
|
|
} |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
sub feature_attachement_direction { |
|
671
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
return $self->feature_attachement_direction_generic( |
|
674
|
|
|
|
|
|
|
$edge->parent, $edge->child |
|
675
|
|
|
|
|
|
|
); |
|
676
|
|
|
|
|
|
|
} |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
sub feature_grandparent_attachement_direction { |
|
679
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
|
682
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
|
683
|
|
|
|
|
|
|
return $self->feature_attachement_direction_generic( |
|
684
|
|
|
|
|
|
|
$edge->parent, $edge->child |
|
685
|
|
|
|
|
|
|
); |
|
686
|
|
|
|
|
|
|
} else { |
|
687
|
|
|
|
|
|
|
return '#novalue#'; |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub feature_attachement_direction_generic { |
|
692
|
|
|
|
|
|
|
my ( $self, $node1, $node2 ) = @_; |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
if ( $node1->ord < $node2->ord ) { |
|
695
|
|
|
|
|
|
|
return -1; |
|
696
|
|
|
|
|
|
|
} else { |
|
697
|
|
|
|
|
|
|
return 1; |
|
698
|
|
|
|
|
|
|
} |
|
699
|
|
|
|
|
|
|
} |
|
700
|
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
sub feature_child { |
|
702
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
703
|
|
|
|
|
|
|
return ( $edge->child->fields->[$field_index] ); |
|
704
|
|
|
|
|
|
|
} |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
sub feature_parent { |
|
707
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
708
|
|
|
|
|
|
|
return ( $edge->parent->fields->[$field_index] ); |
|
709
|
|
|
|
|
|
|
} |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
sub feature_grandparent { |
|
712
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
|
715
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
|
716
|
|
|
|
|
|
|
return ( $grandparent->fields->[$field_index] ); |
|
717
|
|
|
|
|
|
|
} else { |
|
718
|
|
|
|
|
|
|
return '#novalue#'; |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
} |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
sub feature_parent_label { |
|
723
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
724
|
|
|
|
|
|
|
return ( $edge->parent->label ); |
|
725
|
|
|
|
|
|
|
} |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
sub feature_previous_label { |
|
728
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
my $left_sibling = $self->get_left_sibling($edge); |
|
731
|
|
|
|
|
|
|
if ( defined $left_sibling ) { |
|
732
|
|
|
|
|
|
|
return ( $left_sibling->child->label ); |
|
733
|
|
|
|
|
|
|
} else { |
|
734
|
|
|
|
|
|
|
return $self->config->SEQUENCE_BOUNDARY_LABEL; |
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
} |
|
737
|
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
sub feature_grandparent_label { |
|
739
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
|
742
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
|
743
|
|
|
|
|
|
|
return ( $grandparent->label ); |
|
744
|
|
|
|
|
|
|
} else { |
|
745
|
|
|
|
|
|
|
return '#novalue#'; |
|
746
|
|
|
|
|
|
|
} |
|
747
|
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
sub feature_first { |
|
750
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
751
|
|
|
|
|
|
|
return ( $edge->first->fields->[$field_index] ); |
|
752
|
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
sub feature_second { |
|
755
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
756
|
|
|
|
|
|
|
return ( $edge->second->fields->[$field_index] ); |
|
757
|
|
|
|
|
|
|
} |
|
758
|
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
sub feature_left_sibling { |
|
760
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
my $left_sibling = $self->get_left_sibling($edge); |
|
763
|
|
|
|
|
|
|
if ( defined $left_sibling ) { |
|
764
|
|
|
|
|
|
|
return ( $left_sibling->child->fields->[$field_index] ); |
|
765
|
|
|
|
|
|
|
} else { |
|
766
|
|
|
|
|
|
|
return '#start#'; |
|
767
|
|
|
|
|
|
|
} |
|
768
|
|
|
|
|
|
|
} |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
sub feature_right_sibling { |
|
771
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
my $right_sibling = $self->get_right_sibling($edge); |
|
774
|
|
|
|
|
|
|
if ( defined $right_sibling ) { |
|
775
|
|
|
|
|
|
|
return ( $right_sibling->child->fields->[$field_index] ); |
|
776
|
|
|
|
|
|
|
} else { |
|
777
|
|
|
|
|
|
|
return '#end#'; |
|
778
|
|
|
|
|
|
|
} |
|
779
|
|
|
|
|
|
|
} |
|
780
|
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
sub get_left_sibling { |
|
782
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
|
785
|
|
|
|
|
|
|
my $is_first = ( $siblings->[0]->child->ord == $edge->child->ord ); |
|
786
|
|
|
|
|
|
|
if ($is_first) { |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# there is no left sibling to the leftmost node |
|
789
|
|
|
|
|
|
|
return; |
|
790
|
|
|
|
|
|
|
} else { |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
# find my position among parent's children (is at least 1) |
|
793
|
|
|
|
|
|
|
my $my_index = 1; |
|
794
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
|
795
|
|
|
|
|
|
|
$my_index++; |
|
796
|
|
|
|
|
|
|
} |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
# now ($my_index-1) is the index of my (closest) left sibling |
|
799
|
|
|
|
|
|
|
return ( $siblings->[ $my_index - 1 ] ); |
|
800
|
|
|
|
|
|
|
} |
|
801
|
|
|
|
|
|
|
} |
|
802
|
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
sub get_right_sibling { |
|
804
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
|
807
|
|
|
|
|
|
|
my $last_sibling_index = scalar(@$siblings) - 1; |
|
808
|
|
|
|
|
|
|
my $is_last = ( |
|
809
|
|
|
|
|
|
|
$siblings->[$last_sibling_index]->child->ord |
|
810
|
|
|
|
|
|
|
== $edge->child->ord |
|
811
|
|
|
|
|
|
|
); |
|
812
|
|
|
|
|
|
|
if ($is_last) { |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
# there is no right sibling to the rightmost node |
|
815
|
|
|
|
|
|
|
return; |
|
816
|
|
|
|
|
|
|
} else { |
|
817
|
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
# find my position among parent's children |
|
819
|
|
|
|
|
|
|
# (is at most $last_sibling_index - 1) |
|
820
|
|
|
|
|
|
|
my $my_index = $last_sibling_index - 1; |
|
821
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
|
822
|
|
|
|
|
|
|
$my_index--; |
|
823
|
|
|
|
|
|
|
} |
|
824
|
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
# now ($my_index+1) is the index of my (closest) right sibling |
|
826
|
|
|
|
|
|
|
return $siblings->[ $my_index + 1 ]; |
|
827
|
|
|
|
|
|
|
} |
|
828
|
|
|
|
|
|
|
} |
|
829
|
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
sub feature_preceding_child { |
|
831
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
832
|
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->child->ord - 1 ); |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# $node may be undef |
|
836
|
|
|
|
|
|
|
if ($node) { |
|
837
|
|
|
|
|
|
|
if ( $edge->parent->ord == $node->ord ) { |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# no gap between nodes |
|
840
|
|
|
|
|
|
|
return '#mid#'; |
|
841
|
|
|
|
|
|
|
} else { |
|
842
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
843
|
|
|
|
|
|
|
} |
|
844
|
|
|
|
|
|
|
} else { |
|
845
|
|
|
|
|
|
|
return '#start#'; |
|
846
|
|
|
|
|
|
|
} |
|
847
|
|
|
|
|
|
|
} |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
sub feature_preceding_parent { |
|
850
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
851
|
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->parent->ord - 1 ); |
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
# $node may be undef |
|
855
|
|
|
|
|
|
|
if ($node) { |
|
856
|
|
|
|
|
|
|
if ( $edge->child->ord == $node->ord ) { |
|
857
|
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
# no gap between nodes |
|
859
|
|
|
|
|
|
|
return '#mid#'; |
|
860
|
|
|
|
|
|
|
} else { |
|
861
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
} else { |
|
864
|
|
|
|
|
|
|
return '#start#'; |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
} |
|
867
|
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
sub feature_following_child { |
|
869
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->child->ord + 1 ); |
|
872
|
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
# $node may be undef |
|
874
|
|
|
|
|
|
|
if ($node) { |
|
875
|
|
|
|
|
|
|
if ( $edge->parent->ord == $node->ord ) { |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
# no gap between nodes |
|
878
|
|
|
|
|
|
|
return '#mid#'; |
|
879
|
|
|
|
|
|
|
} else { |
|
880
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
881
|
|
|
|
|
|
|
} |
|
882
|
|
|
|
|
|
|
} else { |
|
883
|
|
|
|
|
|
|
return '#end#'; |
|
884
|
|
|
|
|
|
|
} |
|
885
|
|
|
|
|
|
|
} |
|
886
|
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
sub feature_following_parent { |
|
888
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->parent->ord + 1 ); |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# $node may be undef |
|
893
|
|
|
|
|
|
|
if ($node) { |
|
894
|
|
|
|
|
|
|
if ( $edge->child->ord == $node->ord ) { |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# no gap between nodes |
|
897
|
|
|
|
|
|
|
return '#mid#'; |
|
898
|
|
|
|
|
|
|
} else { |
|
899
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
900
|
|
|
|
|
|
|
} |
|
901
|
|
|
|
|
|
|
} else { |
|
902
|
|
|
|
|
|
|
return '#end#'; |
|
903
|
|
|
|
|
|
|
} |
|
904
|
|
|
|
|
|
|
} |
|
905
|
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
sub feature_preceding_first { |
|
907
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
908
|
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->first->ord - 1 ); |
|
910
|
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
# $node may be undef |
|
912
|
|
|
|
|
|
|
if ($node) { |
|
913
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
914
|
|
|
|
|
|
|
} else { |
|
915
|
|
|
|
|
|
|
return '#start#'; |
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
} |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
sub feature_preceding_second { |
|
920
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
921
|
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->second->ord - 1 ); |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
# $node may be undef |
|
925
|
|
|
|
|
|
|
if ($node) { |
|
926
|
|
|
|
|
|
|
if ( $edge->first->ord == $node->ord ) { |
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
# node preceding second node is first node |
|
929
|
|
|
|
|
|
|
return '#mid#'; |
|
930
|
|
|
|
|
|
|
} else { |
|
931
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
932
|
|
|
|
|
|
|
} |
|
933
|
|
|
|
|
|
|
} else { |
|
934
|
|
|
|
|
|
|
return '#start#'; |
|
935
|
|
|
|
|
|
|
} |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
sub feature_following_first { |
|
939
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->first->ord + 1 ); |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
# $node may be undef |
|
944
|
|
|
|
|
|
|
if ($node) { |
|
945
|
|
|
|
|
|
|
if ( $edge->second->ord == $node->ord ) { |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
# node following first node is second node |
|
948
|
|
|
|
|
|
|
return '#mid#'; |
|
949
|
|
|
|
|
|
|
} else { |
|
950
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
951
|
|
|
|
|
|
|
} |
|
952
|
|
|
|
|
|
|
} else { |
|
953
|
|
|
|
|
|
|
return '#end#'; |
|
954
|
|
|
|
|
|
|
} |
|
955
|
|
|
|
|
|
|
} |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
sub feature_following_second { |
|
958
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->second->ord + 1 ); |
|
961
|
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
# $node may be undef |
|
963
|
|
|
|
|
|
|
if ($node) { |
|
964
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
|
965
|
|
|
|
|
|
|
} else { |
|
966
|
|
|
|
|
|
|
return '#end#'; |
|
967
|
|
|
|
|
|
|
} |
|
968
|
|
|
|
|
|
|
} |
|
969
|
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
sub feature_between { |
|
971
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
972
|
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
my @values; |
|
974
|
|
|
|
|
|
|
my $from; |
|
975
|
|
|
|
|
|
|
my $to; |
|
976
|
|
|
|
|
|
|
if ( $edge->parent->ord < $edge->child->ord ) { |
|
977
|
|
|
|
|
|
|
$from = $edge->parent->ord + 1; |
|
978
|
|
|
|
|
|
|
$to = $edge->child->ord - 1; |
|
979
|
|
|
|
|
|
|
} else { |
|
980
|
|
|
|
|
|
|
$from = $edge->child->ord + 1; |
|
981
|
|
|
|
|
|
|
$to = $edge->parent->ord - 1; |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
# TODO: use precomputed values instead |
|
985
|
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
for ( my $ord = $from; $ord <= $to; $ord++ ) { |
|
987
|
|
|
|
|
|
|
push @values, |
|
988
|
|
|
|
|
|
|
$edge->sentence->getNodeByOrd($ord)->fields->[$field_index]; |
|
989
|
|
|
|
|
|
|
} |
|
990
|
|
|
|
|
|
|
return [@values]; |
|
991
|
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
# my $len = $to - $from; |
|
993
|
|
|
|
|
|
|
# if ($len >= 0) { |
|
994
|
|
|
|
|
|
|
# return $edge->sentence->betweenFeatureValues-> |
|
995
|
|
|
|
|
|
|
# {$field_index}->[$from]->[$len]; |
|
996
|
|
|
|
|
|
|
# } else { |
|
997
|
|
|
|
|
|
|
# return; |
|
998
|
|
|
|
|
|
|
# } |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
} |
|
1001
|
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
sub feature_foreach { |
|
1003
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1004
|
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
my $values = $edge->child->fields->[$field_index]; |
|
1006
|
|
|
|
|
|
|
if ($values) { |
|
1007
|
|
|
|
|
|
|
my @values = split / /, $edge->child->fields->[$field_index]; |
|
1008
|
|
|
|
|
|
|
return [@values]; |
|
1009
|
|
|
|
|
|
|
} else { |
|
1010
|
|
|
|
|
|
|
return ''; |
|
1011
|
|
|
|
|
|
|
} |
|
1012
|
|
|
|
|
|
|
} |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
sub feature_equals { |
|
1015
|
|
|
|
|
|
|
my ( $self, $edge, $field_indexes ) = @_; |
|
1016
|
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
# equals takes two arguments |
|
1018
|
|
|
|
|
|
|
if ( @{$field_indexes} == 2 ) { |
|
1019
|
|
|
|
|
|
|
my ( $field_index_1, $field_index_2 ) = @{$field_indexes}; |
|
1020
|
|
|
|
|
|
|
my $values_1 = $edge->child->fields->[$field_index_1]; |
|
1021
|
|
|
|
|
|
|
my $values_2 = $edge->child->fields->[$field_index_2]; |
|
1022
|
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# we handle undefines and empties specially |
|
1024
|
|
|
|
|
|
|
if ( |
|
1025
|
|
|
|
|
|
|
defined $values_1 |
|
1026
|
|
|
|
|
|
|
&& $values_1 ne '' |
|
1027
|
|
|
|
|
|
|
&& defined $values_2 |
|
1028
|
|
|
|
|
|
|
&& $values_2 ne '' |
|
1029
|
|
|
|
|
|
|
) |
|
1030
|
|
|
|
|
|
|
{ |
|
1031
|
|
|
|
|
|
|
my $result = 0; # default not equal |
|
1032
|
|
|
|
|
|
|
my @values_1 = split / /, $values_1; |
|
1033
|
|
|
|
|
|
|
my @values_2 = split / /, $values_2; |
|
1034
|
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# try to find a match |
|
1036
|
|
|
|
|
|
|
foreach my $value_1 (@values_1) { |
|
1037
|
|
|
|
|
|
|
foreach my $value_2 (@values_2) { |
|
1038
|
|
|
|
|
|
|
if ( $value_1 eq $value_2 ) { |
|
1039
|
|
|
|
|
|
|
$result = 1; # one match is enough |
|
1040
|
|
|
|
|
|
|
} |
|
1041
|
|
|
|
|
|
|
} |
|
1042
|
|
|
|
|
|
|
} |
|
1043
|
|
|
|
|
|
|
return $result; |
|
1044
|
|
|
|
|
|
|
} else { |
|
1045
|
|
|
|
|
|
|
return -1; # undef |
|
1046
|
|
|
|
|
|
|
} |
|
1047
|
|
|
|
|
|
|
} else { |
|
1048
|
|
|
|
|
|
|
croak "equals() takes TWO arguments!!!"; |
|
1049
|
|
|
|
|
|
|
} |
|
1050
|
|
|
|
|
|
|
} |
|
1051
|
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# only difference to equals is the line: |
|
1053
|
|
|
|
|
|
|
# my $values_1 = $edge->PARENT->fields->[$field_index_1]; |
|
1054
|
|
|
|
|
|
|
sub feature_equals_pc { |
|
1055
|
|
|
|
|
|
|
my ( $self, $edge, $field_indexes ) = @_; |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
# equals takes two arguments |
|
1058
|
|
|
|
|
|
|
if ( @{$field_indexes} == 2 ) { |
|
1059
|
|
|
|
|
|
|
my ( $field_index_1, $field_index_2 ) = @{$field_indexes}; |
|
1060
|
|
|
|
|
|
|
my $values_1 = $edge->parent->fields->[$field_index_1]; |
|
1061
|
|
|
|
|
|
|
my $values_2 = $edge->child->fields->[$field_index_2]; |
|
1062
|
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
# we handle undefines and empties specially |
|
1064
|
|
|
|
|
|
|
if ( |
|
1065
|
|
|
|
|
|
|
defined $values_1 |
|
1066
|
|
|
|
|
|
|
&& $values_1 ne '' |
|
1067
|
|
|
|
|
|
|
&& defined $values_2 |
|
1068
|
|
|
|
|
|
|
&& $values_2 ne '' |
|
1069
|
|
|
|
|
|
|
) |
|
1070
|
|
|
|
|
|
|
{ |
|
1071
|
|
|
|
|
|
|
my $result = 0; # default not equal |
|
1072
|
|
|
|
|
|
|
my @values_1 = split / /, $values_1; |
|
1073
|
|
|
|
|
|
|
my @values_2 = split / /, $values_2; |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
# try to find a match |
|
1076
|
|
|
|
|
|
|
foreach my $value_1 (@values_1) { |
|
1077
|
|
|
|
|
|
|
foreach my $value_2 (@values_2) { |
|
1078
|
|
|
|
|
|
|
if ( $value_1 eq $value_2 ) { |
|
1079
|
|
|
|
|
|
|
$result = 1; # one match is enough |
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
|
|
|
|
|
|
} |
|
1082
|
|
|
|
|
|
|
} |
|
1083
|
|
|
|
|
|
|
return $result; |
|
1084
|
|
|
|
|
|
|
} else { |
|
1085
|
|
|
|
|
|
|
return -1; # undef |
|
1086
|
|
|
|
|
|
|
} |
|
1087
|
|
|
|
|
|
|
} else { |
|
1088
|
|
|
|
|
|
|
croak "equals() takes TWO arguments!!!"; |
|
1089
|
|
|
|
|
|
|
} |
|
1090
|
|
|
|
|
|
|
} |
|
1091
|
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
# sub equalsat - does not make sense |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# whether the character at the given position of the given field |
|
1095
|
|
|
|
|
|
|
# equals in parent and in child |
|
1096
|
|
|
|
|
|
|
sub feature_equals_pc_at { |
|
1097
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
|
1098
|
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
# equals takes two arguments |
|
1100
|
|
|
|
|
|
|
if ( @{$arguments} == 2 ) { |
|
1101
|
|
|
|
|
|
|
my ( $field_index, $position ) = @{$arguments}; |
|
1102
|
|
|
|
|
|
|
my $field_parent = $edge->parent->fields->[$field_index]; |
|
1103
|
|
|
|
|
|
|
my $field_child = $edge->child->fields->[$field_index]; |
|
1104
|
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
# we handle undefines and too short fields specially |
|
1106
|
|
|
|
|
|
|
if ( |
|
1107
|
|
|
|
|
|
|
defined $field_parent |
|
1108
|
|
|
|
|
|
|
&& length $field_parent > $position |
|
1109
|
|
|
|
|
|
|
&& defined $field_child |
|
1110
|
|
|
|
|
|
|
&& length $field_child > $position |
|
1111
|
|
|
|
|
|
|
) |
|
1112
|
|
|
|
|
|
|
{ |
|
1113
|
|
|
|
|
|
|
my $value_parent = substr $field_parent, $position, 1; |
|
1114
|
|
|
|
|
|
|
my $value_child = substr $field_child, $position, 1; |
|
1115
|
|
|
|
|
|
|
if ( $value_parent eq $value_child ) { |
|
1116
|
|
|
|
|
|
|
return 1; |
|
1117
|
|
|
|
|
|
|
} else { |
|
1118
|
|
|
|
|
|
|
return 0; |
|
1119
|
|
|
|
|
|
|
} |
|
1120
|
|
|
|
|
|
|
} else { |
|
1121
|
|
|
|
|
|
|
return -1; # undef |
|
1122
|
|
|
|
|
|
|
} |
|
1123
|
|
|
|
|
|
|
} else { |
|
1124
|
|
|
|
|
|
|
croak "equals() takes TWO arguments!!!"; |
|
1125
|
|
|
|
|
|
|
} |
|
1126
|
|
|
|
|
|
|
} |
|
1127
|
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
# substring (field, start, length) |
|
1129
|
|
|
|
|
|
|
sub feature_substr_child { |
|
1130
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
|
1131
|
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
# substr takes two or three arguments |
|
1133
|
|
|
|
|
|
|
if ( @{$arguments} != 3 && @{$arguments} != 2 ) { |
|
1134
|
|
|
|
|
|
|
croak "substr() takes THREE or TWO arguments!!!"; |
|
1135
|
|
|
|
|
|
|
} else { |
|
1136
|
|
|
|
|
|
|
my ( $field_index, $start, $length ) = @{$arguments}; |
|
1137
|
|
|
|
|
|
|
my $field = $edge->child->fields->[$field_index]; |
|
1138
|
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
my $value = ''; |
|
1140
|
|
|
|
|
|
|
if ( defined $field ) { |
|
1141
|
|
|
|
|
|
|
if ( defined $length ) { |
|
1142
|
|
|
|
|
|
|
$value = substr( $field, $start, $length ); |
|
1143
|
|
|
|
|
|
|
} else { |
|
1144
|
|
|
|
|
|
|
$value = substr( $field, $start ); |
|
1145
|
|
|
|
|
|
|
} |
|
1146
|
|
|
|
|
|
|
} |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
return $value; |
|
1149
|
|
|
|
|
|
|
} |
|
1150
|
|
|
|
|
|
|
} |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
# substring (field, start, length) |
|
1153
|
|
|
|
|
|
|
sub feature_substr_parent { |
|
1154
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
|
1155
|
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
# substr takes two or three arguments |
|
1157
|
|
|
|
|
|
|
if ( @{$arguments} != 3 && @{$arguments} != 2 ) { |
|
1158
|
|
|
|
|
|
|
croak "substr() takes THREE or TWO arguments!!!"; |
|
1159
|
|
|
|
|
|
|
} else { |
|
1160
|
|
|
|
|
|
|
my ( $field_index, $start, $length ) = @{$arguments}; |
|
1161
|
|
|
|
|
|
|
my $field = $edge->parent->fields->[$field_index]; |
|
1162
|
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
my $value = ''; |
|
1164
|
|
|
|
|
|
|
if ( defined $field ) { |
|
1165
|
|
|
|
|
|
|
if ( defined $length ) { |
|
1166
|
|
|
|
|
|
|
$value = substr( $field, $start, $length ); |
|
1167
|
|
|
|
|
|
|
} else { |
|
1168
|
|
|
|
|
|
|
$value = substr( $field, $start ); |
|
1169
|
|
|
|
|
|
|
} |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
return $value; |
|
1173
|
|
|
|
|
|
|
} |
|
1174
|
|
|
|
|
|
|
} |
|
1175
|
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
# arrayat (array, index) |
|
1177
|
|
|
|
|
|
|
sub feature_array_at_child { |
|
1178
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
|
1179
|
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
# arrayat takes two arguments |
|
1181
|
|
|
|
|
|
|
if ( @{$arguments} != 2 ) { |
|
1182
|
|
|
|
|
|
|
croak "arrayat() takes TWO arguments!!!"; |
|
1183
|
|
|
|
|
|
|
} else { |
|
1184
|
|
|
|
|
|
|
my ( $array_field, $index_field ) = @{$arguments}; |
|
1185
|
|
|
|
|
|
|
my $array = $edge->child->fields->[$array_field]; |
|
1186
|
|
|
|
|
|
|
my $index = $edge->child->fields->[$index_field]; |
|
1187
|
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
my @array = split / /, $array; |
|
1189
|
|
|
|
|
|
|
my $value = $array[$index]; |
|
1190
|
|
|
|
|
|
|
if ( !defined $value ) { |
|
1191
|
|
|
|
|
|
|
$value = ''; |
|
1192
|
|
|
|
|
|
|
} |
|
1193
|
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
return $value; |
|
1195
|
|
|
|
|
|
|
} |
|
1196
|
|
|
|
|
|
|
} |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
sub feature_array_at_parent { |
|
1199
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
|
1200
|
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
# arrayat takes two arguments |
|
1202
|
|
|
|
|
|
|
if ( @{$arguments} != 2 ) { |
|
1203
|
|
|
|
|
|
|
croak "arrayat() takes TWO arguments!!!"; |
|
1204
|
|
|
|
|
|
|
} else { |
|
1205
|
|
|
|
|
|
|
my ( $array_field, $index_field ) = @{$arguments}; |
|
1206
|
|
|
|
|
|
|
my $array = $edge->parent->fields->[$array_field]; |
|
1207
|
|
|
|
|
|
|
my $index = $edge->parent->fields->[$index_field]; |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
my @array = split / /, $array; |
|
1210
|
|
|
|
|
|
|
my $value = $array[$index]; |
|
1211
|
|
|
|
|
|
|
if ( !defined $value ) { |
|
1212
|
|
|
|
|
|
|
$value = ''; |
|
1213
|
|
|
|
|
|
|
} |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
return $value; |
|
1216
|
|
|
|
|
|
|
} |
|
1217
|
|
|
|
|
|
|
} |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# arrayatcp (array, index) |
|
1220
|
|
|
|
|
|
|
sub feature_array_at_cp { |
|
1221
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
|
1222
|
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
# arrayat takes two arguments |
|
1224
|
|
|
|
|
|
|
if ( @{$arguments} != 2 ) { |
|
1225
|
|
|
|
|
|
|
croak "arrayat() takes TWO arguments!!!"; |
|
1226
|
|
|
|
|
|
|
} else { |
|
1227
|
|
|
|
|
|
|
my ( $array_field, $index_field ) = @{$arguments}; |
|
1228
|
|
|
|
|
|
|
my $array = $edge->child->fields->[$array_field]; |
|
1229
|
|
|
|
|
|
|
my $index = $edge->parent->fields->[$index_field]; |
|
1230
|
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
my @array = split / /, $array; |
|
1232
|
|
|
|
|
|
|
my $value = $array[$index]; |
|
1233
|
|
|
|
|
|
|
if ( !defined $value ) { |
|
1234
|
|
|
|
|
|
|
$value = ''; |
|
1235
|
|
|
|
|
|
|
} |
|
1236
|
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
return $value; |
|
1238
|
|
|
|
|
|
|
} |
|
1239
|
|
|
|
|
|
|
} |
|
1240
|
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
sub feature_child_is_first_in_sentence { |
|
1242
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1243
|
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
if ( $edge->child->ord == 1 ) { |
|
1245
|
|
|
|
|
|
|
return 1; |
|
1246
|
|
|
|
|
|
|
} else { |
|
1247
|
|
|
|
|
|
|
return 0; |
|
1248
|
|
|
|
|
|
|
} |
|
1249
|
|
|
|
|
|
|
} |
|
1250
|
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
sub feature_parent_is_first_in_sentence { |
|
1252
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1253
|
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
if ( $edge->parent->ord == 1 ) { |
|
1255
|
|
|
|
|
|
|
return 1; |
|
1256
|
|
|
|
|
|
|
} else { |
|
1257
|
|
|
|
|
|
|
return 0; |
|
1258
|
|
|
|
|
|
|
} |
|
1259
|
|
|
|
|
|
|
} |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
sub feature_child_is_last_in_sentence { |
|
1262
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1263
|
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# last ord = number of nodes (because ords are 1-based, 0 is the root node) |
|
1265
|
|
|
|
|
|
|
if ( $edge->child->ord == scalar( @{ $edge->sentence->nodes } ) ) { |
|
1266
|
|
|
|
|
|
|
return 1; |
|
1267
|
|
|
|
|
|
|
} else { |
|
1268
|
|
|
|
|
|
|
return 0; |
|
1269
|
|
|
|
|
|
|
} |
|
1270
|
|
|
|
|
|
|
} |
|
1271
|
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
sub feature_parent_is_last_in_sentence { |
|
1273
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1274
|
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
# last ord = number of nodes (because ords are 1-based, 0 is the root node) |
|
1276
|
|
|
|
|
|
|
if ( $edge->parent->ord == scalar( @{ $edge->sentence->nodes } ) ) { |
|
1277
|
|
|
|
|
|
|
return 1; |
|
1278
|
|
|
|
|
|
|
} else { |
|
1279
|
|
|
|
|
|
|
return 0; |
|
1280
|
|
|
|
|
|
|
} |
|
1281
|
|
|
|
|
|
|
} |
|
1282
|
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
sub feature_child_is_first_child { |
|
1284
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1285
|
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
my $children = $edge->parent->children; |
|
1287
|
|
|
|
|
|
|
if ( $children->[0]->child->ord == $edge->child->ord ) { |
|
1288
|
|
|
|
|
|
|
return 1; |
|
1289
|
|
|
|
|
|
|
} else { |
|
1290
|
|
|
|
|
|
|
return 0; |
|
1291
|
|
|
|
|
|
|
} |
|
1292
|
|
|
|
|
|
|
} |
|
1293
|
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
sub feature_child_is_last_child { |
|
1295
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1296
|
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
my $children = $edge->parent->children; |
|
1298
|
|
|
|
|
|
|
my $childrenNum = scalar(@$children); |
|
1299
|
|
|
|
|
|
|
if ( $children->[ $childrenNum - 1 ]->child->ord == $edge->child->ord ) { |
|
1300
|
|
|
|
|
|
|
return 1; |
|
1301
|
|
|
|
|
|
|
} else { |
|
1302
|
|
|
|
|
|
|
return 0; |
|
1303
|
|
|
|
|
|
|
} |
|
1304
|
|
|
|
|
|
|
} |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
sub feature_child_is_first_right_child { |
|
1307
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1308
|
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
my $is_right = ( $edge->parent->ord < $edge->child->ord ); |
|
1310
|
|
|
|
|
|
|
if ($is_right) { |
|
1311
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
|
1312
|
|
|
|
|
|
|
my $is_first = ( $siblings->[0]->child->ord == $edge->child->ord ); |
|
1313
|
|
|
|
|
|
|
if ($is_first) { |
|
1314
|
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
# is right & is first (= leftmost) of all siblings |
|
1316
|
|
|
|
|
|
|
return 1; |
|
1317
|
|
|
|
|
|
|
} else { |
|
1318
|
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
# find my position among parent's children (is at least 1) |
|
1320
|
|
|
|
|
|
|
my $my_index = 1; |
|
1321
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
|
1322
|
|
|
|
|
|
|
$my_index++; |
|
1323
|
|
|
|
|
|
|
} |
|
1324
|
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
# now ($my_index-1) is the index of my (closest) left sibling |
|
1326
|
|
|
|
|
|
|
my $sibling_is_left = |
|
1327
|
|
|
|
|
|
|
( |
|
1328
|
|
|
|
|
|
|
$siblings->[ $my_index - 1 ]->child->ord |
|
1329
|
|
|
|
|
|
|
< $edge->parent->ord |
|
1330
|
|
|
|
|
|
|
); |
|
1331
|
|
|
|
|
|
|
if ($sibling_is_left) { |
|
1332
|
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
# is right and closest left sibling is left |
|
1334
|
|
|
|
|
|
|
return 1; |
|
1335
|
|
|
|
|
|
|
} else { |
|
1336
|
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
# is right but not the first one |
|
1338
|
|
|
|
|
|
|
return 0; |
|
1339
|
|
|
|
|
|
|
} |
|
1340
|
|
|
|
|
|
|
} |
|
1341
|
|
|
|
|
|
|
} else { |
|
1342
|
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
# is left |
|
1344
|
|
|
|
|
|
|
return 0; |
|
1345
|
|
|
|
|
|
|
} |
|
1346
|
|
|
|
|
|
|
} |
|
1347
|
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
sub feature_child_is_last_left_child { |
|
1349
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1350
|
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
my $is_left = ( $edge->child->ord < $edge->parent->ord ); |
|
1352
|
|
|
|
|
|
|
if ($is_left) { |
|
1353
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
|
1354
|
|
|
|
|
|
|
my $last_sibling_index = scalar(@$siblings) - 1; |
|
1355
|
|
|
|
|
|
|
my $is_last = ( |
|
1356
|
|
|
|
|
|
|
$siblings->[$last_sibling_index]->child->ord |
|
1357
|
|
|
|
|
|
|
== $edge->child->ord |
|
1358
|
|
|
|
|
|
|
); |
|
1359
|
|
|
|
|
|
|
if ($is_last) { |
|
1360
|
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
# is left & is last of all siblings |
|
1362
|
|
|
|
|
|
|
return 1; |
|
1363
|
|
|
|
|
|
|
} else { |
|
1364
|
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
# find my position among parent's children |
|
1366
|
|
|
|
|
|
|
# (is at most $last_sibling_index - 1) |
|
1367
|
|
|
|
|
|
|
my $my_index = $last_sibling_index - 1; |
|
1368
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
|
1369
|
|
|
|
|
|
|
$my_index--; |
|
1370
|
|
|
|
|
|
|
} |
|
1371
|
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
# now ($my_index+1) is the index of my (closest) right sibling |
|
1373
|
|
|
|
|
|
|
my $sibling_is_right = |
|
1374
|
|
|
|
|
|
|
( |
|
1375
|
|
|
|
|
|
|
$edge->parent->ord |
|
1376
|
|
|
|
|
|
|
< $siblings->[ $my_index + 1 ]->child->ord |
|
1377
|
|
|
|
|
|
|
); |
|
1378
|
|
|
|
|
|
|
if ($sibling_is_right) { |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
# is left and closest right sibling is right |
|
1381
|
|
|
|
|
|
|
return 1; |
|
1382
|
|
|
|
|
|
|
} else { |
|
1383
|
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
# is left but not the last one |
|
1385
|
|
|
|
|
|
|
return 0; |
|
1386
|
|
|
|
|
|
|
} |
|
1387
|
|
|
|
|
|
|
} |
|
1388
|
|
|
|
|
|
|
} else { |
|
1389
|
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
# is right |
|
1391
|
|
|
|
|
|
|
return 0; |
|
1392
|
|
|
|
|
|
|
} |
|
1393
|
|
|
|
|
|
|
} |
|
1394
|
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
sub feature_number_of_childs_children { |
|
1396
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1397
|
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
my $children = $edge->child->children; |
|
1399
|
|
|
|
|
|
|
if ( $children && scalar(@$children) ) { |
|
1400
|
|
|
|
|
|
|
return scalar(@$children); |
|
1401
|
|
|
|
|
|
|
} else { |
|
1402
|
|
|
|
|
|
|
return 0; |
|
1403
|
|
|
|
|
|
|
} |
|
1404
|
|
|
|
|
|
|
} |
|
1405
|
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
sub feature_number_of_parents_children { |
|
1407
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
|
1408
|
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
my $children = $edge->parent->children; |
|
1410
|
|
|
|
|
|
|
if ( $children && scalar(@$children) ) { |
|
1411
|
|
|
|
|
|
|
return scalar(@$children); |
|
1412
|
|
|
|
|
|
|
} else { |
|
1413
|
|
|
|
|
|
|
return 0; |
|
1414
|
|
|
|
|
|
|
} |
|
1415
|
|
|
|
|
|
|
} |
|
1416
|
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
sub feature_additional_model { |
|
1418
|
|
|
|
|
|
|
my ( $self, $edge, $field_index, $model ) = @_; |
|
1419
|
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index]; |
|
1421
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index]; |
|
1422
|
|
|
|
|
|
|
|
|
1423
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
|
1424
|
|
|
|
|
|
|
return $model->get_value( $child, $parent ); |
|
1425
|
|
|
|
|
|
|
} else { |
|
1426
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
|
1427
|
|
|
|
|
|
|
"this should not happen!"; |
|
1428
|
|
|
|
|
|
|
} |
|
1429
|
|
|
|
|
|
|
} |
|
1430
|
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
sub feature_additional_model_bucketed { |
|
1432
|
|
|
|
|
|
|
my ( $self, $edge, $field_index, $model ) = @_; |
|
1433
|
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index]; |
|
1435
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index]; |
|
1436
|
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
|
1438
|
|
|
|
|
|
|
return $model->get_bucketed_value( $child, $parent ); |
|
1439
|
|
|
|
|
|
|
} else { |
|
1440
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
|
1441
|
|
|
|
|
|
|
"this should not happen!"; |
|
1442
|
|
|
|
|
|
|
} |
|
1443
|
|
|
|
|
|
|
} |
|
1444
|
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
sub feature_additional_model_rounded { |
|
1446
|
|
|
|
|
|
|
my ( $self, $edge, $parameters, $model ) = @_; |
|
1447
|
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
my ( $field_index, $rounding ) = @$parameters; |
|
1449
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index]; |
|
1450
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index]; |
|
1451
|
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
|
1453
|
|
|
|
|
|
|
return $model->get_rounded_value( $child, $parent, $rounding ); |
|
1454
|
|
|
|
|
|
|
} else { |
|
1455
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
|
1456
|
|
|
|
|
|
|
"this should not happen!"; |
|
1457
|
|
|
|
|
|
|
} |
|
1458
|
|
|
|
|
|
|
} |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
sub feature_additional_model_d { |
|
1461
|
|
|
|
|
|
|
my ( $self, $edge, $parameters, $model ) = @_; |
|
1462
|
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
my ( $field_index_c, $field_index_p ) = @$parameters; |
|
1464
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index_c]; |
|
1465
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index_p]; |
|
1466
|
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
|
1468
|
|
|
|
|
|
|
return $model->get_rounded_value( $child, $parent ); |
|
1469
|
|
|
|
|
|
|
} else { |
|
1470
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
|
1471
|
|
|
|
|
|
|
"this should not happen!"; |
|
1472
|
|
|
|
|
|
|
} |
|
1473
|
|
|
|
|
|
|
} |
|
1474
|
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
sub feature_pmi { |
|
1476
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1477
|
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
return $self->feature_additional_model( $edge, $field_index, $self->pmi_model ); |
|
1479
|
|
|
|
|
|
|
} |
|
1480
|
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
sub feature_pmi_bucketed { |
|
1482
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1483
|
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
return $self->feature_additional_model_bucketed( $edge, $field_index, $self->pmi_model ); |
|
1485
|
|
|
|
|
|
|
} |
|
1486
|
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
sub feature_pmi_rounded { |
|
1488
|
|
|
|
|
|
|
my ( $self, $edge, $parameters ) = @_; |
|
1489
|
|
|
|
|
|
|
|
|
1490
|
|
|
|
|
|
|
return $self->feature_additional_model_rounded( $edge, $parameters, $self->pmi_model ); |
|
1491
|
|
|
|
|
|
|
} |
|
1492
|
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
sub feature_pmi_d { |
|
1494
|
|
|
|
|
|
|
my ( $self, $edge, $parameters ) = @_; |
|
1495
|
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
return $self->feature_additional_model_d( $edge, $parameters, $self->pmi_model ); |
|
1497
|
|
|
|
|
|
|
} |
|
1498
|
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
sub feature_pmi_2_rounded { |
|
1500
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1501
|
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
my @params = ( $field_index, 1 ); |
|
1503
|
|
|
|
|
|
|
return $self->feature_pmi_rounded( $edge, \@params ); |
|
1504
|
|
|
|
|
|
|
} |
|
1505
|
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
sub feature_pmi_3_rounded { |
|
1507
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1508
|
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
my @params = ( $field_index, 2 ); |
|
1510
|
|
|
|
|
|
|
return $self->feature_pmi_rounded( $edge, \@params ); |
|
1511
|
|
|
|
|
|
|
} |
|
1512
|
|
|
|
|
|
|
|
|
1513
|
|
|
|
|
|
|
sub feature_cprob { |
|
1514
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1515
|
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
return $self->feature_additional_model( $edge, $field_index, $self->cprob_model ); |
|
1517
|
|
|
|
|
|
|
} |
|
1518
|
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
sub feature_cprob_bucketed { |
|
1520
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1521
|
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
return $self->feature_additional_model_bucketed( $edge, $field_index, $self->cprob_model ); |
|
1523
|
|
|
|
|
|
|
} |
|
1524
|
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
sub feature_cprob_rounded { |
|
1526
|
|
|
|
|
|
|
my ( $self, $edge, $parameters ) = @_; |
|
1527
|
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
return $self->feature_additional_model_rounded( $edge, $parameters, $self->cprob_model ); |
|
1529
|
|
|
|
|
|
|
} |
|
1530
|
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
sub feature_cprob_2_rounded { |
|
1532
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
my @params = ( $field_index, 1 ); |
|
1535
|
|
|
|
|
|
|
return $self->feature_cprob_rounded( $edge, \@params ); |
|
1536
|
|
|
|
|
|
|
} |
|
1537
|
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
sub feature_cprob_3_rounded { |
|
1539
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
|
1540
|
|
|
|
|
|
|
|
|
1541
|
|
|
|
|
|
|
my @params = ( $field_index, 2 ); |
|
1542
|
|
|
|
|
|
|
return $self->feature_cprob_rounded( $edge, \@params ); |
|
1543
|
|
|
|
|
|
|
} |
|
1544
|
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
1; |
|
1546
|
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
__END__ |
|
1548
|
|
|
|
|
|
|
|
|
1549
|
|
|
|
|
|
|
=pod |
|
1550
|
|
|
|
|
|
|
|
|
1551
|
|
|
|
|
|
|
=for Pod::Coverage BUILD |
|
1552
|
|
|
|
|
|
|
|
|
1553
|
|
|
|
|
|
|
=encoding utf-8 |
|
1554
|
|
|
|
|
|
|
|
|
1555
|
|
|
|
|
|
|
=head1 NAME |
|
1556
|
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
Treex::Tool::Parser::MSTperl::FeaturesControl |
|
1558
|
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
=head1 VERSION |
|
1560
|
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
version 0.11949 |
|
1562
|
|
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
1564
|
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
Controls the features used in the model. |
|
1566
|
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
=head2 Features |
|
1568
|
|
|
|
|
|
|
|
|
1569
|
|
|
|
|
|
|
TODO: outdated, superceded by use of config file -> rewrite |
|
1570
|
|
|
|
|
|
|
|
|
1571
|
|
|
|
|
|
|
Each feature has a form C<code:value>. The code desribes the information which |
|
1572
|
|
|
|
|
|
|
is relevant for the feature, and the value is the information retained from |
|
1573
|
|
|
|
|
|
|
the dependency edge (and possibly other parts of the sentence |
|
1574
|
|
|
|
|
|
|
(L<Treex::Tool::Parser::MSTperl::Sentence>) stored in C<sentence> field). |
|
1575
|
|
|
|
|
|
|
|
|
1576
|
|
|
|
|
|
|
For example, the feature C<L|l:být|pes> means that the lemma of the parent node |
|
1577
|
|
|
|
|
|
|
(the governing word) is "být" and the lemma of its child node (the dependent |
|
1578
|
|
|
|
|
|
|
node) is "pes". |
|
1579
|
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
Each (proper) feature is composed of several simple features. In the |
|
1581
|
|
|
|
|
|
|
aforementioned example, the simple feature codes were C<L> and C<l> and their |
|
1582
|
|
|
|
|
|
|
values "být" and "pes", respectively. Each simple feature code is a string |
|
1583
|
|
|
|
|
|
|
(case sensitive) and its value is also a string. The simple feature codes are |
|
1584
|
|
|
|
|
|
|
joined together by the C<|> sign to form the code of the proper feature, and |
|
1585
|
|
|
|
|
|
|
similarly, the simple feature values joined by C<|> form the proper feature |
|
1586
|
|
|
|
|
|
|
value. Then, the proper feature code and value are joined together by C<:>. |
|
1587
|
|
|
|
|
|
|
(Therefore, the codes and values of the simple features must not contain the |
|
1588
|
|
|
|
|
|
|
C<|> and the C<:> signs.) |
|
1589
|
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
By a naming convention, |
|
1591
|
|
|
|
|
|
|
if the same simple feature can be computed for both the parent node and its |
|
1592
|
|
|
|
|
|
|
child node, their codes are the same but for the case, which is upper for the |
|
1593
|
|
|
|
|
|
|
parent and lower for the child. If this is not applicable, an uppercase |
|
1594
|
|
|
|
|
|
|
code is used. |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
For higher effectiveness the simple feature codes are translated to integers |
|
1597
|
|
|
|
|
|
|
(see C<simple_feature_codes>). |
|
1598
|
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
In reality the feature codes are translated to integers as well (see |
|
1600
|
|
|
|
|
|
|
C<feature_codes>), but this is only an internal issue. You can see these |
|
1601
|
|
|
|
|
|
|
numbers in the model file if you use the default L<Data::Dumper> format (see |
|
1602
|
|
|
|
|
|
|
C<load> and C<store>). However, if you use the tsv format (see C<load_tsv>, |
|
1603
|
|
|
|
|
|
|
C<store_tsv>), you will see the real string feature codes. |
|
1604
|
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
Currently the following simple features are available. Any subset of them can |
|
1606
|
|
|
|
|
|
|
be used to form a proper feature, but their order should follow their order of |
|
1607
|
|
|
|
|
|
|
appearance in this list (still, this is only a cleanliness and readability |
|
1608
|
|
|
|
|
|
|
thing, it does not affect the function of the parser in any way). |
|
1609
|
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
=over 4 |
|
1611
|
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
=item Distance (D) |
|
1613
|
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
Distance of the two nodes in the sentence, computed as order of the parent |
|
1615
|
|
|
|
|
|
|
minus the order of the child. Eg. for the sentence "To je prima pes ." and the |
|
1616
|
|
|
|
|
|
|
feature D computed on nodes "je" and "pes" (parent and child respectively), |
|
1617
|
|
|
|
|
|
|
the order of "je" is 2 and the order of "pes" is 4, yielding the feature value |
|
1618
|
|
|
|
|
|
|
of 2 - 4 = -2. This leads to a feature C<D:-2>. |
|
1619
|
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
=item Form (F, f) |
|
1621
|
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
The form of the node, i.e. the word exactly as it appears in the sentence text. |
|
1623
|
|
|
|
|
|
|
|
|
1624
|
|
|
|
|
|
|
Currently not used as it has not lead to any improvement in the parsing. |
|
1625
|
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
=item Lemma (L, l) |
|
1627
|
|
|
|
|
|
|
|
|
1628
|
|
|
|
|
|
|
The morphological lemma of the node. |
|
1629
|
|
|
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
=item preceding tag (S, s) |
|
1631
|
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
The morphological tag (or POS tag if you like) of the node preceding (ord-wise) |
|
1633
|
|
|
|
|
|
|
the node. |
|
1634
|
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
=item Tag (T, t) |
|
1636
|
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
The morphological tag of the node. |
|
1638
|
|
|
|
|
|
|
|
|
1639
|
|
|
|
|
|
|
=item following tag (U, u) |
|
1640
|
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
The morphological tag of the node following (ord-wise) the node. |
|
1642
|
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
=item between tag (B) |
|
1644
|
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
The morphological tag of each node between (ord-wise) the parent node and the |
|
1646
|
|
|
|
|
|
|
child node. This simple feature returns (a reference to) an array of values. |
|
1647
|
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
=back |
|
1649
|
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
Some of the simple features can return an empty string in case they are not |
|
1651
|
|
|
|
|
|
|
applicable (eg. C<U> for the last node in the sentence), then the whole |
|
1652
|
|
|
|
|
|
|
feature is not present for the edge. |
|
1653
|
|
|
|
|
|
|
|
|
1654
|
|
|
|
|
|
|
Some of the simple features return an array of values (eg. the C<B> simple |
|
1655
|
|
|
|
|
|
|
feature). This can result in several instances of the feature with the same |
|
1656
|
|
|
|
|
|
|
code for one edge to appear in the result. |
|
1657
|
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
=head1 FIELDS |
|
1659
|
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
=head2 Features |
|
1661
|
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
TODO: slightly outdated |
|
1663
|
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
The examples used here are consistent throughout this part of documentation, |
|
1665
|
|
|
|
|
|
|
i.e. if several simple features are listed in C<simple_feature_codes> and |
|
1666
|
|
|
|
|
|
|
then simple feature with index 9 is referred to in C<array_simple_features>, |
|
1667
|
|
|
|
|
|
|
it really means the C<B> simple feature which is on the 9th position in |
|
1668
|
|
|
|
|
|
|
C<simple_feature_codes>. |
|
1669
|
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
=over 4 |
|
1671
|
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
=item feature_count (Int) |
|
1673
|
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
Alias of C<scalar @{feature_codes}> (but the integer is really |
|
1675
|
|
|
|
|
|
|
stored in the field for faster access). |
|
1676
|
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
=item feature_codes (ArrayRef[Str]) |
|
1678
|
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
Codes of all features to be computed. Their |
|
1680
|
|
|
|
|
|
|
indexes in this array are used to refer to them in the code. Eg.: |
|
1681
|
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
feature_codes ( [( 'L|T', 'l|t', 'L|T|l|t', 'T|B|t')] ) |
|
1683
|
|
|
|
|
|
|
|
|
1684
|
|
|
|
|
|
|
=item feature_codes_hash (HashRef[Str]) |
|
1685
|
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
1 for each feature code to easily check if a feature exists |
|
1687
|
|
|
|
|
|
|
|
|
1688
|
|
|
|
|
|
|
=item feature_indexes (HashRef[Str]) |
|
1689
|
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
Index of each feature code in feature_codes (for conversion of feature code to |
|
1691
|
|
|
|
|
|
|
feature index) |
|
1692
|
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
=item feature_simple_features_indexes (ArrayRef[ArrayRef[Int]]) |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
For each feature contains (a reference to) an array which contains all its |
|
1696
|
|
|
|
|
|
|
simple feature indexes (corresponding to positions in C<simple_feature_codes> |
|
1697
|
|
|
|
|
|
|
). Eg. for the 4 features (0 to 3) listed in C<feature_codes> and the 10 |
|
1698
|
|
|
|
|
|
|
simple features listed in C<simple_feature_codes> (0 to 9): |
|
1699
|
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
feature_simple_features_indexes ( [( |
|
1701
|
|
|
|
|
|
|
[ (1, 5) ], |
|
1702
|
|
|
|
|
|
|
[ (2, 6) ], |
|
1703
|
|
|
|
|
|
|
[ (1, 5, 2, 6) ], |
|
1704
|
|
|
|
|
|
|
[ (5, 9, 6) ], |
|
1705
|
|
|
|
|
|
|
)] ) |
|
1706
|
|
|
|
|
|
|
|
|
1707
|
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
=item array_features (HashRef) |
|
1709
|
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
Indexes of features containing array simple features (see |
|
1711
|
|
|
|
|
|
|
C<array_simple_features>). Eg.: |
|
1712
|
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
array_features( { 3 => 1} ) |
|
1714
|
|
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
as the feature with index 3 (C<'T|B|t'>) contains the C<B> simple feature |
|
1716
|
|
|
|
|
|
|
which is an array simple feature. |
|
1717
|
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
=back |
|
1719
|
|
|
|
|
|
|
|
|
1720
|
|
|
|
|
|
|
=head2 Simple features |
|
1721
|
|
|
|
|
|
|
|
|
1722
|
|
|
|
|
|
|
=over 4 |
|
1723
|
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
=item simple_feature_count (Int) |
|
1725
|
|
|
|
|
|
|
|
|
1726
|
|
|
|
|
|
|
Alias of C<scalar @{simple_feature_codes}> (but the integer is really |
|
1727
|
|
|
|
|
|
|
stored in the field for faster access). |
|
1728
|
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
=item simple_feature_codes (ArrayRef[Str]) |
|
1730
|
|
|
|
|
|
|
|
|
1731
|
|
|
|
|
|
|
Codes of all simple features to be computed. Their order is important as their |
|
1732
|
|
|
|
|
|
|
indexes in this array are used to refer to them in the code, especially in the |
|
1733
|
|
|
|
|
|
|
C<get_simple_feature> method. Eg.: |
|
1734
|
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
simple_feature_codes ( [('D', 'L', 'l', 'S', 's', 'T', 't', 'U', 'u', 'B')]) |
|
1736
|
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
=item simple_feature_codes_hash (HashRef[Str]) |
|
1738
|
|
|
|
|
|
|
|
|
1739
|
|
|
|
|
|
|
1 for each simple feature code to easily check if a simple feature exists |
|
1740
|
|
|
|
|
|
|
|
|
1741
|
|
|
|
|
|
|
=item simple_feature_indexes (HashRef[Str]) |
|
1742
|
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
Index of each simple feature code in simple_feature_codes (for conversion of |
|
1744
|
|
|
|
|
|
|
simple feature code to simple feature index) |
|
1745
|
|
|
|
|
|
|
|
|
1746
|
|
|
|
|
|
|
=item simple_feature_sub_arguments (ArrayRef) |
|
1747
|
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
For each simple feature (on the corresponsing index) contains the index of the |
|
1749
|
|
|
|
|
|
|
field (in C<field_names>), which is used to compute the simple feature value |
|
1750
|
|
|
|
|
|
|
(together with a subroutine from C<simple_feature_subs>). |
|
1751
|
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
If the simple feature takes more than one argument (called a multiarg feature |
|
1753
|
|
|
|
|
|
|
here), then instead of a single field index there is a reference to an array |
|
1754
|
|
|
|
|
|
|
of field indexes. |
|
1755
|
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
If the simple feature takes other arguments than fields (especially integers), |
|
1757
|
|
|
|
|
|
|
then these arguments are stored here insted of field indexes. |
|
1758
|
|
|
|
|
|
|
|
|
1759
|
|
|
|
|
|
|
=item simple_feature_subs (ArrayRef) |
|
1760
|
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
For faster run, the simple features are internally not represented by their |
|
1762
|
|
|
|
|
|
|
string codes, which would have to be parsed repeatedly. Instead their codes |
|
1763
|
|
|
|
|
|
|
are parsed once only (in C<set_simple_feature>) and they are represented as |
|
1764
|
|
|
|
|
|
|
an integer index of the field which is used to compute the feature (it is the |
|
1765
|
|
|
|
|
|
|
actual index of the field in the input file line, accessible through |
|
1766
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Node/fields>) and a reference to a subroutine |
|
1767
|
|
|
|
|
|
|
(one of the C<feature_*> subs, see below) which computes the feature value |
|
1768
|
|
|
|
|
|
|
based on the field index and the edge (L<Treex::Tool::Parser::MSTperl::Edge>). |
|
1769
|
|
|
|
|
|
|
The references subroutine is then invoked in C<get_simple_feature_values_array>. |
|
1770
|
|
|
|
|
|
|
|
|
1771
|
|
|
|
|
|
|
=item array_simple_features (HashRef[Int]) |
|
1772
|
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
Indexes of simple features that return an array of values instead of a single |
|
1774
|
|
|
|
|
|
|
string value. Eg.: |
|
1775
|
|
|
|
|
|
|
|
|
1776
|
|
|
|
|
|
|
array_simple_features( { 9 => 1} ) |
|
1777
|
|
|
|
|
|
|
|
|
1778
|
|
|
|
|
|
|
because in the aforementioned example the C<B> simple feature returns an array |
|
1779
|
|
|
|
|
|
|
of values and has the index C<9>. |
|
1780
|
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
|
|
1782
|
|
|
|
|
|
|
=back |
|
1783
|
|
|
|
|
|
|
|
|
1784
|
|
|
|
|
|
|
=head2 Other |
|
1785
|
|
|
|
|
|
|
|
|
1786
|
|
|
|
|
|
|
=over 4 |
|
1787
|
|
|
|
|
|
|
|
|
1788
|
|
|
|
|
|
|
=item edge_features_cache (HashRef[ArrayRef[Str]) |
|
1789
|
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
If caching is turned on (see below), all features of any edge computed by the |
|
1791
|
|
|
|
|
|
|
C<get_feature_simple_features_indexes> method are computed once only, stored |
|
1792
|
|
|
|
|
|
|
in this cache and then retrieved when needed. |
|
1793
|
|
|
|
|
|
|
|
|
1794
|
|
|
|
|
|
|
The key of the hash is the edge signature (see |
|
1795
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Edge/signature>), the value is |
|
1796
|
|
|
|
|
|
|
(a reference to) an array of fetures and their values. |
|
1797
|
|
|
|
|
|
|
|
|
1798
|
|
|
|
|
|
|
=back |
|
1799
|
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
=head1 METHODS |
|
1801
|
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
=head2 Settings |
|
1803
|
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
The best source of information about all the possible settings is the |
|
1805
|
|
|
|
|
|
|
configuration file itself (usually called C<config.txt>), as it is richly |
|
1806
|
|
|
|
|
|
|
commented and accompanied by real examples at the same time. |
|
1807
|
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
=over 4 |
|
1809
|
|
|
|
|
|
|
|
|
1810
|
|
|
|
|
|
|
=item my $featuresControl = |
|
1811
|
|
|
|
|
|
|
Treex::Tool::Parser::MSTperl::FeaturesControl->new( |
|
1812
|
|
|
|
|
|
|
'config' => $config, |
|
1813
|
|
|
|
|
|
|
'feature_codes_from_config' => $feature_codes_array_reference, |
|
1814
|
|
|
|
|
|
|
'use_edge_features_cache' => $use_edge_features_cache, |
|
1815
|
|
|
|
|
|
|
) |
|
1816
|
|
|
|
|
|
|
|
|
1817
|
|
|
|
|
|
|
Parses feature codes and creates their in-memory representations. |
|
1818
|
|
|
|
|
|
|
|
|
1819
|
|
|
|
|
|
|
=item set_feature ($feature_code) |
|
1820
|
|
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
|
Parses the feature code and (if no errors are encountered) creates its |
|
1822
|
|
|
|
|
|
|
representation in the fields of this package (all C<feature_>* fields and |
|
1823
|
|
|
|
|
|
|
possibly also the C<array_features> field). |
|
1824
|
|
|
|
|
|
|
|
|
1825
|
|
|
|
|
|
|
=item set_simple_feature ($simple_feature_code) |
|
1826
|
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
Parses the simple feature code and creates its representation in the fields of |
|
1828
|
|
|
|
|
|
|
this package (all C<simple_feature_>* fields and possibly also the |
|
1829
|
|
|
|
|
|
|
C<array_simple_features> field). |
|
1830
|
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
=back |
|
1832
|
|
|
|
|
|
|
|
|
1833
|
|
|
|
|
|
|
=head2 Computing (proper) features |
|
1834
|
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
=over 4 |
|
1836
|
|
|
|
|
|
|
|
|
1837
|
|
|
|
|
|
|
=item my $features_array_rf = $model->get_all_features($edge) |
|
1838
|
|
|
|
|
|
|
|
|
1839
|
|
|
|
|
|
|
Returns (a reference to) an array which contains all features of the edge |
|
1840
|
|
|
|
|
|
|
(according to settings). |
|
1841
|
|
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
If caching is turned on, tries to look the features up in the cache before |
|
1843
|
|
|
|
|
|
|
computing them. If they are not cached yet, they are computed and stored into |
|
1844
|
|
|
|
|
|
|
the cache. |
|
1845
|
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
The value of a feature is computed by C<get_feature_value>. Values of simple |
|
1847
|
|
|
|
|
|
|
features are precomputed (by calling C<get_simple_feature_values_array>) and |
|
1848
|
|
|
|
|
|
|
passed to the C<get_feature_value> method. |
|
1849
|
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
=item my $feature_value = get_feature_value(3, $simple_feature_values) |
|
1851
|
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
Returns the value of the feature with the given index. |
|
1853
|
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
If it is an array feature (see C<array_features>), its value is (a reference |
|
1855
|
|
|
|
|
|
|
to) an array of all (string) values of the feature (a reference to an empty |
|
1856
|
|
|
|
|
|
|
array if there are no values). |
|
1857
|
|
|
|
|
|
|
|
|
1858
|
|
|
|
|
|
|
If it is not an array feature, its value is composed from the simple feature |
|
1859
|
|
|
|
|
|
|
values. If some of the simple features do not have a value defined, an empty |
|
1860
|
|
|
|
|
|
|
string (C<''>) is returned. |
|
1861
|
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
=item my $feature_value = get_array_feature_value ($simple_features_indexes, |
|
1863
|
|
|
|
|
|
|
$simple_feature_values, $start_from) |
|
1864
|
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
Recursively calls itself to compose an array of all values of the feature |
|
1866
|
|
|
|
|
|
|
(composed of the simple features given in C<$simple_features_indexes> array |
|
1867
|
|
|
|
|
|
|
reference), which is a cartesian product on all values of the simple features. |
|
1868
|
|
|
|
|
|
|
The C<$start_from> variable should be C<0> when this method is called and is |
|
1869
|
|
|
|
|
|
|
incremented in the recursive calls. |
|
1870
|
|
|
|
|
|
|
|
|
1871
|
|
|
|
|
|
|
=back |
|
1872
|
|
|
|
|
|
|
|
|
1873
|
|
|
|
|
|
|
=head2 Computing simple features |
|
1874
|
|
|
|
|
|
|
|
|
1875
|
|
|
|
|
|
|
=over 4 |
|
1876
|
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
=item my $simple_feature_values = get_simple_feature_values_array($edge) |
|
1878
|
|
|
|
|
|
|
|
|
1879
|
|
|
|
|
|
|
Returns (a reference to) an array of values of all simple features (see |
|
1880
|
|
|
|
|
|
|
C<simple_feature_codes>). For each simple feature, its value can be found |
|
1881
|
|
|
|
|
|
|
on the position in the returned array corresponding to its position in |
|
1882
|
|
|
|
|
|
|
C<simple_feature_codes>. |
|
1883
|
|
|
|
|
|
|
|
|
1884
|
|
|
|
|
|
|
=item my $sub = get_simple_feature_sub_reference ('distance') |
|
1885
|
|
|
|
|
|
|
|
|
1886
|
|
|
|
|
|
|
Translates the feature funtion string name (eg. C<distance>) to its reference |
|
1887
|
|
|
|
|
|
|
(eg. C<\&feature_distance>). |
|
1888
|
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
=item my $value = get_simple_feature_value ($edge, 9) |
|
1890
|
|
|
|
|
|
|
|
|
1891
|
|
|
|
|
|
|
Returns the value of the simple feature with the given index by calling an |
|
1892
|
|
|
|
|
|
|
appropriate C<feature_*> method on the edge |
|
1893
|
|
|
|
|
|
|
(see L<Treex::Tool::Parser::MSTperl::Edge>). If |
|
1894
|
|
|
|
|
|
|
the feature cannot be computed, an empty string (C<''>) is returned (or a |
|
1895
|
|
|
|
|
|
|
reference to an empty array for array simple features - see |
|
1896
|
|
|
|
|
|
|
C<array_simple_features>). |
|
1897
|
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
=item feature_distance |
|
1899
|
|
|
|
|
|
|
|
|
1900
|
|
|
|
|
|
|
=item feature_child |
|
1901
|
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
=item feature_parent |
|
1903
|
|
|
|
|
|
|
|
|
1904
|
|
|
|
|
|
|
=item feature_first |
|
1905
|
|
|
|
|
|
|
|
|
1906
|
|
|
|
|
|
|
=item feature_second |
|
1907
|
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
=item feature_preceding_child |
|
1909
|
|
|
|
|
|
|
|
|
1910
|
|
|
|
|
|
|
=item feature_preceding_parent |
|
1911
|
|
|
|
|
|
|
|
|
1912
|
|
|
|
|
|
|
=item feature_following_child |
|
1913
|
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
=item feature_following_parent |
|
1915
|
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
=item feature_preceding_first |
|
1917
|
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
=item feature_preceding_second |
|
1919
|
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
=item feature_following_first |
|
1921
|
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
=item feature_following_second |
|
1923
|
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
=item feature_between |
|
1925
|
|
|
|
|
|
|
|
|
1926
|
|
|
|
|
|
|
=item feature_foreach |
|
1927
|
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
=item feature_equals, feature_equals_pc, feature_equals_pc_at |
|
1929
|
|
|
|
|
|
|
|
|
1930
|
|
|
|
|
|
|
A simple feature function C<equals(field_1,field_2)> |
|
1931
|
|
|
|
|
|
|
with "at least once" semantics for multiple values |
|
1932
|
|
|
|
|
|
|
(there can be multiple alignments) |
|
1933
|
|
|
|
|
|
|
with a special output value if one of the fields is unknown |
|
1934
|
|
|
|
|
|
|
(maybe it suffices to emmit an undef, as this would occur iff at least |
|
1935
|
|
|
|
|
|
|
one of the arguments is undef; but maybe not and eg. "-1" should be given) |
|
1936
|
|
|
|
|
|
|
|
|
1937
|
|
|
|
|
|
|
This makes it possible to have a simple feature which behaves like this: |
|
1938
|
|
|
|
|
|
|
|
|
1939
|
|
|
|
|
|
|
=over 4 |
|
1940
|
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
=item returns 1 if the edge between child and parent is also present in the |
|
1942
|
|
|
|
|
|
|
English tree |
|
1943
|
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
=item returns 0 if not |
|
1945
|
|
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
|
=item returns -1 if cannot decide (alignment info is missing for some of the |
|
1947
|
|
|
|
|
|
|
nodes) |
|
1948
|
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
=back |
|
1950
|
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
Because if the parser has (the ord of the en child node and) |
|
1952
|
|
|
|
|
|
|
the ord of en child's parent and the ord of the en parent node |
|
1953
|
|
|
|
|
|
|
(and the ord of the en parent's parent), the feature can check whether |
|
1954
|
|
|
|
|
|
|
en_parent->ord = en_child->parentOrd |
|
1955
|
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
C<equalspc(en->ord, en->parent->ord)> |
|
1957
|
|
|
|
|
|
|
|
|
1958
|
|
|
|
|
|
|
=back |
|
1959
|
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
=head1 AUTHORS |
|
1961
|
|
|
|
|
|
|
|
|
1962
|
|
|
|
|
|
|
Rudolf Rosa <rosa@ufal.mff.cuni.cz> |
|
1963
|
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
1965
|
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, |
|
1967
|
|
|
|
|
|
|
Charles University in Prague |
|
1968
|
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
This module is free software; |
|
1970
|
|
|
|
|
|
|
you can redistribute it and/or modify it under the same terms as Perl itself. |