line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Tool::Parser::MSTperl::FeaturesControl; |
2
|
|
|
|
|
|
|
{ |
3
|
|
|
|
|
|
|
$Treex::Tool::Parser::MSTperl::FeaturesControl::VERSION = '0.11949'; |
4
|
|
|
|
|
|
|
} |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
2433
|
use Moose; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use autodie; |
8
|
|
|
|
|
|
|
use Carp; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use Treex::Tool::Parser::MSTperl::ModelAdditional; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# TODO dynamic features |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has 'config' => ( |
15
|
|
|
|
|
|
|
isa => 'Treex::Tool::Parser::MSTperl::Config', |
16
|
|
|
|
|
|
|
is => 'ro', |
17
|
|
|
|
|
|
|
required => '1', |
18
|
|
|
|
|
|
|
weak_ref => '1', |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# FEATURES |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
has 'feature_count' => ( |
24
|
|
|
|
|
|
|
is => 'rw', |
25
|
|
|
|
|
|
|
isa => 'Int', |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
has 'feature_codes_from_config' => ( |
29
|
|
|
|
|
|
|
is => 'rw', |
30
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
31
|
|
|
|
|
|
|
default => sub { [] }, |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has 'feature_codes' => ( |
35
|
|
|
|
|
|
|
is => 'rw', |
36
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
37
|
|
|
|
|
|
|
default => sub { [] }, |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has 'feature_codes_hash' => ( |
41
|
|
|
|
|
|
|
is => 'rw', |
42
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
43
|
|
|
|
|
|
|
default => sub { {} }, |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
has 'feature_indexes' => ( |
47
|
|
|
|
|
|
|
is => 'rw', |
48
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
49
|
|
|
|
|
|
|
default => sub { {} }, |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# for each feature contains (a reference to) an array |
53
|
|
|
|
|
|
|
# which cointains all its subfeature indexes |
54
|
|
|
|
|
|
|
has 'feature_simple_features_indexes' => ( |
55
|
|
|
|
|
|
|
is => 'rw', |
56
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[Int]]', |
57
|
|
|
|
|
|
|
default => sub { [] }, |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# features containing array simple features |
61
|
|
|
|
|
|
|
has 'array_features' => ( |
62
|
|
|
|
|
|
|
is => 'rw', |
63
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
64
|
|
|
|
|
|
|
default => sub { {} }, |
65
|
|
|
|
|
|
|
); |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# features containing dynamic simple features |
68
|
|
|
|
|
|
|
has 'dynamic_features' => ( |
69
|
|
|
|
|
|
|
is => 'rw', |
70
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
71
|
|
|
|
|
|
|
default => sub { {} }, |
72
|
|
|
|
|
|
|
); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# SIMPLE FEATURES |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
has 'simple_feature_count' => ( |
77
|
|
|
|
|
|
|
is => 'rw', |
78
|
|
|
|
|
|
|
isa => 'Int', |
79
|
|
|
|
|
|
|
); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
has 'simple_feature_codes' => ( |
82
|
|
|
|
|
|
|
is => 'rw', |
83
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
84
|
|
|
|
|
|
|
default => sub { [] }, |
85
|
|
|
|
|
|
|
); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
has 'simple_feature_codes_hash' => ( |
88
|
|
|
|
|
|
|
is => 'rw', |
89
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
90
|
|
|
|
|
|
|
default => sub { {} }, |
91
|
|
|
|
|
|
|
); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
has 'simple_feature_indexes' => ( |
94
|
|
|
|
|
|
|
is => 'rw', |
95
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
96
|
|
|
|
|
|
|
default => sub { {} }, |
97
|
|
|
|
|
|
|
); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
has 'simple_feature_subs' => ( |
100
|
|
|
|
|
|
|
is => 'rw', |
101
|
|
|
|
|
|
|
isa => 'ArrayRef', |
102
|
|
|
|
|
|
|
default => sub { [] }, |
103
|
|
|
|
|
|
|
); |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
has 'simple_feature_sub_arguments' => ( |
106
|
|
|
|
|
|
|
is => 'rw', |
107
|
|
|
|
|
|
|
isa => 'ArrayRef', |
108
|
|
|
|
|
|
|
default => sub { [] }, |
109
|
|
|
|
|
|
|
); |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# simple features that return an array of values |
112
|
|
|
|
|
|
|
has 'array_simple_features' => ( |
113
|
|
|
|
|
|
|
is => 'rw', |
114
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
115
|
|
|
|
|
|
|
default => sub { {} }, |
116
|
|
|
|
|
|
|
); |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# simple features that must be always recomputed |
119
|
|
|
|
|
|
|
# because their value cannot be always computed from input data |
120
|
|
|
|
|
|
|
# (for labeller - parent's label, brother's label etc.) |
121
|
|
|
|
|
|
|
has 'dynamic_simple_features' => ( |
122
|
|
|
|
|
|
|
is => 'rw', |
123
|
|
|
|
|
|
|
isa => 'HashRef[Int]', |
124
|
|
|
|
|
|
|
default => sub { {} }, |
125
|
|
|
|
|
|
|
); |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# # simple features that get more than 1 argument as input |
128
|
|
|
|
|
|
|
# has 'multiarg_simple_features' => ( |
129
|
|
|
|
|
|
|
# is => 'rw', |
130
|
|
|
|
|
|
|
# isa => 'HashRef[Int]', |
131
|
|
|
|
|
|
|
# default => sub { {} }, |
132
|
|
|
|
|
|
|
# ); |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# CACHING |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
has 'use_edge_features_cache' => ( |
137
|
|
|
|
|
|
|
is => 'ro', |
138
|
|
|
|
|
|
|
isa => 'Bool', |
139
|
|
|
|
|
|
|
default => '0', |
140
|
|
|
|
|
|
|
); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# using cache turned off to fit into RAM by default |
143
|
|
|
|
|
|
|
# turn on if training with a lot of RAM or on small training data |
144
|
|
|
|
|
|
|
# turned off when parsing (does not make any sense for parsing) |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
has 'edge_features_cache' => ( |
147
|
|
|
|
|
|
|
is => 'rw', |
148
|
|
|
|
|
|
|
isa => 'HashRef[ArrayRef[Str]]', |
149
|
|
|
|
|
|
|
default => sub { {} }, |
150
|
|
|
|
|
|
|
); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
has pmi_model => ( |
153
|
|
|
|
|
|
|
is => 'rw', |
154
|
|
|
|
|
|
|
isa => 'Maybe[Treex::Tool::Parser::MSTperl::ModelAdditional]', |
155
|
|
|
|
|
|
|
default => undef, |
156
|
|
|
|
|
|
|
); |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
has cprob_model => ( |
159
|
|
|
|
|
|
|
is => 'rw', |
160
|
|
|
|
|
|
|
isa => 'Maybe[Treex::Tool::Parser::MSTperl::ModelAdditional]', |
161
|
|
|
|
|
|
|
default => undef, |
162
|
|
|
|
|
|
|
); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub BUILD { |
165
|
|
|
|
|
|
|
my ($self) = @_; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# ignore some settings if in parsing-only mode |
168
|
|
|
|
|
|
|
# if ( !$self->training ) { |
169
|
|
|
|
|
|
|
# $self->use_edge_features_cache(0); |
170
|
|
|
|
|
|
|
# } |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# features |
173
|
|
|
|
|
|
|
foreach my $feature ( @{ $self->feature_codes_from_config } ) { |
174
|
|
|
|
|
|
|
$self->set_feature($feature); |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
$self->feature_count( scalar( @{ $self->feature_codes } ) ); |
178
|
|
|
|
|
|
|
$self->simple_feature_count( scalar( @{ $self->simple_feature_codes } ) ); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
return; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub set_feature { |
184
|
|
|
|
|
|
|
my ( $self, $feature_code ) = @_; |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
if ( $self->feature_codes_hash->{$feature_code} ) { |
187
|
|
|
|
|
|
|
warn "Feature '$feature_code' is defined more than once; " . |
188
|
|
|
|
|
|
|
"disregarding its later definitions.\n"; |
189
|
|
|
|
|
|
|
} else { |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# get simple features |
192
|
|
|
|
|
|
|
my $isArrayFeature = 0; |
193
|
|
|
|
|
|
|
my $isDynamicFeature = 0; |
194
|
|
|
|
|
|
|
my @simple_features_indexes; |
195
|
|
|
|
|
|
|
my %simple_features_hash; |
196
|
|
|
|
|
|
|
foreach my $simple_feature_code ( split( /\|/, $feature_code ) ) { |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# checks |
199
|
|
|
|
|
|
|
if ( $simple_features_hash{$simple_feature_code} ) { |
200
|
|
|
|
|
|
|
warn "Simple feature '$simple_feature_code' " . |
201
|
|
|
|
|
|
|
"is used more than once in '$feature_code'; " . |
202
|
|
|
|
|
|
|
"disregarding its later uses.\n"; |
203
|
|
|
|
|
|
|
next; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
if ( !$self->simple_feature_codes_hash->{$simple_feature_code} ) { |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# this simple feature has not been used at all yet |
208
|
|
|
|
|
|
|
$self->set_simple_feature($simple_feature_code); |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# save |
212
|
|
|
|
|
|
|
my $simple_feature_index = |
213
|
|
|
|
|
|
|
$self->simple_feature_indexes->{$simple_feature_code}; |
214
|
|
|
|
|
|
|
$simple_features_hash{$simple_feature_code} = 1; |
215
|
|
|
|
|
|
|
if ( $self->array_simple_features->{$simple_feature_index} ) { |
216
|
|
|
|
|
|
|
$isArrayFeature = 1; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
if ( $self->dynamic_simple_features->{$simple_feature_index} ) { |
219
|
|
|
|
|
|
|
$isDynamicFeature = 1; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
push @simple_features_indexes, $simple_feature_index; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# save |
225
|
|
|
|
|
|
|
my $feature_index = scalar( @{ $self->feature_codes } ); |
226
|
|
|
|
|
|
|
$self->feature_codes_hash->{$feature_code} = 1; |
227
|
|
|
|
|
|
|
$self->feature_indexes->{$feature_code} = $feature_index; |
228
|
|
|
|
|
|
|
push @{ $self->feature_codes }, $feature_code; |
229
|
|
|
|
|
|
|
push @{ $self->feature_simple_features_indexes }, |
230
|
|
|
|
|
|
|
[@simple_features_indexes]; |
231
|
|
|
|
|
|
|
if ($isArrayFeature) { |
232
|
|
|
|
|
|
|
$self->array_features->{$feature_index} = 1; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
if ($isDynamicFeature) { |
235
|
|
|
|
|
|
|
$self->dynamic_features->{$feature_index} = 1; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
return; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub set_simple_feature { |
243
|
|
|
|
|
|
|
my ( $self, $simple_feature_code ) = @_; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# get sub reference and field index |
246
|
|
|
|
|
|
|
my $simple_feature_index = scalar @{ $self->simple_feature_codes }; |
247
|
|
|
|
|
|
|
my $simple_feature_sub; |
248
|
|
|
|
|
|
|
my $simple_feature_field; |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# simple parent/child feature |
251
|
|
|
|
|
|
|
if ( $simple_feature_code =~ /^([a-zA-Z0-9_]+)$/ ) { |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
if ( $simple_feature_code =~ /^([a-z0-9_]+)$/ ) { |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# child feature |
256
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_child}; |
257
|
|
|
|
|
|
|
$simple_feature_field = $1; |
258
|
|
|
|
|
|
|
} elsif ( $simple_feature_code =~ /^([A-Z0-9_]+)$/ ) { |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# parent feature |
261
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_parent}; |
262
|
|
|
|
|
|
|
$simple_feature_field = lc($1); |
263
|
|
|
|
|
|
|
} else { |
264
|
|
|
|
|
|
|
die "Incorrect simple feature format '$simple_feature_code'. " . |
265
|
|
|
|
|
|
|
"Use lowercase (" . lc($simple_feature_code) . |
266
|
|
|
|
|
|
|
") for child node and UPPERCASE (" . uc($simple_feature_code) . |
267
|
|
|
|
|
|
|
") for parent node.\n"; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# first/second/(left sibling)/(right sibling)/Grandparent/grandchildren |
271
|
|
|
|
|
|
|
# node feature |
272
|
|
|
|
|
|
|
} elsif ( $simple_feature_code =~ /^([12gGlr])\.([a-z0-9_]+)$/ ) { |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
$simple_feature_field = $2; |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
if ( $1 eq '1' ) { |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# first node feature |
279
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_first}; |
280
|
|
|
|
|
|
|
} elsif ( $1 eq '2' ) { |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# second node feature |
283
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_second}; |
284
|
|
|
|
|
|
|
} elsif ( $1 eq 'g' ) { |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# grandchildren node feature |
287
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_grandchildren}; |
288
|
|
|
|
|
|
|
} elsif ( $1 eq 'G' ) { |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# grandparent node feature |
291
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_grandparent}; |
292
|
|
|
|
|
|
|
} elsif ( $1 eq 'l' ) { |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# left sibling edge child feature |
295
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_left_sibling}; |
296
|
|
|
|
|
|
|
} elsif ( $1 eq 'r' ) { |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# right sibling edge child feature |
299
|
|
|
|
|
|
|
$simple_feature_sub = \&{feature_right_sibling}; |
300
|
|
|
|
|
|
|
} else { |
301
|
|
|
|
|
|
|
croak "Assertion failed!"; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# function feature |
305
|
|
|
|
|
|
|
} elsif ( |
306
|
|
|
|
|
|
|
$simple_feature_code |
307
|
|
|
|
|
|
|
=~ /^([12gGlr\.a-z]+|[A-Z]+)\([-a-z0-9_,]*\)$/ |
308
|
|
|
|
|
|
|
) |
309
|
|
|
|
|
|
|
{ |
310
|
|
|
|
|
|
|
my $function_name = $1; |
311
|
|
|
|
|
|
|
$simple_feature_sub = |
312
|
|
|
|
|
|
|
$self->get_simple_feature_sub_reference($function_name); |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
if ($function_name eq 'between' |
315
|
|
|
|
|
|
|
|| $function_name eq 'foreach' |
316
|
|
|
|
|
|
|
|| substr( $function_name, 0, 2 ) eq 'g.' |
317
|
|
|
|
|
|
|
) |
318
|
|
|
|
|
|
|
{ |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# array function |
321
|
|
|
|
|
|
|
$self->array_simple_features->{$simple_feature_index} = 1; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
if ($function_name eq 'LABEL' |
325
|
|
|
|
|
|
|
|| $function_name eq 'l.label' || $function_name eq 'prevlabel' |
326
|
|
|
|
|
|
|
|| $function_name eq 'G.label' |
327
|
|
|
|
|
|
|
|| $function_name eq 'g.label' |
328
|
|
|
|
|
|
|
) |
329
|
|
|
|
|
|
|
{ |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# dynamic feature |
332
|
|
|
|
|
|
|
$self->dynamic_simple_features->{$simple_feature_index} = 1; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# set $simple_feature_field |
336
|
|
|
|
|
|
|
if ( $simple_feature_code =~ /$function_name\(\)$/ ) { |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# no-arg function feature |
339
|
|
|
|
|
|
|
$simple_feature_field = []; |
340
|
|
|
|
|
|
|
} elsif ( $simple_feature_code =~ /$function_name\(([-a-z0-9_]+)\)$/ ) { |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# one-arg function feature |
343
|
|
|
|
|
|
|
$simple_feature_field = $1; |
344
|
|
|
|
|
|
|
} elsif ( |
345
|
|
|
|
|
|
|
$simple_feature_code |
346
|
|
|
|
|
|
|
=~ /$function_name\(([-a-z0-9_,]+)\)$/ |
347
|
|
|
|
|
|
|
) |
348
|
|
|
|
|
|
|
{ |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# multiarg function feature |
351
|
|
|
|
|
|
|
my @fields = split /,/, $1; |
352
|
|
|
|
|
|
|
$simple_feature_field = \@fields; |
353
|
|
|
|
|
|
|
} else { |
354
|
|
|
|
|
|
|
die "Incorrect simple function feature format " . |
355
|
|
|
|
|
|
|
"'$simple_feature_code'.\n"; |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
} else { |
358
|
|
|
|
|
|
|
die "Incorrect simple feature format '$simple_feature_code'.\n"; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# if $simple_feature_field is (a ref to) an array of field names, |
362
|
|
|
|
|
|
|
# handles that correctly by iterating over the array and returning |
363
|
|
|
|
|
|
|
# an array of field indexes; |
364
|
|
|
|
|
|
|
# if there is an integer argument instead of a field name, |
365
|
|
|
|
|
|
|
# detects that and keeps that integer unchanged |
366
|
|
|
|
|
|
|
my $simple_feature_sub_arguments = |
367
|
|
|
|
|
|
|
$self->config->field_name2index($simple_feature_field); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# save |
370
|
|
|
|
|
|
|
$self->simple_feature_codes_hash->{$simple_feature_code} = 1; |
371
|
|
|
|
|
|
|
$self->simple_feature_indexes->{$simple_feature_code} = |
372
|
|
|
|
|
|
|
$simple_feature_index; |
373
|
|
|
|
|
|
|
push @{ $self->simple_feature_codes }, $simple_feature_code; |
374
|
|
|
|
|
|
|
push @{ $self->simple_feature_subs }, $simple_feature_sub; |
375
|
|
|
|
|
|
|
push @{ $self->simple_feature_sub_arguments }, |
376
|
|
|
|
|
|
|
$simple_feature_sub_arguments; |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
return; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# FEATURES COMPUTATION |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# array (ref) of all features of the edge, |
384
|
|
|
|
|
|
|
# in the form of "feature_index:values_string" strings, |
385
|
|
|
|
|
|
|
# where feature_index is the index of the feature |
386
|
|
|
|
|
|
|
# (index in feature_codes, translatable via feature_indexes) |
387
|
|
|
|
|
|
|
# and values_string are values of corresponding simple features, |
388
|
|
|
|
|
|
|
# joined together by '|' |
389
|
|
|
|
|
|
|
# (if any of the simple features does not return a value, the whole feature |
390
|
|
|
|
|
|
|
# is not present) |
391
|
|
|
|
|
|
|
# TODO maybe not returning a value is still a valuable information -> include? |
392
|
|
|
|
|
|
|
sub get_all_features { |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# Edge; 0: all features, 1: only dynamic, -1: only non-dynamic |
395
|
|
|
|
|
|
|
# either get only dynamic features or get all but dynamic features |
396
|
|
|
|
|
|
|
my ( $self, $edge, $only_dynamic_features ) = @_; |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# try to get features from cache |
399
|
|
|
|
|
|
|
# TODO: cache not used now and probably does not even work: |
400
|
|
|
|
|
|
|
# check&fix or remove |
401
|
|
|
|
|
|
|
my $edge_signature; |
402
|
|
|
|
|
|
|
if ( $self->use_edge_features_cache ) { |
403
|
|
|
|
|
|
|
$edge_signature = $edge->signature(); |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
my $cache_features = $self->edge_features_cache->{$edge_signature}; |
406
|
|
|
|
|
|
|
if ($cache_features) { |
407
|
|
|
|
|
|
|
return $cache_features; |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# double else: if cache not used or if edge features not found in cache |
412
|
|
|
|
|
|
|
my $simple_feature_values = $self->get_simple_feature_values_array($edge); |
413
|
|
|
|
|
|
|
my @features; |
414
|
|
|
|
|
|
|
my $features_count = $self->feature_count; |
415
|
|
|
|
|
|
|
for ( |
416
|
|
|
|
|
|
|
my $feature_index = 0; |
417
|
|
|
|
|
|
|
$feature_index < $features_count; |
418
|
|
|
|
|
|
|
$feature_index++ |
419
|
|
|
|
|
|
|
) |
420
|
|
|
|
|
|
|
{ |
421
|
|
|
|
|
|
|
if ($only_dynamic_features |
422
|
|
|
|
|
|
|
&& $only_dynamic_features == 1 |
423
|
|
|
|
|
|
|
&& !$self->dynamic_features->{$feature_index} |
424
|
|
|
|
|
|
|
) |
425
|
|
|
|
|
|
|
{ |
426
|
|
|
|
|
|
|
next; |
427
|
|
|
|
|
|
|
} elsif ( |
428
|
|
|
|
|
|
|
$only_dynamic_features |
429
|
|
|
|
|
|
|
&& $only_dynamic_features == -1 |
430
|
|
|
|
|
|
|
&& $self->dynamic_features->{$feature_index} |
431
|
|
|
|
|
|
|
) |
432
|
|
|
|
|
|
|
{ |
433
|
|
|
|
|
|
|
next; |
434
|
|
|
|
|
|
|
} else { |
435
|
|
|
|
|
|
|
my $feature_value = |
436
|
|
|
|
|
|
|
$self->get_feature_value( $feature_index, $simple_feature_values ); |
437
|
|
|
|
|
|
|
if ( $self->array_features->{$feature_index} ) { |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
#it is an array feature, the returned value is an array reference |
440
|
|
|
|
|
|
|
foreach my $value ( @{$feature_value} ) { |
441
|
|
|
|
|
|
|
push @features, "$feature_index:$value"; |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
} else { |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
#it is not an array feature, the returned value is a string |
446
|
|
|
|
|
|
|
if ( $feature_value ne '' ) { |
447
|
|
|
|
|
|
|
push @features, "$feature_index:$feature_value"; |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# save result in cache |
454
|
|
|
|
|
|
|
if ( $self->use_edge_features_cache ) { |
455
|
|
|
|
|
|
|
$self->edge_features_cache->{$edge_signature} = \@features; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
return \@features; |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# returns value of feature: simple feature values joined by '|' |
462
|
|
|
|
|
|
|
# or '' if any of them is undefined or empty; |
463
|
|
|
|
|
|
|
# for an array feature returns an array (ref) of these |
464
|
|
|
|
|
|
|
# or an empty array (ref) |
465
|
|
|
|
|
|
|
sub get_feature_value { |
466
|
|
|
|
|
|
|
my ( $self, $feature_index, $simple_feature_values ) = @_; |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
my $simple_features_indexes = |
469
|
|
|
|
|
|
|
$self->feature_simple_features_indexes->[$feature_index]; |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
if ( $self->array_features->{$feature_index} ) { |
472
|
|
|
|
|
|
|
my $feature_value = |
473
|
|
|
|
|
|
|
$self->get_array_feature_value( |
474
|
|
|
|
|
|
|
$simple_features_indexes, |
475
|
|
|
|
|
|
|
$simple_feature_values, 0 |
476
|
|
|
|
|
|
|
); |
477
|
|
|
|
|
|
|
if ($feature_value) { |
478
|
|
|
|
|
|
|
return $feature_value; |
479
|
|
|
|
|
|
|
} else { |
480
|
|
|
|
|
|
|
return []; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
} else { |
483
|
|
|
|
|
|
|
my @values; |
484
|
|
|
|
|
|
|
foreach my $simple_feature_index ( @{$simple_features_indexes} ) { |
485
|
|
|
|
|
|
|
my $value = $simple_feature_values->[$simple_feature_index]; |
486
|
|
|
|
|
|
|
if ( defined $value && $value ne '' ) { |
487
|
|
|
|
|
|
|
push @values, $value; |
488
|
|
|
|
|
|
|
} else { |
489
|
|
|
|
|
|
|
return ''; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
my $feature_value = join '|', @values; |
494
|
|
|
|
|
|
|
return $feature_value; |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# for features containing subfeatures that return an array of values |
499
|
|
|
|
|
|
|
sub get_array_feature_value { |
500
|
|
|
|
|
|
|
my ( |
501
|
|
|
|
|
|
|
$self, |
502
|
|
|
|
|
|
|
$simple_features_indexes, |
503
|
|
|
|
|
|
|
$simple_feature_values, |
504
|
|
|
|
|
|
|
$start_from |
505
|
|
|
|
|
|
|
) = @_; |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# get value at this position (position = $start_from) |
508
|
|
|
|
|
|
|
my $simple_feature_index = $simple_features_indexes->[$start_from]; |
509
|
|
|
|
|
|
|
my $value = $simple_feature_values->[$simple_feature_index]; |
510
|
|
|
|
|
|
|
if ( !$self->array_simple_features->{$simple_feature_index} ) { |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# if not an array reference |
513
|
|
|
|
|
|
|
$value = [ ($value) ]; # make it an array reference |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
my $simple_features_count = scalar @{$simple_features_indexes}; |
517
|
|
|
|
|
|
|
if ( $start_from < $simple_features_count - 1 ) { |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# not the last simple feature => have to recurse |
520
|
|
|
|
|
|
|
my $append = |
521
|
|
|
|
|
|
|
$self->get_array_feature_value( |
522
|
|
|
|
|
|
|
$simple_features_indexes, |
523
|
|
|
|
|
|
|
$simple_feature_values, $start_from + 1 |
524
|
|
|
|
|
|
|
); |
525
|
|
|
|
|
|
|
my @values; |
526
|
|
|
|
|
|
|
foreach my $my_value ( @{$value} ) { |
527
|
|
|
|
|
|
|
foreach my $append_value ( @{$append} ) { |
528
|
|
|
|
|
|
|
my $add_value = "$my_value|$append_value"; |
529
|
|
|
|
|
|
|
push @values, $add_value; |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
return [@values]; |
533
|
|
|
|
|
|
|
} else { # else bottom of recursion |
534
|
|
|
|
|
|
|
return $value; |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
# SIMPLE FEATURES |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub get_simple_feature_values_array { |
541
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
my @simple_feature_values; |
544
|
|
|
|
|
|
|
my $simple_feature_count = $self->simple_feature_count; |
545
|
|
|
|
|
|
|
for ( |
546
|
|
|
|
|
|
|
my $simple_feature_index = 0; |
547
|
|
|
|
|
|
|
$simple_feature_index < $simple_feature_count; |
548
|
|
|
|
|
|
|
$simple_feature_index++ |
549
|
|
|
|
|
|
|
) |
550
|
|
|
|
|
|
|
{ |
551
|
|
|
|
|
|
|
my $sub = $self->simple_feature_subs->[$simple_feature_index]; |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# If the simple feature has one parameter, |
554
|
|
|
|
|
|
|
# then $arguments is the one argument; |
555
|
|
|
|
|
|
|
# if the simple feature has more than one parameter, |
556
|
|
|
|
|
|
|
# then $arguments is a reference to an array of arguments. |
557
|
|
|
|
|
|
|
my $arguments = |
558
|
|
|
|
|
|
|
$self->simple_feature_sub_arguments->[$simple_feature_index]; |
559
|
|
|
|
|
|
|
my $value = &$sub( $self, $edge, $arguments ); |
560
|
|
|
|
|
|
|
push @simple_feature_values, $value; |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
return [@simple_feature_values]; |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
my %simple_feature_sub_references = ( |
567
|
|
|
|
|
|
|
'LABEL' => \&{feature_parent_label}, |
568
|
|
|
|
|
|
|
'prevlabel' => \&{feature_previous_label}, |
569
|
|
|
|
|
|
|
'l.label' => \&{feature_previous_label}, |
570
|
|
|
|
|
|
|
'G.label' => \&{feature_grandparent_label}, |
571
|
|
|
|
|
|
|
'g.label' => \&{feature_grandchildren_label}, |
572
|
|
|
|
|
|
|
'distance' => \&{feature_distance}, |
573
|
|
|
|
|
|
|
'G.distance' => \&{feature_grandparent_distance}, |
574
|
|
|
|
|
|
|
'attdir' => \&{feature_attachement_direction}, |
575
|
|
|
|
|
|
|
'G.attdir' => \&{feature_grandparent_attachement_direction}, # grandparent to child |
576
|
|
|
|
|
|
|
'preceding' => \&{feature_preceding_child}, |
577
|
|
|
|
|
|
|
'PRECEDING' => \&{feature_preceding_parent}, |
578
|
|
|
|
|
|
|
'1.preceding' => \&{feature_preceding_first}, |
579
|
|
|
|
|
|
|
'2.preceding' => \&{feature_preceding_second}, |
580
|
|
|
|
|
|
|
'following' => \&{feature_following_child}, |
581
|
|
|
|
|
|
|
'FOLLOWING' => \&{feature_following_parent}, |
582
|
|
|
|
|
|
|
'1.following' => \&{feature_following_first}, |
583
|
|
|
|
|
|
|
'2.following' => \&{feature_following_second}, |
584
|
|
|
|
|
|
|
'between' => \&{feature_between}, |
585
|
|
|
|
|
|
|
'foreach' => \&{feature_foreach}, |
586
|
|
|
|
|
|
|
'equals' => \&{feature_equals}, |
587
|
|
|
|
|
|
|
'equalspc' => \&{feature_equals_pc}, |
588
|
|
|
|
|
|
|
'equalspcat' => \&{feature_equals_pc_at}, |
589
|
|
|
|
|
|
|
'arrayat' => \&{feature_array_at_child}, |
590
|
|
|
|
|
|
|
'ARRAYAT' => \&{feature_array_at_parent}, |
591
|
|
|
|
|
|
|
'arrayatcp' => \&{feature_array_at_cp}, |
592
|
|
|
|
|
|
|
'isfirst' => \&{feature_child_is_first_in_sentence}, |
593
|
|
|
|
|
|
|
'ISFIRST' => \&{feature_parent_is_first_in_sentence}, |
594
|
|
|
|
|
|
|
'islast' => \&{feature_child_is_last_in_sentence}, |
595
|
|
|
|
|
|
|
'ISLAST' => \&{feature_parent_is_last_in_sentence}, |
596
|
|
|
|
|
|
|
'isfirstchild' => \&{feature_child_is_first_child}, |
597
|
|
|
|
|
|
|
'islastchild' => \&{feature_child_is_last_child}, |
598
|
|
|
|
|
|
|
'islastleftchild' => \&{feature_child_is_last_left_child}, |
599
|
|
|
|
|
|
|
'isfirstrightchild' => \&{feature_child_is_first_right_child}, |
600
|
|
|
|
|
|
|
'childno' => \&{feature_number_of_childs_children}, |
601
|
|
|
|
|
|
|
'CHILDNO' => \&{feature_number_of_parents_children}, |
602
|
|
|
|
|
|
|
'substr' => \&{feature_substr_child}, |
603
|
|
|
|
|
|
|
'SUBSTR' => \&{feature_substr_parent}, |
604
|
|
|
|
|
|
|
'pmi' => \&{feature_pmi}, |
605
|
|
|
|
|
|
|
'pmibucketed' => \&{feature_pmi_bucketed}, |
606
|
|
|
|
|
|
|
'pmirounded' => \&{feature_pmi_rounded}, |
607
|
|
|
|
|
|
|
'pmid' => \&{feature_pmi_d}, |
608
|
|
|
|
|
|
|
'cprob' => \&{feature_cprob}, |
609
|
|
|
|
|
|
|
'cprobbucketed' => \&{feature_cprob_bucketed}, |
610
|
|
|
|
|
|
|
'cprobrounded' => \&{feature_cprob_rounded}, |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
# obsolete |
613
|
|
|
|
|
|
|
# 'pmitworounded' => \&{feature_pmi_2_rounded}, |
614
|
|
|
|
|
|
|
# 'pmithreerounded' => \&{feature_pmi_3_rounded}, |
615
|
|
|
|
|
|
|
# 'cprobtworounded' => \&{feature_cprob_2_rounded}, |
616
|
|
|
|
|
|
|
# 'cprobthreerounded' => \&{feature_cprob_3_rounded}, |
617
|
|
|
|
|
|
|
); |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub get_simple_feature_sub_reference { |
620
|
|
|
|
|
|
|
my ( $self, $simple_feature_function ) = @_; |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
if ( $simple_feature_sub_references{$simple_feature_function} ) { |
623
|
|
|
|
|
|
|
return $simple_feature_sub_references{$simple_feature_function}; |
624
|
|
|
|
|
|
|
} else { |
625
|
|
|
|
|
|
|
croak "Unknown feature function '$simple_feature_function'!"; |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
# returns undef if there is no grandparent, i.e. the parent is the root |
630
|
|
|
|
|
|
|
sub get_grandparent { |
631
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
return ( $edge->parent )->parent; |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
sub feature_distance { |
637
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
return $self->feature_distance_generic( $edge->parent, $edge->child ); |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub feature_grandparent_distance { |
643
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
646
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
647
|
|
|
|
|
|
|
return $self->feature_distance_generic( $edge->parent, $edge->child ); |
648
|
|
|
|
|
|
|
} else { |
649
|
|
|
|
|
|
|
return '#novalue#'; |
650
|
|
|
|
|
|
|
} |
651
|
|
|
|
|
|
|
} |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
sub feature_distance_generic { |
654
|
|
|
|
|
|
|
my ( $self, $node1, $node2 ) = @_; |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
my $distance = $node1->ord - $node2->ord; |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
my $bucket = $self->config->distance2bucket->{$distance}; |
659
|
|
|
|
|
|
|
if ( defined $bucket ) { |
660
|
|
|
|
|
|
|
return $bucket; |
661
|
|
|
|
|
|
|
} else { |
662
|
|
|
|
|
|
|
if ( $distance <= $self->config->minBucket ) { |
663
|
|
|
|
|
|
|
return $self->config->minBucket; |
664
|
|
|
|
|
|
|
} else { # $distance >= $self->maxBucket |
665
|
|
|
|
|
|
|
return $self->config->maxBucket; |
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
sub feature_attachement_direction { |
671
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
return $self->feature_attachement_direction_generic( |
674
|
|
|
|
|
|
|
$edge->parent, $edge->child |
675
|
|
|
|
|
|
|
); |
676
|
|
|
|
|
|
|
} |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
sub feature_grandparent_attachement_direction { |
679
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
682
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
683
|
|
|
|
|
|
|
return $self->feature_attachement_direction_generic( |
684
|
|
|
|
|
|
|
$edge->parent, $edge->child |
685
|
|
|
|
|
|
|
); |
686
|
|
|
|
|
|
|
} else { |
687
|
|
|
|
|
|
|
return '#novalue#'; |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub feature_attachement_direction_generic { |
692
|
|
|
|
|
|
|
my ( $self, $node1, $node2 ) = @_; |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
if ( $node1->ord < $node2->ord ) { |
695
|
|
|
|
|
|
|
return -1; |
696
|
|
|
|
|
|
|
} else { |
697
|
|
|
|
|
|
|
return 1; |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
sub feature_child { |
702
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
703
|
|
|
|
|
|
|
return ( $edge->child->fields->[$field_index] ); |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
sub feature_parent { |
707
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
708
|
|
|
|
|
|
|
return ( $edge->parent->fields->[$field_index] ); |
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
sub feature_grandparent { |
712
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
715
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
716
|
|
|
|
|
|
|
return ( $grandparent->fields->[$field_index] ); |
717
|
|
|
|
|
|
|
} else { |
718
|
|
|
|
|
|
|
return '#novalue#'; |
719
|
|
|
|
|
|
|
} |
720
|
|
|
|
|
|
|
} |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
sub feature_parent_label { |
723
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
724
|
|
|
|
|
|
|
return ( $edge->parent->label ); |
725
|
|
|
|
|
|
|
} |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
sub feature_previous_label { |
728
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
my $left_sibling = $self->get_left_sibling($edge); |
731
|
|
|
|
|
|
|
if ( defined $left_sibling ) { |
732
|
|
|
|
|
|
|
return ( $left_sibling->child->label ); |
733
|
|
|
|
|
|
|
} else { |
734
|
|
|
|
|
|
|
return $self->config->SEQUENCE_BOUNDARY_LABEL; |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
} |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
sub feature_grandparent_label { |
739
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
my $grandparent = $self->get_grandparent($edge); |
742
|
|
|
|
|
|
|
if ( defined $grandparent ) { |
743
|
|
|
|
|
|
|
return ( $grandparent->label ); |
744
|
|
|
|
|
|
|
} else { |
745
|
|
|
|
|
|
|
return '#novalue#'; |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
sub feature_first { |
750
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
751
|
|
|
|
|
|
|
return ( $edge->first->fields->[$field_index] ); |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
sub feature_second { |
755
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
756
|
|
|
|
|
|
|
return ( $edge->second->fields->[$field_index] ); |
757
|
|
|
|
|
|
|
} |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
sub feature_left_sibling { |
760
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
my $left_sibling = $self->get_left_sibling($edge); |
763
|
|
|
|
|
|
|
if ( defined $left_sibling ) { |
764
|
|
|
|
|
|
|
return ( $left_sibling->child->fields->[$field_index] ); |
765
|
|
|
|
|
|
|
} else { |
766
|
|
|
|
|
|
|
return '#start#'; |
767
|
|
|
|
|
|
|
} |
768
|
|
|
|
|
|
|
} |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
sub feature_right_sibling { |
771
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
my $right_sibling = $self->get_right_sibling($edge); |
774
|
|
|
|
|
|
|
if ( defined $right_sibling ) { |
775
|
|
|
|
|
|
|
return ( $right_sibling->child->fields->[$field_index] ); |
776
|
|
|
|
|
|
|
} else { |
777
|
|
|
|
|
|
|
return '#end#'; |
778
|
|
|
|
|
|
|
} |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
sub get_left_sibling { |
782
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
785
|
|
|
|
|
|
|
my $is_first = ( $siblings->[0]->child->ord == $edge->child->ord ); |
786
|
|
|
|
|
|
|
if ($is_first) { |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# there is no left sibling to the leftmost node |
789
|
|
|
|
|
|
|
return; |
790
|
|
|
|
|
|
|
} else { |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
# find my position among parent's children (is at least 1) |
793
|
|
|
|
|
|
|
my $my_index = 1; |
794
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
795
|
|
|
|
|
|
|
$my_index++; |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
# now ($my_index-1) is the index of my (closest) left sibling |
799
|
|
|
|
|
|
|
return ( $siblings->[ $my_index - 1 ] ); |
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
sub get_right_sibling { |
804
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
807
|
|
|
|
|
|
|
my $last_sibling_index = scalar(@$siblings) - 1; |
808
|
|
|
|
|
|
|
my $is_last = ( |
809
|
|
|
|
|
|
|
$siblings->[$last_sibling_index]->child->ord |
810
|
|
|
|
|
|
|
== $edge->child->ord |
811
|
|
|
|
|
|
|
); |
812
|
|
|
|
|
|
|
if ($is_last) { |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
# there is no right sibling to the rightmost node |
815
|
|
|
|
|
|
|
return; |
816
|
|
|
|
|
|
|
} else { |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
# find my position among parent's children |
819
|
|
|
|
|
|
|
# (is at most $last_sibling_index - 1) |
820
|
|
|
|
|
|
|
my $my_index = $last_sibling_index - 1; |
821
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
822
|
|
|
|
|
|
|
$my_index--; |
823
|
|
|
|
|
|
|
} |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
# now ($my_index+1) is the index of my (closest) right sibling |
826
|
|
|
|
|
|
|
return $siblings->[ $my_index + 1 ]; |
827
|
|
|
|
|
|
|
} |
828
|
|
|
|
|
|
|
} |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
sub feature_preceding_child { |
831
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->child->ord - 1 ); |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# $node may be undef |
836
|
|
|
|
|
|
|
if ($node) { |
837
|
|
|
|
|
|
|
if ( $edge->parent->ord == $node->ord ) { |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# no gap between nodes |
840
|
|
|
|
|
|
|
return '#mid#'; |
841
|
|
|
|
|
|
|
} else { |
842
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
} else { |
845
|
|
|
|
|
|
|
return '#start#'; |
846
|
|
|
|
|
|
|
} |
847
|
|
|
|
|
|
|
} |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
sub feature_preceding_parent { |
850
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->parent->ord - 1 ); |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
# $node may be undef |
855
|
|
|
|
|
|
|
if ($node) { |
856
|
|
|
|
|
|
|
if ( $edge->child->ord == $node->ord ) { |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
# no gap between nodes |
859
|
|
|
|
|
|
|
return '#mid#'; |
860
|
|
|
|
|
|
|
} else { |
861
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
} else { |
864
|
|
|
|
|
|
|
return '#start#'; |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
} |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
sub feature_following_child { |
869
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->child->ord + 1 ); |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
# $node may be undef |
874
|
|
|
|
|
|
|
if ($node) { |
875
|
|
|
|
|
|
|
if ( $edge->parent->ord == $node->ord ) { |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
# no gap between nodes |
878
|
|
|
|
|
|
|
return '#mid#'; |
879
|
|
|
|
|
|
|
} else { |
880
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
881
|
|
|
|
|
|
|
} |
882
|
|
|
|
|
|
|
} else { |
883
|
|
|
|
|
|
|
return '#end#'; |
884
|
|
|
|
|
|
|
} |
885
|
|
|
|
|
|
|
} |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
sub feature_following_parent { |
888
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->parent->ord + 1 ); |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# $node may be undef |
893
|
|
|
|
|
|
|
if ($node) { |
894
|
|
|
|
|
|
|
if ( $edge->child->ord == $node->ord ) { |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# no gap between nodes |
897
|
|
|
|
|
|
|
return '#mid#'; |
898
|
|
|
|
|
|
|
} else { |
899
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
900
|
|
|
|
|
|
|
} |
901
|
|
|
|
|
|
|
} else { |
902
|
|
|
|
|
|
|
return '#end#'; |
903
|
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
} |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
sub feature_preceding_first { |
907
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->first->ord - 1 ); |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
# $node may be undef |
912
|
|
|
|
|
|
|
if ($node) { |
913
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
914
|
|
|
|
|
|
|
} else { |
915
|
|
|
|
|
|
|
return '#start#'; |
916
|
|
|
|
|
|
|
} |
917
|
|
|
|
|
|
|
} |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
sub feature_preceding_second { |
920
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->second->ord - 1 ); |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
# $node may be undef |
925
|
|
|
|
|
|
|
if ($node) { |
926
|
|
|
|
|
|
|
if ( $edge->first->ord == $node->ord ) { |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
# node preceding second node is first node |
929
|
|
|
|
|
|
|
return '#mid#'; |
930
|
|
|
|
|
|
|
} else { |
931
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
932
|
|
|
|
|
|
|
} |
933
|
|
|
|
|
|
|
} else { |
934
|
|
|
|
|
|
|
return '#start#'; |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
sub feature_following_first { |
939
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->first->ord + 1 ); |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
# $node may be undef |
944
|
|
|
|
|
|
|
if ($node) { |
945
|
|
|
|
|
|
|
if ( $edge->second->ord == $node->ord ) { |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
# node following first node is second node |
948
|
|
|
|
|
|
|
return '#mid#'; |
949
|
|
|
|
|
|
|
} else { |
950
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
} else { |
953
|
|
|
|
|
|
|
return '#end#'; |
954
|
|
|
|
|
|
|
} |
955
|
|
|
|
|
|
|
} |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
sub feature_following_second { |
958
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
my $node = $edge->sentence->getNodeByOrd( $edge->second->ord + 1 ); |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
# $node may be undef |
963
|
|
|
|
|
|
|
if ($node) { |
964
|
|
|
|
|
|
|
return $node->fields->[$field_index]; |
965
|
|
|
|
|
|
|
} else { |
966
|
|
|
|
|
|
|
return '#end#'; |
967
|
|
|
|
|
|
|
} |
968
|
|
|
|
|
|
|
} |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
sub feature_between { |
971
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
my @values; |
974
|
|
|
|
|
|
|
my $from; |
975
|
|
|
|
|
|
|
my $to; |
976
|
|
|
|
|
|
|
if ( $edge->parent->ord < $edge->child->ord ) { |
977
|
|
|
|
|
|
|
$from = $edge->parent->ord + 1; |
978
|
|
|
|
|
|
|
$to = $edge->child->ord - 1; |
979
|
|
|
|
|
|
|
} else { |
980
|
|
|
|
|
|
|
$from = $edge->child->ord + 1; |
981
|
|
|
|
|
|
|
$to = $edge->parent->ord - 1; |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
# TODO: use precomputed values instead |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
for ( my $ord = $from; $ord <= $to; $ord++ ) { |
987
|
|
|
|
|
|
|
push @values, |
988
|
|
|
|
|
|
|
$edge->sentence->getNodeByOrd($ord)->fields->[$field_index]; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
return [@values]; |
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
# my $len = $to - $from; |
993
|
|
|
|
|
|
|
# if ($len >= 0) { |
994
|
|
|
|
|
|
|
# return $edge->sentence->betweenFeatureValues-> |
995
|
|
|
|
|
|
|
# {$field_index}->[$from]->[$len]; |
996
|
|
|
|
|
|
|
# } else { |
997
|
|
|
|
|
|
|
# return; |
998
|
|
|
|
|
|
|
# } |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
} |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
sub feature_foreach { |
1003
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
my $values = $edge->child->fields->[$field_index]; |
1006
|
|
|
|
|
|
|
if ($values) { |
1007
|
|
|
|
|
|
|
my @values = split / /, $edge->child->fields->[$field_index]; |
1008
|
|
|
|
|
|
|
return [@values]; |
1009
|
|
|
|
|
|
|
} else { |
1010
|
|
|
|
|
|
|
return ''; |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
sub feature_equals { |
1015
|
|
|
|
|
|
|
my ( $self, $edge, $field_indexes ) = @_; |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
# equals takes two arguments |
1018
|
|
|
|
|
|
|
if ( @{$field_indexes} == 2 ) { |
1019
|
|
|
|
|
|
|
my ( $field_index_1, $field_index_2 ) = @{$field_indexes}; |
1020
|
|
|
|
|
|
|
my $values_1 = $edge->child->fields->[$field_index_1]; |
1021
|
|
|
|
|
|
|
my $values_2 = $edge->child->fields->[$field_index_2]; |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# we handle undefines and empties specially |
1024
|
|
|
|
|
|
|
if ( |
1025
|
|
|
|
|
|
|
defined $values_1 |
1026
|
|
|
|
|
|
|
&& $values_1 ne '' |
1027
|
|
|
|
|
|
|
&& defined $values_2 |
1028
|
|
|
|
|
|
|
&& $values_2 ne '' |
1029
|
|
|
|
|
|
|
) |
1030
|
|
|
|
|
|
|
{ |
1031
|
|
|
|
|
|
|
my $result = 0; # default not equal |
1032
|
|
|
|
|
|
|
my @values_1 = split / /, $values_1; |
1033
|
|
|
|
|
|
|
my @values_2 = split / /, $values_2; |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# try to find a match |
1036
|
|
|
|
|
|
|
foreach my $value_1 (@values_1) { |
1037
|
|
|
|
|
|
|
foreach my $value_2 (@values_2) { |
1038
|
|
|
|
|
|
|
if ( $value_1 eq $value_2 ) { |
1039
|
|
|
|
|
|
|
$result = 1; # one match is enough |
1040
|
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
} |
1042
|
|
|
|
|
|
|
} |
1043
|
|
|
|
|
|
|
return $result; |
1044
|
|
|
|
|
|
|
} else { |
1045
|
|
|
|
|
|
|
return -1; # undef |
1046
|
|
|
|
|
|
|
} |
1047
|
|
|
|
|
|
|
} else { |
1048
|
|
|
|
|
|
|
croak "equals() takes TWO arguments!!!"; |
1049
|
|
|
|
|
|
|
} |
1050
|
|
|
|
|
|
|
} |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# only difference to equals is the line: |
1053
|
|
|
|
|
|
|
# my $values_1 = $edge->PARENT->fields->[$field_index_1]; |
1054
|
|
|
|
|
|
|
sub feature_equals_pc { |
1055
|
|
|
|
|
|
|
my ( $self, $edge, $field_indexes ) = @_; |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
# equals takes two arguments |
1058
|
|
|
|
|
|
|
if ( @{$field_indexes} == 2 ) { |
1059
|
|
|
|
|
|
|
my ( $field_index_1, $field_index_2 ) = @{$field_indexes}; |
1060
|
|
|
|
|
|
|
my $values_1 = $edge->parent->fields->[$field_index_1]; |
1061
|
|
|
|
|
|
|
my $values_2 = $edge->child->fields->[$field_index_2]; |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
# we handle undefines and empties specially |
1064
|
|
|
|
|
|
|
if ( |
1065
|
|
|
|
|
|
|
defined $values_1 |
1066
|
|
|
|
|
|
|
&& $values_1 ne '' |
1067
|
|
|
|
|
|
|
&& defined $values_2 |
1068
|
|
|
|
|
|
|
&& $values_2 ne '' |
1069
|
|
|
|
|
|
|
) |
1070
|
|
|
|
|
|
|
{ |
1071
|
|
|
|
|
|
|
my $result = 0; # default not equal |
1072
|
|
|
|
|
|
|
my @values_1 = split / /, $values_1; |
1073
|
|
|
|
|
|
|
my @values_2 = split / /, $values_2; |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
# try to find a match |
1076
|
|
|
|
|
|
|
foreach my $value_1 (@values_1) { |
1077
|
|
|
|
|
|
|
foreach my $value_2 (@values_2) { |
1078
|
|
|
|
|
|
|
if ( $value_1 eq $value_2 ) { |
1079
|
|
|
|
|
|
|
$result = 1; # one match is enough |
1080
|
|
|
|
|
|
|
} |
1081
|
|
|
|
|
|
|
} |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
return $result; |
1084
|
|
|
|
|
|
|
} else { |
1085
|
|
|
|
|
|
|
return -1; # undef |
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
} else { |
1088
|
|
|
|
|
|
|
croak "equals() takes TWO arguments!!!"; |
1089
|
|
|
|
|
|
|
} |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
# sub equalsat - does not make sense |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# whether the character at the given position of the given field |
1095
|
|
|
|
|
|
|
# equals in parent and in child |
1096
|
|
|
|
|
|
|
sub feature_equals_pc_at { |
1097
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
# equals takes two arguments |
1100
|
|
|
|
|
|
|
if ( @{$arguments} == 2 ) { |
1101
|
|
|
|
|
|
|
my ( $field_index, $position ) = @{$arguments}; |
1102
|
|
|
|
|
|
|
my $field_parent = $edge->parent->fields->[$field_index]; |
1103
|
|
|
|
|
|
|
my $field_child = $edge->child->fields->[$field_index]; |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
# we handle undefines and too short fields specially |
1106
|
|
|
|
|
|
|
if ( |
1107
|
|
|
|
|
|
|
defined $field_parent |
1108
|
|
|
|
|
|
|
&& length $field_parent > $position |
1109
|
|
|
|
|
|
|
&& defined $field_child |
1110
|
|
|
|
|
|
|
&& length $field_child > $position |
1111
|
|
|
|
|
|
|
) |
1112
|
|
|
|
|
|
|
{ |
1113
|
|
|
|
|
|
|
my $value_parent = substr $field_parent, $position, 1; |
1114
|
|
|
|
|
|
|
my $value_child = substr $field_child, $position, 1; |
1115
|
|
|
|
|
|
|
if ( $value_parent eq $value_child ) { |
1116
|
|
|
|
|
|
|
return 1; |
1117
|
|
|
|
|
|
|
} else { |
1118
|
|
|
|
|
|
|
return 0; |
1119
|
|
|
|
|
|
|
} |
1120
|
|
|
|
|
|
|
} else { |
1121
|
|
|
|
|
|
|
return -1; # undef |
1122
|
|
|
|
|
|
|
} |
1123
|
|
|
|
|
|
|
} else { |
1124
|
|
|
|
|
|
|
croak "equals() takes TWO arguments!!!"; |
1125
|
|
|
|
|
|
|
} |
1126
|
|
|
|
|
|
|
} |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
# substring (field, start, length) |
1129
|
|
|
|
|
|
|
sub feature_substr_child { |
1130
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
# substr takes two or three arguments |
1133
|
|
|
|
|
|
|
if ( @{$arguments} != 3 && @{$arguments} != 2 ) { |
1134
|
|
|
|
|
|
|
croak "substr() takes THREE or TWO arguments!!!"; |
1135
|
|
|
|
|
|
|
} else { |
1136
|
|
|
|
|
|
|
my ( $field_index, $start, $length ) = @{$arguments}; |
1137
|
|
|
|
|
|
|
my $field = $edge->child->fields->[$field_index]; |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
my $value = ''; |
1140
|
|
|
|
|
|
|
if ( defined $field ) { |
1141
|
|
|
|
|
|
|
if ( defined $length ) { |
1142
|
|
|
|
|
|
|
$value = substr( $field, $start, $length ); |
1143
|
|
|
|
|
|
|
} else { |
1144
|
|
|
|
|
|
|
$value = substr( $field, $start ); |
1145
|
|
|
|
|
|
|
} |
1146
|
|
|
|
|
|
|
} |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
return $value; |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
} |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
# substring (field, start, length) |
1153
|
|
|
|
|
|
|
sub feature_substr_parent { |
1154
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
# substr takes two or three arguments |
1157
|
|
|
|
|
|
|
if ( @{$arguments} != 3 && @{$arguments} != 2 ) { |
1158
|
|
|
|
|
|
|
croak "substr() takes THREE or TWO arguments!!!"; |
1159
|
|
|
|
|
|
|
} else { |
1160
|
|
|
|
|
|
|
my ( $field_index, $start, $length ) = @{$arguments}; |
1161
|
|
|
|
|
|
|
my $field = $edge->parent->fields->[$field_index]; |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
my $value = ''; |
1164
|
|
|
|
|
|
|
if ( defined $field ) { |
1165
|
|
|
|
|
|
|
if ( defined $length ) { |
1166
|
|
|
|
|
|
|
$value = substr( $field, $start, $length ); |
1167
|
|
|
|
|
|
|
} else { |
1168
|
|
|
|
|
|
|
$value = substr( $field, $start ); |
1169
|
|
|
|
|
|
|
} |
1170
|
|
|
|
|
|
|
} |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
return $value; |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
} |
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
# arrayat (array, index) |
1177
|
|
|
|
|
|
|
sub feature_array_at_child { |
1178
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
# arrayat takes two arguments |
1181
|
|
|
|
|
|
|
if ( @{$arguments} != 2 ) { |
1182
|
|
|
|
|
|
|
croak "arrayat() takes TWO arguments!!!"; |
1183
|
|
|
|
|
|
|
} else { |
1184
|
|
|
|
|
|
|
my ( $array_field, $index_field ) = @{$arguments}; |
1185
|
|
|
|
|
|
|
my $array = $edge->child->fields->[$array_field]; |
1186
|
|
|
|
|
|
|
my $index = $edge->child->fields->[$index_field]; |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
my @array = split / /, $array; |
1189
|
|
|
|
|
|
|
my $value = $array[$index]; |
1190
|
|
|
|
|
|
|
if ( !defined $value ) { |
1191
|
|
|
|
|
|
|
$value = ''; |
1192
|
|
|
|
|
|
|
} |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
return $value; |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
sub feature_array_at_parent { |
1199
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
# arrayat takes two arguments |
1202
|
|
|
|
|
|
|
if ( @{$arguments} != 2 ) { |
1203
|
|
|
|
|
|
|
croak "arrayat() takes TWO arguments!!!"; |
1204
|
|
|
|
|
|
|
} else { |
1205
|
|
|
|
|
|
|
my ( $array_field, $index_field ) = @{$arguments}; |
1206
|
|
|
|
|
|
|
my $array = $edge->parent->fields->[$array_field]; |
1207
|
|
|
|
|
|
|
my $index = $edge->parent->fields->[$index_field]; |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
my @array = split / /, $array; |
1210
|
|
|
|
|
|
|
my $value = $array[$index]; |
1211
|
|
|
|
|
|
|
if ( !defined $value ) { |
1212
|
|
|
|
|
|
|
$value = ''; |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
return $value; |
1216
|
|
|
|
|
|
|
} |
1217
|
|
|
|
|
|
|
} |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# arrayatcp (array, index) |
1220
|
|
|
|
|
|
|
sub feature_array_at_cp { |
1221
|
|
|
|
|
|
|
my ( $self, $edge, $arguments ) = @_; |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
# arrayat takes two arguments |
1224
|
|
|
|
|
|
|
if ( @{$arguments} != 2 ) { |
1225
|
|
|
|
|
|
|
croak "arrayat() takes TWO arguments!!!"; |
1226
|
|
|
|
|
|
|
} else { |
1227
|
|
|
|
|
|
|
my ( $array_field, $index_field ) = @{$arguments}; |
1228
|
|
|
|
|
|
|
my $array = $edge->child->fields->[$array_field]; |
1229
|
|
|
|
|
|
|
my $index = $edge->parent->fields->[$index_field]; |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
my @array = split / /, $array; |
1232
|
|
|
|
|
|
|
my $value = $array[$index]; |
1233
|
|
|
|
|
|
|
if ( !defined $value ) { |
1234
|
|
|
|
|
|
|
$value = ''; |
1235
|
|
|
|
|
|
|
} |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
return $value; |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
sub feature_child_is_first_in_sentence { |
1242
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
if ( $edge->child->ord == 1 ) { |
1245
|
|
|
|
|
|
|
return 1; |
1246
|
|
|
|
|
|
|
} else { |
1247
|
|
|
|
|
|
|
return 0; |
1248
|
|
|
|
|
|
|
} |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
sub feature_parent_is_first_in_sentence { |
1252
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
if ( $edge->parent->ord == 1 ) { |
1255
|
|
|
|
|
|
|
return 1; |
1256
|
|
|
|
|
|
|
} else { |
1257
|
|
|
|
|
|
|
return 0; |
1258
|
|
|
|
|
|
|
} |
1259
|
|
|
|
|
|
|
} |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
sub feature_child_is_last_in_sentence { |
1262
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# last ord = number of nodes (because ords are 1-based, 0 is the root node) |
1265
|
|
|
|
|
|
|
if ( $edge->child->ord == scalar( @{ $edge->sentence->nodes } ) ) { |
1266
|
|
|
|
|
|
|
return 1; |
1267
|
|
|
|
|
|
|
} else { |
1268
|
|
|
|
|
|
|
return 0; |
1269
|
|
|
|
|
|
|
} |
1270
|
|
|
|
|
|
|
} |
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
sub feature_parent_is_last_in_sentence { |
1273
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
# last ord = number of nodes (because ords are 1-based, 0 is the root node) |
1276
|
|
|
|
|
|
|
if ( $edge->parent->ord == scalar( @{ $edge->sentence->nodes } ) ) { |
1277
|
|
|
|
|
|
|
return 1; |
1278
|
|
|
|
|
|
|
} else { |
1279
|
|
|
|
|
|
|
return 0; |
1280
|
|
|
|
|
|
|
} |
1281
|
|
|
|
|
|
|
} |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
sub feature_child_is_first_child { |
1284
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
my $children = $edge->parent->children; |
1287
|
|
|
|
|
|
|
if ( $children->[0]->child->ord == $edge->child->ord ) { |
1288
|
|
|
|
|
|
|
return 1; |
1289
|
|
|
|
|
|
|
} else { |
1290
|
|
|
|
|
|
|
return 0; |
1291
|
|
|
|
|
|
|
} |
1292
|
|
|
|
|
|
|
} |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
sub feature_child_is_last_child { |
1295
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
my $children = $edge->parent->children; |
1298
|
|
|
|
|
|
|
my $childrenNum = scalar(@$children); |
1299
|
|
|
|
|
|
|
if ( $children->[ $childrenNum - 1 ]->child->ord == $edge->child->ord ) { |
1300
|
|
|
|
|
|
|
return 1; |
1301
|
|
|
|
|
|
|
} else { |
1302
|
|
|
|
|
|
|
return 0; |
1303
|
|
|
|
|
|
|
} |
1304
|
|
|
|
|
|
|
} |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
sub feature_child_is_first_right_child { |
1307
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
my $is_right = ( $edge->parent->ord < $edge->child->ord ); |
1310
|
|
|
|
|
|
|
if ($is_right) { |
1311
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
1312
|
|
|
|
|
|
|
my $is_first = ( $siblings->[0]->child->ord == $edge->child->ord ); |
1313
|
|
|
|
|
|
|
if ($is_first) { |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
# is right & is first (= leftmost) of all siblings |
1316
|
|
|
|
|
|
|
return 1; |
1317
|
|
|
|
|
|
|
} else { |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
# find my position among parent's children (is at least 1) |
1320
|
|
|
|
|
|
|
my $my_index = 1; |
1321
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
1322
|
|
|
|
|
|
|
$my_index++; |
1323
|
|
|
|
|
|
|
} |
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
# now ($my_index-1) is the index of my (closest) left sibling |
1326
|
|
|
|
|
|
|
my $sibling_is_left = |
1327
|
|
|
|
|
|
|
( |
1328
|
|
|
|
|
|
|
$siblings->[ $my_index - 1 ]->child->ord |
1329
|
|
|
|
|
|
|
< $edge->parent->ord |
1330
|
|
|
|
|
|
|
); |
1331
|
|
|
|
|
|
|
if ($sibling_is_left) { |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
# is right and closest left sibling is left |
1334
|
|
|
|
|
|
|
return 1; |
1335
|
|
|
|
|
|
|
} else { |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
# is right but not the first one |
1338
|
|
|
|
|
|
|
return 0; |
1339
|
|
|
|
|
|
|
} |
1340
|
|
|
|
|
|
|
} |
1341
|
|
|
|
|
|
|
} else { |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
# is left |
1344
|
|
|
|
|
|
|
return 0; |
1345
|
|
|
|
|
|
|
} |
1346
|
|
|
|
|
|
|
} |
1347
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
sub feature_child_is_last_left_child { |
1349
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
my $is_left = ( $edge->child->ord < $edge->parent->ord ); |
1352
|
|
|
|
|
|
|
if ($is_left) { |
1353
|
|
|
|
|
|
|
my $siblings = $edge->parent->children; |
1354
|
|
|
|
|
|
|
my $last_sibling_index = scalar(@$siblings) - 1; |
1355
|
|
|
|
|
|
|
my $is_last = ( |
1356
|
|
|
|
|
|
|
$siblings->[$last_sibling_index]->child->ord |
1357
|
|
|
|
|
|
|
== $edge->child->ord |
1358
|
|
|
|
|
|
|
); |
1359
|
|
|
|
|
|
|
if ($is_last) { |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
# is left & is last of all siblings |
1362
|
|
|
|
|
|
|
return 1; |
1363
|
|
|
|
|
|
|
} else { |
1364
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
# find my position among parent's children |
1366
|
|
|
|
|
|
|
# (is at most $last_sibling_index - 1) |
1367
|
|
|
|
|
|
|
my $my_index = $last_sibling_index - 1; |
1368
|
|
|
|
|
|
|
while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) { |
1369
|
|
|
|
|
|
|
$my_index--; |
1370
|
|
|
|
|
|
|
} |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
# now ($my_index+1) is the index of my (closest) right sibling |
1373
|
|
|
|
|
|
|
my $sibling_is_right = |
1374
|
|
|
|
|
|
|
( |
1375
|
|
|
|
|
|
|
$edge->parent->ord |
1376
|
|
|
|
|
|
|
< $siblings->[ $my_index + 1 ]->child->ord |
1377
|
|
|
|
|
|
|
); |
1378
|
|
|
|
|
|
|
if ($sibling_is_right) { |
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
# is left and closest right sibling is right |
1381
|
|
|
|
|
|
|
return 1; |
1382
|
|
|
|
|
|
|
} else { |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
# is left but not the last one |
1385
|
|
|
|
|
|
|
return 0; |
1386
|
|
|
|
|
|
|
} |
1387
|
|
|
|
|
|
|
} |
1388
|
|
|
|
|
|
|
} else { |
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
# is right |
1391
|
|
|
|
|
|
|
return 0; |
1392
|
|
|
|
|
|
|
} |
1393
|
|
|
|
|
|
|
} |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
sub feature_number_of_childs_children { |
1396
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
my $children = $edge->child->children; |
1399
|
|
|
|
|
|
|
if ( $children && scalar(@$children) ) { |
1400
|
|
|
|
|
|
|
return scalar(@$children); |
1401
|
|
|
|
|
|
|
} else { |
1402
|
|
|
|
|
|
|
return 0; |
1403
|
|
|
|
|
|
|
} |
1404
|
|
|
|
|
|
|
} |
1405
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
sub feature_number_of_parents_children { |
1407
|
|
|
|
|
|
|
my ( $self, $edge ) = @_; |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
my $children = $edge->parent->children; |
1410
|
|
|
|
|
|
|
if ( $children && scalar(@$children) ) { |
1411
|
|
|
|
|
|
|
return scalar(@$children); |
1412
|
|
|
|
|
|
|
} else { |
1413
|
|
|
|
|
|
|
return 0; |
1414
|
|
|
|
|
|
|
} |
1415
|
|
|
|
|
|
|
} |
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
sub feature_additional_model { |
1418
|
|
|
|
|
|
|
my ( $self, $edge, $field_index, $model ) = @_; |
1419
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index]; |
1421
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index]; |
1422
|
|
|
|
|
|
|
|
1423
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
1424
|
|
|
|
|
|
|
return $model->get_value( $child, $parent ); |
1425
|
|
|
|
|
|
|
} else { |
1426
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
1427
|
|
|
|
|
|
|
"this should not happen!"; |
1428
|
|
|
|
|
|
|
} |
1429
|
|
|
|
|
|
|
} |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
sub feature_additional_model_bucketed { |
1432
|
|
|
|
|
|
|
my ( $self, $edge, $field_index, $model ) = @_; |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index]; |
1435
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index]; |
1436
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
1438
|
|
|
|
|
|
|
return $model->get_bucketed_value( $child, $parent ); |
1439
|
|
|
|
|
|
|
} else { |
1440
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
1441
|
|
|
|
|
|
|
"this should not happen!"; |
1442
|
|
|
|
|
|
|
} |
1443
|
|
|
|
|
|
|
} |
1444
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
sub feature_additional_model_rounded { |
1446
|
|
|
|
|
|
|
my ( $self, $edge, $parameters, $model ) = @_; |
1447
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
my ( $field_index, $rounding ) = @$parameters; |
1449
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index]; |
1450
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index]; |
1451
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
1453
|
|
|
|
|
|
|
return $model->get_rounded_value( $child, $parent, $rounding ); |
1454
|
|
|
|
|
|
|
} else { |
1455
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
1456
|
|
|
|
|
|
|
"this should not happen!"; |
1457
|
|
|
|
|
|
|
} |
1458
|
|
|
|
|
|
|
} |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
sub feature_additional_model_d { |
1461
|
|
|
|
|
|
|
my ( $self, $edge, $parameters, $model ) = @_; |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
my ( $field_index_c, $field_index_p ) = @$parameters; |
1464
|
|
|
|
|
|
|
my $child = $edge->child->fields->[$field_index_c]; |
1465
|
|
|
|
|
|
|
my $parent = $edge->parent->fields->[$field_index_p]; |
1466
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
if ( defined $child && defined $parent ) { |
1468
|
|
|
|
|
|
|
return $model->get_rounded_value( $child, $parent ); |
1469
|
|
|
|
|
|
|
} else { |
1470
|
|
|
|
|
|
|
croak "Either child or parent is undefined in additional model feature, " . |
1471
|
|
|
|
|
|
|
"this should not happen!"; |
1472
|
|
|
|
|
|
|
} |
1473
|
|
|
|
|
|
|
} |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
sub feature_pmi { |
1476
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1477
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
return $self->feature_additional_model( $edge, $field_index, $self->pmi_model ); |
1479
|
|
|
|
|
|
|
} |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
sub feature_pmi_bucketed { |
1482
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
return $self->feature_additional_model_bucketed( $edge, $field_index, $self->pmi_model ); |
1485
|
|
|
|
|
|
|
} |
1486
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
sub feature_pmi_rounded { |
1488
|
|
|
|
|
|
|
my ( $self, $edge, $parameters ) = @_; |
1489
|
|
|
|
|
|
|
|
1490
|
|
|
|
|
|
|
return $self->feature_additional_model_rounded( $edge, $parameters, $self->pmi_model ); |
1491
|
|
|
|
|
|
|
} |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
sub feature_pmi_d { |
1494
|
|
|
|
|
|
|
my ( $self, $edge, $parameters ) = @_; |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
return $self->feature_additional_model_d( $edge, $parameters, $self->pmi_model ); |
1497
|
|
|
|
|
|
|
} |
1498
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
sub feature_pmi_2_rounded { |
1500
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
my @params = ( $field_index, 1 ); |
1503
|
|
|
|
|
|
|
return $self->feature_pmi_rounded( $edge, \@params ); |
1504
|
|
|
|
|
|
|
} |
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
sub feature_pmi_3_rounded { |
1507
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1508
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
my @params = ( $field_index, 2 ); |
1510
|
|
|
|
|
|
|
return $self->feature_pmi_rounded( $edge, \@params ); |
1511
|
|
|
|
|
|
|
} |
1512
|
|
|
|
|
|
|
|
1513
|
|
|
|
|
|
|
sub feature_cprob { |
1514
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1515
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
return $self->feature_additional_model( $edge, $field_index, $self->cprob_model ); |
1517
|
|
|
|
|
|
|
} |
1518
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
sub feature_cprob_bucketed { |
1520
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1521
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
return $self->feature_additional_model_bucketed( $edge, $field_index, $self->cprob_model ); |
1523
|
|
|
|
|
|
|
} |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
sub feature_cprob_rounded { |
1526
|
|
|
|
|
|
|
my ( $self, $edge, $parameters ) = @_; |
1527
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
return $self->feature_additional_model_rounded( $edge, $parameters, $self->cprob_model ); |
1529
|
|
|
|
|
|
|
} |
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
sub feature_cprob_2_rounded { |
1532
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1533
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
my @params = ( $field_index, 1 ); |
1535
|
|
|
|
|
|
|
return $self->feature_cprob_rounded( $edge, \@params ); |
1536
|
|
|
|
|
|
|
} |
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
sub feature_cprob_3_rounded { |
1539
|
|
|
|
|
|
|
my ( $self, $edge, $field_index ) = @_; |
1540
|
|
|
|
|
|
|
|
1541
|
|
|
|
|
|
|
my @params = ( $field_index, 2 ); |
1542
|
|
|
|
|
|
|
return $self->feature_cprob_rounded( $edge, \@params ); |
1543
|
|
|
|
|
|
|
} |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
1; |
1546
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
__END__ |
1548
|
|
|
|
|
|
|
|
1549
|
|
|
|
|
|
|
=pod |
1550
|
|
|
|
|
|
|
|
1551
|
|
|
|
|
|
|
=for Pod::Coverage BUILD |
1552
|
|
|
|
|
|
|
|
1553
|
|
|
|
|
|
|
=encoding utf-8 |
1554
|
|
|
|
|
|
|
|
1555
|
|
|
|
|
|
|
=head1 NAME |
1556
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
Treex::Tool::Parser::MSTperl::FeaturesControl |
1558
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
=head1 VERSION |
1560
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
version 0.11949 |
1562
|
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
=head1 DESCRIPTION |
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
Controls the features used in the model. |
1566
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
=head2 Features |
1568
|
|
|
|
|
|
|
|
1569
|
|
|
|
|
|
|
TODO: outdated, superceded by use of config file -> rewrite |
1570
|
|
|
|
|
|
|
|
1571
|
|
|
|
|
|
|
Each feature has a form C<code:value>. The code desribes the information which |
1572
|
|
|
|
|
|
|
is relevant for the feature, and the value is the information retained from |
1573
|
|
|
|
|
|
|
the dependency edge (and possibly other parts of the sentence |
1574
|
|
|
|
|
|
|
(L<Treex::Tool::Parser::MSTperl::Sentence>) stored in C<sentence> field). |
1575
|
|
|
|
|
|
|
|
1576
|
|
|
|
|
|
|
For example, the feature C<L|l:být|pes> means that the lemma of the parent node |
1577
|
|
|
|
|
|
|
(the governing word) is "být" and the lemma of its child node (the dependent |
1578
|
|
|
|
|
|
|
node) is "pes". |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
Each (proper) feature is composed of several simple features. In the |
1581
|
|
|
|
|
|
|
aforementioned example, the simple feature codes were C<L> and C<l> and their |
1582
|
|
|
|
|
|
|
values "být" and "pes", respectively. Each simple feature code is a string |
1583
|
|
|
|
|
|
|
(case sensitive) and its value is also a string. The simple feature codes are |
1584
|
|
|
|
|
|
|
joined together by the C<|> sign to form the code of the proper feature, and |
1585
|
|
|
|
|
|
|
similarly, the simple feature values joined by C<|> form the proper feature |
1586
|
|
|
|
|
|
|
value. Then, the proper feature code and value are joined together by C<:>. |
1587
|
|
|
|
|
|
|
(Therefore, the codes and values of the simple features must not contain the |
1588
|
|
|
|
|
|
|
C<|> and the C<:> signs.) |
1589
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
By a naming convention, |
1591
|
|
|
|
|
|
|
if the same simple feature can be computed for both the parent node and its |
1592
|
|
|
|
|
|
|
child node, their codes are the same but for the case, which is upper for the |
1593
|
|
|
|
|
|
|
parent and lower for the child. If this is not applicable, an uppercase |
1594
|
|
|
|
|
|
|
code is used. |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
For higher effectiveness the simple feature codes are translated to integers |
1597
|
|
|
|
|
|
|
(see C<simple_feature_codes>). |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
In reality the feature codes are translated to integers as well (see |
1600
|
|
|
|
|
|
|
C<feature_codes>), but this is only an internal issue. You can see these |
1601
|
|
|
|
|
|
|
numbers in the model file if you use the default L<Data::Dumper> format (see |
1602
|
|
|
|
|
|
|
C<load> and C<store>). However, if you use the tsv format (see C<load_tsv>, |
1603
|
|
|
|
|
|
|
C<store_tsv>), you will see the real string feature codes. |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
Currently the following simple features are available. Any subset of them can |
1606
|
|
|
|
|
|
|
be used to form a proper feature, but their order should follow their order of |
1607
|
|
|
|
|
|
|
appearance in this list (still, this is only a cleanliness and readability |
1608
|
|
|
|
|
|
|
thing, it does not affect the function of the parser in any way). |
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
=over 4 |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
=item Distance (D) |
1613
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
Distance of the two nodes in the sentence, computed as order of the parent |
1615
|
|
|
|
|
|
|
minus the order of the child. Eg. for the sentence "To je prima pes ." and the |
1616
|
|
|
|
|
|
|
feature D computed on nodes "je" and "pes" (parent and child respectively), |
1617
|
|
|
|
|
|
|
the order of "je" is 2 and the order of "pes" is 4, yielding the feature value |
1618
|
|
|
|
|
|
|
of 2 - 4 = -2. This leads to a feature C<D:-2>. |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
=item Form (F, f) |
1621
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
The form of the node, i.e. the word exactly as it appears in the sentence text. |
1623
|
|
|
|
|
|
|
|
1624
|
|
|
|
|
|
|
Currently not used as it has not lead to any improvement in the parsing. |
1625
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
=item Lemma (L, l) |
1627
|
|
|
|
|
|
|
|
1628
|
|
|
|
|
|
|
The morphological lemma of the node. |
1629
|
|
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
=item preceding tag (S, s) |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
The morphological tag (or POS tag if you like) of the node preceding (ord-wise) |
1633
|
|
|
|
|
|
|
the node. |
1634
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
=item Tag (T, t) |
1636
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
The morphological tag of the node. |
1638
|
|
|
|
|
|
|
|
1639
|
|
|
|
|
|
|
=item following tag (U, u) |
1640
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
The morphological tag of the node following (ord-wise) the node. |
1642
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
=item between tag (B) |
1644
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
The morphological tag of each node between (ord-wise) the parent node and the |
1646
|
|
|
|
|
|
|
child node. This simple feature returns (a reference to) an array of values. |
1647
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
=back |
1649
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
Some of the simple features can return an empty string in case they are not |
1651
|
|
|
|
|
|
|
applicable (eg. C<U> for the last node in the sentence), then the whole |
1652
|
|
|
|
|
|
|
feature is not present for the edge. |
1653
|
|
|
|
|
|
|
|
1654
|
|
|
|
|
|
|
Some of the simple features return an array of values (eg. the C<B> simple |
1655
|
|
|
|
|
|
|
feature). This can result in several instances of the feature with the same |
1656
|
|
|
|
|
|
|
code for one edge to appear in the result. |
1657
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
=head1 FIELDS |
1659
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
=head2 Features |
1661
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
TODO: slightly outdated |
1663
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
The examples used here are consistent throughout this part of documentation, |
1665
|
|
|
|
|
|
|
i.e. if several simple features are listed in C<simple_feature_codes> and |
1666
|
|
|
|
|
|
|
then simple feature with index 9 is referred to in C<array_simple_features>, |
1667
|
|
|
|
|
|
|
it really means the C<B> simple feature which is on the 9th position in |
1668
|
|
|
|
|
|
|
C<simple_feature_codes>. |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
=over 4 |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
=item feature_count (Int) |
1673
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
Alias of C<scalar @{feature_codes}> (but the integer is really |
1675
|
|
|
|
|
|
|
stored in the field for faster access). |
1676
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
=item feature_codes (ArrayRef[Str]) |
1678
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
Codes of all features to be computed. Their |
1680
|
|
|
|
|
|
|
indexes in this array are used to refer to them in the code. Eg.: |
1681
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
feature_codes ( [( 'L|T', 'l|t', 'L|T|l|t', 'T|B|t')] ) |
1683
|
|
|
|
|
|
|
|
1684
|
|
|
|
|
|
|
=item feature_codes_hash (HashRef[Str]) |
1685
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
1 for each feature code to easily check if a feature exists |
1687
|
|
|
|
|
|
|
|
1688
|
|
|
|
|
|
|
=item feature_indexes (HashRef[Str]) |
1689
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
Index of each feature code in feature_codes (for conversion of feature code to |
1691
|
|
|
|
|
|
|
feature index) |
1692
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
=item feature_simple_features_indexes (ArrayRef[ArrayRef[Int]]) |
1694
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
For each feature contains (a reference to) an array which contains all its |
1696
|
|
|
|
|
|
|
simple feature indexes (corresponding to positions in C<simple_feature_codes> |
1697
|
|
|
|
|
|
|
). Eg. for the 4 features (0 to 3) listed in C<feature_codes> and the 10 |
1698
|
|
|
|
|
|
|
simple features listed in C<simple_feature_codes> (0 to 9): |
1699
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
feature_simple_features_indexes ( [( |
1701
|
|
|
|
|
|
|
[ (1, 5) ], |
1702
|
|
|
|
|
|
|
[ (2, 6) ], |
1703
|
|
|
|
|
|
|
[ (1, 5, 2, 6) ], |
1704
|
|
|
|
|
|
|
[ (5, 9, 6) ], |
1705
|
|
|
|
|
|
|
)] ) |
1706
|
|
|
|
|
|
|
|
1707
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
=item array_features (HashRef) |
1709
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
Indexes of features containing array simple features (see |
1711
|
|
|
|
|
|
|
C<array_simple_features>). Eg.: |
1712
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
array_features( { 3 => 1} ) |
1714
|
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
as the feature with index 3 (C<'T|B|t'>) contains the C<B> simple feature |
1716
|
|
|
|
|
|
|
which is an array simple feature. |
1717
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
=back |
1719
|
|
|
|
|
|
|
|
1720
|
|
|
|
|
|
|
=head2 Simple features |
1721
|
|
|
|
|
|
|
|
1722
|
|
|
|
|
|
|
=over 4 |
1723
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
=item simple_feature_count (Int) |
1725
|
|
|
|
|
|
|
|
1726
|
|
|
|
|
|
|
Alias of C<scalar @{simple_feature_codes}> (but the integer is really |
1727
|
|
|
|
|
|
|
stored in the field for faster access). |
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
=item simple_feature_codes (ArrayRef[Str]) |
1730
|
|
|
|
|
|
|
|
1731
|
|
|
|
|
|
|
Codes of all simple features to be computed. Their order is important as their |
1732
|
|
|
|
|
|
|
indexes in this array are used to refer to them in the code, especially in the |
1733
|
|
|
|
|
|
|
C<get_simple_feature> method. Eg.: |
1734
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
simple_feature_codes ( [('D', 'L', 'l', 'S', 's', 'T', 't', 'U', 'u', 'B')]) |
1736
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
=item simple_feature_codes_hash (HashRef[Str]) |
1738
|
|
|
|
|
|
|
|
1739
|
|
|
|
|
|
|
1 for each simple feature code to easily check if a simple feature exists |
1740
|
|
|
|
|
|
|
|
1741
|
|
|
|
|
|
|
=item simple_feature_indexes (HashRef[Str]) |
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
Index of each simple feature code in simple_feature_codes (for conversion of |
1744
|
|
|
|
|
|
|
simple feature code to simple feature index) |
1745
|
|
|
|
|
|
|
|
1746
|
|
|
|
|
|
|
=item simple_feature_sub_arguments (ArrayRef) |
1747
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
For each simple feature (on the corresponsing index) contains the index of the |
1749
|
|
|
|
|
|
|
field (in C<field_names>), which is used to compute the simple feature value |
1750
|
|
|
|
|
|
|
(together with a subroutine from C<simple_feature_subs>). |
1751
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
If the simple feature takes more than one argument (called a multiarg feature |
1753
|
|
|
|
|
|
|
here), then instead of a single field index there is a reference to an array |
1754
|
|
|
|
|
|
|
of field indexes. |
1755
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
If the simple feature takes other arguments than fields (especially integers), |
1757
|
|
|
|
|
|
|
then these arguments are stored here insted of field indexes. |
1758
|
|
|
|
|
|
|
|
1759
|
|
|
|
|
|
|
=item simple_feature_subs (ArrayRef) |
1760
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
For faster run, the simple features are internally not represented by their |
1762
|
|
|
|
|
|
|
string codes, which would have to be parsed repeatedly. Instead their codes |
1763
|
|
|
|
|
|
|
are parsed once only (in C<set_simple_feature>) and they are represented as |
1764
|
|
|
|
|
|
|
an integer index of the field which is used to compute the feature (it is the |
1765
|
|
|
|
|
|
|
actual index of the field in the input file line, accessible through |
1766
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Node/fields>) and a reference to a subroutine |
1767
|
|
|
|
|
|
|
(one of the C<feature_*> subs, see below) which computes the feature value |
1768
|
|
|
|
|
|
|
based on the field index and the edge (L<Treex::Tool::Parser::MSTperl::Edge>). |
1769
|
|
|
|
|
|
|
The references subroutine is then invoked in C<get_simple_feature_values_array>. |
1770
|
|
|
|
|
|
|
|
1771
|
|
|
|
|
|
|
=item array_simple_features (HashRef[Int]) |
1772
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
Indexes of simple features that return an array of values instead of a single |
1774
|
|
|
|
|
|
|
string value. Eg.: |
1775
|
|
|
|
|
|
|
|
1776
|
|
|
|
|
|
|
array_simple_features( { 9 => 1} ) |
1777
|
|
|
|
|
|
|
|
1778
|
|
|
|
|
|
|
because in the aforementioned example the C<B> simple feature returns an array |
1779
|
|
|
|
|
|
|
of values and has the index C<9>. |
1780
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
|
1782
|
|
|
|
|
|
|
=back |
1783
|
|
|
|
|
|
|
|
1784
|
|
|
|
|
|
|
=head2 Other |
1785
|
|
|
|
|
|
|
|
1786
|
|
|
|
|
|
|
=over 4 |
1787
|
|
|
|
|
|
|
|
1788
|
|
|
|
|
|
|
=item edge_features_cache (HashRef[ArrayRef[Str]) |
1789
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
If caching is turned on (see below), all features of any edge computed by the |
1791
|
|
|
|
|
|
|
C<get_feature_simple_features_indexes> method are computed once only, stored |
1792
|
|
|
|
|
|
|
in this cache and then retrieved when needed. |
1793
|
|
|
|
|
|
|
|
1794
|
|
|
|
|
|
|
The key of the hash is the edge signature (see |
1795
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Edge/signature>), the value is |
1796
|
|
|
|
|
|
|
(a reference to) an array of fetures and their values. |
1797
|
|
|
|
|
|
|
|
1798
|
|
|
|
|
|
|
=back |
1799
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
=head1 METHODS |
1801
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
=head2 Settings |
1803
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
The best source of information about all the possible settings is the |
1805
|
|
|
|
|
|
|
configuration file itself (usually called C<config.txt>), as it is richly |
1806
|
|
|
|
|
|
|
commented and accompanied by real examples at the same time. |
1807
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
=over 4 |
1809
|
|
|
|
|
|
|
|
1810
|
|
|
|
|
|
|
=item my $featuresControl = |
1811
|
|
|
|
|
|
|
Treex::Tool::Parser::MSTperl::FeaturesControl->new( |
1812
|
|
|
|
|
|
|
'config' => $config, |
1813
|
|
|
|
|
|
|
'feature_codes_from_config' => $feature_codes_array_reference, |
1814
|
|
|
|
|
|
|
'use_edge_features_cache' => $use_edge_features_cache, |
1815
|
|
|
|
|
|
|
) |
1816
|
|
|
|
|
|
|
|
1817
|
|
|
|
|
|
|
Parses feature codes and creates their in-memory representations. |
1818
|
|
|
|
|
|
|
|
1819
|
|
|
|
|
|
|
=item set_feature ($feature_code) |
1820
|
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
|
Parses the feature code and (if no errors are encountered) creates its |
1822
|
|
|
|
|
|
|
representation in the fields of this package (all C<feature_>* fields and |
1823
|
|
|
|
|
|
|
possibly also the C<array_features> field). |
1824
|
|
|
|
|
|
|
|
1825
|
|
|
|
|
|
|
=item set_simple_feature ($simple_feature_code) |
1826
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
Parses the simple feature code and creates its representation in the fields of |
1828
|
|
|
|
|
|
|
this package (all C<simple_feature_>* fields and possibly also the |
1829
|
|
|
|
|
|
|
C<array_simple_features> field). |
1830
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
=back |
1832
|
|
|
|
|
|
|
|
1833
|
|
|
|
|
|
|
=head2 Computing (proper) features |
1834
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
=over 4 |
1836
|
|
|
|
|
|
|
|
1837
|
|
|
|
|
|
|
=item my $features_array_rf = $model->get_all_features($edge) |
1838
|
|
|
|
|
|
|
|
1839
|
|
|
|
|
|
|
Returns (a reference to) an array which contains all features of the edge |
1840
|
|
|
|
|
|
|
(according to settings). |
1841
|
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
If caching is turned on, tries to look the features up in the cache before |
1843
|
|
|
|
|
|
|
computing them. If they are not cached yet, they are computed and stored into |
1844
|
|
|
|
|
|
|
the cache. |
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
The value of a feature is computed by C<get_feature_value>. Values of simple |
1847
|
|
|
|
|
|
|
features are precomputed (by calling C<get_simple_feature_values_array>) and |
1848
|
|
|
|
|
|
|
passed to the C<get_feature_value> method. |
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
=item my $feature_value = get_feature_value(3, $simple_feature_values) |
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
Returns the value of the feature with the given index. |
1853
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
If it is an array feature (see C<array_features>), its value is (a reference |
1855
|
|
|
|
|
|
|
to) an array of all (string) values of the feature (a reference to an empty |
1856
|
|
|
|
|
|
|
array if there are no values). |
1857
|
|
|
|
|
|
|
|
1858
|
|
|
|
|
|
|
If it is not an array feature, its value is composed from the simple feature |
1859
|
|
|
|
|
|
|
values. If some of the simple features do not have a value defined, an empty |
1860
|
|
|
|
|
|
|
string (C<''>) is returned. |
1861
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
=item my $feature_value = get_array_feature_value ($simple_features_indexes, |
1863
|
|
|
|
|
|
|
$simple_feature_values, $start_from) |
1864
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
Recursively calls itself to compose an array of all values of the feature |
1866
|
|
|
|
|
|
|
(composed of the simple features given in C<$simple_features_indexes> array |
1867
|
|
|
|
|
|
|
reference), which is a cartesian product on all values of the simple features. |
1868
|
|
|
|
|
|
|
The C<$start_from> variable should be C<0> when this method is called and is |
1869
|
|
|
|
|
|
|
incremented in the recursive calls. |
1870
|
|
|
|
|
|
|
|
1871
|
|
|
|
|
|
|
=back |
1872
|
|
|
|
|
|
|
|
1873
|
|
|
|
|
|
|
=head2 Computing simple features |
1874
|
|
|
|
|
|
|
|
1875
|
|
|
|
|
|
|
=over 4 |
1876
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
=item my $simple_feature_values = get_simple_feature_values_array($edge) |
1878
|
|
|
|
|
|
|
|
1879
|
|
|
|
|
|
|
Returns (a reference to) an array of values of all simple features (see |
1880
|
|
|
|
|
|
|
C<simple_feature_codes>). For each simple feature, its value can be found |
1881
|
|
|
|
|
|
|
on the position in the returned array corresponding to its position in |
1882
|
|
|
|
|
|
|
C<simple_feature_codes>. |
1883
|
|
|
|
|
|
|
|
1884
|
|
|
|
|
|
|
=item my $sub = get_simple_feature_sub_reference ('distance') |
1885
|
|
|
|
|
|
|
|
1886
|
|
|
|
|
|
|
Translates the feature funtion string name (eg. C<distance>) to its reference |
1887
|
|
|
|
|
|
|
(eg. C<\&feature_distance>). |
1888
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
=item my $value = get_simple_feature_value ($edge, 9) |
1890
|
|
|
|
|
|
|
|
1891
|
|
|
|
|
|
|
Returns the value of the simple feature with the given index by calling an |
1892
|
|
|
|
|
|
|
appropriate C<feature_*> method on the edge |
1893
|
|
|
|
|
|
|
(see L<Treex::Tool::Parser::MSTperl::Edge>). If |
1894
|
|
|
|
|
|
|
the feature cannot be computed, an empty string (C<''>) is returned (or a |
1895
|
|
|
|
|
|
|
reference to an empty array for array simple features - see |
1896
|
|
|
|
|
|
|
C<array_simple_features>). |
1897
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
=item feature_distance |
1899
|
|
|
|
|
|
|
|
1900
|
|
|
|
|
|
|
=item feature_child |
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
=item feature_parent |
1903
|
|
|
|
|
|
|
|
1904
|
|
|
|
|
|
|
=item feature_first |
1905
|
|
|
|
|
|
|
|
1906
|
|
|
|
|
|
|
=item feature_second |
1907
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
=item feature_preceding_child |
1909
|
|
|
|
|
|
|
|
1910
|
|
|
|
|
|
|
=item feature_preceding_parent |
1911
|
|
|
|
|
|
|
|
1912
|
|
|
|
|
|
|
=item feature_following_child |
1913
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
=item feature_following_parent |
1915
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
=item feature_preceding_first |
1917
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
=item feature_preceding_second |
1919
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
=item feature_following_first |
1921
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
=item feature_following_second |
1923
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
=item feature_between |
1925
|
|
|
|
|
|
|
|
1926
|
|
|
|
|
|
|
=item feature_foreach |
1927
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
=item feature_equals, feature_equals_pc, feature_equals_pc_at |
1929
|
|
|
|
|
|
|
|
1930
|
|
|
|
|
|
|
A simple feature function C<equals(field_1,field_2)> |
1931
|
|
|
|
|
|
|
with "at least once" semantics for multiple values |
1932
|
|
|
|
|
|
|
(there can be multiple alignments) |
1933
|
|
|
|
|
|
|
with a special output value if one of the fields is unknown |
1934
|
|
|
|
|
|
|
(maybe it suffices to emmit an undef, as this would occur iff at least |
1935
|
|
|
|
|
|
|
one of the arguments is undef; but maybe not and eg. "-1" should be given) |
1936
|
|
|
|
|
|
|
|
1937
|
|
|
|
|
|
|
This makes it possible to have a simple feature which behaves like this: |
1938
|
|
|
|
|
|
|
|
1939
|
|
|
|
|
|
|
=over 4 |
1940
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
=item returns 1 if the edge between child and parent is also present in the |
1942
|
|
|
|
|
|
|
English tree |
1943
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
=item returns 0 if not |
1945
|
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
|
=item returns -1 if cannot decide (alignment info is missing for some of the |
1947
|
|
|
|
|
|
|
nodes) |
1948
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
=back |
1950
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
Because if the parser has (the ord of the en child node and) |
1952
|
|
|
|
|
|
|
the ord of en child's parent and the ord of the en parent node |
1953
|
|
|
|
|
|
|
(and the ord of the en parent's parent), the feature can check whether |
1954
|
|
|
|
|
|
|
en_parent->ord = en_child->parentOrd |
1955
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
C<equalspc(en->ord, en->parent->ord)> |
1957
|
|
|
|
|
|
|
|
1958
|
|
|
|
|
|
|
=back |
1959
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
=head1 AUTHORS |
1961
|
|
|
|
|
|
|
|
1962
|
|
|
|
|
|
|
Rudolf Rosa <rosa@ufal.mff.cuni.cz> |
1963
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1965
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, |
1967
|
|
|
|
|
|
|
Charles University in Prague |
1968
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
This module is free software; |
1970
|
|
|
|
|
|
|
you can redistribute it and/or modify it under the same terms as Perl itself. |