line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Tool::Parser::MSTperl::Sentence; |
2
|
|
|
|
|
|
|
{ |
3
|
|
|
|
|
|
|
$Treex::Tool::Parser::MSTperl::Sentence::VERSION = '0.11949'; |
4
|
|
|
|
|
|
|
} |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
2216
|
use Moose; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use Treex::Tool::Parser::MSTperl::Node; |
9
|
|
|
|
|
|
|
use Treex::Tool::Parser::MSTperl::RootNode; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has config => ( |
12
|
|
|
|
|
|
|
isa => 'Treex::Tool::Parser::MSTperl::Config', |
13
|
|
|
|
|
|
|
is => 'ro', |
14
|
|
|
|
|
|
|
required => '1', |
15
|
|
|
|
|
|
|
); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# unique for each sentence, where sentence means sequence of words |
18
|
|
|
|
|
|
|
# (i.e. stays the same for copies of the same sentence) |
19
|
|
|
|
|
|
|
# needed for caching of features when training the parser |
20
|
|
|
|
|
|
|
# (can be undef if not needed) |
21
|
|
|
|
|
|
|
has id => ( |
22
|
|
|
|
|
|
|
is => 'ro', |
23
|
|
|
|
|
|
|
isa => 'Maybe[Int]', |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has nodes => ( |
27
|
|
|
|
|
|
|
is => 'rw', |
28
|
|
|
|
|
|
|
isa => 'ArrayRef[Treex::Tool::Parser::MSTperl::Node]', |
29
|
|
|
|
|
|
|
required => 1, |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# root node added |
33
|
|
|
|
|
|
|
has nodes_with_root => ( |
34
|
|
|
|
|
|
|
is => 'rw', |
35
|
|
|
|
|
|
|
isa => 'ArrayRef[Treex::Tool::Parser::MSTperl::Node]', |
36
|
|
|
|
|
|
|
); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# used only in unlabelled parsing |
39
|
|
|
|
|
|
|
has features => ( |
40
|
|
|
|
|
|
|
is => 'rw', |
41
|
|
|
|
|
|
|
isa => 'Maybe[ArrayRef[Str]]', |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# TODO |
45
|
|
|
|
|
|
|
# has betweenFeatureValues => ( |
46
|
|
|
|
|
|
|
# isa => 'HashRef', |
47
|
|
|
|
|
|
|
# is => 'rw', |
48
|
|
|
|
|
|
|
# default => sub { {} }, |
49
|
|
|
|
|
|
|
# ); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
has edges => ( |
52
|
|
|
|
|
|
|
is => 'rw', |
53
|
|
|
|
|
|
|
isa => 'Maybe[ArrayRef[Treex::Tool::Parser::MSTperl::Edge]]', |
54
|
|
|
|
|
|
|
); |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
sub BUILD { |
57
|
|
|
|
|
|
|
my ($self) = @_; |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
#add root |
60
|
|
|
|
|
|
|
my $root = Treex::Tool::Parser::MSTperl::RootNode->new( |
61
|
|
|
|
|
|
|
fields => $self->config->root_field_values, |
62
|
|
|
|
|
|
|
config => $self->config |
63
|
|
|
|
|
|
|
); |
64
|
|
|
|
|
|
|
my @nodes_with_root; |
65
|
|
|
|
|
|
|
push @nodes_with_root, $root; |
66
|
|
|
|
|
|
|
push @nodes_with_root, @{ $self->nodes }; |
67
|
|
|
|
|
|
|
$self->nodes_with_root( [@nodes_with_root] ); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# fill node ords |
70
|
|
|
|
|
|
|
my $ord = 1; |
71
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
72
|
|
|
|
|
|
|
$node->ord($ord); |
73
|
|
|
|
|
|
|
$ord++; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
return; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub fill_fields_after_parse { |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
my ($self) = @_; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
#compute edges |
84
|
|
|
|
|
|
|
$self->compute_edges(); |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
#compute features |
87
|
|
|
|
|
|
|
$self->compute_features( $self->config->unlabelledFeaturesControl ); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
return; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub fill_fields_before_labelling { |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
my ($self) = @_; |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
if ( $self->config->DEBUG >= 3 ) { |
97
|
|
|
|
|
|
|
print $self->id . " fill_fields_before_labelling()\n"; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#compute edges |
101
|
|
|
|
|
|
|
$self->compute_edges(); |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
#compute features |
104
|
|
|
|
|
|
|
$self->compute_features( $self->config->labelledFeaturesControl ); |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
return; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# sub fill_fields_after_labelling { |
110
|
|
|
|
|
|
|
# |
111
|
|
|
|
|
|
|
# my ($self) = @_; |
112
|
|
|
|
|
|
|
# |
113
|
|
|
|
|
|
|
# seems there is nothing to compute here |
114
|
|
|
|
|
|
|
# (provided that the labels are somewhat extra, |
115
|
|
|
|
|
|
|
# i.e. not part of the feature values) |
116
|
|
|
|
|
|
|
# |
117
|
|
|
|
|
|
|
# return; |
118
|
|
|
|
|
|
|
# } |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# compute node parents and the array of edges |
121
|
|
|
|
|
|
|
# used both in fill_fields_after_parse and fill_fields_before_labelling methods |
122
|
|
|
|
|
|
|
sub compute_edges { |
123
|
|
|
|
|
|
|
my ($self) = @_; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
my @edges; |
126
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# fill node parent |
129
|
|
|
|
|
|
|
# (it can be set either in parent or in parentOrd field) |
130
|
|
|
|
|
|
|
if ( $node->parent ) { |
131
|
|
|
|
|
|
|
$node->parentOrd( $node->parent->ord ); |
132
|
|
|
|
|
|
|
} else { # $node->parentOrd |
133
|
|
|
|
|
|
|
$node->parent( $self->getNodeByOrd( $node->parentOrd ) ); |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
if ( $self->config->DEBUG >= 3 ) { |
137
|
|
|
|
|
|
|
print $node->parentOrd |
138
|
|
|
|
|
|
|
. '(' . $node->parent->fields->[1] . ')' |
139
|
|
|
|
|
|
|
. ' -> ' |
140
|
|
|
|
|
|
|
. $node->ord |
141
|
|
|
|
|
|
|
. '(' . $node->fields->[1] . ')' |
142
|
|
|
|
|
|
|
. "\n"; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# add a new edge |
146
|
|
|
|
|
|
|
my $edge = Treex::Tool::Parser::MSTperl::Edge->new( |
147
|
|
|
|
|
|
|
child => $node, |
148
|
|
|
|
|
|
|
parent => $node->parent, |
149
|
|
|
|
|
|
|
sentence => $self |
150
|
|
|
|
|
|
|
); |
151
|
|
|
|
|
|
|
push @edges, $edge; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# add edge to the parent's list of children |
154
|
|
|
|
|
|
|
push @{ $node->parent->children }, $edge; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
$self->edges( [@edges] ); |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
return; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# compute edge features and join them into sentence features |
163
|
|
|
|
|
|
|
sub compute_features { |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
my ( $self, $featuresControl ) = @_; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
my @features; |
168
|
|
|
|
|
|
|
foreach my $edge ( @{ $self->edges } ) { |
169
|
|
|
|
|
|
|
my $edge_features; |
170
|
|
|
|
|
|
|
my $ALGORITHM = $self->config->labeller_algorithm; |
171
|
|
|
|
|
|
|
if ( $ALGORITHM < 20 ) { |
172
|
|
|
|
|
|
|
$edge_features = $featuresControl->get_all_features($edge); |
173
|
|
|
|
|
|
|
} else { |
174
|
|
|
|
|
|
|
$edge_features = $featuresControl->get_all_features( $edge, -1 ); |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
$edge->features($edge_features); |
177
|
|
|
|
|
|
|
push @features, @{$edge_features}; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# (TODO) used only in unlabelled parsing |
181
|
|
|
|
|
|
|
$self->features( [@features] ); |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
return; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub clear_parse { |
187
|
|
|
|
|
|
|
my ($self) = @_; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
#clear node parents and labels |
190
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
191
|
|
|
|
|
|
|
$node->parent(undef); |
192
|
|
|
|
|
|
|
$node->parentOrd(0); |
193
|
|
|
|
|
|
|
$node->label('_'); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
#clear edges |
197
|
|
|
|
|
|
|
$self->edges(undef); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
#clear features |
200
|
|
|
|
|
|
|
$self->features(undef); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
return; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub copy_nonparsed { |
206
|
|
|
|
|
|
|
my ($self) = @_; |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
#copy nodes |
209
|
|
|
|
|
|
|
my @nodes; |
210
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
211
|
|
|
|
|
|
|
my $node_copy = $node->copy_nonparsed(); |
212
|
|
|
|
|
|
|
push @nodes, $node_copy; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
#create a new instance |
216
|
|
|
|
|
|
|
my $copy = Treex::Tool::Parser::MSTperl::Sentence->new( |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# TODO: maybe should get a different ID for the sake of labelling |
219
|
|
|
|
|
|
|
# (but this is curently not used anyway) |
220
|
|
|
|
|
|
|
id => $self->id, |
221
|
|
|
|
|
|
|
nodes => [@nodes], |
222
|
|
|
|
|
|
|
config => $self->config, |
223
|
|
|
|
|
|
|
); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
return $copy; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub copy_nonlabelled { |
229
|
|
|
|
|
|
|
my ($self) = @_; |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
#copy nodes |
232
|
|
|
|
|
|
|
my @nodes; |
233
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
234
|
|
|
|
|
|
|
my $node_copy = $node->copy_nonlabelled(); |
235
|
|
|
|
|
|
|
push @nodes, $node_copy; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
#create a new instance |
239
|
|
|
|
|
|
|
my $copy = Treex::Tool::Parser::MSTperl::Sentence->new( |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# TODO: maybe should get a different ID for the sake of labelling |
242
|
|
|
|
|
|
|
# (but this is curently not used anyway) |
243
|
|
|
|
|
|
|
id => $self->id, |
244
|
|
|
|
|
|
|
nodes => [@nodes], |
245
|
|
|
|
|
|
|
config => $self->config, |
246
|
|
|
|
|
|
|
); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
return $copy; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
sub setChildParent { |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# (Int $childOrd, Int $parentOrd) |
254
|
|
|
|
|
|
|
my ( $self, $childOrd, $parentOrd ) = @_; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
my $child = $self->getNodeByOrd($childOrd); |
257
|
|
|
|
|
|
|
my $parent = $self->getNodeByOrd($parentOrd); |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
$child->parent($parent); |
260
|
|
|
|
|
|
|
$child->parentOrd($parentOrd); |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
return; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub len { |
266
|
|
|
|
|
|
|
my ($self) = @_; |
267
|
|
|
|
|
|
|
return scalar( @{ $self->nodes } ) |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub getNodeByOrd { |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# (Int $ord) |
273
|
|
|
|
|
|
|
my ( $self, $ord ) = @_; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
if ( $ord >= 0 && $ord <= $self->len() ) { |
276
|
|
|
|
|
|
|
return $self->nodes_with_root->[$ord]; |
277
|
|
|
|
|
|
|
} else { |
278
|
|
|
|
|
|
|
return; # undef |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
sub count_errors_attachement { |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence) |
285
|
|
|
|
|
|
|
my ( $self, $correct_sentence ) = @_; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
my $errors = 0; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
#assert that nodes in the sentences with the same ords |
290
|
|
|
|
|
|
|
# are corresponding nodes |
291
|
|
|
|
|
|
|
foreach my $my_node ( @{ $self->nodes } ) { |
292
|
|
|
|
|
|
|
my $my_parent = $my_node->parentOrd; |
293
|
|
|
|
|
|
|
my $correct_node = $correct_sentence->getNodeByOrd( $my_node->ord ); |
294
|
|
|
|
|
|
|
my $correct_parent = $correct_node->parentOrd; |
295
|
|
|
|
|
|
|
if ( $my_parent != $correct_parent ) { |
296
|
|
|
|
|
|
|
if ( $self->config->lossFunction ) { |
297
|
|
|
|
|
|
|
$errors += |
298
|
|
|
|
|
|
|
$self->attachement_error( |
299
|
|
|
|
|
|
|
$my_node, $my_node->parent, $correct_node->parent |
300
|
|
|
|
|
|
|
); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
else { |
303
|
|
|
|
|
|
|
$errors++; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
return $errors; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
sub attachement_error { |
312
|
|
|
|
|
|
|
my ( $self, $node, $assignedParent, $correctParent ) = @_; |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# TODO how do the undefines happen? |
315
|
|
|
|
|
|
|
# they only seem to occur during testing, not during training |
316
|
|
|
|
|
|
|
return 1 if ( !defined $assignedParent || !defined $correctParent ); |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
my $error = 1; |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
my $lossFunction = $self->config->lossFunction; |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
if ( $lossFunction eq 'J' ) { |
323
|
|
|
|
|
|
|
if ( defined $correctParent ) { |
324
|
|
|
|
|
|
|
if ( $correctParent->fields->[4] =~ /^J/ ) { |
325
|
|
|
|
|
|
|
$error = 10; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'A' ) { |
330
|
|
|
|
|
|
|
if ( $node->fields->[4] =~ /^A/ ) { |
331
|
|
|
|
|
|
|
$error = 10; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'NA' ) { |
335
|
|
|
|
|
|
|
if ( defined $correctParent ) { |
336
|
|
|
|
|
|
|
if ($node->fields->[4] =~ /^A/ |
337
|
|
|
|
|
|
|
&& $correctParent->fields->[4] =~ /^N/ |
338
|
|
|
|
|
|
|
) |
339
|
|
|
|
|
|
|
{ |
340
|
|
|
|
|
|
|
$error = 10; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'NA2' ) { |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# if the child is A |
347
|
|
|
|
|
|
|
if ( $node->fields->[4] =~ /^A/ ) { |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# and the assigned or correct parent is N |
350
|
|
|
|
|
|
|
if ( $correctParent->fields->[4] =~ /^N/ || $assignedParent->fields->[4] =~ /^N/ ) { |
351
|
|
|
|
|
|
|
$error = 10; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'JNA' ) { |
356
|
|
|
|
|
|
|
if ( defined $correctParent ) { |
357
|
|
|
|
|
|
|
if ($node->fields->[4] =~ /^A/ |
358
|
|
|
|
|
|
|
&& |
359
|
|
|
|
|
|
|
( |
360
|
|
|
|
|
|
|
$correctParent->fields->[4] =~ /^N/ |
361
|
|
|
|
|
|
|
|| $correctParent->fields->[4] =~ /^J/ |
362
|
|
|
|
|
|
|
) |
363
|
|
|
|
|
|
|
) |
364
|
|
|
|
|
|
|
{ |
365
|
|
|
|
|
|
|
$error = 10; |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'NR2' ) { |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# if the child is N |
372
|
|
|
|
|
|
|
if ( $node->fields->[4] =~ /^N/ ) { |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# and the assigned or correct parent is R |
375
|
|
|
|
|
|
|
if ( $correctParent->fields->[4] =~ /^R/ || $assignedParent->fields->[4] =~ /^R/ ) { |
376
|
|
|
|
|
|
|
$error = 10; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'JNR2' ) { |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
# if the child is N or J |
383
|
|
|
|
|
|
|
if ( $node->fields->[4] =~ /^[NJ]/ ) { |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
# and the assigned or correct parent is R |
386
|
|
|
|
|
|
|
if ( $correctParent->fields->[4] =~ /^R/ || $assignedParent->fields->[4] =~ /^R/ ) { |
387
|
|
|
|
|
|
|
$error = 10; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
elsif ( $lossFunction eq 'J2' ) { |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# if the assigned or correct parent is J |
394
|
|
|
|
|
|
|
if ( $correctParent->fields->[4] =~ /^J/ || $assignedParent->fields->[4] =~ /^J/ ) { |
395
|
|
|
|
|
|
|
$error = 10; |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
return $error; |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
sub count_errors_labelling { |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence) |
405
|
|
|
|
|
|
|
my ( $self, $correct_sentence ) = @_; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
my $errors = 0; |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
my @correct_labels = |
410
|
|
|
|
|
|
|
map { $_->label } @{ $correct_sentence->nodes }; |
411
|
|
|
|
|
|
|
my @my_labels = |
412
|
|
|
|
|
|
|
map { $_->label } @{ $self->nodes }; |
413
|
|
|
|
|
|
|
for ( my $i = 0; $i < @correct_labels; $i++ ) { |
414
|
|
|
|
|
|
|
if ( $correct_labels[$i] ne $my_labels[$i] ) { |
415
|
|
|
|
|
|
|
$errors++; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
return $errors; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub count_errors_attachement_and_labelling { |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence) |
425
|
|
|
|
|
|
|
my ( $self, $correct_sentence ) = @_; |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
my $errors = 0; |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
#assert that nodes in the sentences with the same ords |
430
|
|
|
|
|
|
|
# are corresponding nodes |
431
|
|
|
|
|
|
|
foreach my $my_node ( @{ $self->nodes } ) { |
432
|
|
|
|
|
|
|
my $my_parent = $my_node->parentOrd; |
433
|
|
|
|
|
|
|
my $my_label = $my_node->label; |
434
|
|
|
|
|
|
|
my $correct_node = $correct_sentence->getNodeByOrd( $my_node->ord ); |
435
|
|
|
|
|
|
|
my $correct_parent = $correct_node->parentOrd; |
436
|
|
|
|
|
|
|
my $correct_label = $correct_node->label; |
437
|
|
|
|
|
|
|
if ( $my_parent != $correct_parent || $my_label ne $correct_label ) { |
438
|
|
|
|
|
|
|
$errors++; |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
return $errors; |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
sub toParentOrdsArray { |
446
|
|
|
|
|
|
|
my ($self) = @_; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
my @parents; |
449
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
450
|
|
|
|
|
|
|
push @parents, $node->parentOrd; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
return [@parents]; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
sub toLabelsArray { |
457
|
|
|
|
|
|
|
my ($self) = @_; |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
my @labels; |
460
|
|
|
|
|
|
|
foreach my $node ( @{ $self->nodes } ) { |
461
|
|
|
|
|
|
|
push @labels, $node->label; |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
return [@labels]; |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
1; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
__END__ |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=pod |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=for Pod::Coverage BUILD |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=encoding utf-8 |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=head1 NAME |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Treex::Tool::Parser::MSTperl::Sentence |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=head1 VERSION |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
version 0.11949 |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=head1 DESCRIPTION |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
Represents a sentence, both parsed an unparsed. |
488
|
|
|
|
|
|
|
Contains an array of nodes which represent the words in the sentence. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
The nodes are ordered, their C<ord> is their 1-based position in the sentence. |
491
|
|
|
|
|
|
|
The C<0 ord> value is reserved for the (technical) sentence root. |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head1 FIELDS |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=over 4 |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=item id (Int) |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
An integer id unique for each sentence (in its proper sense, where sentence |
500
|
|
|
|
|
|
|
is a sequence of tokens - i.e. C<id> stays the same for copies of the same |
501
|
|
|
|
|
|
|
sentence). |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=item nodes (ArrayRef[Treex::Tool::Parser::MSTperl::Node]) |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
(A reference to) an array of nodes (C<Treex::Tool::Parser::MSTperl::Node>) of |
506
|
|
|
|
|
|
|
the sentence. |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
A node represents both a token of the sentence (usually this is a word) and a |
509
|
|
|
|
|
|
|
node in the parse tree of the sentence as well (if the sentence have been |
510
|
|
|
|
|
|
|
parsed). |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=item nodes_with_root (ArrayRef[Treex::Tool::Parser::MSTperl::Node]) |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
Copy of C<nodes> field with a root node |
515
|
|
|
|
|
|
|
(L<Treex::Tool::Parser::MSTperl::RootNode>) added at the beginning. As the |
516
|
|
|
|
|
|
|
root node's C<ord> is C<0> by definition, the position of the nodes in this |
517
|
|
|
|
|
|
|
array exactly corresponds to its C<ord>. |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=item edges (Maybe[ArrayRef[Treex::Tool::Parser::MSTperl::Edge]]) |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
If the sentence is parsed (i.e. the nodes know their parents), this field |
522
|
|
|
|
|
|
|
contains (a reference to) an array of all edges |
523
|
|
|
|
|
|
|
(L<Treex::Tool::Parser::MSTperl::Edge>) in the parse tree of the sentence. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
This field is set by the C<sub> C<fill_fields_after_parse>. |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
If the sentence is not parsed, this field is C<undef>. |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=item features (Maybe[ArrayRef[Str]]) |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
If the sentence is parsed, this field |
532
|
|
|
|
|
|
|
contains (a reference to) an array of all features of all edges in the parse |
533
|
|
|
|
|
|
|
tree of the sentence. If some of the features are repeated in the sentence |
534
|
|
|
|
|
|
|
(i.e. they are present in severeal edges or even repeated in one edge), they |
535
|
|
|
|
|
|
|
are repeated here as well, i.e. this is not a set in mathematical sense but a |
536
|
|
|
|
|
|
|
(generally unordered) list. |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
This field is set by the C<sub> C<fill_fields_after_parse>. |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
If the sentence is not parsed, this field is C<undef>. |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=back |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 METHODS |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=head2 Constructor |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=over 4 |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=item my $sentence = Treex::Tool::Parser::MSTperl::Sentence->new( |
551
|
|
|
|
|
|
|
id => 12, nodes => [$node1, $node2, $node3, ...]); |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Creates a new sentence. The C<id> must be unique (but copies of the same |
554
|
|
|
|
|
|
|
sentence are to share the same id). It is used for edge signature generation |
555
|
|
|
|
|
|
|
(L<Treex::Tool::Parser::MSTperl::Edge/signature>) in edge features caching (and |
556
|
|
|
|
|
|
|
therefore does not have to be set if caching is disabled). |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
The order of the nodes denotes their order in the sentence, starting from the |
559
|
|
|
|
|
|
|
node with C<ord> 1, i.e. the technical root |
560
|
|
|
|
|
|
|
(L<Treex::Tool::Parser::MSTperl::RootNode>) is not to be included as it is |
561
|
|
|
|
|
|
|
generated automatically in the constructor. |
562
|
|
|
|
|
|
|
The C<ord>s of the nodes (L<Treex::Tool::Parser::MSTperl::Node/ord>) do not |
563
|
|
|
|
|
|
|
have to (and actually shouldn't) be filled in. If they are, they are checked |
564
|
|
|
|
|
|
|
and a warning on STDERR is issued if they do not correspond to the position of |
565
|
|
|
|
|
|
|
the nodes in the array. If they are not, they are filled in automatically |
566
|
|
|
|
|
|
|
during the sentence creation. |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
Other fields (C<nodes_with_root>, C<edges> and C<features>) should usually not |
569
|
|
|
|
|
|
|
be set. C<nodes_with_root> are set automatically during sentence creation (and |
570
|
|
|
|
|
|
|
any value set to it is discarded). C<edges> and C<features> are to be set only |
571
|
|
|
|
|
|
|
if the sentence is parsed (i.e. the nodes know their parents, see |
572
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Node/parent> and |
573
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Node/parentOrd>) by calling the |
574
|
|
|
|
|
|
|
C<fill_fields_after_parse> method. |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
So, if the sentence is already parsed, you should call the |
577
|
|
|
|
|
|
|
C<fill_fields_after_parse> method immediately after creaion of the sentence. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=item my $unparsed_sentence_copy = $sentence->copy_nonparsed(); |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Creates a new instance of the same sentence with the same C<id> and with |
582
|
|
|
|
|
|
|
copies of the nodes but without any parsing information (like after calling |
583
|
|
|
|
|
|
|
C<clear_parse>). The nodes are copied by calling |
584
|
|
|
|
|
|
|
L<Treex::Tool::Parser::MSTperl::Node/copy_nonparsed>. |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
=back |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=head2 Action methods |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=over 4 |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=item $sentence->setChildParent(5, 3) |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
Sets the parent of the node with the first C<ord> to be the node with the second |
595
|
|
|
|
|
|
|
C<ord> - eg. here, the 3rd node is the parent of the 5th node. |
596
|
|
|
|
|
|
|
It only sets the C<parent> and C<parentOrd> fields in the child node |
597
|
|
|
|
|
|
|
(i.e. it does not create or modify any edges). |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
When all nodes' parents have been set, C<fill_fields_after_parse> can be called. |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
=item $sentence->fill_fields_after_parse() |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
Fills the fields of the sentence and fields of its nodes which can be filled |
604
|
|
|
|
|
|
|
only for a sentence that has already been parsed (i.e. if the nodes' C<parent> |
605
|
|
|
|
|
|
|
or C<parentOrd> fields are filled). |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
The fields which are filled by this subroutine are C<edges> and C<features> |
608
|
|
|
|
|
|
|
for the sentence and C<parent> or C<parentOrd> for each of the sentence nodes |
609
|
|
|
|
|
|
|
which do not have the field set. |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
=item $sentence->clear_parse() |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
Is kind of an inversion of the C<fill_fields_after_parse> method. It clears |
614
|
|
|
|
|
|
|
the C<edges> and C<features> fields and also unsets the parents of all nodes |
615
|
|
|
|
|
|
|
(by setting their C<parent> field to C<undef> and C<parentOrd> to C<0>). |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=back |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
=head2 Information methods |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=over 4 |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=item $sentence->len() |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
Returns length of the sentence, i.e. number of nodes in the sentence. |
626
|
|
|
|
|
|
|
Each node corresponds to one word (one token to be more precise). |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
=item $sentence->count_errors_attachement($correct_sentence) |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
Compares the parse tree of the sentence with its correct parse tree, |
631
|
|
|
|
|
|
|
represented by an instance of the same sentence containing its correct parse. |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
An error is considered to be an incorrectly assigned governing node. So, the |
634
|
|
|
|
|
|
|
parents of all nodes (obviously not including the root node) are compared and |
635
|
|
|
|
|
|
|
if they are different, it is counted as an error. This leads to a minimum |
636
|
|
|
|
|
|
|
number of errors equal to 0 and maximum number equal to the length of the |
637
|
|
|
|
|
|
|
sentence. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=item $sentence->count_errors_labelling($correct_sentence) |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
Compares the labelling of the sentence with its correct labelling, |
642
|
|
|
|
|
|
|
represented by an instance of the same sentence containing the correct labels. |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
An error is considered to be an incorrectly assigned label. So, the |
645
|
|
|
|
|
|
|
labels of all edges (technically stored in the child nodes) are compared and |
646
|
|
|
|
|
|
|
if they are different, it is counted as an error. This leads to a minimum |
647
|
|
|
|
|
|
|
number of errors equal to 0 and maximum number equal to the length of the |
648
|
|
|
|
|
|
|
sentence. |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=item $sentence->getNodeByOrd(6) |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
Returns the node with this C<ord> (it can also be the root node if the C<ord> |
653
|
|
|
|
|
|
|
is 0) or C<undef> if the C<ord> is out of range. |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=item $sentence->toString() |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
Returns forms of the nodes joined by spaces (i.e. the sentence as a text but |
658
|
|
|
|
|
|
|
with a space between each two adjacent tokens). |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=item $sentence->toParentOrdsArray() |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
Returns (a reference to) an array of node parent ords, i.e. for the sentence |
663
|
|
|
|
|
|
|
"Tom is big", where "is" is a child of the root node and "Tom" and "big" are |
664
|
|
|
|
|
|
|
children of "is", this method returns C<[2, 0, 2]>. |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
=back |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=head1 AUTHORS |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
Rudolf Rosa <rosa@ufal.mff.cuni.cz> |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles |
675
|
|
|
|
|
|
|
University in Prague |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under |
678
|
|
|
|
|
|
|
the same terms as Perl itself. |