line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Core::Document; |
2
|
|
|
|
|
|
|
$Treex::Core::Document::VERSION = '2.20210102'; |
3
|
24
|
|
|
24
|
|
967342
|
use Moose; |
|
24
|
|
|
|
|
7923477
|
|
|
24
|
|
|
|
|
196
|
|
4
|
24
|
|
|
24
|
|
188370
|
use Treex::Core::Common; |
|
24
|
|
|
|
|
93
|
|
|
24
|
|
|
|
|
154
|
|
5
|
24
|
|
|
24
|
|
153259
|
use Treex::Core::Config; |
|
24
|
|
|
|
|
87
|
|
|
24
|
|
|
|
|
896
|
|
6
|
24
|
|
|
24
|
|
13473
|
use Treex::Core::DocZone; |
|
24
|
|
|
|
|
95
|
|
|
24
|
|
|
|
|
1164
|
|
7
|
24
|
|
|
24
|
|
14547
|
use Treex::Core::Bundle; |
|
24
|
|
|
|
|
107
|
|
|
24
|
|
|
|
|
1218
|
|
8
|
|
|
|
|
|
|
|
9
|
24
|
|
|
24
|
|
232
|
use Treex::PML; |
|
24
|
|
|
|
|
62
|
|
|
24
|
|
|
|
|
3824
|
|
10
|
|
|
|
|
|
|
Treex::PML::UseBackends('PMLBackend'); |
11
|
|
|
|
|
|
|
Treex::PML::AddResourcePath( Treex::Core::Config->pml_schema_dir() ); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
with 'Treex::Core::WildAttr'; |
14
|
|
|
|
|
|
|
|
15
|
24
|
|
|
24
|
|
199
|
use Scalar::Util qw( weaken reftype ); |
|
24
|
|
|
|
|
70
|
|
|
24
|
|
|
|
|
1583
|
|
16
|
|
|
|
|
|
|
|
17
|
24
|
|
|
24
|
|
183
|
use PerlIO::via::gzip; |
|
24
|
|
|
|
|
65
|
|
|
24
|
|
|
|
|
669
|
|
18
|
24
|
|
|
24
|
|
159
|
use Storable; |
|
24
|
|
|
|
|
58
|
|
|
24
|
|
|
|
|
1938
|
|
19
|
24
|
|
|
24
|
|
191
|
use Digest::MD5 qw(md5_hex); |
|
24
|
|
|
|
|
66
|
|
|
24
|
|
|
|
|
1196
|
|
20
|
24
|
|
|
24
|
|
183
|
use Lingua::Interset::FeatureStructure; |
|
24
|
|
|
|
|
56
|
|
|
24
|
|
|
|
|
11791
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has loaded_from => ( is => 'rw', isa => 'Str', default => '' ); |
23
|
|
|
|
|
|
|
has path => ( is => 'rw', isa => 'Str' ); |
24
|
|
|
|
|
|
|
has file_stem => ( is => 'rw', isa => 'Str', default => 'noname' ); |
25
|
|
|
|
|
|
|
has file_number => ( is => 'rw', isa => 'Str', builder => 'build_file_number' ); |
26
|
|
|
|
|
|
|
has compress => ( is => 'rw', isa => 'Bool', default => undef, documentation => 'compression to .gz' ); |
27
|
|
|
|
|
|
|
has storable => ( |
28
|
|
|
|
|
|
|
is => 'rw', |
29
|
|
|
|
|
|
|
isa => 'Bool', |
30
|
|
|
|
|
|
|
default => undef, |
31
|
|
|
|
|
|
|
documentation => 'using Storable with gz compression instead of Treex::PML' |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has _hash => ( is => 'rw', isa => 'Str' ); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub get_hash { |
37
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
38
|
0
|
0
|
|
|
|
0
|
if ( ! defined($self->_hash) ) { |
39
|
0
|
|
|
|
|
0
|
$Storable::canonical = 1; |
40
|
0
|
|
|
|
|
0
|
$self->_set_hash(md5_hex(Storable::nfreeze($self))); |
41
|
0
|
|
|
|
|
0
|
$Storable::canonical = 0; |
42
|
|
|
|
|
|
|
} |
43
|
0
|
|
|
|
|
0
|
return $self->_hash; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub set_hash { |
47
|
0
|
|
|
0
|
0
|
0
|
my ($self, $hash) = @_; |
48
|
|
|
|
|
|
|
|
49
|
0
|
|
|
|
|
0
|
$self->_set_hash($hash); |
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
0
|
return; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
has _pmldoc => ( |
55
|
|
|
|
|
|
|
isa => 'Treex::PML::Document', |
56
|
|
|
|
|
|
|
is => 'rw', |
57
|
|
|
|
|
|
|
init_arg => 'pml_doc', |
58
|
|
|
|
|
|
|
writer => '_set_pmldoc', |
59
|
|
|
|
|
|
|
handles => { |
60
|
|
|
|
|
|
|
set_filename => 'changeFilename', |
61
|
|
|
|
|
|
|
map { $_ => $_ } |
62
|
|
|
|
|
|
|
qw( clone writeFile writeTo filename URL |
63
|
|
|
|
|
|
|
changeFilename changeURL fileFormat changeFileFormat |
64
|
|
|
|
|
|
|
backend changeBackend encoding changeEncoding userData |
65
|
|
|
|
|
|
|
changeUserData metaData changeMetaData listMetaData |
66
|
|
|
|
|
|
|
appData changeAppData listAppData |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
documentRootData |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
FS changeFS |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
hint changeHint pattern_count pattern patterns |
73
|
|
|
|
|
|
|
changePatterns tail changeTail |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
trees changeTrees treeList tree delete_tree lastTreeNo notSaved |
76
|
|
|
|
|
|
|
currentTreeNo currentNode nodes value_line value_line_list |
77
|
|
|
|
|
|
|
determine_node_type ) |
78
|
|
|
|
|
|
|
}, |
79
|
|
|
|
|
|
|
builder => '_create_empty_pml_doc', |
80
|
|
|
|
|
|
|
); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
has _index => ( |
83
|
|
|
|
|
|
|
is => 'rw', |
84
|
|
|
|
|
|
|
default => sub { return {} }, |
85
|
|
|
|
|
|
|
); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
has _backref => ( |
88
|
|
|
|
|
|
|
is => 'rw', |
89
|
|
|
|
|
|
|
default => sub { return {} }, |
90
|
|
|
|
|
|
|
); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
has _latest_node_number => ( # for generating document-unique IDs |
93
|
|
|
|
|
|
|
is => 'rw', |
94
|
|
|
|
|
|
|
default => 0, |
95
|
|
|
|
|
|
|
); |
96
|
|
|
|
|
|
|
|
97
|
24
|
|
|
24
|
|
230
|
use Treex::PML::Factory; |
|
24
|
|
|
|
|
56
|
|
|
24
|
|
|
|
|
99543
|
|
98
|
|
|
|
|
|
|
my $factory = Treex::PML::Factory->new(); |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
my $highest_file_number = 1; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# the description attribute is stored inside the meta structures of pml documents, |
103
|
|
|
|
|
|
|
# that is why it is not realized as a regular Moose attribute |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub set_description { |
106
|
2
|
|
|
2
|
0
|
15
|
my ( $self, $attr_value ) = @_; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
return Treex::PML::Node::set_attr( |
109
|
|
|
|
|
|
|
$self->metaData('pml_root')->{meta}, |
110
|
2
|
|
|
|
|
10
|
'description', $attr_value |
111
|
|
|
|
|
|
|
); |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub description { |
115
|
4
|
|
|
4
|
1
|
178
|
my $self = shift; |
116
|
4
|
|
|
|
|
20
|
return Treex::PML::Node::attr( $self->metaData('pml_root')->{meta}, 'description' ); |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub build_file_number { |
120
|
31
|
|
|
31
|
0
|
1063
|
return sprintf "%03d", $highest_file_number++; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Full filename without the extension |
124
|
|
|
|
|
|
|
sub full_filename { |
125
|
2
|
50
|
|
2
|
1
|
18
|
log_fatal 'Incorrect number of arguments' if @_ != 1; |
126
|
2
|
|
|
|
|
6
|
my $self = shift; |
127
|
2
|
|
|
|
|
5
|
my $path = ''; |
128
|
2
|
50
|
33
|
|
|
68
|
if (defined $self->path && $self->path ne ''){ |
129
|
0
|
|
|
|
|
0
|
$path = $self->path; |
130
|
0
|
0
|
|
|
|
0
|
$path .= '/' if $path !~ m{/$}; |
131
|
|
|
|
|
|
|
} |
132
|
2
|
|
|
|
|
94
|
return $path . $self->file_stem . $self->file_number; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub BUILD { |
136
|
32
|
|
|
32
|
0
|
92
|
my $self = shift; |
137
|
32
|
|
|
|
|
115
|
my ($params_rf) = @_; |
138
|
32
|
|
|
|
|
76
|
my $pmldoc; |
139
|
|
|
|
|
|
|
|
140
|
32
|
50
|
|
|
|
127
|
if ( defined $params_rf ) { |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# creating Treex::Core::Document from an already existing Treex::PML::Document instance |
143
|
32
|
50
|
|
|
|
247
|
if ( $params_rf->{pmldoc} ) { |
|
|
100
|
|
|
|
|
|
144
|
0
|
|
|
|
|
0
|
$pmldoc = $params_rf->{pmldoc}; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# loading Treex::Core::Document from a file |
148
|
|
|
|
|
|
|
elsif ( $params_rf->{filename} ) { |
149
|
|
|
|
|
|
|
|
150
|
10
|
50
|
|
|
|
63
|
if ( $params_rf->{filename} =~ /.streex$/ ) { |
151
|
0
|
|
|
|
|
0
|
log_fatal 'Storable (.streex) docs must be retrieved by Treex::Core::Document->retrieve_storable($filename)'; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
else { |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# If the file contains invalid PML (e.g. unknown afun value) |
157
|
|
|
|
|
|
|
# Treex::PML fails with die. |
158
|
|
|
|
|
|
|
# TODO: we should rather catch the die message and report it via log_fatal |
159
|
10
|
|
|
|
|
28
|
$pmldoc = eval { |
160
|
|
|
|
|
|
|
# In r10421, ZŽ added here recover => 1: |
161
|
|
|
|
|
|
|
# $factory->createDocumentFromFile( $params_rf->{filename}, { recover => 1 }); |
162
|
|
|
|
|
|
|
# However, if the file contains invalid PML (e.g. unknown afun value), the recover=>1 option |
163
|
|
|
|
|
|
|
# results in returning a $pmldoc which seems to be OK, but it contains no bundles, |
164
|
|
|
|
|
|
|
# so Treex crashes on subsequent blocks which is misleading for users. |
165
|
|
|
|
|
|
|
# If we really want to be fault-tolerant, it seems we would need to set Treex::PML::Instance::Reader::STRICT=0, |
166
|
|
|
|
|
|
|
# but I don't know enough about PML internals and I think it's better to make such errors fatal. |
167
|
|
|
|
|
|
|
# Martin Popel |
168
|
|
|
|
|
|
|
# 2018-03-05: Dan Zeman: adding recover=>1 again. |
169
|
|
|
|
|
|
|
# Random strange errors happen with visually correct files on cluster in the PML backend ("extra content after document end"). |
170
|
|
|
|
|
|
|
# It is paralyzing my work. |
171
|
10
|
|
|
|
|
97
|
$factory->createDocumentFromFile( $params_rf->{filename}, {'recover' => 1} ); |
172
|
|
|
|
|
|
|
}; |
173
|
10
|
50
|
|
|
|
8865133
|
if ($Treex::PML::FSError) { |
174
|
0
|
|
|
|
|
0
|
log_warn "Treex::PML::FSError: $Treex::PML::FSError"; |
175
|
0
|
|
|
|
|
0
|
log_warn "Trying to process the document anyway."; |
176
|
|
|
|
|
|
|
} |
177
|
10
|
0
|
|
|
|
60
|
log_fatal "Error while loading " . $params_rf->{filename} . ( $@ ? "\n$@" : '' ) |
|
|
50
|
|
|
|
|
|
178
|
|
|
|
|
|
|
if !defined $pmldoc; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# constructing treex document from an existing file |
184
|
32
|
100
|
|
|
|
136
|
if ($pmldoc) { |
185
|
10
|
|
|
|
|
645
|
$self->_set_pmldoc($pmldoc); |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# ensuring Treex::Core types (partially copied from the factory) |
188
|
|
|
|
|
|
|
# $doczone hashref will be reused as the blessed instance variable |
189
|
10
|
|
|
|
|
549
|
for my $doczone ($self->get_all_zones()){ |
190
|
3
|
|
|
|
|
820
|
Treex::Core::DocZone->new($doczone); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
10
|
|
|
|
|
308
|
$self->_rebless_and_index(); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
32
|
|
|
|
|
222
|
$self->deserialize_wild; |
197
|
32
|
|
|
|
|
173
|
foreach my $bundle ( $self->get_bundles ) { |
198
|
12
|
|
|
|
|
246
|
$bundle->deserialize_wild; |
199
|
12
|
|
|
|
|
68
|
foreach my $bundlezone ( $bundle->get_all_zones ) { |
200
|
14
|
|
|
|
|
1269
|
foreach my $tree ( $bundlezone->get_all_trees ){ |
201
|
32
|
100
|
|
|
|
2684
|
my $ordered = $tree->type->get_structure_name =~ /[at]-(root|node)/ ? 1 : 0; |
202
|
32
|
|
|
|
|
574
|
my $correct_ord = 0; |
203
|
32
|
100
|
|
|
|
235
|
my @nodes = $tree->get_descendants( { add_self => 1, ($ordered ? (ordered => 1) : ()) } ); |
204
|
32
|
|
|
|
|
119
|
foreach my $node (@nodes){ |
205
|
|
|
|
|
|
|
# normalize ord, so there are no gaps |
206
|
42
|
100
|
|
|
|
553
|
if ($ordered){ |
207
|
28
|
|
|
|
|
921
|
$node->_set_ord($correct_ord); |
208
|
28
|
|
|
|
|
60
|
$correct_ord++; |
209
|
|
|
|
|
|
|
} |
210
|
42
|
|
|
|
|
248
|
$node->deserialize_wild; |
211
|
|
|
|
|
|
|
# Interset, if present, must be deserialized after wild because it relies on wild to take care of the 'other' feature. |
212
|
42
|
100
|
|
|
|
861
|
if ( $node->DOES('Treex::Core::Node::Interset') ) { |
213
|
28
|
|
|
|
|
11880
|
$node->deserialize_iset; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
32
|
|
|
|
|
3263
|
return; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub _rebless_and_index { |
224
|
10
|
|
|
10
|
|
34
|
my $self = shift; |
225
|
10
|
|
|
|
|
54
|
foreach my $bundle ( $self->get_bundles ) { |
226
|
12
|
|
|
|
|
298
|
bless $bundle, 'Treex::Core::Bundle'; |
227
|
|
|
|
|
|
|
|
228
|
12
|
|
|
|
|
416
|
$bundle->_set_document($self); |
229
|
|
|
|
|
|
|
|
230
|
12
|
100
|
|
|
|
57
|
if ( defined $bundle->{zones} ) { |
231
|
8
|
|
|
|
|
68
|
foreach my $zone ( map { $_->value() } $bundle->{zones}->elements ) { |
|
14
|
|
|
|
|
163
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# $zone hashref will be reused as the blessed instance variable |
234
|
14
|
|
|
|
|
166
|
Treex::Core::BundleZone->new($zone); |
235
|
14
|
|
|
|
|
3423
|
$zone->_set_bundle($bundle); |
236
|
|
|
|
|
|
|
|
237
|
14
|
|
|
|
|
66
|
foreach my $tree ( $zone->get_all_trees ) { |
238
|
32
|
|
|
|
|
66
|
my $layer; |
239
|
32
|
50
|
|
|
|
144
|
if ( $tree->type->get_structure_name =~ /(\S)-(root|node|nonterminal|terminal)/ ) { |
240
|
32
|
|
|
|
|
672
|
$layer = uc($1); |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
else { |
243
|
0
|
|
|
|
|
0
|
log_fatal "Unexpected member in zone structure: " . $tree->type->get_structure_name; |
244
|
|
|
|
|
|
|
} |
245
|
32
|
|
|
|
|
132
|
foreach my $node ( $tree, $tree->descendants ) { # must still call Treex::PML::Node's API |
246
|
42
|
|
|
|
|
15695
|
bless $node, "Treex::Core::Node::$layer"; |
247
|
42
|
|
|
|
|
243
|
$self->index_node_by_id( $node->get_id, $node ); |
248
|
42
|
100
|
100
|
|
|
193
|
if ($layer eq 'A' && $node->{iset}){ |
249
|
3
|
|
|
|
|
10
|
$node->{iset} = Lingua::Interset::FeatureStructure->new(%{$node->{iset}}); |
|
3
|
|
|
|
|
38
|
|
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
} |
252
|
32
|
|
|
|
|
30116
|
$tree->_set_zone($zone); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
} |
257
|
10
|
|
|
|
|
63
|
return; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub _pml_attribute_hash { |
261
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
262
|
0
|
|
|
|
|
0
|
return $self->metaData('pml_root')->{meta}; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
#my $_treex_schema_file = Treex::PML::ResolvePath( '.', 'treex_schema.xml', 1 ); |
266
|
|
|
|
|
|
|
my $_treex_schema_file = Treex::Core::Config->pml_schema_dir . "/" . 'treex_schema.xml'; |
267
|
|
|
|
|
|
|
if ( not -f $_treex_schema_file ) { |
268
|
|
|
|
|
|
|
log_fatal "Can't find PML schema $_treex_schema_file"; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
my $_treex_schema = Treex::PML::Schema->new( { filename => $_treex_schema_file } ); |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
sub _create_empty_pml_doc { ## no critic (ProhibitUnusedPrivateSubroutines) |
274
|
32
|
|
|
32
|
|
504
|
my $fsfile = Treex::PML::Document->create |
275
|
|
|
|
|
|
|
( |
276
|
|
|
|
|
|
|
name => "x", #$filename, ??? |
277
|
|
|
|
|
|
|
FS => Treex::PML::FSFormat->new( |
278
|
|
|
|
|
|
|
{ |
279
|
|
|
|
|
|
|
'deepord' => ' N' # ??? |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
), |
282
|
|
|
|
|
|
|
trees => [], |
283
|
|
|
|
|
|
|
backend => 'PMLBackend', |
284
|
|
|
|
|
|
|
encoding => "utf-8", |
285
|
|
|
|
|
|
|
); |
286
|
|
|
|
|
|
|
|
287
|
32
|
|
|
|
|
14980
|
$fsfile->changeMetaData( 'schema-url', 'treex_schema.xml' ); |
288
|
32
|
|
|
|
|
427
|
$fsfile->changeMetaData( 'schema', $_treex_schema ); |
289
|
32
|
|
|
|
|
463
|
$fsfile->changeMetaData( 'pml_root', { meta => {}, bundles => undef, } ); |
290
|
32
|
|
|
|
|
1305
|
return $fsfile; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# --- INDEXING |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub index_node_by_id { |
296
|
414
|
|
|
414
|
1
|
8711
|
my $self = shift; |
297
|
414
|
|
|
|
|
1953
|
my ( $id, $node ) = pos_validated_list( |
298
|
|
|
|
|
|
|
\@_, |
299
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Id' }, |
300
|
|
|
|
|
|
|
{ isa => 'Maybe[Treex::Core::Node]' }, #jde to takhle? |
301
|
|
|
|
|
|
|
); |
302
|
414
|
|
|
|
|
12337
|
my $index = $self->_index; |
303
|
414
|
100
|
|
|
|
1034
|
if ( defined $node ) { |
304
|
303
|
|
|
|
|
948
|
$index->{$id} = $node; |
305
|
303
|
|
|
|
|
1313
|
weaken $index->{$id}; |
306
|
|
|
|
|
|
|
|
307
|
303
|
|
|
|
|
1179
|
my $refs = $node->_get_referenced_ids; |
308
|
303
|
|
|
|
|
527
|
foreach my $type ( keys %{$refs} ) { |
|
303
|
|
|
|
|
885
|
|
309
|
4
|
|
|
|
|
19
|
$self->index_backref( $type, $id, $refs->{$type} ); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
else { |
313
|
111
|
|
|
|
|
310
|
delete $index->{$id}; |
314
|
|
|
|
|
|
|
} |
315
|
414
|
|
|
|
|
1247
|
return; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# Add references to the reversed references list |
319
|
|
|
|
|
|
|
sub index_backref { |
320
|
24
|
|
|
24
|
0
|
78
|
my ( $self, $type, $source, $targets ) = @_; |
321
|
24
|
|
|
|
|
671
|
my $backref = $self->_backref; |
322
|
|
|
|
|
|
|
|
323
|
24
|
|
|
|
|
51
|
foreach my $target ( @{$targets} ) { |
|
24
|
|
|
|
|
64
|
|
324
|
30
|
100
|
|
|
|
80
|
next if ( !defined($target) ); |
325
|
25
|
|
100
|
|
|
125
|
my $target_backrefs = $backref->{$target} // {}; |
326
|
25
|
|
|
|
|
65
|
$backref->{$target} = $target_backrefs; |
327
|
|
|
|
|
|
|
|
328
|
25
|
100
|
|
|
|
86
|
$target_backrefs->{$type} = [] if ( !$target_backrefs->{$type} ); |
329
|
25
|
|
|
|
|
41
|
push @{ $target_backrefs->{$type} }, $source; |
|
25
|
|
|
|
|
78
|
|
330
|
|
|
|
|
|
|
} |
331
|
24
|
|
|
|
|
76
|
return; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# Remove references from the reversed references list |
335
|
|
|
|
|
|
|
sub remove_backref { |
336
|
10
|
|
|
10
|
0
|
35
|
my ( $self, $type, $source, $targets ) = @_; |
337
|
10
|
|
|
|
|
298
|
my $backref = $self->_backref; |
338
|
|
|
|
|
|
|
|
339
|
10
|
|
|
|
|
23
|
foreach my $target ( @{$targets} ) { |
|
10
|
|
|
|
|
25
|
|
340
|
16
|
50
|
|
|
|
43
|
next if ( !defined($target) ); |
341
|
16
|
|
|
|
|
34
|
my $target_backrefs = $backref->{$target}; |
342
|
16
|
50
|
|
|
|
37
|
next if ( !$target_backrefs ); |
343
|
|
|
|
|
|
|
|
344
|
16
|
|
|
|
|
22
|
$target_backrefs->{$type} = [ grep { $_ ne $source } @{ $target_backrefs->{$type} } ]; |
|
14
|
|
|
|
|
50
|
|
|
16
|
|
|
|
|
42
|
|
345
|
|
|
|
|
|
|
} |
346
|
10
|
|
|
|
|
33
|
return; |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# Return a hash of references ( type->[nodes] ) leading to the node with the given id |
350
|
|
|
|
|
|
|
sub get_references_to_id { |
351
|
24
|
|
|
24
|
1
|
65
|
my ( $self, $id ) = @_; |
352
|
24
|
|
|
|
|
645
|
my $backref = $self->_backref; |
353
|
|
|
|
|
|
|
|
354
|
24
|
100
|
|
|
|
92
|
return if ( !$backref->{$id} ); |
355
|
22
|
|
|
|
|
65
|
return $backref->{$id}; # TODO clone ? |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
# Remove all references and backreferences leading to the $node (calls remove_reference() on the source nodes) |
359
|
|
|
|
|
|
|
sub _remove_references_to_node { |
360
|
69
|
|
|
69
|
|
134
|
my ( $self, $node ) = @_; |
361
|
69
|
|
|
|
|
1633
|
my $id = $node->id; |
362
|
69
|
|
|
|
|
1803
|
my $backref = $self->_backref; |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# First, delete backreferences to the $node |
365
|
69
|
|
|
|
|
208
|
my $refs = $node->_get_referenced_ids(); |
366
|
69
|
|
|
|
|
114
|
foreach my $type ( keys %{$refs} ) { |
|
69
|
|
|
|
|
197
|
|
367
|
1
|
|
|
|
|
6
|
$self->remove_backref( $type, $id, $refs->{$type} ); |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Second, delete references to the $node |
371
|
69
|
100
|
|
|
|
248
|
return if ( !$backref->{$id} ); |
372
|
4
|
|
|
|
|
9
|
my $node_backref = $backref->{$id}; |
373
|
|
|
|
|
|
|
|
374
|
4
|
|
|
|
|
8
|
foreach my $type ( keys %{$node_backref} ) { |
|
4
|
|
|
|
|
18
|
|
375
|
6
|
|
|
|
|
11
|
foreach my $source ( @{ $node_backref->{$type} } ) { |
|
6
|
|
|
|
|
19
|
|
376
|
6
|
|
|
|
|
19
|
$self->get_node_by_id($source)->remove_reference( $type, $id ); |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# Third, delete backreferences from the $node |
381
|
4
|
|
|
|
|
13
|
delete $backref->{$id}; |
382
|
4
|
|
|
|
|
18
|
return; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub id_is_indexed { |
386
|
183
|
|
|
183
|
1
|
311
|
my $self = shift; |
387
|
183
|
|
|
|
|
826
|
my ($id) = pos_validated_list( |
388
|
|
|
|
|
|
|
\@_, |
389
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Id' }, |
390
|
|
|
|
|
|
|
); |
391
|
183
|
|
|
|
|
5705
|
return ( defined $self->_index->{$id} ); |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
sub get_node_by_id { |
395
|
71
|
|
|
71
|
1
|
1100
|
my $self = shift; |
396
|
71
|
|
|
|
|
298
|
my ($id) = pos_validated_list( |
397
|
|
|
|
|
|
|
\@_, |
398
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Id' }, |
399
|
|
|
|
|
|
|
); |
400
|
71
|
50
|
|
|
|
2122
|
if ( defined $self->_index->{$id} ) { |
401
|
71
|
|
|
|
|
1804
|
return $self->_index->{$id}; |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
else { |
404
|
0
|
|
|
|
|
0
|
log_fatal "ID not indexed: id=\"$id\""; |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
# This is something very fatal. Treex assumes every node ID to |
407
|
|
|
|
|
|
|
# be valid and pointing to a node *in the given document*. |
408
|
|
|
|
|
|
|
# (It is fine to have a node with no a/lex.rf |
409
|
|
|
|
|
|
|
# attribute, but if the attribute is there, the value |
410
|
|
|
|
|
|
|
# has to be an ID within the document.) |
411
|
|
|
|
|
|
|
# |
412
|
|
|
|
|
|
|
# If your data violates the requirement and your IDs point to |
413
|
|
|
|
|
|
|
# a different document, the only hack we suggest is to drop such |
414
|
|
|
|
|
|
|
# references... |
415
|
|
|
|
|
|
|
} |
416
|
0
|
|
|
|
|
0
|
return; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub get_all_node_ids { |
420
|
0
|
0
|
|
0
|
1
|
0
|
log_fatal('Incorrect number of arguments') if @_ != 1; |
421
|
0
|
|
|
|
|
0
|
my $self = shift; |
422
|
0
|
|
|
|
|
0
|
return ( keys %{ $self->_index } ); |
|
0
|
|
|
|
|
0
|
|
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# -------------------------------------- ACCESS TO BUNDLES ------------------- |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
sub get_bundles { |
428
|
98
|
50
|
|
98
|
1
|
5269
|
log_fatal('Incorrect number of arguments') if @_ != 1; |
429
|
98
|
|
|
|
|
207
|
my $self = shift; |
430
|
98
|
|
|
|
|
524
|
return $self->trees; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub create_bundle { |
434
|
28
|
|
|
28
|
1
|
2455
|
my ( $self, $arg_ref ) = @_; |
435
|
28
|
|
|
|
|
782
|
my $fsfile = $self->_pmldoc(); |
436
|
28
|
|
|
|
|
79
|
my $new_bundle; |
437
|
|
|
|
|
|
|
my $position_of_new; |
438
|
|
|
|
|
|
|
|
439
|
28
|
100
|
66
|
|
|
139
|
if ( $arg_ref and ( $arg_ref->{after} or $arg_ref->{before} ) ) { |
|
|
|
66
|
|
|
|
|
440
|
2
|
100
|
|
|
|
7
|
my $reference_bundle = ( $arg_ref->{after} ) ? $arg_ref->{after} : $arg_ref->{before}; |
441
|
2
|
|
|
|
|
9
|
my $position_of_reference = $reference_bundle->get_position; |
442
|
2
|
100
|
|
|
|
6
|
$position_of_new = $position_of_reference + ( $arg_ref->{after} ? 1 : 0 ); |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
else { # default: append at the end of the document |
446
|
26
|
|
|
|
|
97
|
$position_of_new = scalar( $self->get_bundles() ); |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
28
|
|
|
|
|
596
|
$new_bundle = $fsfile->new_tree($position_of_new); |
450
|
28
|
|
|
|
|
1981
|
$new_bundle->set_type_by_name( $fsfile->metaData('schema'), 'bundle.type' ); |
451
|
28
|
|
|
|
|
2375
|
bless $new_bundle, "Treex::Core::Bundle"; # is this correct/sufficient with Moose ???? |
452
|
28
|
|
|
|
|
1073
|
$new_bundle->_set_document($self); |
453
|
|
|
|
|
|
|
|
454
|
28
|
|
|
|
|
153
|
$new_bundle->set_id( "s" . ( $fsfile->lastTreeNo + 1 ) ); |
455
|
|
|
|
|
|
|
|
456
|
28
|
|
|
|
|
114
|
return $new_bundle; |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# ----------------------- ACCESS TO ZONES ------------------------------------ |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
sub create_zone { |
462
|
4
|
|
|
4
|
1
|
685
|
my $self = shift; |
463
|
4
|
|
|
|
|
48
|
my ( $language, $selector ) = pos_validated_list( |
464
|
|
|
|
|
|
|
\@_, |
465
|
|
|
|
|
|
|
{ isa => 'Treex::Type::LangCode' }, |
466
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Selector', default => '' }, |
467
|
|
|
|
|
|
|
); |
468
|
|
|
|
|
|
|
|
469
|
4
|
|
|
|
|
59
|
my $new_zone = Treex::Core::DocZone->new( |
470
|
|
|
|
|
|
|
{ |
471
|
|
|
|
|
|
|
'language' => $language, |
472
|
|
|
|
|
|
|
'selector' => $selector |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
); |
475
|
|
|
|
|
|
|
|
476
|
4
|
|
|
|
|
1199
|
my $new_element = Treex::PML::Seq::Element->new( 'zone', $new_zone ); |
477
|
|
|
|
|
|
|
|
478
|
4
|
|
|
|
|
43
|
my $meta = $self->metaData('pml_root')->{meta}; |
479
|
4
|
100
|
|
|
|
75
|
if ( defined $meta->{zones} ) { |
480
|
2
|
|
|
|
|
11
|
$meta->{zones}->unshift_element_obj($new_element); |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
else { |
483
|
2
|
|
|
|
|
20
|
$meta->{zones} = Treex::PML::Seq->new( [$new_element] ); |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
4
|
|
|
|
|
81
|
return $new_zone; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub get_all_zones { |
490
|
15
|
|
|
15
|
0
|
48
|
my $self = shift; |
491
|
15
|
|
|
|
|
101
|
my $meta = $self->metaData('pml_root')->{meta}; |
492
|
15
|
100
|
|
|
|
384
|
return if !$meta->{zones}; |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# Each element is a pair [$name, $value]. We need just the values. |
495
|
6
|
|
|
|
|
34
|
return map {$_->[1]} $meta->{zones}->elements; |
|
16
|
|
|
|
|
118
|
|
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub get_zone { |
499
|
4
|
|
|
4
|
1
|
16
|
my $self = shift; |
500
|
4
|
|
|
|
|
35
|
my ( $language, $selector ) = pos_validated_list( |
501
|
|
|
|
|
|
|
\@_, |
502
|
|
|
|
|
|
|
{ isa => 'Treex::Type::LangCode' }, |
503
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Selector', default => '' }, |
504
|
|
|
|
|
|
|
); |
505
|
|
|
|
|
|
|
|
506
|
4
|
|
|
|
|
18
|
foreach my $zone ($self->get_all_zones()) { |
507
|
12
|
100
|
66
|
|
|
311
|
return $zone if $zone->language eq $language && $zone->selector eq $selector; |
508
|
|
|
|
|
|
|
} |
509
|
0
|
|
|
|
|
0
|
return; |
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
sub get_or_create_zone { |
513
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
514
|
0
|
|
|
|
|
0
|
my ( $language, $selector ) = pos_validated_list( |
515
|
|
|
|
|
|
|
\@_, |
516
|
|
|
|
|
|
|
{ isa => 'Treex::Type::LangCode' }, |
517
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Selector', default => '' }, |
518
|
|
|
|
|
|
|
); |
519
|
|
|
|
|
|
|
|
520
|
0
|
|
|
|
|
0
|
my $fs_zone = $self->get_zone( $language, $selector ); |
521
|
0
|
0
|
|
|
|
0
|
if ( not defined $fs_zone ) { |
522
|
0
|
|
|
|
|
0
|
$fs_zone = $self->create_zone( $language, $selector ); |
523
|
|
|
|
|
|
|
} |
524
|
0
|
|
|
|
|
0
|
return $fs_zone; |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# -------------------- LOADING AND SAVING ------------------------------------ |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
sub load { |
530
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
531
|
0
|
|
|
|
|
0
|
return $self->_pmldoc->load(@_); |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# TODO: this is unfinished: should be somehow connected with the code in BUILD |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
sub save { |
537
|
11
|
|
|
11
|
1
|
1180
|
my $self = shift; |
538
|
11
|
|
|
|
|
38
|
my ($filename) = @_; |
539
|
|
|
|
|
|
|
|
540
|
11
|
100
|
|
|
|
79
|
if ( $filename =~ /\.streex$/ ) { |
541
|
1
|
50
|
|
1
|
|
55
|
open( my $F, ">:via(gzip)", $filename ) or log_fatal $!; |
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
8
|
|
542
|
1
|
|
|
|
|
3957
|
print $F Storable::nfreeze($self); |
543
|
1
|
|
|
|
|
17617
|
close $F; |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# using Storable::nstore_fd($self,*$F) emits 'Inappropriate ioctl for device' |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
else { |
549
|
10
|
|
|
|
|
56
|
$self->_serialize_all_wild(); |
550
|
10
|
|
|
|
|
295
|
return $self->_pmldoc->save(@_); |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
1
|
|
|
|
|
1443
|
return; |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
sub _serialize_all_wild { |
557
|
10
|
|
|
10
|
|
31
|
my ($self) = @_; |
558
|
10
|
|
|
|
|
67
|
$self->serialize_wild; |
559
|
10
|
|
|
|
|
45
|
foreach my $bundle ( $self->get_bundles ) { |
560
|
12
|
|
|
|
|
307
|
$bundle->serialize_wild; |
561
|
12
|
|
|
|
|
63
|
foreach my $bundlezone ( $bundle->get_all_zones ) { |
562
|
14
|
|
|
|
|
116
|
foreach my $node ( map { $_->get_descendants( { add_self => 1 } ) } $bundlezone->get_all_trees ) { |
|
32
|
|
|
|
|
201
|
|
563
|
|
|
|
|
|
|
# Interset, if present, must be serialized before wild because it relies on wild to take care of the 'other' feature. |
564
|
42
|
100
|
|
|
|
1079
|
if ( $node->DOES('Treex::Core::Node::Interset') ) { |
565
|
28
|
|
|
|
|
13301
|
$node->serialize_iset; |
566
|
|
|
|
|
|
|
} |
567
|
42
|
|
|
|
|
5846
|
$node->serialize_wild; |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
} |
571
|
10
|
|
|
|
|
68
|
return; |
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
sub retrieve_storable { |
575
|
1
|
|
|
1
|
1
|
14
|
my ( $class, $file ) = @_; # $file stands for a file name, but it can be also file handle (needed by the TrEd backend for .streex) |
576
|
|
|
|
|
|
|
|
577
|
1
|
|
|
|
|
2
|
my $FILEHANDLE; |
578
|
1
|
|
|
|
|
3
|
my $opened = 0; |
579
|
|
|
|
|
|
|
|
580
|
1
|
50
|
33
|
|
|
10
|
if ( ref($file) and reftype($file) eq 'GLOB' ) { |
581
|
0
|
|
|
|
|
0
|
$FILEHANDLE = $file; |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
else { |
584
|
1
|
50
|
|
|
|
9
|
log_fatal "filename=$file, but Treex::Core::Document->retrieve_storable(\$filename) can be used only for .streex files" |
585
|
|
|
|
|
|
|
unless $file =~ /\.streex$/; |
586
|
1
|
50
|
|
|
|
26
|
open $FILEHANDLE, "<:via(gzip)", $file or log_fatal($!); |
587
|
1
|
|
|
|
|
2235
|
$opened = 1; |
588
|
|
|
|
|
|
|
} |
589
|
|
|
|
|
|
|
|
590
|
1
|
|
|
|
|
2
|
my $serialized; |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# reading it this way is silly, but both slurping the file or |
593
|
|
|
|
|
|
|
# using Storable::retrieve_fd lead to errors when used with via(gzip) |
594
|
1
|
|
|
|
|
9
|
while (<$FILEHANDLE>) { |
595
|
5641
|
|
|
|
|
407497
|
$serialized .= $_; |
596
|
|
|
|
|
|
|
} |
597
|
|
|
|
|
|
|
|
598
|
1
|
50
|
|
|
|
32
|
if ( $opened ) { |
599
|
1
|
|
|
|
|
9
|
close($FILEHANDLE); |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# my $retrieved_doc = Storable::retrieve_fd(*$FILEHANDLE) or log_fatal($!); |
603
|
1
|
50
|
|
|
|
238
|
my $retrieved_doc = Storable::thaw($serialized) or log_fatal $!; |
604
|
|
|
|
|
|
|
|
605
|
1
|
50
|
|
|
|
6474
|
if ( not ref($file) ) { |
606
|
1
|
|
|
|
|
74
|
$retrieved_doc->set_loaded_from($file); |
607
|
1
|
|
|
|
|
32
|
my ( $volume, $dirs, $file_name ) = File::Spec->splitpath($file); |
608
|
1
|
|
|
|
|
41
|
$retrieved_doc->set_path( $volume . $dirs ); |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# $retrieved_doc->changeFilename($file); # why this doesn't affect the name displayed in TrEd? |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
# *.streex files saved before r8789 (2012-05-29) have no PML types with nodes, let's fix it |
614
|
|
|
|
|
|
|
# TODO: delete this hack as soon as no such old streex files are needed. |
615
|
1
|
|
|
|
|
7
|
foreach my $bundle ( $retrieved_doc->get_bundles() ) { |
616
|
2
|
|
|
|
|
51
|
foreach my $bundlezone ( $bundle->get_all_zones() ) { |
617
|
1
|
|
|
|
|
13
|
foreach my $node ( map { $_->get_descendants() } $bundlezone->get_all_trees() ) { |
|
1
|
|
|
|
|
16
|
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
# skip this hack if we are dealing with a new streex file |
620
|
|
|
|
|
|
|
#return $retrieved_doc if $node->type; |
621
|
|
|
|
|
|
|
# This shortcut does not work since old files have only *some* nodes without types |
622
|
1
|
|
|
|
|
10
|
$node->fix_pml_type(); |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
} |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
|
627
|
1
|
|
|
|
|
8
|
return $retrieved_doc; |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
1; |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
__END__ |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=for Pod::Coverage BUILD build_file_number description set_description |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=encoding utf-8 |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=head1 NAME |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
Treex::Core::Document - representation of a text and its linguistic analyses in the Treex framework |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
=head1 VERSION |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
version 2.20210102 |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head1 DESCRIPTION |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
A document consists of a sequence of bundles, mirroring a sequence |
653
|
|
|
|
|
|
|
of natural language sentences (typically, but not necessarily, |
654
|
|
|
|
|
|
|
originating from the same text). Attributes (attribute-value pairs) |
655
|
|
|
|
|
|
|
can be attached to a document as a whole. |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
Note that the references from the bundles to the containing document are weak, |
658
|
|
|
|
|
|
|
so make sure you always keep a reference to the document in scope to prevent |
659
|
|
|
|
|
|
|
the contents of the document from being garbage-collected. |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
C<Treex::Core::Document>'s instances have the following attributes: |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
=over 4 |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=item description |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
Textual description of the file's content that is stored in the file. |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
=item loaded_from |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
=item path |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=item file_stem |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=item file_number |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
=back |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
The attributes can be accessed using semi-affordance accessors: |
682
|
|
|
|
|
|
|
getters have the same names as attributes, while setters start with |
683
|
|
|
|
|
|
|
C<set_>. For example, the attribute C<path> has a getter C<path()> and a setter C<set_path($path)> |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=head1 METHODS |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=head2 Constructor |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=over 4 |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=item my $new_document = Treex::Core::Document->new; |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
creates a new empty document object. |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
=item my $new_document = Treex::Core::Document->new( { pmldoc => $pmldoc } ); |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
creates a C<Treex::Core::Document> instance from an already existing L<Treex::PML::Document> instance |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
=item my $new_document = Treex::Core::Document->new( { filename => $filename } ); |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
loads a C<Treex::Core::Document> instance from a .treex file |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
=back |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=head2 Access to zones |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
Document zones are instances of L<Treex::Core::DocZone>, parametrized |
711
|
|
|
|
|
|
|
by language code and possibly also by another free label |
712
|
|
|
|
|
|
|
called selector, whose purpose is to distinguish zones for the same language |
713
|
|
|
|
|
|
|
but from a different source. |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
=over 4 |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=item my $zone = $doc->create_zone( $langcode, ?$selector ); |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
=item my $zone = $doc->get_zone( $langcode, ?$selector ); |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
=item my $zone = $doc->get_or_create_zone( $langcode, ?$selector ); |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=back |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
=head2 Access to bundles |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=over 4 |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=item my @bundles = $document->get_bundles(); |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
Returns the array of bundles contained in the document. |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=item my $new_bundle = $document->create_bundle(); |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
Creates a new empty bundle and appends it |
738
|
|
|
|
|
|
|
at the end of the document. |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
=item my $new_bundle = $document->new_bundle_before( $existing_bundle ); |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
Creates a new empty bundle and inserts it |
743
|
|
|
|
|
|
|
in front of the existing bundle. |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
=item my $new_bundle = $document->new_bundle_after( $existing_bundle ); |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
Creates a new empty bundle and inserts it |
748
|
|
|
|
|
|
|
after the existing bundle. |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=back |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=head2 Node indexing |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=over 4 |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
=item $document->index_node_by_id( $id, $node ); |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
The node is added to the document's indexing table C<id2node> (it is done |
760
|
|
|
|
|
|
|
automatically in L<Treex::Core::Node::set_attr()|Treex::Core::Node/set_attr> |
761
|
|
|
|
|
|
|
if the attribute name is 'C<id>'). When using C<undef> in the place of the |
762
|
|
|
|
|
|
|
second argument, the entry for the given id is deleted from the hash. |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item my $node = $document->get_node_by_id( $id ); |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Return the node which has the value C<$id> in its 'C<id>' attribute, |
768
|
|
|
|
|
|
|
no matter to which tree and to which bundle in the given document |
769
|
|
|
|
|
|
|
the node belongs to. |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
It is prohibited in Treex for IDs to point outside of the current document. |
772
|
|
|
|
|
|
|
In rare cases where your data has such links, we recommend you to split the |
773
|
|
|
|
|
|
|
documents differently or hack it by dropping the problematic links. |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=item $document->id_is_indexed( $id ); |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
Return C<true> if the given C<id> is already present in the indexing table. |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=item $document->get_all_node_ids(); |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
Return the array of all node identifiers indexed in the document. |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=item $document->get_references_to_id( $id ); |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
Return all references leading to the given node id in a hash (keys are reference types, e.g. 'alignment', |
786
|
|
|
|
|
|
|
'a/lex.rf' etc., values are arrays of nodes referencing this node). |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
=item $document->remove_refences_to_id( $id ); |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
Remove all references to the given node id (calls remove_reference() on each referencing node). |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=back |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=head2 Serializing |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
=over 4 |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=item my $document = load($filename, \%opts) |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
Loads document from C<$filename> given C<%opts> using L<Treex::PML::Document::load()> |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
=item $document->save($filename) |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
Saves document to C<$filename> using L<Treex::PML::Document::save()>, |
805
|
|
|
|
|
|
|
or by the Storable module if the file's extension is .streex.gz. |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
=item Treex::Core::Document->retrieve_storable($filename) |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
Loading a document from the .streex (Storable) format. |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
=back |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
=head2 Other |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
=over 4 |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
=item my $filename = $doc->full_filename; |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
full filename without the extension |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=back |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
=head1 AUTHOR |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
Zdeněk Žabokrtský <zabokrtsky@ufal.mff.cuni.cz> |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
Martin Popel <popel@ufal.mff.cuni.cz> |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
Ondřej Dušek <odusek@ufal.mff.cuni.cz> |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |