line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Core::Document; |
2
|
|
|
|
|
|
|
$Treex::Core::Document::VERSION = '2.20160630'; |
3
|
24
|
|
|
24
|
|
360604
|
use Moose; |
|
24
|
|
|
|
|
7513134
|
|
|
24
|
|
|
|
|
184
|
|
4
|
24
|
|
|
24
|
|
175954
|
use Treex::Core::Common; |
|
24
|
|
|
|
|
84
|
|
|
24
|
|
|
|
|
155
|
|
5
|
24
|
|
|
24
|
|
143786
|
use Treex::Core::Config; |
|
24
|
|
|
|
|
63
|
|
|
24
|
|
|
|
|
724
|
|
6
|
24
|
|
|
24
|
|
13139
|
use Treex::Core::DocZone; |
|
24
|
|
|
|
|
128
|
|
|
24
|
|
|
|
|
1120
|
|
7
|
24
|
|
|
24
|
|
218329
|
use Treex::Core::Bundle; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Treex::PML; |
10
|
|
|
|
|
|
|
Treex::PML::UseBackends('PMLBackend'); |
11
|
|
|
|
|
|
|
Treex::PML::AddResourcePath( Treex::Core::Config->pml_schema_dir() ); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
with 'Treex::Core::WildAttr'; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use Scalar::Util qw( weaken reftype ); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
use PerlIO::via::gzip; |
18
|
|
|
|
|
|
|
use Storable; |
19
|
|
|
|
|
|
|
use Digest::MD5 qw(md5_hex); |
20
|
|
|
|
|
|
|
use Lingua::Interset::FeatureStructure; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has loaded_from => ( is => 'rw', isa => 'Str', default => '' ); |
23
|
|
|
|
|
|
|
has path => ( is => 'rw', isa => 'Str' ); |
24
|
|
|
|
|
|
|
has file_stem => ( is => 'rw', isa => 'Str', default => 'noname' ); |
25
|
|
|
|
|
|
|
has file_number => ( is => 'rw', isa => 'Str', builder => 'build_file_number' ); |
26
|
|
|
|
|
|
|
has compress => ( is => 'rw', isa => 'Bool', default => undef, documentation => 'compression to .gz' ); |
27
|
|
|
|
|
|
|
has storable => ( |
28
|
|
|
|
|
|
|
is => 'rw', |
29
|
|
|
|
|
|
|
isa => 'Bool', |
30
|
|
|
|
|
|
|
default => undef, |
31
|
|
|
|
|
|
|
documentation => 'using Storable with gz compression instead of Treex::PML' |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has _hash => ( is => 'rw', isa => 'Str' ); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub get_hash { |
37
|
|
|
|
|
|
|
my $self = shift; |
38
|
|
|
|
|
|
|
if ( ! defined($self->_hash) ) { |
39
|
|
|
|
|
|
|
$Storable::canonical = 1; |
40
|
|
|
|
|
|
|
$self->_set_hash(md5_hex(Storable::nfreeze($self))); |
41
|
|
|
|
|
|
|
$Storable::canonical = 0; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
return $self->_hash; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub set_hash { |
47
|
|
|
|
|
|
|
my ($self, $hash) = @_; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
$self->_set_hash($hash); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
return; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
has _pmldoc => ( |
55
|
|
|
|
|
|
|
isa => 'Treex::PML::Document', |
56
|
|
|
|
|
|
|
is => 'rw', |
57
|
|
|
|
|
|
|
init_arg => 'pml_doc', |
58
|
|
|
|
|
|
|
writer => '_set_pmldoc', |
59
|
|
|
|
|
|
|
handles => { |
60
|
|
|
|
|
|
|
set_filename => 'changeFilename', |
61
|
|
|
|
|
|
|
map { $_ => $_ } |
62
|
|
|
|
|
|
|
qw( clone writeFile writeTo filename URL |
63
|
|
|
|
|
|
|
changeFilename changeURL fileFormat changeFileFormat |
64
|
|
|
|
|
|
|
backend changeBackend encoding changeEncoding userData |
65
|
|
|
|
|
|
|
changeUserData metaData changeMetaData listMetaData |
66
|
|
|
|
|
|
|
appData changeAppData listAppData |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
documentRootData |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
FS changeFS |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
hint changeHint pattern_count pattern patterns |
73
|
|
|
|
|
|
|
changePatterns tail changeTail |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
trees changeTrees treeList tree delete_tree lastTreeNo notSaved |
76
|
|
|
|
|
|
|
currentTreeNo currentNode nodes value_line value_line_list |
77
|
|
|
|
|
|
|
determine_node_type ) |
78
|
|
|
|
|
|
|
}, |
79
|
|
|
|
|
|
|
builder => '_create_empty_pml_doc', |
80
|
|
|
|
|
|
|
); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
has _index => ( |
83
|
|
|
|
|
|
|
is => 'rw', |
84
|
|
|
|
|
|
|
default => sub { return {} }, |
85
|
|
|
|
|
|
|
); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
has _backref => ( |
88
|
|
|
|
|
|
|
is => 'rw', |
89
|
|
|
|
|
|
|
default => sub { return {} }, |
90
|
|
|
|
|
|
|
); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
has _latest_node_number => ( # for generating document-unique IDs |
93
|
|
|
|
|
|
|
is => 'rw', |
94
|
|
|
|
|
|
|
default => 0, |
95
|
|
|
|
|
|
|
); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
use Treex::PML::Factory; |
98
|
|
|
|
|
|
|
my $factory = Treex::PML::Factory->new(); |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
my $highest_file_number = 1; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# the description attribute is stored inside the meta structures of pml documents, |
103
|
|
|
|
|
|
|
# that is why it is not realized as a regular Moose attribute |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub set_description { |
106
|
|
|
|
|
|
|
my ( $self, $attr_value ) = @_; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
return Treex::PML::Node::set_attr( |
109
|
|
|
|
|
|
|
$self->metaData('pml_root')->{meta}, |
110
|
|
|
|
|
|
|
'description', $attr_value |
111
|
|
|
|
|
|
|
); |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub description { |
115
|
|
|
|
|
|
|
my $self = shift; |
116
|
|
|
|
|
|
|
return Treex::PML::Node::attr( $self->metaData('pml_root')->{meta}, 'description' ); |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub build_file_number { |
120
|
|
|
|
|
|
|
return sprintf "%03d", $highest_file_number++; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Full filename without the extension |
124
|
|
|
|
|
|
|
sub full_filename { |
125
|
|
|
|
|
|
|
log_fatal 'Incorrect number of arguments' if @_ != 1; |
126
|
|
|
|
|
|
|
my $self = shift; |
127
|
|
|
|
|
|
|
my $path = ''; |
128
|
|
|
|
|
|
|
if (defined $self->path && $self->path ne ''){ |
129
|
|
|
|
|
|
|
$path = $self->path; |
130
|
|
|
|
|
|
|
$path .= '/' if $path !~ m{/$}; |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
return $path . $self->file_stem . $self->file_number; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub BUILD { |
136
|
|
|
|
|
|
|
my $self = shift; |
137
|
|
|
|
|
|
|
my ($params_rf) = @_; |
138
|
|
|
|
|
|
|
my $pmldoc; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
if ( defined $params_rf ) { |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# creating Treex::Core::Document from an already existing Treex::PML::Document instance |
143
|
|
|
|
|
|
|
if ( $params_rf->{pmldoc} ) { |
144
|
|
|
|
|
|
|
$pmldoc = $params_rf->{pmldoc}; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# loading Treex::Core::Document from a file |
148
|
|
|
|
|
|
|
elsif ( $params_rf->{filename} ) { |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
if ( $params_rf->{filename} =~ /.streex$/ ) { |
151
|
|
|
|
|
|
|
log_fatal 'Storable (.streex) docs must be retrieved by Treex::Core::Document->retrieve_storable($filename)'; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
else { |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# If the file contains invalid PML (e.g. unknown afun value) |
157
|
|
|
|
|
|
|
# Treex::PML fails with die. |
158
|
|
|
|
|
|
|
# TODO: we should rather catch the die message and report it via log_fatal |
159
|
|
|
|
|
|
|
$pmldoc = eval { |
160
|
|
|
|
|
|
|
# In r10421, ZŽ added here recover => 1: |
161
|
|
|
|
|
|
|
# $factory->createDocumentFromFile( $params_rf->{filename}, { recover => 1 }); |
162
|
|
|
|
|
|
|
# However, if the file contains invalid PML (e.g. unknown afun value), the recover=>1 option |
163
|
|
|
|
|
|
|
# results in returning a $pmldoc which seems to be OK, but it contains no bundles, |
164
|
|
|
|
|
|
|
# so Treex crashes on subsequent blocks which is misleading for users. |
165
|
|
|
|
|
|
|
# If we really want to be fault-tolerant, it seems we would need to set Treex::PML::Instance::Reader::STRICT=0, |
166
|
|
|
|
|
|
|
# but I don't no enough about PML internals and I think it's better to make such errors fatal. |
167
|
|
|
|
|
|
|
# Martin Popel |
168
|
|
|
|
|
|
|
$factory->createDocumentFromFile( $params_rf->{filename}); |
169
|
|
|
|
|
|
|
}; |
170
|
|
|
|
|
|
|
log_fatal "Error while loading " . $params_rf->{filename} . ( $@ ? "\n$@" : '' ) |
171
|
|
|
|
|
|
|
if !defined $pmldoc; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# constructing treex document from an existing file |
177
|
|
|
|
|
|
|
if ($pmldoc) { |
178
|
|
|
|
|
|
|
$self->_set_pmldoc($pmldoc); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# ensuring Treex::Core types (partially copied from the factory) |
181
|
|
|
|
|
|
|
# $doczone hashref will be reused as the blessed instance variable |
182
|
|
|
|
|
|
|
for my $doczone ($self->get_all_zones()){ |
183
|
|
|
|
|
|
|
Treex::Core::DocZone->new($doczone); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
$self->_rebless_and_index(); |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
$self->deserialize_wild; |
190
|
|
|
|
|
|
|
foreach my $bundle ( $self->get_bundles ) { |
191
|
|
|
|
|
|
|
$bundle->deserialize_wild; |
192
|
|
|
|
|
|
|
foreach my $bundlezone ( $bundle->get_all_zones ) { |
193
|
|
|
|
|
|
|
foreach my $tree ( $bundlezone->get_all_trees ){ |
194
|
|
|
|
|
|
|
my $ordered = $tree->type->get_structure_name =~ /[at]-(root|node)/ ? 1 : 0; |
195
|
|
|
|
|
|
|
my $correct_ord = 0; |
196
|
|
|
|
|
|
|
my @nodes = $tree->get_descendants( { add_self => 1, ($ordered ? (ordered => 1) : ()) } ); |
197
|
|
|
|
|
|
|
foreach my $node (@nodes){ |
198
|
|
|
|
|
|
|
# normalize ord, so there are no gaps |
199
|
|
|
|
|
|
|
if ($ordered){ |
200
|
|
|
|
|
|
|
$node->_set_ord($correct_ord); |
201
|
|
|
|
|
|
|
$correct_ord++; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
$node->deserialize_wild; |
204
|
|
|
|
|
|
|
if ( $node->DOES('Treex::Core::Node::Interset') ) { |
205
|
|
|
|
|
|
|
$node->deserialize_iset; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
return; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub _rebless_and_index { |
216
|
|
|
|
|
|
|
my $self = shift; |
217
|
|
|
|
|
|
|
foreach my $bundle ( $self->get_bundles ) { |
218
|
|
|
|
|
|
|
bless $bundle, 'Treex::Core::Bundle'; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
$bundle->_set_document($self); |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
if ( defined $bundle->{zones} ) { |
223
|
|
|
|
|
|
|
foreach my $zone ( map { $_->value() } $bundle->{zones}->elements ) { |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# $zone hashref will be reused as the blessed instance variable |
226
|
|
|
|
|
|
|
Treex::Core::BundleZone->new($zone); |
227
|
|
|
|
|
|
|
$zone->_set_bundle($bundle); |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
foreach my $tree ( $zone->get_all_trees ) { |
230
|
|
|
|
|
|
|
my $layer; |
231
|
|
|
|
|
|
|
if ( $tree->type->get_structure_name =~ /(\S)-(root|node|nonterminal|terminal)/ ) { |
232
|
|
|
|
|
|
|
$layer = uc($1); |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
else { |
235
|
|
|
|
|
|
|
log_fatal "Unexpected member in zone structure: " . $tree->type->get_structure_name; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
foreach my $node ( $tree, $tree->descendants ) { # must still call Treex::PML::Node's API |
238
|
|
|
|
|
|
|
bless $node, "Treex::Core::Node::$layer"; |
239
|
|
|
|
|
|
|
$self->index_node_by_id( $node->get_id, $node ); |
240
|
|
|
|
|
|
|
if ($layer eq 'A' && $node->{iset}){ |
241
|
|
|
|
|
|
|
$node->{iset} = Lingua::Interset::FeatureStructure->new(%{$node->{iset}}); |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
$tree->_set_zone($zone); |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
return; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub _pml_attribute_hash { |
253
|
|
|
|
|
|
|
my $self = shift; |
254
|
|
|
|
|
|
|
return $self->metaData('pml_root')->{meta}; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
#my $_treex_schema_file = Treex::PML::ResolvePath( '.', 'treex_schema.xml', 1 ); |
258
|
|
|
|
|
|
|
my $_treex_schema_file = Treex::Core::Config->pml_schema_dir . "/" . 'treex_schema.xml'; |
259
|
|
|
|
|
|
|
if ( not -f $_treex_schema_file ) { |
260
|
|
|
|
|
|
|
log_fatal "Can't find PML schema $_treex_schema_file"; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
my $_treex_schema = Treex::PML::Schema->new( { filename => $_treex_schema_file } ); |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub _create_empty_pml_doc { ## no critic (ProhibitUnusedPrivateSubroutines) |
266
|
|
|
|
|
|
|
my $fsfile = Treex::PML::Document->create |
267
|
|
|
|
|
|
|
( |
268
|
|
|
|
|
|
|
name => "x", #$filename, ??? |
269
|
|
|
|
|
|
|
FS => Treex::PML::FSFormat->new( |
270
|
|
|
|
|
|
|
{ |
271
|
|
|
|
|
|
|
'deepord' => ' N' # ??? |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
), |
274
|
|
|
|
|
|
|
trees => [], |
275
|
|
|
|
|
|
|
backend => 'PMLBackend', |
276
|
|
|
|
|
|
|
encoding => "utf-8", |
277
|
|
|
|
|
|
|
); |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
$fsfile->changeMetaData( 'schema-url', 'treex_schema.xml' ); |
280
|
|
|
|
|
|
|
$fsfile->changeMetaData( 'schema', $_treex_schema ); |
281
|
|
|
|
|
|
|
$fsfile->changeMetaData( 'pml_root', { meta => {}, bundles => undef, } ); |
282
|
|
|
|
|
|
|
return $fsfile; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# --- INDEXING |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
sub index_node_by_id { |
288
|
|
|
|
|
|
|
my $self = shift; |
289
|
|
|
|
|
|
|
my ( $id, $node ) = pos_validated_list( |
290
|
|
|
|
|
|
|
\@_, |
291
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Id' }, |
292
|
|
|
|
|
|
|
{ isa => 'Maybe[Treex::Core::Node]' }, #jde to takhle? |
293
|
|
|
|
|
|
|
); |
294
|
|
|
|
|
|
|
my $index = $self->_index; |
295
|
|
|
|
|
|
|
if ( defined $node ) { |
296
|
|
|
|
|
|
|
$index->{$id} = $node; |
297
|
|
|
|
|
|
|
weaken $index->{$id}; |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
my $refs = $node->_get_referenced_ids; |
300
|
|
|
|
|
|
|
foreach my $type ( keys %{$refs} ) { |
301
|
|
|
|
|
|
|
$self->index_backref( $type, $id, $refs->{$type} ); |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
else { |
305
|
|
|
|
|
|
|
delete $index->{$id}; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
return; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# Add references to the reversed references list |
311
|
|
|
|
|
|
|
sub index_backref { |
312
|
|
|
|
|
|
|
my ( $self, $type, $source, $targets ) = @_; |
313
|
|
|
|
|
|
|
my $backref = $self->_backref; |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
foreach my $target ( @{$targets} ) { |
316
|
|
|
|
|
|
|
next if ( !defined($target) ); |
317
|
|
|
|
|
|
|
my $target_backrefs = $backref->{$target} // {}; |
318
|
|
|
|
|
|
|
$backref->{$target} = $target_backrefs; |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
$target_backrefs->{$type} = [] if ( !$target_backrefs->{$type} ); |
321
|
|
|
|
|
|
|
push @{ $target_backrefs->{$type} }, $source; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
return; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# Remove references from the reversed references list |
327
|
|
|
|
|
|
|
sub remove_backref { |
328
|
|
|
|
|
|
|
my ( $self, $type, $source, $targets ) = @_; |
329
|
|
|
|
|
|
|
my $backref = $self->_backref; |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
foreach my $target ( @{$targets} ) { |
332
|
|
|
|
|
|
|
next if ( !defined($target) ); |
333
|
|
|
|
|
|
|
my $target_backrefs = $backref->{$target}; |
334
|
|
|
|
|
|
|
next if ( !$target_backrefs ); |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
$target_backrefs->{$type} = [ grep { $_ ne $source } @{ $target_backrefs->{$type} } ]; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
return; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# Return a hash of references ( type->[nodes] ) leading to the node with the given id |
342
|
|
|
|
|
|
|
sub get_references_to_id { |
343
|
|
|
|
|
|
|
my ( $self, $id ) = @_; |
344
|
|
|
|
|
|
|
my $backref = $self->_backref; |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
return if ( !$backref->{$id} ); |
347
|
|
|
|
|
|
|
return $backref->{$id}; # TODO clone ? |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# Remove all references and backreferences leading to the $node (calls remove_reference() on the source nodes) |
351
|
|
|
|
|
|
|
sub _remove_references_to_node { |
352
|
|
|
|
|
|
|
my ( $self, $node ) = @_; |
353
|
|
|
|
|
|
|
my $id = $node->id; |
354
|
|
|
|
|
|
|
my $backref = $self->_backref; |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# First, delete backreferences to the $node |
357
|
|
|
|
|
|
|
my $refs = $node->_get_referenced_ids(); |
358
|
|
|
|
|
|
|
foreach my $type ( keys %{$refs} ) { |
359
|
|
|
|
|
|
|
$self->remove_backref( $type, $id, $refs->{$type} ); |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# Second, delete references to the $node |
363
|
|
|
|
|
|
|
return if ( !$backref->{$id} ); |
364
|
|
|
|
|
|
|
my $node_backref = $backref->{$id}; |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
foreach my $type ( keys %{$node_backref} ) { |
367
|
|
|
|
|
|
|
foreach my $source ( @{ $node_backref->{$type} } ) { |
368
|
|
|
|
|
|
|
$self->get_node_by_id($source)->remove_reference( $type, $id ); |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# Third, delete backreferences from the $node |
373
|
|
|
|
|
|
|
delete $backref->{$id}; |
374
|
|
|
|
|
|
|
return; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
sub id_is_indexed { |
378
|
|
|
|
|
|
|
my $self = shift; |
379
|
|
|
|
|
|
|
my ($id) = pos_validated_list( |
380
|
|
|
|
|
|
|
\@_, |
381
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Id' }, |
382
|
|
|
|
|
|
|
); |
383
|
|
|
|
|
|
|
return ( defined $self->_index->{$id} ); |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
sub get_node_by_id { |
387
|
|
|
|
|
|
|
my $self = shift; |
388
|
|
|
|
|
|
|
my ($id) = pos_validated_list( |
389
|
|
|
|
|
|
|
\@_, |
390
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Id' }, |
391
|
|
|
|
|
|
|
); |
392
|
|
|
|
|
|
|
if ( defined $self->_index->{$id} ) { |
393
|
|
|
|
|
|
|
return $self->_index->{$id}; |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
else { |
396
|
|
|
|
|
|
|
log_fatal "ID not indexed: id=\"$id\""; |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# This is something very fatal. Treex assumes every node ID to |
399
|
|
|
|
|
|
|
# be valid and pointing to a node *in the given document*. |
400
|
|
|
|
|
|
|
# (It is fine to have a node with no a/lex.rf |
401
|
|
|
|
|
|
|
# attribute, but if the attribute is there, the value |
402
|
|
|
|
|
|
|
# has to be an ID within the document.) |
403
|
|
|
|
|
|
|
# |
404
|
|
|
|
|
|
|
# If your data violates the requirement and your IDs point to |
405
|
|
|
|
|
|
|
# a different document, the only hack we suggest is to drop such |
406
|
|
|
|
|
|
|
# references... |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
return; |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
sub get_all_node_ids { |
412
|
|
|
|
|
|
|
log_fatal('Incorrect number of arguments') if @_ != 1; |
413
|
|
|
|
|
|
|
my $self = shift; |
414
|
|
|
|
|
|
|
return ( keys %{ $self->_index } ); |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# -------------------------------------- ACCESS TO BUNDLES ------------------- |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub get_bundles { |
420
|
|
|
|
|
|
|
log_fatal('Incorrect number of arguments') if @_ != 1; |
421
|
|
|
|
|
|
|
my $self = shift; |
422
|
|
|
|
|
|
|
return $self->trees; |
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
sub create_bundle { |
426
|
|
|
|
|
|
|
my ( $self, $arg_ref ) = @_; |
427
|
|
|
|
|
|
|
my $fsfile = $self->_pmldoc(); |
428
|
|
|
|
|
|
|
my $new_bundle; |
429
|
|
|
|
|
|
|
my $position_of_new; |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
if ( $arg_ref and ( $arg_ref->{after} or $arg_ref->{before} ) ) { |
432
|
|
|
|
|
|
|
my $reference_bundle = ( $arg_ref->{after} ) ? $arg_ref->{after} : $arg_ref->{before}; |
433
|
|
|
|
|
|
|
my $position_of_reference = $reference_bundle->get_position; |
434
|
|
|
|
|
|
|
$position_of_new = $position_of_reference + ( $arg_ref->{after} ? 1 : 0 ); |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
else { # default: append at the end of the document |
438
|
|
|
|
|
|
|
$position_of_new = scalar( $self->get_bundles() ); |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
$new_bundle = $fsfile->new_tree($position_of_new); |
442
|
|
|
|
|
|
|
$new_bundle->set_type_by_name( $fsfile->metaData('schema'), 'bundle.type' ); |
443
|
|
|
|
|
|
|
bless $new_bundle, "Treex::Core::Bundle"; # is this correct/sufficient with Moose ???? |
444
|
|
|
|
|
|
|
$new_bundle->_set_document($self); |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
$new_bundle->set_id( "s" . ( $fsfile->lastTreeNo + 1 ) ); |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
return $new_bundle; |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
# ----------------------- ACCESS TO ZONES ------------------------------------ |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
sub create_zone { |
454
|
|
|
|
|
|
|
my $self = shift; |
455
|
|
|
|
|
|
|
my ( $language, $selector ) = pos_validated_list( |
456
|
|
|
|
|
|
|
\@_, |
457
|
|
|
|
|
|
|
{ isa => 'Treex::Type::LangCode' }, |
458
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Selector', default => '' }, |
459
|
|
|
|
|
|
|
); |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
my $new_zone = Treex::Core::DocZone->new( |
462
|
|
|
|
|
|
|
{ |
463
|
|
|
|
|
|
|
'language' => $language, |
464
|
|
|
|
|
|
|
'selector' => $selector |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
); |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
my $new_element = Treex::PML::Seq::Element->new( 'zone', $new_zone ); |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
my $meta = $self->metaData('pml_root')->{meta}; |
471
|
|
|
|
|
|
|
if ( defined $meta->{zones} ) { |
472
|
|
|
|
|
|
|
$meta->{zones}->unshift_element_obj($new_element); |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
else { |
475
|
|
|
|
|
|
|
$meta->{zones} = Treex::PML::Seq->new( [$new_element] ); |
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
return $new_zone; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
sub get_all_zones { |
482
|
|
|
|
|
|
|
my $self = shift; |
483
|
|
|
|
|
|
|
my $meta = $self->metaData('pml_root')->{meta}; |
484
|
|
|
|
|
|
|
return if !$meta->{zones}; |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# Each element is a pair [$name, $value]. We need just the values. |
487
|
|
|
|
|
|
|
return map {$_->[1]} $meta->{zones}->elements; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub get_zone { |
491
|
|
|
|
|
|
|
my $self = shift; |
492
|
|
|
|
|
|
|
my ( $language, $selector ) = pos_validated_list( |
493
|
|
|
|
|
|
|
\@_, |
494
|
|
|
|
|
|
|
{ isa => 'Treex::Type::LangCode' }, |
495
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Selector', default => '' }, |
496
|
|
|
|
|
|
|
); |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
foreach my $zone ($self->get_all_zones()) { |
499
|
|
|
|
|
|
|
return $zone if $zone->language eq $language && $zone->selector eq $selector; |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
return; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
sub get_or_create_zone { |
505
|
|
|
|
|
|
|
my $self = shift; |
506
|
|
|
|
|
|
|
my ( $language, $selector ) = pos_validated_list( |
507
|
|
|
|
|
|
|
\@_, |
508
|
|
|
|
|
|
|
{ isa => 'Treex::Type::LangCode' }, |
509
|
|
|
|
|
|
|
{ isa => 'Treex::Type::Selector', default => '' }, |
510
|
|
|
|
|
|
|
); |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
my $fs_zone = $self->get_zone( $language, $selector ); |
513
|
|
|
|
|
|
|
if ( not defined $fs_zone ) { |
514
|
|
|
|
|
|
|
$fs_zone = $self->create_zone( $language, $selector ); |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
return $fs_zone; |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# -------------------- LOADING AND SAVING ------------------------------------ |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
sub load { |
522
|
|
|
|
|
|
|
my $self = shift; |
523
|
|
|
|
|
|
|
return $self->_pmldoc->load(@_); |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# TODO: this is unfinished: should be somehow connected with the code in BUILD |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
sub save { |
529
|
|
|
|
|
|
|
my $self = shift; |
530
|
|
|
|
|
|
|
my ($filename) = @_; |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
if ( $filename =~ /\.streex$/ ) { |
533
|
|
|
|
|
|
|
open( my $F, ">:via(gzip)", $filename ) or log_fatal $!; |
534
|
|
|
|
|
|
|
print $F Storable::nfreeze($self); |
535
|
|
|
|
|
|
|
close $F; |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
# using Storable::nstore_fd($self,*$F) emits 'Inappropriate ioctl for device' |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
else { |
541
|
|
|
|
|
|
|
$self->_serialize_all_wild(); |
542
|
|
|
|
|
|
|
return $self->_pmldoc->save(@_); |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
return; |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub _serialize_all_wild { |
549
|
|
|
|
|
|
|
my ($self) = @_; |
550
|
|
|
|
|
|
|
$self->serialize_wild; |
551
|
|
|
|
|
|
|
foreach my $bundle ( $self->get_bundles ) { |
552
|
|
|
|
|
|
|
$bundle->serialize_wild; |
553
|
|
|
|
|
|
|
foreach my $bundlezone ( $bundle->get_all_zones ) { |
554
|
|
|
|
|
|
|
foreach my $node ( map { $_->get_descendants( { add_self => 1 } ) } $bundlezone->get_all_trees ) { |
555
|
|
|
|
|
|
|
$node->serialize_wild; |
556
|
|
|
|
|
|
|
if ( $node->DOES('Treex::Core::Node::Interset') ) { |
557
|
|
|
|
|
|
|
$node->serialize_iset; |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
} |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
return; |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub retrieve_storable { |
566
|
|
|
|
|
|
|
my ( $class, $file ) = @_; # $file stands for a file name, but it can be also file handle (needed by the TrEd backend for .streex) |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
my $FILEHANDLE; |
569
|
|
|
|
|
|
|
my $opened = 0; |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
if ( ref($file) and reftype($file) eq 'GLOB' ) { |
572
|
|
|
|
|
|
|
$FILEHANDLE = $file; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
else { |
575
|
|
|
|
|
|
|
log_fatal "filename=$file, but Treex::Core::Document->retrieve_storable(\$filename) can be used only for .streex files" |
576
|
|
|
|
|
|
|
unless $file =~ /\.streex$/; |
577
|
|
|
|
|
|
|
open $FILEHANDLE, "<:via(gzip)", $file or log_fatal($!); |
578
|
|
|
|
|
|
|
$opened = 1; |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
my $serialized; |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
# reading it this way is silly, but both slurping the file or |
584
|
|
|
|
|
|
|
# using Storable::retrieve_fd lead to errors when used with via(gzip) |
585
|
|
|
|
|
|
|
while (<$FILEHANDLE>) { |
586
|
|
|
|
|
|
|
$serialized .= $_; |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
if ( $opened ) { |
590
|
|
|
|
|
|
|
close($FILEHANDLE); |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
# my $retrieved_doc = Storable::retrieve_fd(*$FILEHANDLE) or log_fatal($!); |
594
|
|
|
|
|
|
|
my $retrieved_doc = Storable::thaw($serialized) or log_fatal $!; |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
if ( not ref($file) ) { |
597
|
|
|
|
|
|
|
$retrieved_doc->set_loaded_from($file); |
598
|
|
|
|
|
|
|
my ( $volume, $dirs, $file_name ) = File::Spec->splitpath($file); |
599
|
|
|
|
|
|
|
$retrieved_doc->set_path( $volume . $dirs ); |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
# $retrieved_doc->changeFilename($file); # why this doesn't affect the name displayed in TrEd? |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
# *.streex files saved before r8789 (2012-05-29) have no PML types with nodes, let's fix it |
605
|
|
|
|
|
|
|
# TODO: delete this hack as soon as no such old streex files are needed. |
606
|
|
|
|
|
|
|
foreach my $bundle ( $retrieved_doc->get_bundles() ) { |
607
|
|
|
|
|
|
|
foreach my $bundlezone ( $bundle->get_all_zones() ) { |
608
|
|
|
|
|
|
|
foreach my $node ( map { $_->get_descendants() } $bundlezone->get_all_trees() ) { |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# skip this hack if we are dealing with a new streex file |
611
|
|
|
|
|
|
|
#return $retrieved_doc if $node->type; |
612
|
|
|
|
|
|
|
# This shortcut does not work since old files have only *some* nodes without types |
613
|
|
|
|
|
|
|
$node->fix_pml_type(); |
614
|
|
|
|
|
|
|
} |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
return $retrieved_doc; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
1; |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
__END__ |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
=for Pod::Coverage BUILD build_file_number description set_description |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=encoding utf-8 |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
=head1 NAME |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
Treex::Core::Document - representation of a text and its linguistic analyses in the Treex framework |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
=head1 VERSION |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
version 2.20160630 |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=head1 DESCRIPTION |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
A document consists of a sequence of bundles, mirroring a sequence |
644
|
|
|
|
|
|
|
of natural language sentences (typically, but not necessarily, |
645
|
|
|
|
|
|
|
originating from the same text). Attributes (attribute-value pairs) |
646
|
|
|
|
|
|
|
can be attached to a document as a whole. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
Note that the references from the bundles to the containing document are weak, |
649
|
|
|
|
|
|
|
so make sure you always keep a reference to the document in scope to prevent |
650
|
|
|
|
|
|
|
the contents of the document from being garbage-collected. |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
C<Treex::Core::Document>'s instances have the following attributes: |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
=over 4 |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=item description |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Textual description of the file's content that is stored in the file. |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=item loaded_from |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=item path |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
=item file_stem |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item file_number |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=back |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
The attributes can be accessed using semi-affordance accessors: |
673
|
|
|
|
|
|
|
getters have the same names as attributes, while setters start with |
674
|
|
|
|
|
|
|
C<set_>. For example, the attribute C<path> has a getter C<path()> and a setter C<set_path($path)> |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
=head1 METHODS |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
=head2 Constructor |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=over 4 |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=item my $new_document = Treex::Core::Document->new; |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
creates a new empty document object. |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
=item my $new_document = Treex::Core::Document->new( { pmldoc => $pmldoc } ); |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
creates a C<Treex::Core::Document> instance from an already existing L<Treex::PML::Document> instance |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
=item my $new_document = Treex::Core::Document->new( { filename => $filename } ); |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
loads a C<Treex::Core::Document> instance from a .treex file |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
=back |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
=head2 Access to zones |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
Document zones are instances of L<Treex::Core::DocZone>, parametrized |
702
|
|
|
|
|
|
|
by language code and possibly also by another free label |
703
|
|
|
|
|
|
|
called selector, whose purpose is to distinguish zones for the same language |
704
|
|
|
|
|
|
|
but from a different source. |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=over 4 |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=item my $zone = $doc->create_zone( $langcode, ?$selector ); |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=item my $zone = $doc->get_zone( $langcode, ?$selector ); |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
=item my $zone = $doc->get_or_create_zone( $langcode, ?$selector ); |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
=back |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=head2 Access to bundles |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
=over 4 |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
=item my @bundles = $document->get_bundles(); |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
Returns the array of bundles contained in the document. |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
=item my $new_bundle = $document->create_bundle(); |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
Creates a new empty bundle and appends it |
729
|
|
|
|
|
|
|
at the end of the document. |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item my $new_bundle = $document->new_bundle_before( $existing_bundle ); |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
Creates a new empty bundle and inserts it |
734
|
|
|
|
|
|
|
in front of the existing bundle. |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
=item my $new_bundle = $document->new_bundle_after( $existing_bundle ); |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
Creates a new empty bundle and inserts it |
739
|
|
|
|
|
|
|
after the existing bundle. |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=back |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=head2 Node indexing |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
=over 4 |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
=item $document->index_node_by_id( $id, $node ); |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
The node is added to the document's indexing table C<id2node> (it is done |
751
|
|
|
|
|
|
|
automatically in L<Treex::Core::Node::set_attr()|Treex::Core::Node/set_attr> |
752
|
|
|
|
|
|
|
if the attribute name is 'C<id>'). When using C<undef> in the place of the |
753
|
|
|
|
|
|
|
second argument, the entry for the given id is deleted from the hash. |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
=item my $node = $document->get_node_by_id( $id ); |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
Return the node which has the value C<$id> in its 'C<id>' attribute, |
759
|
|
|
|
|
|
|
no matter to which tree and to which bundle in the given document |
760
|
|
|
|
|
|
|
the node belongs to. |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
It is prohibited in Treex for IDs to point outside of the current document. |
763
|
|
|
|
|
|
|
In rare cases where your data has such links, we recommend you to split the |
764
|
|
|
|
|
|
|
documents differently or hack it by dropping the problematic links. |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=item $document->id_is_indexed( $id ); |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
Return C<true> if the given C<id> is already present in the indexing table. |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
=item $document->get_all_node_ids(); |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
Return the array of all node identifiers indexed in the document. |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=item $document->get_references_to_id( $id ); |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
Return all references leading to the given node id in a hash (keys are reference types, e.g. 'alignment', |
777
|
|
|
|
|
|
|
'a/lex.rf' etc., values are arrays of nodes referencing this node). |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=item $document->remove_refences_to_id( $id ); |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
Remove all references to the given node id (calls remove_reference() on each referencing node). |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=back |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
=head2 Serializing |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
=over 4 |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
=item my $document = load($filename, \%opts) |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
Loads document from C<$filename> given C<%opts> using L<Treex::PML::Document::load()> |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
=item $document->save($filename) |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
Saves document to C<$filename> using L<Treex::PML::Document::save()>, |
796
|
|
|
|
|
|
|
or by the Storable module if the file's extension is .streex.gz. |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=item Treex::Core::Document->retrieve_storable($filename) |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
Loading a document from the .streex (Storable) format. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
=back |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=head2 Other |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=over 4 |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=item my $filename = $doc->full_filename; |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
full filename without the extension |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=back |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
=head1 AUTHOR |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
ZdenÄk Žabokrtský <zabokrtsky@ufal.mff.cuni.cz> |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
Martin Popel <popel@ufal.mff.cuni.cz> |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
OndÅej DuÅ¡ek <odusek@ufal.mff.cuni.cz> |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |