line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## This is a simple XML backend for TEI files -*-cperl-*- |
2
|
|
|
|
|
|
|
## author: Petr Pajas |
3
|
|
|
|
|
|
|
# $Id: TrXML.pm 3025 2007-04-23 13:55:04Z pajas $ ' |
4
|
|
|
|
|
|
|
############################################################# |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Treex::PML::Backend::TrXML; |
7
|
1
|
|
|
1
|
|
1354
|
use Treex::PML; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use XML::LibXML; |
9
|
|
|
|
|
|
|
use XML::LibXML::SAX; |
10
|
|
|
|
|
|
|
use Treex::PML::IO qw(close_backend); |
11
|
|
|
|
|
|
|
use strict; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use vars qw($VERSION); |
14
|
|
|
|
|
|
|
BEGIN { |
15
|
|
|
|
|
|
|
$VERSION='2.22'; # version template |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
sub test { |
19
|
|
|
|
|
|
|
my ($f)=@_; |
20
|
|
|
|
|
|
|
if (ref($f)) { |
21
|
|
|
|
|
|
|
return ($f->getline()=~/\s*\<\?xml / && |
22
|
|
|
|
|
|
|
$f->getline()=~/\]|\/i); |
23
|
|
|
|
|
|
|
} else { |
24
|
|
|
|
|
|
|
my $fh = Treex::PML::IO::open_backend($f,"r"); |
25
|
|
|
|
|
|
|
my $test = $fh && test($fh); |
26
|
|
|
|
|
|
|
Treex::PML::IO::close_backend($fh); |
27
|
|
|
|
|
|
|
return $test; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub open_backend { |
32
|
|
|
|
|
|
|
my ($uri,$rw,$encoding)=@_; |
33
|
|
|
|
|
|
|
# discard encoding and pass the rest to the Treex::PML::IO |
34
|
|
|
|
|
|
|
Treex::PML::IO::open_backend($uri,$rw,($rw eq 'w' ? $encoding : undef)); |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub read { |
39
|
|
|
|
|
|
|
my ($input,$target_doc) = @_; |
40
|
|
|
|
|
|
|
#my $handler = XML::SAX::Writer->new(); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
my $handler = Treex::PML::Backend::TrXML::SAXHandler->new(TargetDocument => $target_doc); |
43
|
|
|
|
|
|
|
my $p = XML::LibXML::SAX->new(Handler => $handler); |
44
|
|
|
|
|
|
|
if (ref($input)) { |
45
|
|
|
|
|
|
|
$p->parse(Source => { ByteStream => $input }); |
46
|
|
|
|
|
|
|
} else { |
47
|
|
|
|
|
|
|
$p->parse_uri($input); |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
return 1; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub xml_quote { |
54
|
|
|
|
|
|
|
local $_=$_[0]; |
55
|
|
|
|
|
|
|
s/&/&/g; |
56
|
|
|
|
|
|
|
s/'/'/g; |
57
|
|
|
|
|
|
|
s/"/"/g; |
58
|
|
|
|
|
|
|
s/>/>/g; |
59
|
|
|
|
|
|
|
s/</g; |
60
|
|
|
|
|
|
|
return $_; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub xml_quote_pcdata { |
64
|
|
|
|
|
|
|
local $_=$_[0]; |
65
|
|
|
|
|
|
|
s/&/&/g; |
66
|
|
|
|
|
|
|
s/>/>/g; |
67
|
|
|
|
|
|
|
s/</g; |
68
|
|
|
|
|
|
|
return $_; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub write { |
72
|
|
|
|
|
|
|
my ($output, $src_doc) = @_; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
die "Require GLOB reference\n" unless ref($output); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# xml_decl |
77
|
|
|
|
|
|
|
print $output "
|
78
|
|
|
|
|
|
|
if ($src_doc->metaData('xmldecl_version') ne "") { |
79
|
|
|
|
|
|
|
print $output " version=\"".$src_doc->metaData('xmldecl_version')."\""; |
80
|
|
|
|
|
|
|
} else { |
81
|
|
|
|
|
|
|
print $output " version=\"1.0\""; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
if ($src_doc->encoding() ne "") { |
84
|
|
|
|
|
|
|
print $output " encoding=\"".$src_doc->encoding()."\""; |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
if ($src_doc->metaData('xmldecl_standalone') ne "") { |
87
|
|
|
|
|
|
|
print $output " standalone=\"".$src_doc->metaData('xmldecl_standalone')."\""; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
print $output "?>\n"; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
print $output ("
|
92
|
|
|
|
|
|
|
" \"http://ufal.mff.cuni.cz/~pajas/tred.dtd\" [\n". |
93
|
|
|
|
|
|
|
"
|
94
|
|
|
|
|
|
|
join("\n",map { " $_ CDATA #IMPLIED" } |
95
|
|
|
|
|
|
|
grep { !/^(?:ORD|HIDE|ID)$/ } $src_doc->FS->attributes). |
96
|
|
|
|
|
|
|
"\">\n]>\n"); |
97
|
|
|
|
|
|
|
print $output "\n"; |
98
|
|
|
|
|
|
|
print $output "\n"; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
my @meta=grep { !/^xmldecl_/ } $src_doc->listMetaData(); |
101
|
|
|
|
|
|
|
if (@meta) { |
102
|
|
|
|
|
|
|
print $output "\n"; |
103
|
|
|
|
|
|
|
foreach (@meta) { |
104
|
|
|
|
|
|
|
print $output " metaData($_))."\"/>\n"; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
print $output "\n"; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
print $output "\n"; |
110
|
|
|
|
|
|
|
foreach my $atr (grep { !/^(?:ORD|HIDE|ID)$/ } $src_doc->FS->attributes) { |
111
|
|
|
|
|
|
|
print $output "
|
112
|
|
|
|
|
|
|
if ($src_doc->FS->isList($atr)) { |
113
|
|
|
|
|
|
|
print $output " v=\"",xml_quote(join("|",$src_doc->FS->listValues($atr))),"\""; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
print $output "/>\n"; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
print $output "\n"; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
foreach my $tree ($src_doc->trees) { |
120
|
|
|
|
|
|
|
my $node=$tree; |
121
|
|
|
|
|
|
|
NODE: while ($node) { |
122
|
|
|
|
|
|
|
print $output "
|
123
|
|
|
|
|
|
|
print $output |
124
|
|
|
|
|
|
|
map { " $_=\"".xml_quote($node->{$_})."\"" } |
125
|
|
|
|
|
|
|
grep { $node->{$_} ne "" } |
126
|
|
|
|
|
|
|
grep { !/^(?:ORD|HIDE|ID)$/ } $src_doc->FS->attributes; |
127
|
|
|
|
|
|
|
print $output ">\n"; |
128
|
|
|
|
|
|
|
if ($node->firstson) { |
129
|
|
|
|
|
|
|
$node=$node->firstson; |
130
|
|
|
|
|
|
|
next; |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
while ($node) { |
133
|
|
|
|
|
|
|
print $output "\n"; |
134
|
|
|
|
|
|
|
if ($node->rbrother) { |
135
|
|
|
|
|
|
|
$node=$node->rbrother; |
136
|
|
|
|
|
|
|
next NODE; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
$node=$node->parent; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
print $output "\n"; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# SAX TrXML to Treex::PML::Document transducer |
147
|
|
|
|
|
|
|
package Treex::PML::Backend::TrXML::SAXHandler; |
148
|
|
|
|
|
|
|
use strict; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
use vars qw($VERSION); |
151
|
|
|
|
|
|
|
BEGIN { |
152
|
|
|
|
|
|
|
$VERSION='2.22'; # version template |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
use Treex::PML; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub decode { |
157
|
|
|
|
|
|
|
my ($self, $str)=@_; |
158
|
|
|
|
|
|
|
my $enc=$self->{TargetDocument}->encoding(); |
159
|
|
|
|
|
|
|
if ($]>=5.008 or $enc eq "") { |
160
|
|
|
|
|
|
|
return $str; |
161
|
|
|
|
|
|
|
} else { |
162
|
|
|
|
|
|
|
print "encoding: $enc, $str\n"; |
163
|
|
|
|
|
|
|
eval { |
164
|
|
|
|
|
|
|
$str = XML::LibXML::decodeFromUTF8($enc,$str); |
165
|
|
|
|
|
|
|
}; |
166
|
|
|
|
|
|
|
warn $@ if $@; |
167
|
|
|
|
|
|
|
return $str; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub new { |
172
|
|
|
|
|
|
|
my ($class, %args) = @_; |
173
|
|
|
|
|
|
|
bless \%args, $class; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub start_document { |
177
|
|
|
|
|
|
|
my ($self,$hash) = @_; |
178
|
|
|
|
|
|
|
$self->{TargetDocument} ||= Treex::PML::Factory->createDocument(); |
179
|
|
|
|
|
|
|
$self->{FSAttrs} ||= []; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub end_document { |
183
|
|
|
|
|
|
|
my ($self) = @_; |
184
|
|
|
|
|
|
|
my $FS = Treex::PML::Factory->createFSFormat([ |
185
|
|
|
|
|
|
|
@{$self->{FSAttrs}}, |
186
|
|
|
|
|
|
|
'@N ORD', '@H HIDE', '@K ID' |
187
|
|
|
|
|
|
|
]); |
188
|
|
|
|
|
|
|
$self->{TargetDocument}->changeFS($FS); |
189
|
|
|
|
|
|
|
$self->{TargetDocument}; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub xml_decl { |
193
|
|
|
|
|
|
|
my ($self,$data) = @_; |
194
|
|
|
|
|
|
|
my $doc = $self->{TargetDocument}; |
195
|
|
|
|
|
|
|
$doc->changeEncoding($data->{Encoding});# || 'iso-8859-2'); |
196
|
|
|
|
|
|
|
$doc->changeMetaData('xmldecl_version' => $data->{Version}); |
197
|
|
|
|
|
|
|
$doc->changeMetaData('xmldecl_standalone' => $data->{Standalone}); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
sub characters { |
201
|
|
|
|
|
|
|
# nothing to do so far |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub start_element { |
205
|
|
|
|
|
|
|
my ($self, $hash) = @_; |
206
|
|
|
|
|
|
|
my $elem = $hash->{Name}; |
207
|
|
|
|
|
|
|
my $attr = $hash->{Attributes}; |
208
|
|
|
|
|
|
|
my $target_doc = $self->{TargetDocument}; |
209
|
|
|
|
|
|
|
# my %attr = map { $_->{Name} => $_->{Value} } values %$attr; |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# $elem eq 'tree' && do { } # nothing to do |
212
|
|
|
|
|
|
|
# $elem eq 'info' && do { } # nothing to do |
213
|
|
|
|
|
|
|
if ($elem eq 'meta') { |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
$target_doc->changeMetaData($self->decode($attr->{'{}name'}->{Value}) => |
216
|
|
|
|
|
|
|
$self->decode($attr->{'{}content'}->{Value})); |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
} elsif ($elem eq 'types') { |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# $target_doc->changeMetaData('TrXML types/@full' => $self->decode($attr->{'{}full'}->{Value})) |
221
|
|
|
|
|
|
|
# if (exists($attr->{'{}full'})); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
} elsif ($elem eq 't') { |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
my $atrname = $attr->{'{}n'}->{Value}; |
226
|
|
|
|
|
|
|
my $v = exists($attr->{'{}v'}) ? $self->decode($attr->{'{}v'}->{Value}) : ""; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
push @{$self->{FSAttrs}}, '@P '.$atrname; |
229
|
|
|
|
|
|
|
push @{$self->{FSAttrs}}, '@L '.$atrname.'|'.$v if ($v ne ""); |
230
|
|
|
|
|
|
|
# d and m not implemented |
231
|
|
|
|
|
|
|
} elsif ($elem eq 'nd') { |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
my $parent = $self->{Node}; |
234
|
|
|
|
|
|
|
my $new; |
235
|
|
|
|
|
|
|
if ($parent) { |
236
|
|
|
|
|
|
|
$self->{Node} = $new = Treex::PML::Factory->createNode(); |
237
|
|
|
|
|
|
|
} else { |
238
|
|
|
|
|
|
|
undef $parent; |
239
|
|
|
|
|
|
|
$self->{Tree} = $self->{TargetDocument}->new_tree($self->{TargetDocument}->lastTreeNo+1); |
240
|
|
|
|
|
|
|
$self->{Node} = $new = $self->{Tree}; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
$new->{ORD}=$attr->{'{}n'}->{Value}; |
243
|
|
|
|
|
|
|
$new->{HIDE}='hide'x$attr->{'{}h'}->{Value}; |
244
|
|
|
|
|
|
|
$new->{ID}=$self->decode($attr->{'{}id'}->{Value}); |
245
|
|
|
|
|
|
|
foreach (grep { !/^{}(?:n|h|id)$/ } keys %$attr) { |
246
|
|
|
|
|
|
|
$new->{$self->decode($attr->{$_}->{Name})} = $self->decode($attr->{$_}->{Value}); |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
$new->paste_on($parent,'ORD') if ($parent); |
249
|
|
|
|
|
|
|
} elsif ($elem eq 'trees' or $elem eq 'info') { |
250
|
|
|
|
|
|
|
# do nothing |
251
|
|
|
|
|
|
|
} else { |
252
|
|
|
|
|
|
|
die "Treex::PML::Backend::TrXML: unknown element $elem\n"; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
$self->{attributes}=$attr; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub end_element { |
258
|
|
|
|
|
|
|
my ($self,$hash) = @_; |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
if ($hash->{Name} eq 'nd') { |
261
|
|
|
|
|
|
|
$self->{Node}=$self->{Node}->parent; |
262
|
|
|
|
|
|
|
} elsif ($hash->{Name} eq 'trees') { |
263
|
|
|
|
|
|
|
$self->{Node}=undef; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
sub entity_reference { |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub start_cdata { # not much use for this |
271
|
|
|
|
|
|
|
my $self = shift; |
272
|
|
|
|
|
|
|
$self->{InCDATA} = 1; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub end_cdata { # not much use for this |
276
|
|
|
|
|
|
|
my $self = shift; |
277
|
|
|
|
|
|
|
$self->{InCDATA} = 0; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub comment { |
281
|
|
|
|
|
|
|
my $self = $_[0]; |
282
|
|
|
|
|
|
|
my $data = $_[1]; |
283
|
|
|
|
|
|
|
if ($self->{Node}) { |
284
|
|
|
|
|
|
|
$self->{Node}->{xml_comment}.=''; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
sub doctype_decl { # not use for this, so far |
289
|
|
|
|
|
|
|
my ($self,$hash) = @_; |
290
|
|
|
|
|
|
|
foreach (qw(Name SystemId PublicId Internal)) { |
291
|
|
|
|
|
|
|
$self->{"DocType_$_"} = $hash->{$_}; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
1; |
296
|
|
|
|
|
|
|
__END__ |