blib/lib/Treex/PML/Backend/TEIXML.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 29 | 217 | 13.3 |
branch | 0 | 90 | 0.0 |
condition | 0 | 52 | 0.0 |
subroutine | 11 | 32 | 34.3 |
pod | 0 | 6 | 0.0 |
total | 40 | 397 | 10.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | ## This is a simple XML backend for TEI files -*-cperl-*- | ||||||
2 | ## author: Petr Pajas | ||||||
3 | # $Id: TEIXML.pm 2762 2006-07-28 13:57:23Z pajas $ ' | ||||||
4 | ############################################################# | ||||||
5 | |||||||
6 | package Treex::PML::Backend::TEIXML; | ||||||
7 | 1 | 1 | 1046 | use Treex::PML; | |||
1 | 3 | ||||||
1 | 84 | ||||||
8 | 1 | 1 | 8 | use XML::LibXML; | |||
1 | 2 | ||||||
1 | 10 | ||||||
9 | 1 | 1 | 688 | use XML::LibXML::SAX::Parser; | |||
1 | 25859 | ||||||
1 | 46 | ||||||
10 | 1 | 1 | 12 | use Treex::PML::IO qw(close_backend); | |||
1 | 2 | ||||||
1 | 74 | ||||||
11 | 1 | 1 | 6 | use strict; | |||
1 | 3 | ||||||
1 | 35 | ||||||
12 | |||||||
13 | 1 | 1 | 6 | use vars qw($VERSION); | |||
1 | 4 | ||||||
1 | 55 | ||||||
14 | BEGIN { | ||||||
15 | 1 | 1 | 1464 | $VERSION='2.24'; # version template | |||
16 | } | ||||||
17 | |||||||
18 | sub open_backend { | ||||||
19 | 0 | 0 | 0 | my ($uri,$rw,$encoding)=@_; | |||
20 | # discard encoding and pass the rest to the Treex::PML::IO | ||||||
21 | 0 | 0 | Treex::PML::IO::open_backend($uri,$rw,($rw eq 'w' ? $encoding : undef)); | ||||
22 | } | ||||||
23 | |||||||
24 | |||||||
25 | sub test { | ||||||
26 | # should be replaced with some better magic-test for TEI XML | ||||||
27 | 0 | 0 | 0 | my ($f)=@_; | |||
28 | |||||||
29 | 0 | 0 | if (ref($f)) { | ||||
30 | 0 | my $line1=$f->getline(); | |||||
31 | 0 | my $line2=$f->getline(); | |||||
32 | 0 | 0 | return ($line1 =~ /^\s*<\?xml / and ($line2 =~ /^\s* ]/ |
||||
33 | or $line2 =~ /^\s* | ||||||
34 | } else { | ||||||
35 | 0 | my $fh = Treex::PML::IO::open_backend($f,"r"); | |||||
36 | 0 | 0 | my $test = $fh && test($fh); | ||||
37 | 0 | Treex::PML::IO::close_backend($fh); | |||||
38 | 0 | return $test; | |||||
39 | } | ||||||
40 | } | ||||||
41 | |||||||
42 | sub read { | ||||||
43 | 0 | 0 | 0 | my ($input,$target_doc) = @_; | |||
44 | 0 | my $handler = Treex::PML::Backend::TEIXML::SAXHandler->new(TargetDocument => $target_doc); | |||||
45 | 0 | my $p = XML::LibXML::SAX::Parser->new(Handler => $handler); | |||||
46 | 0 | 0 | if (ref($input)) { | ||||
47 | 0 | $p->parse(Source => { ByteStream => $input }); | |||||
48 | } else { | ||||||
49 | 0 | $p->parse_uri($input); | |||||
50 | } | ||||||
51 | |||||||
52 | 0 | return 1; | |||||
53 | } | ||||||
54 | |||||||
55 | sub xml_quote { | ||||||
56 | 0 | 0 | 0 | local $_=$_[0]; | |||
57 | 0 | s/&/&/g; | |||||
58 | 0 | s/\'/'/g; | |||||
59 | 0 | s/\"/"/g; | |||||
60 | 0 | s/>/>/g; | |||||
61 | 0 | s/</g; | |||||
62 | 0 | return $_; | |||||
63 | } | ||||||
64 | |||||||
65 | sub xml_quote_pcdata { | ||||||
66 | 0 | 0 | 0 | local $_=$_[0]; | |||
67 | 0 | s/&/&/g; | |||||
68 | 0 | s/>/>/g; | |||||
69 | 0 | s/</g; | |||||
70 | 0 | return $_; | |||||
71 | } | ||||||
72 | |||||||
73 | |||||||
74 | sub write { | ||||||
75 | 0 | 0 | 0 | my ($output, $src_doc) = @_; | |||
76 | |||||||
77 | 0 | 0 | die "Require GLOB reference\n" unless ref($output); | ||||
78 | |||||||
79 | 0 | my $rootdep=''; | |||||
80 | 0 | 0 | 0 | if ($src_doc->FS->exists('dep') && | |||
81 | $src_doc->FS->isList('dep')) { | ||||||
82 | 0 | ($rootdep)=$src_doc->FS->listValues('dep'); | |||||
83 | } | ||||||
84 | # xml_decl | ||||||
85 | 0 | print $output " | |||||
86 | 0 | 0 | if ($src_doc->metaData('xmldecl_version') ne "") { | ||||
87 | 0 | print $output " version=\"".$src_doc->metaData('xmldecl_version')."\""; | |||||
88 | } else { | ||||||
89 | 0 | print $output " version=\"1.0\""; | |||||
90 | } | ||||||
91 | 0 | 0 | if ($src_doc->encoding() ne "") { | ||||
92 | 0 | print $output " encoding=\"".$src_doc->encoding()."\""; | |||||
93 | } | ||||||
94 | 0 | 0 | if ($src_doc->metaData('xmldecl_standalone') ne "") { | ||||
95 | 0 | print $output " standalone=\"".$src_doc->metaData('xmldecl_standalone')."\""; | |||||
96 | } | ||||||
97 | 0 | print $output "?>\n"; | |||||
98 | |||||||
99 | 0 | 0 | if ($src_doc->metaData('xml_doctype')) { | ||||
100 | 0 | my $properties=$src_doc->metaData('xml_doctype'); | |||||
101 | 0 | 0 | unless ($properties->{'Name'}) { | ||||
102 | 0 | my $output = "DOCTYPE ".$properties->{'Name'}; | |||||
103 | 0 | 0 | $output .= ' SYSTEM "'.$properties->{'SystemId'}.'"' if $properties->{'SystemId'}; | ||||
104 | 0 | 0 | $output .= ' PUBLIC "'.$properties->{'PublicId'}.'"' if $properties->{'PublicId'}; | ||||
105 | 0 | 0 | $output .= ' '.$properties->{'Internal'} if $properties->{'Internal'}; | ||||
106 | 0 | print $output ""; | |||||
107 | } | ||||||
108 | } | ||||||
109 | |||||||
110 | 0 | print $output " |
|||||
111 | # declare all list attributes as fLib. If fLib info exists, use it | ||||||
112 | # to get value identifiers | ||||||
113 | 0 | foreach my $attr (grep { $src_doc->FS->isList($_) } $src_doc->FS->attributes) { | |||||
0 | |||||||
114 | 0 | my %valids; | |||||
115 | 0 | 0 | if (ref($src_doc->metaData('fLib'))) { | ||||
116 | 0 | my $flib=$src_doc->metaData('fLib'); | |||||
117 | 0 | 0 | if (exists($flib->{$attr})) { | ||||
118 | 0 | foreach (@{$flib->{$attr}}) { | |||||
0 | |||||||
119 | 0 | $valids{$_->[1]} = $_->[0]; | |||||
120 | } | ||||||
121 | } | ||||||
122 | } | ||||||
123 | 0 | print $output " |
|||||
124 | 0 | foreach ($src_doc->FS->listValues($attr)) { | |||||
125 | 0 | print $output " | |||||
126 | 0 | 0 | 0 | print $output " id=\"$valids{$_}\"" if (exists($valids{$_}) and $valids{$_} ne ""); | |||
127 | 0 | print $output " name=\"$attr\">", | |||||
128 | " |
||||||
129 | } | ||||||
130 | 0 | print $output "\n"; | |||||
131 | } | ||||||
132 | 0 | print $output "\n"; | |||||
133 | 0 | print $output "
| |||||
134 | 0 | 0 | if ($src_doc->tree(0)) { | ||||
135 | 0 | my $tree0=$src_doc->tree(0); | |||||
136 | 0 | foreach ($src_doc->FS->attributes()) { | |||||
137 | print $output " $1=\"".xml_quote($tree0->{$_})."\"" | ||||||
138 | 0 | 0 | 0 | if (/^p_(.*)$/ and $tree0->{$_} ne ""); | |||
139 | } | ||||||
140 | } | ||||||
141 | 0 | print $output ">\n"; | |||||
142 | |||||||
143 | 0 | foreach my $tree ($src_doc->trees) { | |||||
144 | 0 | print $output " | |||||
145 | 0 | foreach ($src_doc->FS->attributes()) { | |||||
146 | print $output " $1=\"".xml_quote($tree->{$_})."\"" | ||||||
147 | 0 | 0 | 0 | if (/^s_(.*)/ and $tree->{$_} ne ""); | |||
148 | } | ||||||
149 | 0 | print $output ">\n"; | |||||
150 | |||||||
151 | 0 | foreach my $node (sort { $a->{ord} <=> $b->{ord} } $tree->descendants) { | |||||
0 | |||||||
152 | 0 | 0 | my $type=$node->{tei_type} || "w"; | ||||
153 | 0 | print $output "<$type"; | |||||
154 | 0 | foreach (grep { exists($node->{$_}) and | |||||
155 | 0 | 0 | 0 | defined($node->{$_}) and | |||
156 | !/^[sp]_|^(?:form|type|ord|dep)$/ } | ||||||
157 | $src_doc->FS->attributes()) { | ||||||
158 | 0 | print $output " $_=\"".xml_quote($node->{$_})."\""; | |||||
159 | } | ||||||
160 | print $output " dep=\"". | ||||||
161 | xml_quote($node->parent->parent ? ($node->parent->{id} | ||||||
162 | || $node->parent->{AID} #grrrrrrrr! | ||||||
163 | 0 | 0 | 0 | ) : $rootdep )."\""; | |||
164 | 0 | print $output ">"; | |||||
165 | 0 | print $output xml_quote_pcdata($node->{form}); | |||||
166 | 0 | print $output "$type>\n"; | |||||
167 | } | ||||||
168 | |||||||
169 | 0 | print $output "\n"; | |||||
170 | } | ||||||
171 | |||||||
172 | 0 | print $output "\n"; | |||||
173 | 0 | print $output "\n"; | |||||
174 | 0 | print $output "\n"; | |||||
175 | } | ||||||
176 | |||||||
177 | |||||||
178 | # SAX TEI-XML to Treex::PML::Document transducer | ||||||
179 | package Treex::PML::Backend::TEIXML::SAXHandler; | ||||||
180 | 1 | 1 | 9 | use strict; | |||
1 | 3 | ||||||
1 | 37 | ||||||
181 | |||||||
182 | 1 | 1 | 8 | use vars qw($VERSION); | |||
1 | 3 | ||||||
1 | 45 | ||||||
183 | BEGIN { | ||||||
184 | 1 | 1 | 30 | $VERSION='2.24'; # version template | |||
185 | } | ||||||
186 | 1 | 1 | 9 | use Treex::PML; | |||
1 | 2 | ||||||
1 | 1818 | ||||||
187 | |||||||
188 | sub new { | ||||||
189 | 0 | 0 | my ($class, %args) = @_; | ||||
190 | 0 | bless \%args, $class; | |||||
191 | } | ||||||
192 | |||||||
193 | sub start_document { | ||||||
194 | 0 | 0 | my ($self,$hash) = @_; | ||||
195 | 0 | 0 | $self->{TargetDocument} ||= Treex::PML::Factory->createDocument(); | ||||
196 | } | ||||||
197 | |||||||
198 | sub end_document { | ||||||
199 | 0 | 0 | my ($self) = @_; | ||||
200 | 0 | my @header = ('@V form','@V form','@N ord'); | |||||
201 | 0 | foreach my $attr (keys(%{$self->{FSAttrs}})) { | |||||
0 | |||||||
202 | 0 | push @header, '@P '.$attr; | |||||
203 | 0 | 0 | 0 | if (exists($self->{FSAttrSyms}->{$attr}) | |||
204 | and ref($self->{FSAttrSyms}->{$attr})) { | ||||||
205 | 0 | my ($list); | |||||
206 | 0 | foreach (@{$self->{FSAttrSyms}->{$attr}}) { | |||||
0 | |||||||
207 | 0 | $list.="|$_->[1]"; | |||||
208 | } | ||||||
209 | 0 | push @header, '@L '.$attr.$list; | |||||
210 | } | ||||||
211 | } | ||||||
212 | 0 | $self->{TargetDocument}->changeFS(Treex::PML::Factory->createFSFormat(\@header)); | |||||
213 | 0 | $self->{TargetDocument}->changeMetaData('fLib' => $self->{FSAttrSyms}); | |||||
214 | 0 | $self->{TargetDocument}; | |||||
215 | } | ||||||
216 | |||||||
217 | sub xml_decl { | ||||||
218 | 0 | 0 | my ($self,$data) = @_; | ||||
219 | 0 | 0 | $self->{TargetDocument}->changeEncoding($data->{Encoding} || 'iso-8859-2'); | ||||
220 | 0 | $self->{TargetDocument}->changeMetaData('xmldecl_version' => $data->{Version}); | |||||
221 | 0 | $self->{TargetDocument}->changeMetaData('xmldecl_standalone' => $data->{Standalone}); | |||||
222 | } | ||||||
223 | |||||||
224 | sub characters { | ||||||
225 | 0 | 0 | my ($self,$hash) = @_; | ||||
226 | 0 | 0 | return unless $self->{Node}; | ||||
227 | 0 | 0 | 0 | if (($self->{Node}{tei_type} eq 'w') or | |||
228 | ($self->{Node}{tei_type} eq 'c')) { | ||||||
229 | 0 | my $str = $hash->{Data}; | |||||
230 | 0 | 0 | if ($]>=5.008) { | ||||
231 | # leave data in the UTF-8 encoding | ||||||
232 | 0 | $self->{Node}->{form}.=$str; | |||||
233 | } else { | ||||||
234 | $self->{Node}->{form}=$self->{Node}->{form}. | ||||||
235 | 0 | XML::LibXML::decodeFromUTF8($self->{TargetDocument}->encoding(),$str); | |||||
236 | } | ||||||
237 | } | ||||||
238 | } | ||||||
239 | |||||||
240 | sub start_element { | ||||||
241 | 0 | 0 | my ($self, $hash) = @_; | ||||
242 | 0 | my $elem = $hash->{Name}; | |||||
243 | 0 | my $attr = $hash->{Attributes}; | |||||
244 | 0 | my $target_doc = $self->{TargetDocument}; | |||||
245 | |||||||
246 | 0 | 0 | 0 | if ($elem eq 'p') { | |||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
247 | 0 | $self->{DocAttributes}=$attr; | |||||
248 | } elsif ($elem eq 'f') { | ||||||
249 | 0 | $self->{CurrentFSAttr}=$attr->{"{}name"}->{Value}; | |||||
250 | 0 | $self->{CurrentFSAttrID}=$attr->{"{}id"}->{Value}; | |||||
251 | 0 | $self->{FSAttrs}->{$attr->{"{}name"}->{Value}}=1; | |||||
252 | } elsif ($elem eq 'sym') { | ||||||
253 | 0 | push @{$self->{FSAttrSyms}->{$self->{CurrentFSAttr}}}, | |||||
254 | 0 | [$self->{CurrentFSAttrID},$attr->{"{}value"}->{Value}]; | |||||
255 | } elsif ($elem eq 's') { | ||||||
256 | 0 | my $node = $self->{Node} = $self->{Tree} = $target_doc->new_tree($target_doc->lastTreeNo+1); | |||||
257 | 0 | $node->{ord} = 0; | |||||
258 | 0 | $self->{LastOrd} = 0; | |||||
259 | 0 | $node->{tei_type}=$elem; | |||||
260 | 0 | $node->{form}='#'.($target_doc->lastTreeNo+1); | |||||
261 | 0 | 0 | if (ref($attr)) { | ||||
262 | 0 | foreach (values %$attr) { | |||||
263 | $node->{'s_'.$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
264 | 0 | 0 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||
265 | 0 | $self->{FSAttrs}->{'s_'.$_->{Name}}=1; | |||||
266 | } | ||||||
267 | $self->{IDs}->{$node->{id}}=$node | ||||||
268 | 0 | 0 | if ($node->{id} ne ''); | ||||
269 | $self->{IDs}->{$node->{AID}}=$node | ||||||
270 | 0 | 0 | if ($node->{AID} ne ''); #grrrrrrrr! | ||||
271 | } | ||||||
272 | 0 | 0 | 0 | if ($target_doc->lastTreeNo == 0 and ref($self->{DocAttributes})) { | |||
273 | 0 | foreach (values %{$self->{DocAttributes}}) { | |||||
0 | |||||||
274 | # leave data in the UTF-8 encoding in Perl 5.8 | ||||||
275 | $node->{'p_'.$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
276 | 0 | 0 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||
277 | 0 | $self->{FSAttrs}->{"p_".$_->{Name}}=1; | |||||
278 | } | ||||||
279 | } | ||||||
280 | } elsif ($elem eq 'w' or $elem eq 'c') { | ||||||
281 | 0 | my $node = $self->{Node} = Treex::PML::Factory->createNode(); | |||||
282 | 0 | $node->{tei_type}=$elem; | |||||
283 | 0 | $node->{ord} = ++($self->{LastOrd}); | |||||
284 | 0 | $node->paste_on($self->{Tree},'ord'); | |||||
285 | 0 | 0 | if (ref($attr)) { | ||||
286 | 0 | foreach (values %$attr) { | |||||
287 | $node->{$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
288 | 0 | 0 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||
289 | 0 | $self->{FSAttrs}->{$_->{Name}}=1; | |||||
290 | } | ||||||
291 | $self->{IDs}->{$node->{id}}=$node | ||||||
292 | 0 | 0 | if ($node->{id} ne ''); | ||||
293 | $self->{IDs}->{$node->{AID}}=$node | ||||||
294 | 0 | 0 | if ($node->{AID} ne ''); #grrrrrrrr! | ||||
295 | } | ||||||
296 | } | ||||||
297 | } | ||||||
298 | |||||||
299 | sub end_element { | ||||||
300 | 0 | 0 | my ($self) = @_; | ||||
301 | 0 | 0 | 0 | if ($self->{Node} and $self->{Node}->{tei_type} eq 's') { | |||
302 | # build the tree (no consistency checks at all) | ||||||
303 | 0 | my @nodes=$self->{Tree}->descendants; | |||||
304 | 0 | foreach my $node (@nodes) { | |||||
305 | 0 | my $dep=$node->{dep}; | |||||
306 | 0 | 0 | 0 | if ($dep ne '' and | |||
307 | ref($self->{IDs}{$dep})) { | ||||||
308 | 0 | $node->cut()->paste_on($self->{IDs}{$dep}, 'ord'); | |||||
309 | } | ||||||
310 | } | ||||||
311 | } | ||||||
312 | 0 | 0 | $self->{Node} = $self->{Node}->parent if ($self->{Node}); | ||||
313 | } | ||||||
314 | |||||||
315 | sub start_entity { | ||||||
316 | # just hoping some parser would support these | ||||||
317 | 0 | 0 | print "START ENTITY: @{$_[1]}\n"; | ||||
0 | |||||||
318 | } | ||||||
319 | |||||||
320 | |||||||
321 | sub end_entity { | ||||||
322 | 0 | 0 | print "END ENTITY: @{$_[1]}\n"; | ||||
0 | |||||||
323 | } | ||||||
324 | |||||||
325 | sub entity_reference { | ||||||
326 | 0 | 0 | my $self = $_[0]; | ||||
327 | 0 | my $name = $_[1]->{Name}; | |||||
328 | 0 | 0 | 0 | if ($self->{Node}->{tei_type} eq 'w' or | |||
329 | $self->{Node}->{tei_type} eq 'c') { | ||||||
330 | 0 | $self->{Node}->{form}.='&'.$name.';'; | |||||
331 | } | ||||||
332 | } | ||||||
333 | |||||||
334 | sub start_cdata { # not much use for this | ||||||
335 | 0 | 0 | my $self = shift; | ||||
336 | 0 | $self->{InCDATA} = 1; | |||||
337 | } | ||||||
338 | |||||||
339 | sub end_cdata { # not much use for this | ||||||
340 | 0 | 0 | my $self = shift; | ||||
341 | 0 | $self->{InCDATA} = 0; | |||||
342 | } | ||||||
343 | |||||||
344 | sub comment { | ||||||
345 | 0 | 0 | my $self = $_[0]; | ||||
346 | 0 | my $data = $_[1]; | |||||
347 | 0 | 0 | if ($self->{Node}) { | ||||
348 | 0 | $self->{Node}->{xml_comment}.=''; | |||||
349 | } | ||||||
350 | } | ||||||
351 | |||||||
352 | sub doctype_decl { # unfortunatelly, not called by the parser, so far | ||||||
353 | 0 | 0 | my ($self,$hash) = @_; | ||||
354 | 0 | $self->{TargetDocument}->changeMetaData("xml_doctype" => $hash); | |||||
355 | } | ||||||
356 | |||||||
357 | # hack to fix LibXML | ||||||
358 | 0 | 0 | sub XML::LibXML::Dtd::type { return $_[0]->nodeType } | ||||
359 | |||||||
360 | |||||||
361 | 1; | ||||||
362 | __END__ |