blib/lib/Treex/PML/Backend/TEIXML.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 1 | 3 | 33.3 |
branch | n/a | ||
condition | n/a | ||
subroutine | 1 | 1 | 100.0 |
pod | n/a | ||
total | 2 | 4 | 50.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | ## This is a simple XML backend for TEI files -*-cperl-*- | ||||||
2 | ## author: Petr Pajas | ||||||
3 | # $Id: TEIXML.pm 2762 2006-07-28 13:57:23Z pajas $ ' | ||||||
4 | ############################################################# | ||||||
5 | |||||||
6 | package Treex::PML::Backend::TEIXML; | ||||||
7 | 1 | 1 | 894 | use Treex::PML; | |||
0 | |||||||
0 | |||||||
8 | use XML::LibXML; | ||||||
9 | use XML::LibXML::SAX::Parser; | ||||||
10 | use Treex::PML::IO qw(close_backend); | ||||||
11 | use strict; | ||||||
12 | |||||||
13 | use vars qw($VERSION); | ||||||
14 | BEGIN { | ||||||
15 | $VERSION='2.22'; # version template | ||||||
16 | } | ||||||
17 | |||||||
18 | sub open_backend { | ||||||
19 | my ($uri,$rw,$encoding)=@_; | ||||||
20 | # discard encoding and pass the rest to the Treex::PML::IO | ||||||
21 | Treex::PML::IO::open_backend($uri,$rw,($rw eq 'w' ? $encoding : undef)); | ||||||
22 | } | ||||||
23 | |||||||
24 | |||||||
25 | sub test { | ||||||
26 | # should be replaced with some better magic-test for TEI XML | ||||||
27 | my ($f)=@_; | ||||||
28 | |||||||
29 | if (ref($f)) { | ||||||
30 | my $line1=$f->getline(); | ||||||
31 | my $line2=$f->getline(); | ||||||
32 | return ($line1 =~ /^\s*<\?xml / and ($line2 =~ /^\s* ]/ |
||||||
33 | or $line2 =~ /^\s* | ||||||
34 | } else { | ||||||
35 | my $fh = Treex::PML::IO::open_backend($f,"r"); | ||||||
36 | my $test = $fh && test($fh); | ||||||
37 | Treex::PML::IO::close_backend($fh); | ||||||
38 | return $test; | ||||||
39 | } | ||||||
40 | } | ||||||
41 | |||||||
42 | sub read { | ||||||
43 | my ($input,$target_doc) = @_; | ||||||
44 | my $handler = Treex::PML::Backend::TEIXML::SAXHandler->new(TargetDocument => $target_doc); | ||||||
45 | my $p = XML::LibXML::SAX::Parser->new(Handler => $handler); | ||||||
46 | if (ref($input)) { | ||||||
47 | $p->parse(Source => { ByteStream => $input }); | ||||||
48 | } else { | ||||||
49 | $p->parse_uri($input); | ||||||
50 | } | ||||||
51 | |||||||
52 | return 1; | ||||||
53 | } | ||||||
54 | |||||||
55 | sub xml_quote { | ||||||
56 | local $_=$_[0]; | ||||||
57 | s/&/&/g; | ||||||
58 | s/\'/'/g; | ||||||
59 | s/\"/"/g; | ||||||
60 | s/>/>/g; | ||||||
61 | s/</g; | ||||||
62 | return $_; | ||||||
63 | } | ||||||
64 | |||||||
65 | sub xml_quote_pcdata { | ||||||
66 | local $_=$_[0]; | ||||||
67 | s/&/&/g; | ||||||
68 | s/>/>/g; | ||||||
69 | s/</g; | ||||||
70 | return $_; | ||||||
71 | } | ||||||
72 | |||||||
73 | |||||||
74 | sub write { | ||||||
75 | my ($output, $src_doc) = @_; | ||||||
76 | |||||||
77 | die "Require GLOB reference\n" unless ref($output); | ||||||
78 | |||||||
79 | my $rootdep=''; | ||||||
80 | if ($src_doc->FS->exists('dep') && | ||||||
81 | $src_doc->FS->isList('dep')) { | ||||||
82 | ($rootdep)=$src_doc->FS->listValues('dep'); | ||||||
83 | } | ||||||
84 | # xml_decl | ||||||
85 | print $output " | ||||||
86 | if ($src_doc->metaData('xmldecl_version') ne "") { | ||||||
87 | print $output " version=\"".$src_doc->metaData('xmldecl_version')."\""; | ||||||
88 | } else { | ||||||
89 | print $output " version=\"1.0\""; | ||||||
90 | } | ||||||
91 | if ($src_doc->encoding() ne "") { | ||||||
92 | print $output " encoding=\"".$src_doc->encoding()."\""; | ||||||
93 | } | ||||||
94 | if ($src_doc->metaData('xmldecl_standalone') ne "") { | ||||||
95 | print $output " standalone=\"".$src_doc->metaData('xmldecl_standalone')."\""; | ||||||
96 | } | ||||||
97 | print $output "?>\n"; | ||||||
98 | |||||||
99 | if ($src_doc->metaData('xml_doctype')) { | ||||||
100 | my $properties=$src_doc->metaData('xml_doctype'); | ||||||
101 | unless ($properties->{'Name'}) { | ||||||
102 | my $output = "DOCTYPE ".$properties->{'Name'}; | ||||||
103 | $output .= ' SYSTEM "'.$properties->{'SystemId'}.'"' if $properties->{'SystemId'}; | ||||||
104 | $output .= ' PUBLIC "'.$properties->{'PublicId'}.'"' if $properties->{'PublicId'}; | ||||||
105 | $output .= ' '.$properties->{'Internal'} if $properties->{'Internal'}; | ||||||
106 | print $output ""; | ||||||
107 | } | ||||||
108 | } | ||||||
109 | |||||||
110 | print $output " |
||||||
111 | # declare all list attributes as fLib. If fLib info exists, use it | ||||||
112 | # to get value identifiers | ||||||
113 | foreach my $attr (grep { $src_doc->FS->isList($_) } $src_doc->FS->attributes) { | ||||||
114 | my %valids; | ||||||
115 | if (ref($src_doc->metaData('fLib'))) { | ||||||
116 | my $flib=$src_doc->metaData('fLib'); | ||||||
117 | if (exists($flib->{$attr})) { | ||||||
118 | foreach (@{$flib->{$attr}}) { | ||||||
119 | $valids{$_->[1]} = $_->[0]; | ||||||
120 | } | ||||||
121 | } | ||||||
122 | } | ||||||
123 | print $output " |
||||||
124 | foreach ($src_doc->FS->listValues($attr)) { | ||||||
125 | print $output " | ||||||
126 | print $output " id=\"$valids{$_}\"" if (exists($valids{$_}) and $valids{$_} ne ""); | ||||||
127 | print $output " name=\"$attr\">", | ||||||
128 | " |
||||||
129 | } | ||||||
130 | print $output "\n"; | ||||||
131 | } | ||||||
132 | print $output "\n"; | ||||||
133 | print $output "
| ||||||
134 | if ($src_doc->tree(0)) { | ||||||
135 | my $tree0=$src_doc->tree(0); | ||||||
136 | foreach ($src_doc->FS->attributes()) { | ||||||
137 | print $output " $1=\"".xml_quote($tree0->{$_})."\"" | ||||||
138 | if (/^p_(.*)$/ and $tree0->{$_} ne ""); | ||||||
139 | } | ||||||
140 | } | ||||||
141 | print $output ">\n"; | ||||||
142 | |||||||
143 | foreach my $tree ($src_doc->trees) { | ||||||
144 | print $output " | ||||||
145 | foreach ($src_doc->FS->attributes()) { | ||||||
146 | print $output " $1=\"".xml_quote($tree->{$_})."\"" | ||||||
147 | if (/^s_(.*)/ and $tree->{$_} ne ""); | ||||||
148 | } | ||||||
149 | print $output ">\n"; | ||||||
150 | |||||||
151 | foreach my $node (sort { $a->{ord} <=> $b->{ord} } $tree->descendants) { | ||||||
152 | my $type=$node->{tei_type} || "w"; | ||||||
153 | print $output "<$type"; | ||||||
154 | foreach (grep { exists($node->{$_}) and | ||||||
155 | defined($node->{$_}) and | ||||||
156 | !/^[sp]_|^(?:form|type|ord|dep)$/ } | ||||||
157 | $src_doc->FS->attributes()) { | ||||||
158 | print $output " $_=\"".xml_quote($node->{$_})."\""; | ||||||
159 | } | ||||||
160 | print $output " dep=\"". | ||||||
161 | xml_quote($node->parent->parent ? ($node->parent->{id} | ||||||
162 | || $node->parent->{AID} #grrrrrrrr! | ||||||
163 | ) : $rootdep )."\""; | ||||||
164 | print $output ">"; | ||||||
165 | print $output xml_quote_pcdata($node->{form}); | ||||||
166 | print $output "$type>\n"; | ||||||
167 | } | ||||||
168 | |||||||
169 | print $output "\n"; | ||||||
170 | } | ||||||
171 | |||||||
172 | print $output "\n"; | ||||||
173 | print $output "\n"; | ||||||
174 | print $output "\n"; | ||||||
175 | } | ||||||
176 | |||||||
177 | |||||||
178 | # SAX TEI-XML to Treex::PML::Document transducer | ||||||
179 | package Treex::PML::Backend::TEIXML::SAXHandler; | ||||||
180 | use strict; | ||||||
181 | |||||||
182 | use vars qw($VERSION); | ||||||
183 | BEGIN { | ||||||
184 | $VERSION='2.22'; # version template | ||||||
185 | } | ||||||
186 | use Treex::PML; | ||||||
187 | |||||||
188 | sub new { | ||||||
189 | my ($class, %args) = @_; | ||||||
190 | bless \%args, $class; | ||||||
191 | } | ||||||
192 | |||||||
193 | sub start_document { | ||||||
194 | my ($self,$hash) = @_; | ||||||
195 | $self->{TargetDocument} ||= Treex::PML::Factory->createDocument(); | ||||||
196 | } | ||||||
197 | |||||||
198 | sub end_document { | ||||||
199 | my ($self) = @_; | ||||||
200 | my @header = ('@V form','@V form','@N ord'); | ||||||
201 | foreach my $attr (keys(%{$self->{FSAttrs}})) { | ||||||
202 | push @header, '@P '.$attr; | ||||||
203 | if (exists($self->{FSAttrSyms}->{$attr}) | ||||||
204 | and ref($self->{FSAttrSyms}->{$attr})) { | ||||||
205 | my ($list); | ||||||
206 | foreach (@{$self->{FSAttrSyms}->{$attr}}) { | ||||||
207 | $list.="|$_->[1]"; | ||||||
208 | } | ||||||
209 | push @header, '@L '.$attr.$list; | ||||||
210 | } | ||||||
211 | } | ||||||
212 | $self->{TargetDocument}->changeFS(Treex::PML::Factory->createFSFormat(\@header)); | ||||||
213 | $self->{TargetDocument}->changeMetaData('fLib' => $self->{FSAttrSyms}); | ||||||
214 | $self->{TargetDocument}; | ||||||
215 | } | ||||||
216 | |||||||
217 | sub xml_decl { | ||||||
218 | my ($self,$data) = @_; | ||||||
219 | $self->{TargetDocument}->changeEncoding($data->{Encoding} || 'iso-8859-2'); | ||||||
220 | $self->{TargetDocument}->changeMetaData('xmldecl_version' => $data->{Version}); | ||||||
221 | $self->{TargetDocument}->changeMetaData('xmldecl_standalone' => $data->{Standalone}); | ||||||
222 | } | ||||||
223 | |||||||
224 | sub characters { | ||||||
225 | my ($self,$hash) = @_; | ||||||
226 | return unless $self->{Node}; | ||||||
227 | if (($self->{Node}{tei_type} eq 'w') or | ||||||
228 | ($self->{Node}{tei_type} eq 'c')) { | ||||||
229 | my $str = $hash->{Data}; | ||||||
230 | if ($]>=5.008) { | ||||||
231 | # leave data in the UTF-8 encoding | ||||||
232 | $self->{Node}->{form}.=$str; | ||||||
233 | } else { | ||||||
234 | $self->{Node}->{form}=$self->{Node}->{form}. | ||||||
235 | XML::LibXML::decodeFromUTF8($self->{TargetDocument}->encoding(),$str); | ||||||
236 | } | ||||||
237 | } | ||||||
238 | } | ||||||
239 | |||||||
240 | sub start_element { | ||||||
241 | my ($self, $hash) = @_; | ||||||
242 | my $elem = $hash->{Name}; | ||||||
243 | my $attr = $hash->{Attributes}; | ||||||
244 | my $target_doc = $self->{TargetDocument}; | ||||||
245 | |||||||
246 | if ($elem eq 'p') { | ||||||
247 | $self->{DocAttributes}=$attr; | ||||||
248 | } elsif ($elem eq 'f') { | ||||||
249 | $self->{CurrentFSAttr}=$attr->{"{}name"}->{Value}; | ||||||
250 | $self->{CurrentFSAttrID}=$attr->{"{}id"}->{Value}; | ||||||
251 | $self->{FSAttrs}->{$attr->{"{}name"}->{Value}}=1; | ||||||
252 | } elsif ($elem eq 'sym') { | ||||||
253 | push @{$self->{FSAttrSyms}->{$self->{CurrentFSAttr}}}, | ||||||
254 | [$self->{CurrentFSAttrID},$attr->{"{}value"}->{Value}]; | ||||||
255 | } elsif ($elem eq 's') { | ||||||
256 | my $node = $self->{Node} = $self->{Tree} = $target_doc->new_tree($target_doc->lastTreeNo+1); | ||||||
257 | $node->{ord} = 0; | ||||||
258 | $self->{LastOrd} = 0; | ||||||
259 | $node->{tei_type}=$elem; | ||||||
260 | $node->{form}='#'.($target_doc->lastTreeNo+1); | ||||||
261 | if (ref($attr)) { | ||||||
262 | foreach (values %$attr) { | ||||||
263 | $node->{'s_'.$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
264 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||||
265 | $self->{FSAttrs}->{'s_'.$_->{Name}}=1; | ||||||
266 | } | ||||||
267 | $self->{IDs}->{$node->{id}}=$node | ||||||
268 | if ($node->{id} ne ''); | ||||||
269 | $self->{IDs}->{$node->{AID}}=$node | ||||||
270 | if ($node->{AID} ne ''); #grrrrrrrr! | ||||||
271 | } | ||||||
272 | if ($target_doc->lastTreeNo == 0 and ref($self->{DocAttributes})) { | ||||||
273 | foreach (values %{$self->{DocAttributes}}) { | ||||||
274 | # leave data in the UTF-8 encoding in Perl 5.8 | ||||||
275 | $node->{'p_'.$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
276 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||||
277 | $self->{FSAttrs}->{"p_".$_->{Name}}=1; | ||||||
278 | } | ||||||
279 | } | ||||||
280 | } elsif ($elem eq 'w' or $elem eq 'c') { | ||||||
281 | my $node = $self->{Node} = Treex::PML::Factory->createNode(); | ||||||
282 | $node->{tei_type}=$elem; | ||||||
283 | $node->{ord} = ++($self->{LastOrd}); | ||||||
284 | $node->paste_on($self->{Tree},'ord'); | ||||||
285 | if (ref($attr)) { | ||||||
286 | foreach (values %$attr) { | ||||||
287 | $node->{$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
288 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||||
289 | $self->{FSAttrs}->{$_->{Name}}=1; | ||||||
290 | } | ||||||
291 | $self->{IDs}->{$node->{id}}=$node | ||||||
292 | if ($node->{id} ne ''); | ||||||
293 | $self->{IDs}->{$node->{AID}}=$node | ||||||
294 | if ($node->{AID} ne ''); #grrrrrrrr! | ||||||
295 | } | ||||||
296 | } | ||||||
297 | } | ||||||
298 | |||||||
299 | sub end_element { | ||||||
300 | my ($self) = @_; | ||||||
301 | if ($self->{Node} and $self->{Node}->{tei_type} eq 's') { | ||||||
302 | # build the tree (no consistency checks at all) | ||||||
303 | my @nodes=$self->{Tree}->descendants; | ||||||
304 | foreach my $node (@nodes) { | ||||||
305 | my $dep=$node->{dep}; | ||||||
306 | if ($dep ne '' and | ||||||
307 | ref($self->{IDs}{$dep})) { | ||||||
308 | $node->cut()->paste_on($self->{IDs}{$dep}, 'ord'); | ||||||
309 | } | ||||||
310 | } | ||||||
311 | } | ||||||
312 | $self->{Node} = $self->{Node}->parent if ($self->{Node}); | ||||||
313 | } | ||||||
314 | |||||||
315 | sub start_entity { | ||||||
316 | # just hoping some parser would support these | ||||||
317 | print "START ENTITY: @{$_[1]}\n"; | ||||||
318 | } | ||||||
319 | |||||||
320 | |||||||
321 | sub end_entity { | ||||||
322 | print "END ENTITY: @{$_[1]}\n"; | ||||||
323 | } | ||||||
324 | |||||||
325 | sub entity_reference { | ||||||
326 | my $self = $_[0]; | ||||||
327 | my $name = $_[1]->{Name}; | ||||||
328 | if ($self->{Node}->{tei_type} eq 'w' or | ||||||
329 | $self->{Node}->{tei_type} eq 'c') { | ||||||
330 | $self->{Node}->{form}.='&'.$name.';'; | ||||||
331 | } | ||||||
332 | } | ||||||
333 | |||||||
334 | sub start_cdata { # not much use for this | ||||||
335 | my $self = shift; | ||||||
336 | $self->{InCDATA} = 1; | ||||||
337 | } | ||||||
338 | |||||||
339 | sub end_cdata { # not much use for this | ||||||
340 | my $self = shift; | ||||||
341 | $self->{InCDATA} = 0; | ||||||
342 | } | ||||||
343 | |||||||
344 | sub comment { | ||||||
345 | my $self = $_[0]; | ||||||
346 | my $data = $_[1]; | ||||||
347 | if ($self->{Node}) { | ||||||
348 | $self->{Node}->{xml_comment}.=''; | ||||||
349 | } | ||||||
350 | } | ||||||
351 | |||||||
352 | sub doctype_decl { # unfortunatelly, not called by the parser, so far | ||||||
353 | my ($self,$hash) = @_; | ||||||
354 | $self->{TargetDocument}->changeMetaData("xml_doctype" => $hash); | ||||||
355 | } | ||||||
356 | |||||||
357 | # hack to fix LibXML | ||||||
358 | sub XML::LibXML::Dtd::type { return $_[0]->nodeType } | ||||||
359 | |||||||
360 | |||||||
361 | 1; | ||||||
362 | __END__ |