| blib/lib/Treex/PML/Backend/TEIXML.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 29 | 217 | 13.3 |
| branch | 0 | 90 | 0.0 |
| condition | 0 | 52 | 0.0 |
| subroutine | 11 | 32 | 34.3 |
| pod | 0 | 6 | 0.0 |
| total | 40 | 397 | 10.0 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | ## This is a simple XML backend for TEI files -*-cperl-*- | ||||||
| 2 | ## author: Petr Pajas | ||||||
| 3 | # $Id: TEIXML.pm 2762 2006-07-28 13:57:23Z pajas $ ' | ||||||
| 4 | ############################################################# | ||||||
| 5 | |||||||
| 6 | package Treex::PML::Backend::TEIXML; | ||||||
| 7 | 1 | 1 | 1046 | use Treex::PML; | |||
| 1 | 3 | ||||||
| 1 | 84 | ||||||
| 8 | 1 | 1 | 8 | use XML::LibXML; | |||
| 1 | 2 | ||||||
| 1 | 10 | ||||||
| 9 | 1 | 1 | 688 | use XML::LibXML::SAX::Parser; | |||
| 1 | 25859 | ||||||
| 1 | 46 | ||||||
| 10 | 1 | 1 | 12 | use Treex::PML::IO qw(close_backend); | |||
| 1 | 2 | ||||||
| 1 | 74 | ||||||
| 11 | 1 | 1 | 6 | use strict; | |||
| 1 | 3 | ||||||
| 1 | 35 | ||||||
| 12 | |||||||
| 13 | 1 | 1 | 6 | use vars qw($VERSION); | |||
| 1 | 4 | ||||||
| 1 | 55 | ||||||
| 14 | BEGIN { | ||||||
| 15 | 1 | 1 | 1464 | $VERSION='2.24'; # version template | |||
| 16 | } | ||||||
| 17 | |||||||
| 18 | sub open_backend { | ||||||
| 19 | 0 | 0 | 0 | my ($uri,$rw,$encoding)=@_; | |||
| 20 | # discard encoding and pass the rest to the Treex::PML::IO | ||||||
| 21 | 0 | 0 | Treex::PML::IO::open_backend($uri,$rw,($rw eq 'w' ? $encoding : undef)); | ||||
| 22 | } | ||||||
| 23 | |||||||
| 24 | |||||||
| 25 | sub test { | ||||||
| 26 | # should be replaced with some better magic-test for TEI XML | ||||||
| 27 | 0 | 0 | 0 | my ($f)=@_; | |||
| 28 | |||||||
| 29 | 0 | 0 | if (ref($f)) { | ||||
| 30 | 0 | my $line1=$f->getline(); | |||||
| 31 | 0 | my $line2=$f->getline(); | |||||
| 32 | 0 | 0 | return ($line1 =~ /^\s*<\?xml / and ($line2 =~ /^\s* ]/ |
||||
| 33 | or $line2 =~ /^\s* | ||||||
| 34 | } else { | ||||||
| 35 | 0 | my $fh = Treex::PML::IO::open_backend($f,"r"); | |||||
| 36 | 0 | 0 | my $test = $fh && test($fh); | ||||
| 37 | 0 | Treex::PML::IO::close_backend($fh); | |||||
| 38 | 0 | return $test; | |||||
| 39 | } | ||||||
| 40 | } | ||||||
| 41 | |||||||
| 42 | sub read { | ||||||
| 43 | 0 | 0 | 0 | my ($input,$target_doc) = @_; | |||
| 44 | 0 | my $handler = Treex::PML::Backend::TEIXML::SAXHandler->new(TargetDocument => $target_doc); | |||||
| 45 | 0 | my $p = XML::LibXML::SAX::Parser->new(Handler => $handler); | |||||
| 46 | 0 | 0 | if (ref($input)) { | ||||
| 47 | 0 | $p->parse(Source => { ByteStream => $input }); | |||||
| 48 | } else { | ||||||
| 49 | 0 | $p->parse_uri($input); | |||||
| 50 | } | ||||||
| 51 | |||||||
| 52 | 0 | return 1; | |||||
| 53 | } | ||||||
| 54 | |||||||
| 55 | sub xml_quote { | ||||||
| 56 | 0 | 0 | 0 | local $_=$_[0]; | |||
| 57 | 0 | s/&/&/g; | |||||
| 58 | 0 | s/\'/'/g; | |||||
| 59 | 0 | s/\"/"/g; | |||||
| 60 | 0 | s/>/>/g; | |||||
| 61 | 0 | s/</g; | |||||
| 62 | 0 | return $_; | |||||
| 63 | } | ||||||
| 64 | |||||||
| 65 | sub xml_quote_pcdata { | ||||||
| 66 | 0 | 0 | 0 | local $_=$_[0]; | |||
| 67 | 0 | s/&/&/g; | |||||
| 68 | 0 | s/>/>/g; | |||||
| 69 | 0 | s/</g; | |||||
| 70 | 0 | return $_; | |||||
| 71 | } | ||||||
| 72 | |||||||
| 73 | |||||||
| 74 | sub write { | ||||||
| 75 | 0 | 0 | 0 | my ($output, $src_doc) = @_; | |||
| 76 | |||||||
| 77 | 0 | 0 | die "Require GLOB reference\n" unless ref($output); | ||||
| 78 | |||||||
| 79 | 0 | my $rootdep=''; | |||||
| 80 | 0 | 0 | 0 | if ($src_doc->FS->exists('dep') && | |||
| 81 | $src_doc->FS->isList('dep')) { | ||||||
| 82 | 0 | ($rootdep)=$src_doc->FS->listValues('dep'); | |||||
| 83 | } | ||||||
| 84 | # xml_decl | ||||||
| 85 | 0 | print $output " | |||||
| 86 | 0 | 0 | if ($src_doc->metaData('xmldecl_version') ne "") { | ||||
| 87 | 0 | print $output " version=\"".$src_doc->metaData('xmldecl_version')."\""; | |||||
| 88 | } else { | ||||||
| 89 | 0 | print $output " version=\"1.0\""; | |||||
| 90 | } | ||||||
| 91 | 0 | 0 | if ($src_doc->encoding() ne "") { | ||||
| 92 | 0 | print $output " encoding=\"".$src_doc->encoding()."\""; | |||||
| 93 | } | ||||||
| 94 | 0 | 0 | if ($src_doc->metaData('xmldecl_standalone') ne "") { | ||||
| 95 | 0 | print $output " standalone=\"".$src_doc->metaData('xmldecl_standalone')."\""; | |||||
| 96 | } | ||||||
| 97 | 0 | print $output "?>\n"; | |||||
| 98 | |||||||
| 99 | 0 | 0 | if ($src_doc->metaData('xml_doctype')) { | ||||
| 100 | 0 | my $properties=$src_doc->metaData('xml_doctype'); | |||||
| 101 | 0 | 0 | unless ($properties->{'Name'}) { | ||||
| 102 | 0 | my $output = "DOCTYPE ".$properties->{'Name'}; | |||||
| 103 | 0 | 0 | $output .= ' SYSTEM "'.$properties->{'SystemId'}.'"' if $properties->{'SystemId'}; | ||||
| 104 | 0 | 0 | $output .= ' PUBLIC "'.$properties->{'PublicId'}.'"' if $properties->{'PublicId'}; | ||||
| 105 | 0 | 0 | $output .= ' '.$properties->{'Internal'} if $properties->{'Internal'}; | ||||
| 106 | 0 | print $output ""; | |||||
| 107 | } | ||||||
| 108 | } | ||||||
| 109 | |||||||
| 110 | 0 | print $output " |
|||||
| 111 | # declare all list attributes as fLib. If fLib info exists, use it | ||||||
| 112 | # to get value identifiers | ||||||
| 113 | 0 | foreach my $attr (grep { $src_doc->FS->isList($_) } $src_doc->FS->attributes) { | |||||
| 0 | |||||||
| 114 | 0 | my %valids; | |||||
| 115 | 0 | 0 | if (ref($src_doc->metaData('fLib'))) { | ||||
| 116 | 0 | my $flib=$src_doc->metaData('fLib'); | |||||
| 117 | 0 | 0 | if (exists($flib->{$attr})) { | ||||
| 118 | 0 | foreach (@{$flib->{$attr}}) { | |||||
| 0 | |||||||
| 119 | 0 | $valids{$_->[1]} = $_->[0]; | |||||
| 120 | } | ||||||
| 121 | } | ||||||
| 122 | } | ||||||
| 123 | 0 | print $output " |
|||||
| 124 | 0 | foreach ($src_doc->FS->listValues($attr)) { | |||||
| 125 | 0 | print $output " | |||||
| 126 | 0 | 0 | 0 | print $output " id=\"$valids{$_}\"" if (exists($valids{$_}) and $valids{$_} ne ""); | |||
| 127 | 0 | print $output " name=\"$attr\">", | |||||
| 128 | " |
||||||
| 129 | } | ||||||
| 130 | 0 | print $output "\n"; | |||||
| 131 | } | ||||||
| 132 | 0 | print $output "\n"; | |||||
| 133 | 0 | print $output "
| |||||
| 134 | 0 | 0 | if ($src_doc->tree(0)) { | ||||
| 135 | 0 | my $tree0=$src_doc->tree(0); | |||||
| 136 | 0 | foreach ($src_doc->FS->attributes()) { | |||||
| 137 | print $output " $1=\"".xml_quote($tree0->{$_})."\"" | ||||||
| 138 | 0 | 0 | 0 | if (/^p_(.*)$/ and $tree0->{$_} ne ""); | |||
| 139 | } | ||||||
| 140 | } | ||||||
| 141 | 0 | print $output ">\n"; | |||||
| 142 | |||||||
| 143 | 0 | foreach my $tree ($src_doc->trees) { | |||||
| 144 | 0 | print $output " | |||||
| 145 | 0 | foreach ($src_doc->FS->attributes()) { | |||||
| 146 | print $output " $1=\"".xml_quote($tree->{$_})."\"" | ||||||
| 147 | 0 | 0 | 0 | if (/^s_(.*)/ and $tree->{$_} ne ""); | |||
| 148 | } | ||||||
| 149 | 0 | print $output ">\n"; | |||||
| 150 | |||||||
| 151 | 0 | foreach my $node (sort { $a->{ord} <=> $b->{ord} } $tree->descendants) { | |||||
| 0 | |||||||
| 152 | 0 | 0 | my $type=$node->{tei_type} || "w"; | ||||
| 153 | 0 | print $output "<$type"; | |||||
| 154 | 0 | foreach (grep { exists($node->{$_}) and | |||||
| 155 | 0 | 0 | 0 | defined($node->{$_}) and | |||
| 156 | !/^[sp]_|^(?:form|type|ord|dep)$/ } | ||||||
| 157 | $src_doc->FS->attributes()) { | ||||||
| 158 | 0 | print $output " $_=\"".xml_quote($node->{$_})."\""; | |||||
| 159 | } | ||||||
| 160 | print $output " dep=\"". | ||||||
| 161 | xml_quote($node->parent->parent ? ($node->parent->{id} | ||||||
| 162 | || $node->parent->{AID} #grrrrrrrr! | ||||||
| 163 | 0 | 0 | 0 | ) : $rootdep )."\""; | |||
| 164 | 0 | print $output ">"; | |||||
| 165 | 0 | print $output xml_quote_pcdata($node->{form}); | |||||
| 166 | 0 | print $output "$type>\n"; | |||||
| 167 | } | ||||||
| 168 | |||||||
| 169 | 0 | print $output "\n"; | |||||
| 170 | } | ||||||
| 171 | |||||||
| 172 | 0 | print $output "\n"; | |||||
| 173 | 0 | print $output "\n"; | |||||
| 174 | 0 | print $output "\n"; | |||||
| 175 | } | ||||||
| 176 | |||||||
| 177 | |||||||
| 178 | # SAX TEI-XML to Treex::PML::Document transducer | ||||||
| 179 | package Treex::PML::Backend::TEIXML::SAXHandler; | ||||||
| 180 | 1 | 1 | 9 | use strict; | |||
| 1 | 3 | ||||||
| 1 | 37 | ||||||
| 181 | |||||||
| 182 | 1 | 1 | 8 | use vars qw($VERSION); | |||
| 1 | 3 | ||||||
| 1 | 45 | ||||||
| 183 | BEGIN { | ||||||
| 184 | 1 | 1 | 30 | $VERSION='2.24'; # version template | |||
| 185 | } | ||||||
| 186 | 1 | 1 | 9 | use Treex::PML; | |||
| 1 | 2 | ||||||
| 1 | 1818 | ||||||
| 187 | |||||||
| 188 | sub new { | ||||||
| 189 | 0 | 0 | my ($class, %args) = @_; | ||||
| 190 | 0 | bless \%args, $class; | |||||
| 191 | } | ||||||
| 192 | |||||||
| 193 | sub start_document { | ||||||
| 194 | 0 | 0 | my ($self,$hash) = @_; | ||||
| 195 | 0 | 0 | $self->{TargetDocument} ||= Treex::PML::Factory->createDocument(); | ||||
| 196 | } | ||||||
| 197 | |||||||
| 198 | sub end_document { | ||||||
| 199 | 0 | 0 | my ($self) = @_; | ||||
| 200 | 0 | my @header = ('@V form','@V form','@N ord'); | |||||
| 201 | 0 | foreach my $attr (keys(%{$self->{FSAttrs}})) { | |||||
| 0 | |||||||
| 202 | 0 | push @header, '@P '.$attr; | |||||
| 203 | 0 | 0 | 0 | if (exists($self->{FSAttrSyms}->{$attr}) | |||
| 204 | and ref($self->{FSAttrSyms}->{$attr})) { | ||||||
| 205 | 0 | my ($list); | |||||
| 206 | 0 | foreach (@{$self->{FSAttrSyms}->{$attr}}) { | |||||
| 0 | |||||||
| 207 | 0 | $list.="|$_->[1]"; | |||||
| 208 | } | ||||||
| 209 | 0 | push @header, '@L '.$attr.$list; | |||||
| 210 | } | ||||||
| 211 | } | ||||||
| 212 | 0 | $self->{TargetDocument}->changeFS(Treex::PML::Factory->createFSFormat(\@header)); | |||||
| 213 | 0 | $self->{TargetDocument}->changeMetaData('fLib' => $self->{FSAttrSyms}); | |||||
| 214 | 0 | $self->{TargetDocument}; | |||||
| 215 | } | ||||||
| 216 | |||||||
| 217 | sub xml_decl { | ||||||
| 218 | 0 | 0 | my ($self,$data) = @_; | ||||
| 219 | 0 | 0 | $self->{TargetDocument}->changeEncoding($data->{Encoding} || 'iso-8859-2'); | ||||
| 220 | 0 | $self->{TargetDocument}->changeMetaData('xmldecl_version' => $data->{Version}); | |||||
| 221 | 0 | $self->{TargetDocument}->changeMetaData('xmldecl_standalone' => $data->{Standalone}); | |||||
| 222 | } | ||||||
| 223 | |||||||
| 224 | sub characters { | ||||||
| 225 | 0 | 0 | my ($self,$hash) = @_; | ||||
| 226 | 0 | 0 | return unless $self->{Node}; | ||||
| 227 | 0 | 0 | 0 | if (($self->{Node}{tei_type} eq 'w') or | |||
| 228 | ($self->{Node}{tei_type} eq 'c')) { | ||||||
| 229 | 0 | my $str = $hash->{Data}; | |||||
| 230 | 0 | 0 | if ($]>=5.008) { | ||||
| 231 | # leave data in the UTF-8 encoding | ||||||
| 232 | 0 | $self->{Node}->{form}.=$str; | |||||
| 233 | } else { | ||||||
| 234 | $self->{Node}->{form}=$self->{Node}->{form}. | ||||||
| 235 | 0 | XML::LibXML::decodeFromUTF8($self->{TargetDocument}->encoding(),$str); | |||||
| 236 | } | ||||||
| 237 | } | ||||||
| 238 | } | ||||||
| 239 | |||||||
| 240 | sub start_element { | ||||||
| 241 | 0 | 0 | my ($self, $hash) = @_; | ||||
| 242 | 0 | my $elem = $hash->{Name}; | |||||
| 243 | 0 | my $attr = $hash->{Attributes}; | |||||
| 244 | 0 | my $target_doc = $self->{TargetDocument}; | |||||
| 245 | |||||||
| 246 | 0 | 0 | 0 | if ($elem eq 'p') { | |||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 247 | 0 | $self->{DocAttributes}=$attr; | |||||
| 248 | } elsif ($elem eq 'f') { | ||||||
| 249 | 0 | $self->{CurrentFSAttr}=$attr->{"{}name"}->{Value}; | |||||
| 250 | 0 | $self->{CurrentFSAttrID}=$attr->{"{}id"}->{Value}; | |||||
| 251 | 0 | $self->{FSAttrs}->{$attr->{"{}name"}->{Value}}=1; | |||||
| 252 | } elsif ($elem eq 'sym') { | ||||||
| 253 | 0 | push @{$self->{FSAttrSyms}->{$self->{CurrentFSAttr}}}, | |||||
| 254 | 0 | [$self->{CurrentFSAttrID},$attr->{"{}value"}->{Value}]; | |||||
| 255 | } elsif ($elem eq 's') { | ||||||
| 256 | 0 | my $node = $self->{Node} = $self->{Tree} = $target_doc->new_tree($target_doc->lastTreeNo+1); | |||||
| 257 | 0 | $node->{ord} = 0; | |||||
| 258 | 0 | $self->{LastOrd} = 0; | |||||
| 259 | 0 | $node->{tei_type}=$elem; | |||||
| 260 | 0 | $node->{form}='#'.($target_doc->lastTreeNo+1); | |||||
| 261 | 0 | 0 | if (ref($attr)) { | ||||
| 262 | 0 | foreach (values %$attr) { | |||||
| 263 | $node->{'s_'.$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
| 264 | 0 | 0 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||
| 265 | 0 | $self->{FSAttrs}->{'s_'.$_->{Name}}=1; | |||||
| 266 | } | ||||||
| 267 | $self->{IDs}->{$node->{id}}=$node | ||||||
| 268 | 0 | 0 | if ($node->{id} ne ''); | ||||
| 269 | $self->{IDs}->{$node->{AID}}=$node | ||||||
| 270 | 0 | 0 | if ($node->{AID} ne ''); #grrrrrrrr! | ||||
| 271 | } | ||||||
| 272 | 0 | 0 | 0 | if ($target_doc->lastTreeNo == 0 and ref($self->{DocAttributes})) { | |||
| 273 | 0 | foreach (values %{$self->{DocAttributes}}) { | |||||
| 0 | |||||||
| 274 | # leave data in the UTF-8 encoding in Perl 5.8 | ||||||
| 275 | $node->{'p_'.$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
| 276 | 0 | 0 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||
| 277 | 0 | $self->{FSAttrs}->{"p_".$_->{Name}}=1; | |||||
| 278 | } | ||||||
| 279 | } | ||||||
| 280 | } elsif ($elem eq 'w' or $elem eq 'c') { | ||||||
| 281 | 0 | my $node = $self->{Node} = Treex::PML::Factory->createNode(); | |||||
| 282 | 0 | $node->{tei_type}=$elem; | |||||
| 283 | 0 | $node->{ord} = ++($self->{LastOrd}); | |||||
| 284 | 0 | $node->paste_on($self->{Tree},'ord'); | |||||
| 285 | 0 | 0 | if (ref($attr)) { | ||||
| 286 | 0 | foreach (values %$attr) { | |||||
| 287 | $node->{$_->{Name}} = ($]>=5.008) ? $_->{Value} : | ||||||
| 288 | 0 | 0 | XML::LibXML::decodeFromUTF8($target_doc->encoding(),$_->{Value}); | ||||
| 289 | 0 | $self->{FSAttrs}->{$_->{Name}}=1; | |||||
| 290 | } | ||||||
| 291 | $self->{IDs}->{$node->{id}}=$node | ||||||
| 292 | 0 | 0 | if ($node->{id} ne ''); | ||||
| 293 | $self->{IDs}->{$node->{AID}}=$node | ||||||
| 294 | 0 | 0 | if ($node->{AID} ne ''); #grrrrrrrr! | ||||
| 295 | } | ||||||
| 296 | } | ||||||
| 297 | } | ||||||
| 298 | |||||||
| 299 | sub end_element { | ||||||
| 300 | 0 | 0 | my ($self) = @_; | ||||
| 301 | 0 | 0 | 0 | if ($self->{Node} and $self->{Node}->{tei_type} eq 's') { | |||
| 302 | # build the tree (no consistency checks at all) | ||||||
| 303 | 0 | my @nodes=$self->{Tree}->descendants; | |||||
| 304 | 0 | foreach my $node (@nodes) { | |||||
| 305 | 0 | my $dep=$node->{dep}; | |||||
| 306 | 0 | 0 | 0 | if ($dep ne '' and | |||
| 307 | ref($self->{IDs}{$dep})) { | ||||||
| 308 | 0 | $node->cut()->paste_on($self->{IDs}{$dep}, 'ord'); | |||||
| 309 | } | ||||||
| 310 | } | ||||||
| 311 | } | ||||||
| 312 | 0 | 0 | $self->{Node} = $self->{Node}->parent if ($self->{Node}); | ||||
| 313 | } | ||||||
| 314 | |||||||
| 315 | sub start_entity { | ||||||
| 316 | # just hoping some parser would support these | ||||||
| 317 | 0 | 0 | print "START ENTITY: @{$_[1]}\n"; | ||||
| 0 | |||||||
| 318 | } | ||||||
| 319 | |||||||
| 320 | |||||||
| 321 | sub end_entity { | ||||||
| 322 | 0 | 0 | print "END ENTITY: @{$_[1]}\n"; | ||||
| 0 | |||||||
| 323 | } | ||||||
| 324 | |||||||
| 325 | sub entity_reference { | ||||||
| 326 | 0 | 0 | my $self = $_[0]; | ||||
| 327 | 0 | my $name = $_[1]->{Name}; | |||||
| 328 | 0 | 0 | 0 | if ($self->{Node}->{tei_type} eq 'w' or | |||
| 329 | $self->{Node}->{tei_type} eq 'c') { | ||||||
| 330 | 0 | $self->{Node}->{form}.='&'.$name.';'; | |||||
| 331 | } | ||||||
| 332 | } | ||||||
| 333 | |||||||
| 334 | sub start_cdata { # not much use for this | ||||||
| 335 | 0 | 0 | my $self = shift; | ||||
| 336 | 0 | $self->{InCDATA} = 1; | |||||
| 337 | } | ||||||
| 338 | |||||||
| 339 | sub end_cdata { # not much use for this | ||||||
| 340 | 0 | 0 | my $self = shift; | ||||
| 341 | 0 | $self->{InCDATA} = 0; | |||||
| 342 | } | ||||||
| 343 | |||||||
| 344 | sub comment { | ||||||
| 345 | 0 | 0 | my $self = $_[0]; | ||||
| 346 | 0 | my $data = $_[1]; | |||||
| 347 | 0 | 0 | if ($self->{Node}) { | ||||
| 348 | 0 | $self->{Node}->{xml_comment}.=''; | |||||
| 349 | } | ||||||
| 350 | } | ||||||
| 351 | |||||||
| 352 | sub doctype_decl { # unfortunatelly, not called by the parser, so far | ||||||
| 353 | 0 | 0 | my ($self,$hash) = @_; | ||||
| 354 | 0 | $self->{TargetDocument}->changeMetaData("xml_doctype" => $hash); | |||||
| 355 | } | ||||||
| 356 | |||||||
| 357 | # hack to fix LibXML | ||||||
| 358 | 0 | 0 | sub XML::LibXML::Dtd::type { return $_[0]->nodeType } | ||||
| 359 | |||||||
| 360 | |||||||
| 361 | 1; | ||||||
| 362 | __END__ |