File Coverage

blib/lib/XML/DTDParser.pm

Criterion	Covered	Total	%
statement	123	195	63.0
branch	45	98	45.9
condition	4	9	44.4
subroutine	10	13	76.9
pod	3	6	50.0
total	185	321	57.6

line	stmt	bran	cond	sub	pod	time	code
1							package XML::DTDParser;
2							require Exporter;
3	1			1		9906	use FileHandle;
	1					44769
	1					6
4	1			1		565	use strict;
	1					2
	1					57
5	1			1		6	use File::Spec;
	1					7
	1					36
6	1			1		5	use Cwd;
	1					2
	1					4608
7							our @ISA = qw(Exporter);
8
9							our @EXPORT = qw(ParseDTD FindDTDRoot ParseDTDFile);
10							our @EXPORT_OK = @EXPORT;
11
12							our $VERSION = '2.01';
13
14							my $namechar = '[#\x41-\x5A\x61-\x7A\xC0-\xD6\xD8-\xF6\xF8-\xFF0-9\xB7._:-]';
15							my $name = '[\x41-\x5A\x61-\x7A\xC0-\xD6\xD8-\xF6\xF8-\xFF_:]' . $namechar . '*';
16							my $nameX = $name . '[.?+]';
17
18							my $nmtoken = $namechar . '+';
19
20							my $AttType = '(?:CDATA\b\|IDREFS?\b\|ID\b\|ENTITY\b\|ENTITIES\b\|NMTOKENS?\b\|$[^$]\)\|NOTATION\s+$[^$]\))';
21							my $DefaultDecl = q{(?:#REQUIRED\|#IMPLIED\|(:?#FIXED ?)?(?:".?"\|'.?'))};
22							my $AttDef = '('.$name.') ('.$AttType.')(?: ('.$DefaultDecl.'))?';
23
24
25							sub ParseDTDFile {
26	0			0	1	0	my $file = shift;
27	0	0				0	open my $IN, "< $file"
28							or die "Cannot open the $file : $!\n";
29	0					0	my $xml = do {local $/; <$IN>};
	0					0
	0					0
30	0					0	close $IN;
31	0					0	my ($vol,$dir,$filename) = File::Spec->splitpath( $file);
32	0	0				0	if ($filename eq $file) {
33	0					0	return ParseDTD($xml);
34							} else {
35							# in case there are any includes, they should be relative to the DTD file, not to current dir
36	0					0	my $cwd = cwd();
37	0					0	chdir(File::Spec->catdir($vol,$dir));
38	0					0	my $DTD = ParseDTD($xml);
39	0					0	chdir($cwd);
40	0					0	return $DTD;
41							}
42							}
43
44							sub ParseDTD {
45	1			1	1	106	my $xml = shift;
46	1					2	my (%elements, %definitions);
47
48	1					54	$xml =~ s/\s\s*/ /gs;
49
50	1					169	while ($xml =~ s{}{}io) {
51	0					0	my ($percent, $entity, $include) = ($1,$2,$3);
52	0	0				0	$percent = '&' unless $percent;
53	0					0	my $definition;
54							{
55							# the $include may be a URL, use LWP::Simple to fetch it if it is.
56	0					0	my $IN;
	0					0
57	0	0				0	open $IN, "<$include" or die "Cannot open include file $include : $!\n";
58	0					0	$definition = do {local $/; <$IN>};
	0					0
	0					0
59	0					0	close $IN;
60							}
61	0					0	$definition =~ s/\s\s*/ /gs;
62	0					0	$xml =~ s{\Q$percent$entity;\E}{$definition}g;
63							}
64
65	1					2	my (%elementinfo, %attribinfo);
66	1					14	while ($xml =~ s{}{}s) {
67	8					21	my $info = $1;$info =~ s/\s+$//s;
	8					21
68	8					8	my %info;
69	8					49	while ($info =~ s{^([\w-]+)\s=\s((?:'[^']')+\|(?:"[^"]")+\|[^\s'"]\S)\s}{}s) {
70	19					46	my ($name, $value) = ($1, $2);
71	19	100				51	if ($value =~ /^'/) {
		50
72	18					43	($value = substr $value, 1, length($value)-2) =~ s/''/'/g;
73							} elsif ($value =~ /^"/) {
74	0					0	($value = substr $value, 1, length($value)-2) =~ s/""/"/g;
75							}
76	19					225	$info{$name} = $value;
77							}
78	8	50				17	die "Malformed section!\n\t\n"
79							if ($info ne '');
80	8	50				19	die "The section doesn't contain the 'element' parameter!\n"
81							unless exists $info{'element'};
82	8					11	my $element = $info{'element'};
83	8					14	delete $info{'element'};
84	8	100				15	if (exists $info{'attribute'}) {
85	2					3	my $attribute = $info{'attribute'};
86	2					3	delete $info{'attribute'};
87	2					19	$attribinfo{$element}->{$attribute} = \%info;
88							} else {
89	6					46	$elementinfo{$element} = \%info;
90							}
91							}
92
93	1					7	$xml =~ s{}{}gs;
94	1					4	$xml =~ s{<\?.*?\?>}{}gs;
95
96	1					138	while ($xml =~ s{}{}io) {
97	0					0	my ($percent, $entity, $definition) = ($1,$2,$3);
98	0	0				0	$percent = '&' unless $percent;
99	0					0	$definitions{"$percent$entity"} = $definition;
100							}
101
102							{
103	1					4	my $replacements = 0;
	1					3
104	1	0	33			48	1 while $replacements++ < 1000 and $xml =~ s{([&%]$name);}{(exists $definitions{$1} ? $definitions{$1} : "$1\x01;")}geo;
	0					0
105	1	50				31	die <<'END' if $xml =~ m{([&%]$name);}o;
106							Recursive definitions or too many entities! Only up to 1000 entity replacements allowed.
107							(An entity is something like &foo; or %foo;. They are defined by tag.)
108							END
109							}
110	1					3	undef %definitions;
111	1					4	$xml =~ tr/\x01//d;
112
113	1					160	while ($xml =~ s{}{}io) {
114	10					33	my ($element, $children, $option) = ($1,$2,$3);
115	10					58	$elements{$element}->{childrenSTR} = $children . $option;
116	10					20	$children =~ s/\s//g;
117	10	100	33			61	if ($children eq '(#PCDATA)') {
		50
118	7					12	$children = '#PCDATA';
119							} elsif ($children =~ s/^$(#PCDATA(?:\\|$name)+)$$/$1/o and $option eq '*') {
120	0					0	$children =~ s/\\|/*,/g;
121	0					0	$children .= '*';
122							} else {
123	3					12	$children = simplify_children( $children, $option);
124							}
125
126	10	50				131	die " is not valid!\n"
127							unless $children =~ m{^#?$nameX(?:,$nameX)*$}o;
128
129
130	10					25	$elements{$element}->{childrenARR} = [];
131	10					30	foreach my $child (split ',', $children) {
132	17	100	66			71	$child =~ s/([?*+])$//
133							and $option = $1
134							or $option = '!';
135	17	100				45	if (exists $elements{$element}->{children}->{$child}) {
136	1					5	$elements{$element}->{children}->{$child} = _merge_options( $elements{$element}->{children}->{$child}, $option);
137	1	50				6	$elements{$element}->{childrenX}->{$child} = _merge_counts( $elements{$element}->{childrenX}->{$child}, _char2count($option))
138							unless $child eq '#PCDATA';
139							} else {
140	16					43	$elements{$element}->{children}->{$child} = $option;
141	16	100				44	$elements{$element}->{childrenX}->{$child} = _char2count($option)
142							unless $child eq '#PCDATA';
143							}
144	17	100				47	push @{$elements{$element}->{childrenARR}}, $child
	10					34
145							unless $child eq '#PCDATA';
146							}
147	10					109	delete $elements{$element}->{childrenARR}
148	10	100				15	if @{$elements{$element}->{childrenARR}} == 0
149							}
150
151	1					151	while ($xml =~ s{}{}io) {
152	0					0	my ($element, $param) = ($1,$2);
153	0	0				0	if (uc $param eq 'ANY') {
154	0					0	$elements{$element}->{any} = 1;
155							} else {
156	0					0	$elements{$element} = {};
157							}
158							}
159							#=for comment
160	1					182	while ($xml =~ s{}{}io) {
161	2					7	my ($element, $attributes) = ($1,$2);
162	2	50				7	die " referenced by an not found!\n"
163							unless exists $elements{$element};
164	2					344	while ($attributes =~ s/^\s*$AttDef//io) {
165	4					26	my ($name,$type,$option,$default) = ($1,$2,$3);
166	4	50				19	if ($option =~ /^#FIXED\s+["'](.*)["']$/i){
		50
167	0					0	$option = '#FIXED';
168	0					0	$default = $1;
169							} elsif ($option =~ /^["'](.*)["']$/i){
170	0					0	$option = '';
171	0					0	$default = $1;
172							}
173	4					18	$elements{$element}->{attributes}->{$name} = [$type,$option,$default,undef];
174	4	50				47	if ($type =~ /^(?:NOTATION\s)?$\s(.*?)$$/) {
175	0					0	$elements{$element}->{attributes}->{$name}->[3] = parse_enum($1);
176							}
177							}
178							}
179							#=cut
180							#$xml = '';
181
182	1					4	$xml =~ s/\s\s*/ /g;
183
184	1	50				4	die "UNPARSED DATA:\n$xml\n\n"
185							if $xml =~ /\S/;
186
187	1					7	foreach my $element (keys %elements) {
188	10					10	foreach my $child (keys %{$elements{$element}->{children}}) {
	10					26
189	16	100				27	if ($child eq '#PCDATA') {
190	7					12	delete $elements{$element}->{children}->{'#PCDATA'};
191	7					20	$elements{$element}->{content} = 1;
192							} else {
193	9	50				19	die "Element $child referenced by $element was not found!\n"
194							unless exists $elements{$child};
195	9	50				15	if (exists $elements{$child}->{parent}) {
196	0					0	push @{$elements{$child}->{parent}}, $element;
	0					0
197							} else {
198	9					19	$elements{$child}->{parent} = [$element];
199							}
200	9					29	$elements{$child}->{option} = $elements{$element}->{children}->{$child};
201							}
202							}
203	10	100				13	if (scalar(keys %{$elements{$element}->{children}}) == 0) {
	10					25
204	7					12	delete $elements{$element}->{children};
205							}
206	10	100				30	if (exists $elementinfo{$element}) {
207	6					7	foreach my $info (keys %{$elementinfo{$element}}) {
	6					15
208	7					19	$elements{$element}->{$info} = $elementinfo{$element}->{$info};
209							}
210							}
211	10	100				22	if (exists $attribinfo{$element}) {
212	2					3	foreach my $attribute (keys %{$attribinfo{$element}}) {
	2					6
213	2					9	$elements{$element}->{'attributes'}->{$attribute}->[4] = $attribinfo{$element}->{$attribute};
214							}
215							}
216							}
217
218	1					10	return \%elements;
219							}
220
221							sub flatten_children {
222	3			3	0	8	my ( $children, $option ) = @_;
223
224	3	50				12	if ($children =~ /\\|/) {
225	0					0	$children =~ s{[\|,]}{?,}g;
226	0					0	$children .= '?'
227							}
228
229	3	50				8	if ($option) {
230	0					0	$children =~ s/,/$option,/g;
231	0					0	$children .= $option;
232							}
233
234	3					45	return $children;
235							}
236
237							sub simplify_children {
238	3			3	0	5	my ( $children, $option ) = @_;
239
240	3					88	1 while $children =~ s{$($nameX(?:[,\|]$nameX))$([?+]*)}{flatten_children($1, $2)}geo;
	3					8
241
242	3	50				8	if ($option) {
243	0					0	$children =~ s/,/$option,/g;
244	0					0	$children .= $option;
245							}
246
247	3					6	foreach ($children) {
248	3					6	s{\?\?}{?}g;
249	3					5	s{\?\+}{*}g;
250	3					5	s{\?\}{}g;
251	3					5	s{\+\?}{*}g;
252	3					4	s{\+\+}{+}g;
253	3					4	s{\+\}{}g;
254	3					6	s{\\?}{}g;
255	3					4	s{\\+}{}g;
256	3					8	s{\\}{*}g;
257							}
258
259	3					9	return $children;
260							}
261
262							sub parse_enum {
263	0			0	0	0	my $enum = shift;
264	0					0	$enum =~ tr/\x20\x09\x0D\x0A//d; # get rid of whitespace
265	0					0	return [split /\\|/, $enum];
266							}
267
268							my %merge_options = (
269							'!!' => '+',
270							'!*' => '+' ,
271							'!+' => '+',
272							'!?' => '+',
273							'*' => '',
274							'*+' => '+',
275							'?' => '',
276							'++' => '+',
277							'+?' => '+',
278							'??' => '?',
279							);
280							sub _merge_options {
281	1			1		7	my ($o1, $o2) = sort @_;
282	1					6	return $merge_options{$o1.$o2};
283							}
284
285							my %char2count = (
286							'!' => '1',
287							'?' => '0..1',
288							'+' => '1..',
289							'*' => '0..',
290							);
291							sub _char2count{
292	10			10		42	return $char2count{$_[0]}
293							}
294
295							sub _merge_counts {
296	1			1		3	my ($c1, $c2) = @_;
297	1	50				584	if ($c1 =~ /^\d+$/) {
		0
		0
298	1	50				16	if ($c2 =~ /^\d+$/) {
		50
		50
299	0					0	return $c1+$c2
300							} elsif ($c2 =~ /^(\d+)..(\d+)$/) {
301	0					0	return ($c1+$1) . ".." . ($c1+$2);
302							} elsif ($c2 =~ /^(\d+)..$/) {
303	1					10	return ($c1+$1) . "..";
304							}
305							} elsif ($c1 =~ /^(\d+)..(\d+)$/) {
306	0						my ($c1l,$c1u) = ($1,$2);
307	0	0					if ($c2 =~ /^\d+$/) {
		0
		0
308	0						return ($c1l+$c2) . ".." . ($c1u+$c2);
309							} elsif ($c2 =~ /^(\d+)..(\d+)$/) {
310	0						return ($c1l+$1) . ".." . ($c1u+$2);
311							} elsif ($c2 =~ /^(\d+)..$/) {
312	0						return ($c1l+$1) . "..";
313							}
314							} elsif ($c1 =~ /^(\d+)..$/) {
315	0						$c1=$1;
316	0	0					if ($c2 =~ /^\d+$/) {
		0
		0
317	0						return ($c1+$c2) . "..";
318							} elsif ($c2 =~ /^(\d+)..(\d+)$/) {
319	0						return ($c1+$1) . "..";
320							} elsif ($c2 =~ /^(\d+)..$/) {
321	0						return ($c1+$1) . "..";
322							}
323							}
324							}
325
326							sub FindDTDRoot {
327	0			0	1		my $elements = shift;
328	0						my @roots;
329	0						foreach my $element (keys %$elements) {
330	0	0					if (!exists $elements->{$element}->{parent}) {
331	0						push @roots, $element;
332	0						$elements->{$element}->{option} = '!';
333							}
334							}
335	0						return @roots;
336							}
337
338							=head1 NAME
339
340							XML::DTDParser - quick and dirty DTD parser
341
342							Version 2.01
343
344							=head1 SYNOPSIS
345
346							use XML::DTDParser qw(ParseDTD ParseDTDFile);
347
348							$DTD = ParseDTD $DTDtext;
349							#or
350							$DTD = ParseDTDFile( $dtdfile)
351
352							=head1 DESCRIPTION
353
354							This module parses a DTD file and creates a data structure containing info about
355							all tags, their allowed parameters, children, parents, optionality etc. etc. etc.
356
357							Since I'm too lazy to document the structure, parse a DTD you need and print
358							the result to a file using Data::Dumper. The datastructure should be selfevident.
359
360							Note: The module should be able to parse just about anything, but it intentionaly looses some information.
361							Eg. if the DTD specifies that a tag should contain either CHILD1 or CHILD2 you only get that
362							CHILD1 and CHILD2 are optional. That is is the DTD contains
363
364							the result will be the same is if it contained
365
366
367							You get the original unparsed parameter list as well so if you need this
368							information you may parse it yourself.
369
370							Since version 1.6 this module supports my "extensions" to DTDs.
371							If the DTD contains a comment in form
372
373
374
375							and there is an element XXX in the DTD, the resulting hash for the XXX will contain
376
377							'foo' => 'bar',
378							'person' => 'd\'Artagnan',
379							'greeting => 'Hello World!'
380
381							If the DTD contains
382
383
384
385							the
386
387							$DTD->{XXX}->{attributes}->{YYY}->[4]
388
389							will be set to
390
391							{ break => 'no' }
392
393							I use this parser to import the DTD into the database so that I could map some fields
394							to certain tags for output and I want to be able to specify the mapping inside the file:
395
396
397
398							=head2 EXPORT
399
400							By default the module exports all (both) it's functions. If you only want one, or none
401							use
402
403							use XML::DTDParser qw(ParseDTD);
404							or
405							use XML::DTDParser qw();
406
407							=over 4
408
409							=item ParseDTD
410
411							$DTD = ParseDTD $DTDtext;
412
413							Parses the $DTDtext and creates a data structure. If the $DTDtext contains some
414							declarations those are read and parsed as needed.
415							The paths are relative to current directory.
416
417							The module currently doesn't support URLs here yet.
418
419							=item ParseDTDFile
420
421							$DTD = ParseDTDFile $DTDfile;
422
423							Parses the contents of $DTDfile and creates a data structure. If the $DTDfile contains some
424							declarations those are read and parsed as needed.
425							The paths are relative to the $DTDfile.
426
427							The module currently doesn't support URLs here yet.
428
429							=item FindDTDRoot
430
431							$DTD = ParseDTD $DTDtext;
432							@roots = FindDTDRoot $DTD;
433
434							Returns all tags that have no parent. There could be several such tags defined by the DTD.
435							Especialy if it used some common includes.
436
437							=back
438
439							=head1 AUTHOR
440
441							Jenda@Krynicky.cz
442							http://Jenda.Krynicky.cz
443
444							=head1 COPYRIGHT
445
446							Copyright (c) 2002 Jan Krynicky . All rights reserved.
447
448							This program is free software; you can redistribute it and/or
449							modify it under the same terms as Perl itself.
450
451							=cut
452