File Coverage

blib/lib/HTML/HTML5/Sanity.pm

Criterion	Covered	Total	%
statement	15	17	88.2
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	21	23	91.3

line	stmt	sub	time	code
1	2	2	53953	use 5.010;
	2		6
	2		61
2	2	2	10	use strict;
	2		2
	2		57
3	2	2	8	use warnings;
	2		5
	2		134
4
5				package HTML::HTML5::Sanity;
6
7				BEGIN {
8	2	2	6	$HTML::HTML5::Sanity::AUTHORITY = 'cpan:TOBYINK';
9	2		265	$HTML::HTML5::Sanity::VERSION = '0.105';
10				}
11
12				require Exporter;
13				our @ISA = qw(Exporter);
14				our %EXPORT_TAGS = (
15				'all' => [ qw(fix_document) ],
16				'standard' => [ qw(fix_document) ],
17				);
18				our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
19				our @EXPORT = ( @{ $EXPORT_TAGS{'standard'} } );
20
21				our $FIX_LANG_ATTRIBUTES = 1;
22
23	2	2	8188	use Locale::Country qw(country_code2code LOCALE_CODE_ALPHA_2 LOCALE_CODE_NUMERIC);
	2		78065
	2		176
24	2	2	1091	use XML::LibXML qw(:ns :libxml);
	0
	0
25
26				our $lang_3to2 = {
27				'aar' => 'aa' ,
28				'abk' => 'ab' ,
29				'ave' => 'ae' ,
30				'afr' => 'af' ,
31				'aka' => 'ak' ,
32				'amh' => 'am' ,
33				'arg' => 'an' ,
34				'ara' => 'ar' ,
35				'asm' => 'as' ,
36				'ava' => 'av' ,
37				'aym' => 'ay' ,
38				'aze' => 'az' ,
39				'bak' => 'ba' ,
40				'bel' => 'be' ,
41				'bul' => 'bg' ,
42				'bih' => 'bh' ,
43				'bis' => 'bi' ,
44				'bam' => 'bm' ,
45				'ben' => 'bn' ,
46				'tib' => 'bo' ,
47				'bod' => 'bo' ,
48				'bre' => 'br' ,
49				'bos' => 'bs' ,
50				'cat' => 'ca' ,
51				'che' => 'ce' ,
52				'cha' => 'ch' ,
53				'cos' => 'co' ,
54				'cre' => 'cr' ,
55				'cze' => 'cs' ,
56				'ces' => 'cs' ,
57				'chu' => 'cu' ,
58				'chv' => 'cv' ,
59				'wel' => 'cy' ,
60				'cym' => 'cy' ,
61				'dan' => 'da' ,
62				'ger' => 'de' ,
63				'deu' => 'de' ,
64				'div' => 'dv' ,
65				'dzo' => 'dz' ,
66				'ewe' => 'ee' ,
67				'gre' => 'el' ,
68				'ell' => 'el' ,
69				'eng' => 'en' ,
70				'epo' => 'eo' ,
71				'spa' => 'es' ,
72				'est' => 'et' ,
73				'baq' => 'eu' ,
74				'eus' => 'eu' ,
75				'per' => 'fa' ,
76				'fas' => 'fa' ,
77				'ful' => 'ff' ,
78				'fin' => 'fi' ,
79				'fij' => 'fj' ,
80				'fao' => 'fo' ,
81				'fre' => 'fr' ,
82				'fra' => 'fr' ,
83				'fry' => 'fy' ,
84				'gle' => 'ga' ,
85				'gla' => 'gd' ,
86				'glg' => 'gl' ,
87				'grn' => 'gn' ,
88				'guj' => 'gu' ,
89				'glv' => 'gv' ,
90				'hau' => 'ha' ,
91				'heb' => 'he' ,
92				'hin' => 'hi' ,
93				'hmo' => 'ho' ,
94				'hrv' => 'hr' ,
95				'hat' => 'ht' ,
96				'hat' => 'ht' ,
97				'hun' => 'hu' ,
98				'arm' => 'hy' ,
99				'hye' => 'hy' ,
100				'her' => 'hz' ,
101				'ina' => 'ia' ,
102				'ind' => 'id' ,
103				'ile' => 'ie' ,
104				'ibo' => 'ig' ,
105				'iii' => 'ii' ,
106				'ipk' => 'ik' ,
107				'ido' => 'io' ,
108				'ice' => 'is' ,
109				'isl' => 'is' ,
110				'ita' => 'it' ,
111				'iku' => 'iu' ,
112				'jpn' => 'ja' ,
113				'jav' => 'jv' ,
114				'geo' => 'ka' ,
115				'kat' => 'ka' ,
116				'kon' => 'kg' ,
117				'kik' => 'ki' ,
118				'kik' => 'ki' ,
119				'kua' => 'kj' ,
120				'kaz' => 'kk' ,
121				'kal' => 'kl' ,
122				'khm' => 'km' ,
123				'kan' => 'kn' ,
124				'kor' => 'ko' ,
125				'kau' => 'kr' ,
126				'kas' => 'ks' ,
127				'kur' => 'ku' ,
128				'kom' => 'kv' ,
129				'cor' => 'kw' ,
130				'kir' => 'ky' ,
131				'lat' => 'la' ,
132				'ltz' => 'lb' ,
133				'ltz' => 'lb' ,
134				'lug' => 'lg' ,
135				'lim' => 'li' ,
136				'lin' => 'ln' ,
137				'lao' => 'lo' ,
138				'lit' => 'lt' ,
139				'lub' => 'lu' ,
140				'lav' => 'lv' ,
141				'mlg' => 'mg' ,
142				'mah' => 'mh' ,
143				'mao' => 'mi' ,
144				'mri' => 'mi' ,
145				'mac' => 'mk' ,
146				'mkd' => 'mk' ,
147				'mal' => 'ml' ,
148				'mon' => 'mn' ,
149				'mar' => 'mr' ,
150				'may' => 'ms' ,
151				'msa' => 'ms' ,
152				'mlt' => 'mt' ,
153				'bur' => 'my' ,
154				'mya' => 'my' ,
155				'nau' => 'na' ,
156				'nob' => 'nb' ,
157				'nde' => 'nd' ,
158				'nep' => 'ne' ,
159				'ndo' => 'ng' ,
160				'dut' => 'nl' ,
161				'nld' => 'nl' ,
162				'nno' => 'nn' ,
163				'nor' => 'no' ,
164				'nbl' => 'nr' ,
165				'nav' => 'nv' ,
166				'nya' => 'ny' ,
167				'oci' => 'oc' ,
168				'oji' => 'oj' ,
169				'orm' => 'om' ,
170				'ori' => 'or' ,
171				'oss' => 'os' ,
172				'pan' => 'pa' ,
173				'pli' => 'pi' ,
174				'pol' => 'pl' ,
175				'pus' => 'ps' ,
176				'por' => 'pt' ,
177				'que' => 'qu' ,
178				'roh' => 'rm' ,
179				'run' => 'rn' ,
180				'rum' => 'ro' ,
181				'ron' => 'ro' ,
182				'rus' => 'ru' ,
183				'kin' => 'rw' ,
184				'san' => 'sa' ,
185				'srd' => 'sc' ,
186				'snd' => 'sd' ,
187				'sme' => 'se' ,
188				'sag' => 'sg' ,
189				'sin' => 'si' ,
190				'slo' => 'sk' ,
191				'slk' => 'sk' ,
192				'slv' => 'sl' ,
193				'smo' => 'sm' ,
194				'sna' => 'sn' ,
195				'som' => 'so' ,
196				'alb' => 'sq' ,
197				'sqi' => 'sq' ,
198				'srp' => 'sr' ,
199				'ssw' => 'ss' ,
200				'sot' => 'st' ,
201				'sun' => 'su' ,
202				'swe' => 'sv' ,
203				'swa' => 'sw' ,
204				'tam' => 'ta' ,
205				'tel' => 'te' ,
206				'tgk' => 'tg' ,
207				'tha' => 'th' ,
208				'tir' => 'ti' ,
209				'tuk' => 'tk' ,
210				'tgl' => 'tl' ,
211				'tsn' => 'tn' ,
212				'ton' => 'to' ,
213				'tur' => 'tr' ,
214				'tso' => 'ts' ,
215				'tat' => 'tt' ,
216				'twi' => 'tw' ,
217				'tah' => 'ty' ,
218				'uig' => 'ug' ,
219				'ukr' => 'uk' ,
220				'urd' => 'ur' ,
221				'uzb' => 'uz' ,
222				'ven' => 've' ,
223				'vie' => 'vi' ,
224				'vol' => 'vo' ,
225				'wln' => 'wa' ,
226				'wol' => 'wo' ,
227				'xho' => 'xh' ,
228				'yid' => 'yi' ,
229				'yor' => 'yo' ,
230				'zha' => 'za' ,
231				'chi' => 'zh' ,
232				'zho' => 'zh' ,
233				'zul' => 'zu' ,
234				};
235
236				our $lang_grandfather = {
237				'art-lojban' => 'jbo',
238				'i-ami' => 'ami',
239				'i-bnn' => 'bnn',
240				'i-hak' => 'hak',
241				'i-klingon' => 'tlh',
242				'i-lux' => 'lb',
243				'i-navajo' => 'nv',
244				'i-pwn' => 'pwn', #haha
245				'i-tao' => 'tao',
246				'i-tay' => 'tay',
247				'i-tsu' => 'tsu',
248				'no-bok' => 'nb',
249				'no-nyn' => 'nn',
250				'sgn-be-fr' => 'sfb',
251				'sgn-be-nl' => 'vgt',
252				'sgn-ch-de' => 'sgg',
253				'zh-guoyu' => 'cmn',
254				'zh-hakka' => 'hak',
255				'zh-min-nan' => 'nan',
256				'zh-xiang' => 'hsn',
257				};
258
259				our $obsolete_iso3166 = {
260				'UK' => 'GB', # Exceptionally reserved
261				'FX' => 'FR', # Exceptionally reserved
262				'ZR' => 'CD', # Zaire => Congo
263				'HV' => 'BF', # Upper Volta => Burkina Faso
264				'DY' => 'BJ', # Dahomey => Benin
265				'BU' => 'MM', # Burma => Myanmar
266				'TP' => 'TL', # East Timor => Timor-Leste
267				'NH' => 'VU', # New Hebrides => Vanuatu
268				'RH' => 'ZW', # Rhodesia => Zimbabwe
269				};
270
271				our $canon_lang = {};
272
273				sub fix_document
274				{
275				my $old_document = shift;
276				my $attribute_behaviour = shift \|\| 0;
277
278				my $new_document = XML::LibXML::Document->new;
279
280				my $new_root = fix_element(
281				$old_document->documentElement,
282				$new_document,
283				{ ':' => 'http://www.w3.org/1999/xhtml', 'xml' => XML_XML_NS }
284				);
285
286				$new_document->setDocumentElement($new_root);
287
288				return $new_document;
289				}
290
291				sub fix_element
292				{
293				my $old_element = shift;
294				my $new_document = shift;
295				my $parent_declarations = shift;
296
297				my $declared_namespaces = {};
298				foreach my $k (keys %{$parent_declarations})
299				{
300				$declared_namespaces->{$k} = $parent_declarations->{$k};
301				}
302
303				# Process namespace declarations on this element.
304				foreach my $attr ($old_element->attributes)
305				{
306				next if $attr->nodeType == XML_NAMESPACE_DECL;
307
308				if ($attr->nodeName =~ /^xmlns:(.*)$/)
309				{
310				my $prefix = $1;
311
312				if ($prefix eq 'xml' && $attr->getData eq XML_XML_NS)
313				{
314				# that's OK.
315				}
316				elsif ($prefix eq 'xml' \|\| $attr->getData eq XML_XML_NS)
317				{
318				next;
319				}
320				elsif ($prefix eq 'xmlns' \|\| $attr->getData eq XML_XMLNS_NS)
321				{
322				next;
323				}
324
325				$declared_namespaces->{ $prefix } = $attr->getData;
326				}
327				}
328
329				# Process any default XML Namespace
330				my $hasExplicit = 0;
331				if ($old_element->hasAttributeNS(undef, 'xmlns'))
332				{
333				$hasExplicit = 1;
334				$declared_namespaces->{ ':' } = $old_element->getAttributeNS(undef, 'xmlns');
335				}
336
337				# Create a new element.
338				my $new_element;
339				if ($hasExplicit)
340				{
341				$new_element = $new_document->createElementNS(
342				$declared_namespaces->{ ':' },
343				$old_element->nodeName,
344				);
345				}
346				else
347				{
348				my $tag = $old_element->nodeName;
349				if ($tag =~ /^([^:]+)\:([^:]+)$/)
350				{
351				my $ns_prefix = $1;
352				my $localname = $2;
353
354				if (defined $declared_namespaces->{$ns_prefix})
355				{
356				$new_element = $new_document->createElementNS(
357				$declared_namespaces->{$ns_prefix}, $tag);
358				}
359				}
360				unless ($new_element)
361				{
362				$new_element = $new_document->createElementNS(
363				$declared_namespaces->{ ':' }, $tag);
364				}
365				}
366
367				# Add attributes to new element.
368				foreach my $old_attr ($old_element->attributes)
369				{
370				next if $old_attr->nodeType == XML_NAMESPACE_DECL;
371				# next if $old_attr->nodeName =~ /^xmlns(:.*)?$/;
372
373				fix_attribute($old_attr, $new_element, $declared_namespaces);
374				}
375
376				# Process child nodes.
377				foreach my $old_kid ($old_element->childNodes)
378				{
379				if ($old_kid->nodeType == XML_TEXT_NODE
380				\|\| $old_kid->nodeType == XML_CDATA_SECTION_NODE)
381				{
382				$new_element->appendTextNode($old_kid->nodeValue);
383				}
384				elsif ($old_kid->nodeType == XML_COMMENT_NODE)
385				{
386				$new_element->appendChild(
387				$new_document->createComment($old_kid->nodeValue)
388				);
389				}
390				elsif ($old_kid->nodeType == XML_ELEMENT_NODE)
391				{
392				$new_element->appendChild(
393				fix_element($old_kid, $new_document, $declared_namespaces)
394				);
395				}
396				}
397
398				return $new_element;
399				}
400
401				sub fix_attribute
402				{
403				my $old_attribute = shift;
404				my $new_element = shift;
405				my $declared_namespaces = shift;
406
407				my $name = $old_attribute->nodeName;
408				my @new_attribute;
409
410				if ($name =~ /^([^:]+)\:([^:]+)$/)
411				{
412				my $ns_prefix = $1;
413				my $localname = $2;
414
415				if (defined $declared_namespaces->{$ns_prefix})
416				{
417				@new_attribute = (
418				$declared_namespaces->{$ns_prefix},
419				sprintf("%s:%s", $ns_prefix, $localname),
420				);
421				}
422				}
423
424				my $node_value = $old_attribute->nodeValue;
425
426				if ($FIX_LANG_ATTRIBUTES && $name =~ /^(xml:)?lang$/i)
427				{
428				return undef
429				unless _valid_lang($node_value);
430
431				if ($FIX_LANG_ATTRIBUTES == 2)
432				{
433				$node_value = _canon_lang($node_value);
434				}
435				}
436
437				if (@new_attribute)
438				{
439				$new_element->setAttributeNS(@new_attribute, $node_value);
440				}
441				else
442				{
443				$new_element->setAttribute($name, $node_value);
444				}
445
446				return undef;
447				}
448
449				sub _valid_lang
450				{
451				my $value_to_test = shift;
452
453				return 1 if (defined $value_to_test) && ($value_to_test eq '');
454				return 0 unless defined $value_to_test;
455
456				# Regex for recognizing RFC 4646 well-formed tags
457				# http://www.rfc-editor.org/rfc/rfc4646.txt
458				# http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21
459
460				# The structure requires no forward references, so it reverses the order.
461				# It uses Java/Perl syntax instead of the old ABNF
462				# The uppercase comments are fragments copied from RFC 4646
463
464				# Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.
465
466				my $alpha = '[a-z]'; # ALPHA
467				my $digit = '[0-9]'; # DIGIT
468				my $alphanum = '[a-z0-9]'; # ALPHA / DIGIT
469				my $x = 'x'; # private use singleton
470				my $singleton = '[a-wyz]'; # other singleton
471				my $s = '[_-]'; # separator -- lenient parsers will use [_-] -- strict will use [-]
472
473				# Now do the components. The structure is slightly different to allow for capturing the right components.
474				# The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.
475
476				my $language = '([a-z]{2,8}) \| ([a-z]{2,3} [_-] [a-z]{3})';
477
478				# ABNF (23ALPHA) / 4ALPHA / 58ALPHA --- note: because of how \| works in regex, don't use $alpha{2,3} \| $alpha{4,8}
479				# We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).
480
481				# Note: extlang invalid in Unicode language tags
482
483				my $script = '[a-z]{4}' ; # 4ALPHA
484
485				my $region = '(?: [a-z]{2}\|[0-9]{3})' ; # 2ALPHA / 3DIGIT
486
487				my $variant = '(?: [a-z0-9]{5,8} \| [0-9] [a-z0-9]{3} )' ; # 5*8alphanum / (DIGIT 3alphanum)
488
489				my $extension = '(?: [a-wyz] (?: [_-] [a-z0-9]{2,8} )+ )' ; # singleton 1("-" (28alphanum))
490
491				my $privateUse = '(?: x (?: [_-] [a-z0-9]{1,8} )+ )' ; # "x" 1("-" (18alphanum))
492
493				# Define certain grandfathered codes, since otherwise the regex is pretty useless.
494				# Since these are limited, this is safe even later changes to the registry --
495				# the only oddity is that it might change the type of the tag, and thus
496				# the results from the capturing groups.
497				# http://www.iana.org/assignments/language-subtag-registry
498				# Note that these have to be compared case insensitively, requiring (?i) below.
499
500				my $grandfathered = '(?:
501				(en [_-] GB [_-] oed)
502				\| (i [_-] (?: ami \| bnn \| default \| enochian \| hak \| klingon \| lux \| mingo \| navajo \| pwn \| tao \| tay \| tsu ))
503				\| (no [_-] (?: bok \| nyn ))
504				\| (sgn [_-] (?: BE [_-] (?: fr \| nl) \| CH [_-] de ))
505				\| (zh [_-] min [_-] nan)
506				)';
507
508				# old: \| zh $s (?: cmn (?: $s Hans \| $s Hant )? \| gan \| min (?: $s nan)? \| wuu \| yue );
509				# For well-formedness, we don't need the ones that would otherwise pass.
510				# For validity, they need to be checked.
511
512				# $grandfatheredWellFormed = (?:
513				# art $s lojban
514				# \| cel $s gaulish
515				# \| zh $s (?: guoyu \| hakka \| xiang )
516				# );
517
518				# Unicode locales: but we are shifting to a compatible form
519				# $keyvalue = (?: $alphanum+ \= $alphanum+);
520				# $keywords = ($keyvalue (?: \; $keyvalue)*);
521
522				# We separate items that we want to capture as a single group
523
524				my $variantList = $variant . '(?:' . $s . $variant . ')*' ; # special for multiples
525				my $extensionList = $extension . '(?:' . $s . $extension . ')*' ; # special for multiples
526
527				my $langtag = "
528				($language)
529				($s ( $script ) )?
530				($s ( $region ) )?
531				($s ( $variantList ) )?
532				($s ( $extensionList ) )?
533				($s ( $privateUse ) )?
534				";
535
536				# Here is the final breakdown, with capturing groups for each of these components
537				# The variants, extensions, grandfathered, and private-use may have interior '-'
538
539				my $r = ($value_to_test =~
540				/^(
541				($langtag)
542				\| ($privateUse)
543				\| ($grandfathered)
544				)$/xi);
545				return $r;
546				}
547
548				# If people use a non-canon lang once, they're likely to do it twice, so a little caching.
549				sub _canon_lang
550				{
551				my $lang = shift;
552				unless (defined $canon_lang->{$lang})
553				{
554				$canon_lang->{$lang} = __canon_lang($lang);
555				}
556				return $canon_lang->{$lang};
557				}
558
559				sub __canon_lang
560				{
561				# Use lower case only
562				my $lang = lc shift;
563
564				# If there's a 2 letter code where a three letter code has been used, replace it.
565				if ($lang =~ /^([a-z]{3})/)
566				{
567				substr($lang, 0, 3) = $lang_3to2->{$1}
568				if defined $lang_3to2->{$1};
569				}
570
571				return $lang_grandfather->{$lang}
572				if defined $lang_grandfather->{$lang};
573
574				# Simple case.
575				return $lang if length $lang < 4;
576
577				# Replace '_' with '-'.
578				$lang =~ s/_/-/g;
579
580				# upper case the country component of lang-country pairs.
581				return sprintf('%s-%s', $1, _canon_country($2))
582				if $lang =~ /^([a-z]{2,3})-([a-z]{2}\|\d{3})$/;
583
584				# title case the script component of lang-script pairs.
585				return sprintf('%s-%s', $1, _canon_script($2))
586				if $lang =~ /^([a-z]{2,3})-([a-z]{4})$/;
587
588				# title case the script component, upper case country componet of lang-script-country triplets.
589				return sprintf('%s-%s-%s', $1, _canon_script($2), _canon_country($3))
590				if $lang =~ /^([a-z]{2,3})-([a-z]{4})-([a-z]{2}\|\d{3})$/;
591
592				# Too complicated - give up and return lower case.
593				return $lang;
594				}
595
596				sub _canon_country
597				{
598				my $c = uc shift;
599
600				if ($c =~ /^\d\d\d$/)
601				{
602				my $c1 = country_code2code($c, LOCALE_CODE_NUMERIC, LOCALE_CODE_ALPHA_2);
603				$c = uc $c1
604				if defined $c1 && length $c1;
605				}
606
607				return $obsolete_iso3166->{$c}
608				if defined $obsolete_iso3166->{$c};
609
610				return $c;
611				}
612
613				sub _canon_script
614				{
615				my $s = ucfirst lc shift;
616				return $s;
617				}
618
619				1;
620
621				__END__