| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 2 |  |  |  |  |  |  | # File:         HTML.pm | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | # Description:  Read HTML meta information | 
| 5 |  |  |  |  |  |  | # | 
| 6 |  |  |  |  |  |  | # Revisions:    01/30/2007 - P. Harvey Created | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # References:   1) http://www.w3.org/TR/html4/ | 
| 9 |  |  |  |  |  |  | #               2) http://www.daisy.org/publications/specifications/daisy_202.html | 
| 10 |  |  |  |  |  |  | #               3) http://vancouver-webpages.com/META/metatags.detail.html | 
| 11 |  |  |  |  |  |  | #               4) http://www.html-reference.com/META.htm | 
| 12 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | package Image::ExifTool::HTML; | 
| 15 |  |  |  |  |  |  |  | 
| 16 | 4 |  |  | 4 |  | 4473 | use strict; | 
|  | 4 |  |  |  |  | 10 |  | 
|  | 4 |  |  |  |  | 156 |  | 
| 17 | 4 |  |  | 4 |  | 24 | use vars qw($VERSION @ISA @EXPORT_OK); | 
|  | 4 |  |  |  |  | 8 |  | 
|  | 4 |  |  |  |  | 274 |  | 
| 18 | 4 |  |  | 4 |  | 30 | use Image::ExifTool qw(:DataAccess :Utils); | 
|  | 4 |  |  |  |  | 7 |  | 
|  | 4 |  |  |  |  | 960 |  | 
| 19 | 4 |  |  | 4 |  | 1121 | use Image::ExifTool::PostScript; | 
|  | 4 |  |  |  |  | 31 |  | 
|  | 4 |  |  |  |  | 197 |  | 
| 20 | 4 |  |  | 4 |  | 2406 | use Image::ExifTool::XMP qw(EscapeXML UnescapeXML); | 
|  | 4 |  |  |  |  | 122 |  | 
|  | 4 |  |  |  |  | 12208 |  | 
| 21 |  |  |  |  |  |  | require Exporter; | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | $VERSION = '1.16'; | 
| 24 |  |  |  |  |  |  | @ISA = qw(Exporter); | 
| 25 |  |  |  |  |  |  | @EXPORT_OK = qw(EscapeHTML UnescapeHTML); | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | sub SetHTMLCharset($$); | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | # convert HTML charset (lower case) to ExifTool Charset name | 
| 30 |  |  |  |  |  |  | my %htmlCharset = ( | 
| 31 |  |  |  |  |  |  | macintosh     => 'MacRoman', | 
| 32 |  |  |  |  |  |  | 'iso-8859-1'   => 'Latin', | 
| 33 |  |  |  |  |  |  | 'utf-8'        => 'UTF8', | 
| 34 |  |  |  |  |  |  | 'windows-1252' => 'Latin', | 
| 35 |  |  |  |  |  |  | ); | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | # HTML 4 character entity references | 
| 38 |  |  |  |  |  |  | my %entityNum = ( | 
| 39 |  |  |  |  |  |  | 'quot'   => 34,   'eth'    => 240,  'lsquo'  => 8216, | 
| 40 |  |  |  |  |  |  | 'amp'    => 38,   'ntilde' => 241,  'rsquo'  => 8217, | 
| 41 |  |  |  |  |  |  | 'apos'   => 39,   'ograve' => 242,  'sbquo'  => 8218, | 
| 42 |  |  |  |  |  |  | 'lt'     => 60,   'oacute' => 243,  'ldquo'  => 8220, | 
| 43 |  |  |  |  |  |  | 'gt'     => 62,   'ocirc'  => 244,  'rdquo'  => 8221, | 
| 44 |  |  |  |  |  |  | 'nbsp'   => 160,  'otilde' => 245,  'bdquo'  => 8222, | 
| 45 |  |  |  |  |  |  | 'iexcl'  => 161,  'ouml'   => 246,  'dagger' => 8224, | 
| 46 |  |  |  |  |  |  | 'cent'   => 162,  'divide' => 247,  'Dagger' => 8225, | 
| 47 |  |  |  |  |  |  | 'pound'  => 163,  'oslash' => 248,  'bull'   => 8226, | 
| 48 |  |  |  |  |  |  | 'curren' => 164,  'ugrave' => 249,  'hellip' => 8230, | 
| 49 |  |  |  |  |  |  | 'yen'    => 165,  'uacute' => 250,  'permil' => 8240, | 
| 50 |  |  |  |  |  |  | 'brvbar' => 166,  'ucirc'  => 251,  'prime'  => 8242, | 
| 51 |  |  |  |  |  |  | 'sect'   => 167,  'uuml'   => 252,  'Prime'  => 8243, | 
| 52 |  |  |  |  |  |  | 'uml'    => 168,  'yacute' => 253,  'lsaquo' => 8249, | 
| 53 |  |  |  |  |  |  | 'copy'   => 169,  'thorn'  => 254,  'rsaquo' => 8250, | 
| 54 |  |  |  |  |  |  | 'ordf'   => 170,  'yuml'   => 255,  'oline'  => 8254, | 
| 55 |  |  |  |  |  |  | 'laquo'  => 171,  'OElig'  => 338,  'frasl'  => 8260, | 
| 56 |  |  |  |  |  |  | 'not'    => 172,  'oelig'  => 339,  'euro'   => 8364, | 
| 57 |  |  |  |  |  |  | 'shy'    => 173,  'Scaron' => 352,  'image'  => 8465, | 
| 58 |  |  |  |  |  |  | 'reg'    => 174,  'scaron' => 353,  'weierp' => 8472, | 
| 59 |  |  |  |  |  |  | 'macr'   => 175,  'Yuml'   => 376,  'real'   => 8476, | 
| 60 |  |  |  |  |  |  | 'deg'    => 176,  'fnof'   => 402,  'trade'  => 8482, | 
| 61 |  |  |  |  |  |  | 'plusmn' => 177,  'circ'   => 710,  'alefsym'=> 8501, | 
| 62 |  |  |  |  |  |  | 'sup2'   => 178,  'tilde'  => 732,  'larr'   => 8592, | 
| 63 |  |  |  |  |  |  | 'sup3'   => 179,  'Alpha'  => 913,  'uarr'   => 8593, | 
| 64 |  |  |  |  |  |  | 'acute'  => 180,  'Beta'   => 914,  'rarr'   => 8594, | 
| 65 |  |  |  |  |  |  | 'micro'  => 181,  'Gamma'  => 915,  'darr'   => 8595, | 
| 66 |  |  |  |  |  |  | 'para'   => 182,  'Delta'  => 916,  'harr'   => 8596, | 
| 67 |  |  |  |  |  |  | 'middot' => 183,  'Epsilon'=> 917,  'crarr'  => 8629, | 
| 68 |  |  |  |  |  |  | 'cedil'  => 184,  'Zeta'   => 918,  'lArr'   => 8656, | 
| 69 |  |  |  |  |  |  | 'sup1'   => 185,  'Eta'    => 919,  'uArr'   => 8657, | 
| 70 |  |  |  |  |  |  | 'ordm'   => 186,  'Theta'  => 920,  'rArr'   => 8658, | 
| 71 |  |  |  |  |  |  | 'raquo'  => 187,  'Iota'   => 921,  'dArr'   => 8659, | 
| 72 |  |  |  |  |  |  | 'frac14' => 188,  'Kappa'  => 922,  'hArr'   => 8660, | 
| 73 |  |  |  |  |  |  | 'frac12' => 189,  'Lambda' => 923,  'forall' => 8704, | 
| 74 |  |  |  |  |  |  | 'frac34' => 190,  'Mu'     => 924,  'part'   => 8706, | 
| 75 |  |  |  |  |  |  | 'iquest' => 191,  'Nu'     => 925,  'exist'  => 8707, | 
| 76 |  |  |  |  |  |  | 'Agrave' => 192,  'Xi'     => 926,  'empty'  => 8709, | 
| 77 |  |  |  |  |  |  | 'Aacute' => 193,  'Omicron'=> 927,  'nabla'  => 8711, | 
| 78 |  |  |  |  |  |  | 'Acirc'  => 194,  'Pi'     => 928,  'isin'   => 8712, | 
| 79 |  |  |  |  |  |  | 'Atilde' => 195,  'Rho'    => 929,  'notin'  => 8713, | 
| 80 |  |  |  |  |  |  | 'Auml'   => 196,  'Sigma'  => 931,  'ni'     => 8715, | 
| 81 |  |  |  |  |  |  | 'Aring'  => 197,  'Tau'    => 932,  'prod'   => 8719, | 
| 82 |  |  |  |  |  |  | 'AElig'  => 198,  'Upsilon'=> 933,  'sum'    => 8721, | 
| 83 |  |  |  |  |  |  | 'Ccedil' => 199,  'Phi'    => 934,  'minus'  => 8722, | 
| 84 |  |  |  |  |  |  | 'Egrave' => 200,  'Chi'    => 935,  'lowast' => 8727, | 
| 85 |  |  |  |  |  |  | 'Eacute' => 201,  'Psi'    => 936,  'radic'  => 8730, | 
| 86 |  |  |  |  |  |  | 'Ecirc'  => 202,  'Omega'  => 937,  'prop'   => 8733, | 
| 87 |  |  |  |  |  |  | 'Euml'   => 203,  'alpha'  => 945,  'infin'  => 8734, | 
| 88 |  |  |  |  |  |  | 'Igrave' => 204,  'beta'   => 946,  'ang'    => 8736, | 
| 89 |  |  |  |  |  |  | 'Iacute' => 205,  'gamma'  => 947,  'and'    => 8743, | 
| 90 |  |  |  |  |  |  | 'Icirc'  => 206,  'delta'  => 948,  'or'     => 8744, | 
| 91 |  |  |  |  |  |  | 'Iuml'   => 207,  'epsilon'=> 949,  'cap'    => 8745, | 
| 92 |  |  |  |  |  |  | 'ETH'    => 208,  'zeta'   => 950,  'cup'    => 8746, | 
| 93 |  |  |  |  |  |  | 'Ntilde' => 209,  'eta'    => 951,  'int'    => 8747, | 
| 94 |  |  |  |  |  |  | 'Ograve' => 210,  'theta'  => 952,  'there4' => 8756, | 
| 95 |  |  |  |  |  |  | 'Oacute' => 211,  'iota'   => 953,  'sim'    => 8764, | 
| 96 |  |  |  |  |  |  | 'Ocirc'  => 212,  'kappa'  => 954,  'cong'   => 8773, | 
| 97 |  |  |  |  |  |  | 'Otilde' => 213,  'lambda' => 955,  'asymp'  => 8776, | 
| 98 |  |  |  |  |  |  | 'Ouml'   => 214,  'mu'     => 956,  'ne'     => 8800, | 
| 99 |  |  |  |  |  |  | 'times'  => 215,  'nu'     => 957,  'equiv'  => 8801, | 
| 100 |  |  |  |  |  |  | 'Oslash' => 216,  'xi'     => 958,  'le'     => 8804, | 
| 101 |  |  |  |  |  |  | 'Ugrave' => 217,  'omicron'=> 959,  'ge'     => 8805, | 
| 102 |  |  |  |  |  |  | 'Uacute' => 218,  'pi'     => 960,  'sub'    => 8834, | 
| 103 |  |  |  |  |  |  | 'Ucirc'  => 219,  'rho'    => 961,  'sup'    => 8835, | 
| 104 |  |  |  |  |  |  | 'Uuml'   => 220,  'sigmaf' => 962,  'nsub'   => 8836, | 
| 105 |  |  |  |  |  |  | 'Yacute' => 221,  'sigma'  => 963,  'sube'   => 8838, | 
| 106 |  |  |  |  |  |  | 'THORN'  => 222,  'tau'    => 964,  'supe'   => 8839, | 
| 107 |  |  |  |  |  |  | 'szlig'  => 223,  'upsilon'=> 965,  'oplus'  => 8853, | 
| 108 |  |  |  |  |  |  | 'agrave' => 224,  'phi'    => 966,  'otimes' => 8855, | 
| 109 |  |  |  |  |  |  | 'aacute' => 225,  'chi'    => 967,  'perp'   => 8869, | 
| 110 |  |  |  |  |  |  | 'acirc'  => 226,  'psi'    => 968,  'sdot'   => 8901, | 
| 111 |  |  |  |  |  |  | 'atilde' => 227,  'omega'  => 969,  'lceil'  => 8968, | 
| 112 |  |  |  |  |  |  | 'auml'   => 228,  'thetasym'=>977,  'rceil'  => 8969, | 
| 113 |  |  |  |  |  |  | 'aring'  => 229,  'upsih'  => 978,  'lfloor' => 8970, | 
| 114 |  |  |  |  |  |  | 'aelig'  => 230,  'piv'    => 982,  'rfloor' => 8971, | 
| 115 |  |  |  |  |  |  | 'ccedil' => 231,  'ensp'   => 8194, 'lang'   => 9001, | 
| 116 |  |  |  |  |  |  | 'egrave' => 232,  'emsp'   => 8195, 'rang'   => 9002, | 
| 117 |  |  |  |  |  |  | 'eacute' => 233,  'thinsp' => 8201, 'loz'    => 9674, | 
| 118 |  |  |  |  |  |  | 'ecirc'  => 234,  'zwnj'   => 8204, 'spades' => 9824, | 
| 119 |  |  |  |  |  |  | 'euml'   => 235,  'zwj'    => 8205, 'clubs'  => 9827, | 
| 120 |  |  |  |  |  |  | 'igrave' => 236,  'lrm'    => 8206, 'hearts' => 9829, | 
| 121 |  |  |  |  |  |  | 'iacute' => 237,  'rlm'    => 8207, 'diams'  => 9830, | 
| 122 |  |  |  |  |  |  | 'icirc'  => 238,  'ndash'  => 8211, | 
| 123 |  |  |  |  |  |  | 'iuml'   => 239,  'mdash'  => 8212, | 
| 124 |  |  |  |  |  |  | ); | 
| 125 |  |  |  |  |  |  | my %entityName; # look up entity names by number (built as necessary) | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | # HTML info | 
| 128 |  |  |  |  |  |  | # (tag ID's are case insensitive and must be all lower case in tables) | 
| 129 |  |  |  |  |  |  | %Image::ExifTool::HTML::Main = ( | 
| 130 |  |  |  |  |  |  | GROUPS => { 2 => 'Document' }, | 
| 131 |  |  |  |  |  |  | NOTES => q{ | 
| 132 |  |  |  |  |  |  | Meta information extracted from the header of HTML and XHTML files.  This is | 
| 133 |  |  |  |  |  |  | a mix of information found in the C elements, C element, and the | 
| 134 |  |  |  |  |  |  | C element. | 
| 135 |  |  |  |  |  |  | }, | 
| 136 |  |  |  |  |  |  | dc => { | 
| 137 |  |  |  |  |  |  | Name => 'DC', | 
| 138 |  |  |  |  |  |  | SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' }, | 
| 139 |  |  |  |  |  |  | }, | 
| 140 |  |  |  |  |  |  | ncc => { | 
| 141 |  |  |  |  |  |  | Name => 'NCC', | 
| 142 |  |  |  |  |  |  | SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' }, | 
| 143 |  |  |  |  |  |  | }, | 
| 144 |  |  |  |  |  |  | prod => { | 
| 145 |  |  |  |  |  |  | Name => 'Prod', | 
| 146 |  |  |  |  |  |  | SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' }, | 
| 147 |  |  |  |  |  |  | }, | 
| 148 |  |  |  |  |  |  | vw96 => { | 
| 149 |  |  |  |  |  |  | Name => 'VW96', | 
| 150 |  |  |  |  |  |  | SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' }, | 
| 151 |  |  |  |  |  |  | }, | 
| 152 |  |  |  |  |  |  | 'http-equiv' => { | 
| 153 |  |  |  |  |  |  | Name => 'HTTP-equiv', | 
| 154 |  |  |  |  |  |  | SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' }, | 
| 155 |  |  |  |  |  |  | }, | 
| 156 |  |  |  |  |  |  | o => { | 
| 157 |  |  |  |  |  |  | Name => 'Office', | 
| 158 |  |  |  |  |  |  | SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' }, | 
| 159 |  |  |  |  |  |  | }, | 
| 160 |  |  |  |  |  |  | abstract        => { }, | 
| 161 |  |  |  |  |  |  | author          => { }, | 
| 162 |  |  |  |  |  |  | classification  => { }, | 
| 163 |  |  |  |  |  |  | 'content-language'=>{ Name => 'ContentLanguage' }, | 
| 164 |  |  |  |  |  |  | copyright       => { }, | 
| 165 |  |  |  |  |  |  | description     => { }, | 
| 166 |  |  |  |  |  |  | distribution    => { }, | 
| 167 |  |  |  |  |  |  | 'doc-class'      => { Name => 'DocClass' }, | 
| 168 |  |  |  |  |  |  | 'doc-rights'     => { Name => 'DocRights' }, | 
| 169 |  |  |  |  |  |  | 'doc-type'       => { Name => 'DocType' }, | 
| 170 |  |  |  |  |  |  | formatter       => { }, | 
| 171 |  |  |  |  |  |  | generator       => { }, | 
| 172 |  |  |  |  |  |  | generatorversion=> { Name => 'GeneratorVersion' }, | 
| 173 |  |  |  |  |  |  | googlebot       => { Name => 'GoogleBot' }, | 
| 174 |  |  |  |  |  |  | keywords        => { List => 1 }, | 
| 175 |  |  |  |  |  |  | mssmarttagspreventparsing => { Name => 'NoMSSmartTags' }, | 
| 176 |  |  |  |  |  |  | originator      => { }, | 
| 177 |  |  |  |  |  |  | owner           => { }, | 
| 178 |  |  |  |  |  |  | progid          => { Name => 'ProgID' }, | 
| 179 |  |  |  |  |  |  | rating          => { }, | 
| 180 |  |  |  |  |  |  | refresh         => { }, | 
| 181 |  |  |  |  |  |  | 'resource-type'  => { Name => 'ResourceType' }, | 
| 182 |  |  |  |  |  |  | 'revisit-after'  => { Name => 'RevisitAfter' }, | 
| 183 |  |  |  |  |  |  | robots          => { List => 1 }, | 
| 184 |  |  |  |  |  |  | title           => { Notes => "the only extracted tag which isn't from an HTML META element" }, | 
| 185 |  |  |  |  |  |  | ); | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | # ref 2 | 
| 188 |  |  |  |  |  |  | %Image::ExifTool::HTML::dc = ( | 
| 189 |  |  |  |  |  |  | GROUPS => { 1 => 'HTML-dc', 2 => 'Document' }, | 
| 190 |  |  |  |  |  |  | NOTES => 'Dublin Core schema tags (also used in XMP).', | 
| 191 |  |  |  |  |  |  | contributor => { Groups => { 2 => 'Author' }, List => 'Bag' }, | 
| 192 |  |  |  |  |  |  | coverage    => { }, | 
| 193 |  |  |  |  |  |  | creator     => { Groups => { 2 => 'Author' }, List => 'Seq' }, | 
| 194 |  |  |  |  |  |  | date        => { | 
| 195 |  |  |  |  |  |  | Groups => { 2 => 'Time'   }, | 
| 196 |  |  |  |  |  |  | List => 'Seq', | 
| 197 |  |  |  |  |  |  | PrintConv => '$self->ConvertDateTime($val)', | 
| 198 |  |  |  |  |  |  | }, | 
| 199 |  |  |  |  |  |  | description => { }, | 
| 200 |  |  |  |  |  |  | 'format'     => { }, | 
| 201 |  |  |  |  |  |  | identifier  => { }, | 
| 202 |  |  |  |  |  |  | language    => { List => 'Bag' }, | 
| 203 |  |  |  |  |  |  | publisher   => { Groups => { 2 => 'Author' }, List => 'Bag' }, | 
| 204 |  |  |  |  |  |  | relation    => { List => 'Bag' }, | 
| 205 |  |  |  |  |  |  | rights      => { Groups => { 2 => 'Author' } }, | 
| 206 |  |  |  |  |  |  | source      => { Groups => { 2 => 'Author' } }, | 
| 207 |  |  |  |  |  |  | subject     => { List => 'Bag' }, | 
| 208 |  |  |  |  |  |  | title       => { }, | 
| 209 |  |  |  |  |  |  | type        => { List => 'Bag' }, | 
| 210 |  |  |  |  |  |  | ); | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | # ref 2 | 
| 213 |  |  |  |  |  |  | %Image::ExifTool::HTML::ncc = ( | 
| 214 |  |  |  |  |  |  | GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' }, | 
| 215 |  |  |  |  |  |  | charset         => { Name => 'CharacterSet' }, # name changed to avoid conflict with -charset option | 
| 216 |  |  |  |  |  |  | depth           => { }, | 
| 217 |  |  |  |  |  |  | files           => { }, | 
| 218 |  |  |  |  |  |  | footnotes       => { }, | 
| 219 |  |  |  |  |  |  | generator       => { }, | 
| 220 |  |  |  |  |  |  | kbytesize       => { Name => 'KByteSize' }, | 
| 221 |  |  |  |  |  |  | maxpagenormal   => { Name => 'MaxPageNormal' }, | 
| 222 |  |  |  |  |  |  | multimediatype  => { Name => 'MultimediaType' }, | 
| 223 |  |  |  |  |  |  | narrator        => { }, | 
| 224 |  |  |  |  |  |  | pagefront       => { Name => 'PageFront' }, | 
| 225 |  |  |  |  |  |  | pagenormal      => { Name => 'PageNormal' }, | 
| 226 |  |  |  |  |  |  | pagespecial     => { Name => 'PageSpecial' }, | 
| 227 |  |  |  |  |  |  | prodnotes       => { Name => 'ProdNotes' }, | 
| 228 |  |  |  |  |  |  | producer        => { }, | 
| 229 |  |  |  |  |  |  | produceddate    => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # YYYY-mm-dd | 
| 230 |  |  |  |  |  |  | revision        => { }, | 
| 231 |  |  |  |  |  |  | revisiondate    => { Name => 'RevisionDate', Groups => { 2 => 'Time' } }, | 
| 232 |  |  |  |  |  |  | setinfo         => { Name => 'SetInfo' }, | 
| 233 |  |  |  |  |  |  | sidebars        => { }, | 
| 234 |  |  |  |  |  |  | sourcedate      => { Name => 'SourceDate', Groups => { 2 => 'Time' } }, | 
| 235 |  |  |  |  |  |  | sourceedition   => { Name => 'SourceEdition' }, | 
| 236 |  |  |  |  |  |  | sourcepublisher => { Name => 'SourcePublisher' }, | 
| 237 |  |  |  |  |  |  | sourcerights    => { Name => 'SourceRights' }, | 
| 238 |  |  |  |  |  |  | sourcetitle     => { Name => 'SourceTitle' }, | 
| 239 |  |  |  |  |  |  | tocitems        => { Name => 'TOCItems' }, | 
| 240 |  |  |  |  |  |  | totaltime       => { Name => 'Duration' }, # HH:MM:SS | 
| 241 |  |  |  |  |  |  | ); | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | # ref 3 | 
| 244 |  |  |  |  |  |  | %Image::ExifTool::HTML::vw96 = ( | 
| 245 |  |  |  |  |  |  | GROUPS => { 1 => 'HTML-vw96', 2 => 'Document' }, | 
| 246 |  |  |  |  |  |  | objecttype      => { Name => 'ObjectType' }, | 
| 247 |  |  |  |  |  |  | ); | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | # ref 2 | 
| 250 |  |  |  |  |  |  | %Image::ExifTool::HTML::prod = ( | 
| 251 |  |  |  |  |  |  | GROUPS => { 1 => 'HTML-prod', 2 => 'Document' }, | 
| 252 |  |  |  |  |  |  | reclocation     => { Name => 'RecLocation' }, | 
| 253 |  |  |  |  |  |  | recengineer     => { Name => 'RecEngineer' }, | 
| 254 |  |  |  |  |  |  | ); | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | # ref 3/4 | 
| 257 |  |  |  |  |  |  | %Image::ExifTool::HTML::equiv = ( | 
| 258 |  |  |  |  |  |  | GROUPS => { 1 => 'HTTP-equiv', 2 => 'Document' }, | 
| 259 |  |  |  |  |  |  | NOTES => 'These tags have a family 1 group name of "HTTP-equiv".', | 
| 260 |  |  |  |  |  |  | 'cache-control'       => { Name => 'CacheControl' }, | 
| 261 |  |  |  |  |  |  | 'content-disposition' => { Name => 'ContentDisposition' }, | 
| 262 |  |  |  |  |  |  | 'content-language'    => { Name => 'ContentLanguage' }, | 
| 263 |  |  |  |  |  |  | 'content-script-type' => { Name => 'ContentScriptType' }, | 
| 264 |  |  |  |  |  |  | 'content-style-type'  => { Name => 'ContentStyleType' }, | 
| 265 |  |  |  |  |  |  | # note: setting the HTMLCharset like this will miss any tags which come earlier | 
| 266 |  |  |  |  |  |  | 'content-type'        => { Name => 'ContentType', RawConv => \&SetHTMLCharset }, | 
| 267 |  |  |  |  |  |  | 'default-style'       => { Name => 'DefaultStyle' }, | 
| 268 |  |  |  |  |  |  | expires              => { }, | 
| 269 |  |  |  |  |  |  | 'ext-cache'           => { Name => 'ExtCache' }, | 
| 270 |  |  |  |  |  |  | imagetoolbar         => { Name => 'ImageToolbar' }, | 
| 271 |  |  |  |  |  |  | lotus                => { }, | 
| 272 |  |  |  |  |  |  | 'page-enter'          => { Name => 'PageEnter' }, | 
| 273 |  |  |  |  |  |  | 'page-exit'           => { Name => 'PageExit' }, | 
| 274 |  |  |  |  |  |  | 'pics-label'          => { Name => 'PicsLabel' }, | 
| 275 |  |  |  |  |  |  | pragma               => { }, | 
| 276 |  |  |  |  |  |  | refresh              => { }, | 
| 277 |  |  |  |  |  |  | 'reply-to'            => { Name => 'ReplyTo' }, | 
| 278 |  |  |  |  |  |  | 'set-cookie'          => { Name => 'SetCookie' }, | 
| 279 |  |  |  |  |  |  | 'site-enter'          => { Name => 'SiteEnter' }, | 
| 280 |  |  |  |  |  |  | 'site-exit'           => { Name => 'SiteExit' }, | 
| 281 |  |  |  |  |  |  | vary                 => { }, | 
| 282 |  |  |  |  |  |  | 'window-target'       => { Name => 'WindowTarget' }, | 
| 283 |  |  |  |  |  |  | ); | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | # MS Office namespace (ref PH) | 
| 286 |  |  |  |  |  |  | %Image::ExifTool::HTML::Office = ( | 
| 287 |  |  |  |  |  |  | GROUPS => { 1 => 'HTML-office', 2 => 'Document' }, | 
| 288 |  |  |  |  |  |  | NOTES => 'Tags written by Microsoft Office applications.', | 
| 289 |  |  |  |  |  |  | Subject     => { }, | 
| 290 |  |  |  |  |  |  | Author      => { Groups => { 2 => 'Author' } }, | 
| 291 |  |  |  |  |  |  | Keywords    => { }, | 
| 292 |  |  |  |  |  |  | Description => { }, | 
| 293 |  |  |  |  |  |  | Template    => { }, | 
| 294 |  |  |  |  |  |  | LastAuthor  => { Groups => { 2 => 'Author' } }, | 
| 295 |  |  |  |  |  |  | Revision    => { Name => 'RevisionNumber' }, | 
| 296 |  |  |  |  |  |  | TotalTime   => { Name => 'TotalEditTime',   PrintConv => 'ConvertTimeSpan($val, 60)' }, | 
| 297 |  |  |  |  |  |  | Created     => { | 
| 298 |  |  |  |  |  |  | Name => 'CreateDate', | 
| 299 |  |  |  |  |  |  | Groups => { 2 => 'Time' }, | 
| 300 |  |  |  |  |  |  | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', | 
| 301 |  |  |  |  |  |  | PrintConv => '$self->ConvertDateTime($val)', | 
| 302 |  |  |  |  |  |  | }, | 
| 303 |  |  |  |  |  |  | LastSaved   => { | 
| 304 |  |  |  |  |  |  | Name => 'ModifyDate', | 
| 305 |  |  |  |  |  |  | Groups => { 2 => 'Time' }, | 
| 306 |  |  |  |  |  |  | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', | 
| 307 |  |  |  |  |  |  | PrintConv => '$self->ConvertDateTime($val)', | 
| 308 |  |  |  |  |  |  | }, | 
| 309 |  |  |  |  |  |  | LastSaved   => { | 
| 310 |  |  |  |  |  |  | Name => 'ModifyDate', | 
| 311 |  |  |  |  |  |  | Groups => { 2 => 'Time' }, | 
| 312 |  |  |  |  |  |  | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', | 
| 313 |  |  |  |  |  |  | PrintConv => '$self->ConvertDateTime($val)', | 
| 314 |  |  |  |  |  |  | }, | 
| 315 |  |  |  |  |  |  | LastPrinted => { | 
| 316 |  |  |  |  |  |  | Name => 'LastPrinted', | 
| 317 |  |  |  |  |  |  | Groups => { 2 => 'Time' }, | 
| 318 |  |  |  |  |  |  | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', | 
| 319 |  |  |  |  |  |  | PrintConv => '$self->ConvertDateTime($val)', | 
| 320 |  |  |  |  |  |  | }, | 
| 321 |  |  |  |  |  |  | Pages       => { }, | 
| 322 |  |  |  |  |  |  | Words       => { }, | 
| 323 |  |  |  |  |  |  | Characters  => { }, | 
| 324 |  |  |  |  |  |  | Category    => { }, | 
| 325 |  |  |  |  |  |  | Manager     => { }, | 
| 326 |  |  |  |  |  |  | Company     => { }, | 
| 327 |  |  |  |  |  |  | Lines       => { }, | 
| 328 |  |  |  |  |  |  | Paragraphs  => { }, | 
| 329 |  |  |  |  |  |  | CharactersWithSpaces => { }, | 
| 330 |  |  |  |  |  |  | Version     => { Name => 'RevisionNumber' }, | 
| 331 |  |  |  |  |  |  | ); | 
| 332 |  |  |  |  |  |  |  | 
| 333 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 334 |  |  |  |  |  |  | # Set HTMLCharset member based on content type | 
| 335 |  |  |  |  |  |  | # Inputs: 0) content type string, 1) ExifTool ref | 
| 336 |  |  |  |  |  |  | # Returns: original string | 
| 337 |  |  |  |  |  |  | sub SetHTMLCharset($$) | 
| 338 |  |  |  |  |  |  | { | 
| 339 | 1 |  |  | 1 | 0 | 5 | my ($val, $et) = @_; | 
| 340 | 1 | 50 |  |  |  | 12 | $$et{HTMLCharset} = $htmlCharset{lc $1} if $val =~ /charset=['"]?([-\w]+)/; | 
| 341 | 1 |  |  |  |  | 3 | return $val; | 
| 342 |  |  |  |  |  |  | } | 
| 343 |  |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 345 |  |  |  |  |  |  | # Convert single UTF-8 character to HTML character reference | 
| 346 |  |  |  |  |  |  | # Inputs: 0) UTF-8 character sequence | 
| 347 |  |  |  |  |  |  | # Returns: HTML character reference (eg. """); | 
| 348 |  |  |  |  |  |  | # Note: Must be called via EscapeHTML to load name lookup | 
| 349 |  |  |  |  |  |  | sub EscapeChar($) | 
| 350 |  |  |  |  |  |  | { | 
| 351 | 157 |  |  | 157 | 0 | 274 | my $ch = shift; | 
| 352 | 157 |  |  |  |  | 190 | my $val; | 
| 353 | 157 | 50 |  |  |  | 244 | if ($] < 5.006001) { | 
| 354 | 0 |  |  |  |  | 0 | ($val) = Image::ExifTool::UnpackUTF8($ch); | 
| 355 |  |  |  |  |  |  | } else { | 
| 356 |  |  |  |  |  |  | # the meaning of "U0" is reversed as of Perl 5.10.0! | 
| 357 | 157 | 50 |  |  |  | 386 | ($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch); | 
| 358 |  |  |  |  |  |  | } | 
| 359 | 157 | 50 |  |  |  | 294 | return '?' unless defined $val; | 
| 360 | 157 | 100 |  |  |  | 467 | return "&$entityName{$val};" if $entityName{$val}; | 
| 361 | 112 |  |  |  |  | 433 | return sprintf('%x;',$val); | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 365 |  |  |  |  |  |  | # Escape any special characters for HTML | 
| 366 |  |  |  |  |  |  | # Inputs: 0) string to be escaped, 1) optional string encoding (default 'UTF8') | 
| 367 |  |  |  |  |  |  | # Returns: escaped string | 
| 368 |  |  |  |  |  |  | sub EscapeHTML($;$) | 
| 369 |  |  |  |  |  |  | { | 
| 370 | 167 |  |  | 167 | 0 | 312 | my ($str, $enc) = @_; | 
| 371 |  |  |  |  |  |  | # escape XML characters | 
| 372 | 167 |  |  |  |  | 394 | $str = EscapeXML($str); | 
| 373 |  |  |  |  |  |  | # escape other special characters if they exist | 
| 374 | 167 | 100 |  |  |  | 433 | if ($str =~ /[\x80-\xff]/) { | 
| 375 |  |  |  |  |  |  | # generate entity name lookup if necessary | 
| 376 | 16 | 100 |  |  |  | 53 | unless (%entityName) { | 
| 377 | 2 |  |  |  |  | 8 | local $_; | 
| 378 | 2 |  |  |  |  | 106 | foreach (keys %entityNum) { | 
| 379 | 506 |  |  |  |  | 1291 | $entityName{$entityNum{$_}} = $_; | 
| 380 |  |  |  |  |  |  | } | 
| 381 | 2 |  |  |  |  | 25 | delete $entityName{39};  # 'apos' is not valid HTML | 
| 382 |  |  |  |  |  |  | } | 
| 383 |  |  |  |  |  |  | # suppress warnings | 
| 384 | 16 |  |  | 0 |  | 114 | local $SIG{'__WARN__'} = sub { 1 }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 385 | 16 | 50 | 33 |  |  | 58 | if ($enc and $enc ne 'UTF8') { | 
| 386 | 0 |  |  |  |  | 0 | $str = Image::ExifTool::Decode(undef, $str, $enc, undef, 'UTF8'); | 
| 387 | 0 |  |  |  |  | 0 | $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge; | 
|  | 0 |  |  |  |  | 0 |  | 
| 388 | 0 |  |  |  |  | 0 | $str = Image::ExifTool::Decode(undef, $str, 'UTF8', undef, $enc); | 
| 389 |  |  |  |  |  |  | } else { | 
| 390 |  |  |  |  |  |  | # escape any non-ascii characters for HTML | 
| 391 | 16 |  |  |  |  | 76 | $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge; | 
|  | 157 |  |  |  |  | 347 |  | 
| 392 |  |  |  |  |  |  | } | 
| 393 |  |  |  |  |  |  | } | 
| 394 | 167 |  |  |  |  | 421 | return $str; | 
| 395 |  |  |  |  |  |  | } | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 398 |  |  |  |  |  |  | # Unescape all HTML character references | 
| 399 |  |  |  |  |  |  | # Inputs: 0) string to be unescaped, 1) optional string encoding (default 'UTF8') | 
| 400 |  |  |  |  |  |  | # Returns: unescaped string | 
| 401 |  |  |  |  |  |  | sub UnescapeHTML($;$) | 
| 402 |  |  |  |  |  |  | { | 
| 403 | 154 |  |  | 154 | 0 | 346 | my ($str, $enc) = @_; | 
| 404 | 154 |  |  |  |  | 490 | return UnescapeXML($str, \%entityNum, $enc); | 
| 405 |  |  |  |  |  |  | } | 
| 406 |  |  |  |  |  |  |  | 
| 407 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 408 |  |  |  |  |  |  | # Extract information from a HTML file | 
| 409 |  |  |  |  |  |  | # Inputs: 0) ExifTool object reference, 1) DirInfo reference | 
| 410 |  |  |  |  |  |  | # Returns: 1 on success, 0 if this wasn't a valid HTML file | 
| 411 |  |  |  |  |  |  | sub ProcessHTML($$) | 
| 412 |  |  |  |  |  |  | { | 
| 413 | 1 |  |  | 1 | 0 | 4 | my ($et, $dirInfo) = @_; | 
| 414 | 1 |  |  |  |  | 4 | my $raf = $$dirInfo{RAF}; | 
| 415 | 1 |  |  |  |  | 4 | my $buff; | 
| 416 |  |  |  |  |  |  |  | 
| 417 |  |  |  |  |  |  | # validate HTML or XHTML file | 
| 418 | 1 | 50 |  |  |  | 6 | $raf->Read($buff, 256) or return 0; | 
| 419 | 1 | 50 |  |  |  | 15 | $buff =~ /^(\xef\xbb\xbf)?\s*<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0; | 
| 420 | 1 | 50 | 50 |  |  | 14 | $buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $2 eq '?xml'; | 
| 421 | 1 |  |  |  |  | 9 | $et->SetFileType(); | 
| 422 |  |  |  |  |  |  |  | 
| 423 | 1 | 50 |  |  |  | 11 | $raf->Seek(0,0) or $et->Warn('Seek error'), return 1; | 
| 424 |  |  |  |  |  |  |  | 
| 425 | 1 |  |  |  |  | 14 | local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf); | 
| 426 | 1 | 50 |  |  |  | 11 | $/ or $et->Warn('Invalid HTML data'), return 1; | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | # extract header information | 
| 429 | 1 |  |  |  |  | 6 | my $doc; | 
| 430 | 1 |  |  |  |  | 7 | while ($raf->ReadLine($buff)) { | 
| 431 | 76 | 100 |  |  |  | 130 | if (not defined $doc) { | 
| 432 |  |  |  |  |  |  | # look for 'head' element | 
| 433 | 5 | 100 |  |  |  | 25 | next unless $buff =~ / | 
| 434 | 1 |  |  |  |  | 7 | $doc = substr($buff, pos($buff)); | 
| 435 | 1 |  |  |  |  | 3 | next; | 
| 436 |  |  |  |  |  |  | } | 
| 437 | 71 |  |  |  |  | 120 | $doc .= $buff; | 
| 438 | 71 | 100 |  |  |  | 196 | last if $buff =~ m{}i; | 
| 439 |  |  |  |  |  |  | } | 
| 440 | 1 | 50 |  |  |  | 14 | return 1 unless defined $doc; | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  | # process all elements in header | 
| 443 | 1 |  |  |  |  | 30 | my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main'); | 
| 444 | 1 |  |  |  |  | 4 | for (;;) { | 
| 445 | 36 | 100 |  |  |  | 211 | last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg; | 
| 446 | 35 |  |  |  |  | 137 | my ($tagName, $attrs) = ($1, $2); | 
| 447 | 35 |  |  |  |  | 67 | my $tag = lc($tagName); | 
| 448 | 35 |  |  |  |  | 66 | my ($val, $grp); | 
| 449 | 35 | 100 |  |  |  | 113 | if ($attrs =~ m{/$}) {  # self-contained XHTML tags end in '/>' | 
| 450 | 33 |  |  |  |  | 50 | $val = ''; | 
| 451 |  |  |  |  |  |  | } else { | 
| 452 |  |  |  |  |  |  | # look for element close | 
| 453 | 2 |  |  |  |  | 6 | my $pos = pos($doc); | 
| 454 | 2 |  |  |  |  | 12 | my $close = "$tagName>"; | 
| 455 |  |  |  |  |  |  | # the following doesn't work on Solaris Perl 5.6.1 due to Perl bug: | 
| 456 |  |  |  |  |  |  | # if ($doc =~ m{(.*?)$tagName>}sg) { | 
| 457 |  |  |  |  |  |  | #     $val = $1; | 
| 458 | 2 | 50 |  |  |  | 45 | if ($doc =~ m{$close}sg) { | 
| 459 | 2 |  |  |  |  | 29 | $val = substr($doc, $pos, pos($doc)-$pos-length($close)); | 
| 460 |  |  |  |  |  |  | } else { | 
| 461 | 0 |  |  |  |  | 0 | pos($doc) = $pos; | 
| 462 | 0 | 0 |  |  |  | 0 | next unless $tag eq 'meta'; # META tags don't need to be closed | 
| 463 | 0 |  |  |  |  | 0 | $val = ''; | 
| 464 |  |  |  |  |  |  | } | 
| 465 |  |  |  |  |  |  | } | 
| 466 | 35 |  |  |  |  | 58 | my $table = $tagTablePtr; | 
| 467 | 35 | 100 |  |  |  | 75 | if ($tag eq 'meta') { | 
|  |  | 100 |  |  |  |  |  | 
| 468 |  |  |  |  |  |  | # parse HTML META element | 
| 469 | 33 |  |  |  |  | 212 | undef $tag; | 
| 470 |  |  |  |  |  |  | # tag name is in NAME or HTTP-EQUIV attribute | 
| 471 | 33 | 100 |  |  |  | 137 | if ($attrs =~ /\bname\s*=\s*['"]?([\w:.-]+)/si) { | 
|  |  | 50 |  |  |  |  |  | 
| 472 | 32 |  |  |  |  | 65 | $tagName = $1; | 
| 473 |  |  |  |  |  |  | } elsif ($attrs =~ /\bhttp-equiv\s*=\s*['"]?([\w:.-]+)/si) { | 
| 474 | 1 |  |  |  |  | 4 | $tagName = "HTTP-equiv.$1"; | 
| 475 |  |  |  |  |  |  | } else { | 
| 476 | 0 |  |  |  |  | 0 | next;   # no name | 
| 477 |  |  |  |  |  |  | } | 
| 478 | 33 | 50 |  |  |  | 78 | $tag = lc($tagName) or next; | 
| 479 |  |  |  |  |  |  | # tag value is in CONTENT attribute | 
| 480 | 33 | 50 | 33 |  |  | 163 | if ($attrs =~ /\bcontent\s*=\s*(['"])(.*?)\1/si or | 
| 481 |  |  |  |  |  |  | $attrs =~ /\bcontent\s*=\s*(['"]?)([\w:.-]+)/si) | 
| 482 |  |  |  |  |  |  | { | 
| 483 | 33 |  |  |  |  | 78 | $val = $2; | 
| 484 |  |  |  |  |  |  | } else { | 
| 485 | 0 | 0 |  |  |  | 0 | next unless length $val; | 
| 486 |  |  |  |  |  |  | } | 
| 487 |  |  |  |  |  |  | # isolate group name (separator is '.' in HTML, but ':' in ref 2) | 
| 488 | 33 | 50 |  |  |  | 105 | if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) { | 
| 489 | 33 |  |  |  |  | 77 | ($grp, $tag) = ($1, $2); | 
| 490 | 33 |  |  |  |  | 82 | my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp); | 
| 491 | 33 | 50 | 33 |  |  | 123 | if ($tagInfo and $$tagInfo{SubDirectory}) { | 
| 492 | 33 |  |  |  |  | 89 | $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); | 
| 493 |  |  |  |  |  |  | } else { | 
| 494 | 0 |  |  |  |  | 0 | $tag = "$grp.$tag"; | 
| 495 |  |  |  |  |  |  | } | 
| 496 |  |  |  |  |  |  | } | 
| 497 |  |  |  |  |  |  | } elsif ($tag eq 'xml') { | 
| 498 | 1 |  |  |  |  | 10 | $et->VPrint(0, "Parsing XML\n"); | 
| 499 |  |  |  |  |  |  | # parse XML tags (quick-and-dirty) | 
| 500 | 1 |  |  |  |  | 10 | my $xml = $val; | 
| 501 | 1 |  |  |  |  | 19 | while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) { | 
| 502 | 25 |  |  |  |  | 93 | ($grp, $tag, $val) = ($1, $2, $4); | 
| 503 | 25 |  |  |  |  | 58 | my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp); | 
| 504 | 25 | 50 | 33 |  |  | 88 | next unless $tagInfo and $$tagInfo{SubDirectory}; | 
| 505 | 25 |  |  |  |  | 62 | $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); | 
| 506 | 25 | 100 |  |  |  | 57 | unless ($$table{$tag}) { | 
| 507 | 5 |  |  |  |  | 14 | my $name = ucfirst $tag; | 
| 508 | 5 |  |  |  |  | 15 | $name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes | 
|  | 1 |  |  |  |  | 9 |  | 
| 509 | 5 |  |  |  |  | 13 | $name =~ s/\s(.)/\U$1/g;     # capitalize all words in tag name | 
| 510 | 5 |  |  |  |  | 8 | $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters (also hex code wide chars) | 
| 511 | 5 |  |  |  |  | 22 | AddTagToTable($table, $tag, { Name => $name }); | 
| 512 | 5 |  |  |  |  | 23 | $et->VPrint(0, "  [adding $tag '${name}']\n"); | 
| 513 |  |  |  |  |  |  | } | 
| 514 | 25 | 50 |  |  |  | 84 | $val = $et->Decode($val, $$et{HTMLCharset}) if $$et{HTMLCharset}; | 
| 515 | 25 |  |  |  |  | 82 | $et->HandleTag($table, $tag, UnescapeXML($val)); | 
| 516 |  |  |  |  |  |  | } | 
| 517 | 1 |  |  |  |  | 4 | next; | 
| 518 |  |  |  |  |  |  | } else { | 
| 519 |  |  |  |  |  |  | # the only other element we process is TITLE | 
| 520 | 1 | 50 |  |  |  | 12 | next unless $tag eq 'title'; | 
| 521 |  |  |  |  |  |  | } | 
| 522 | 34 | 50 |  |  |  | 83 | unless ($$table{$tag}) { | 
| 523 | 0 |  |  |  |  | 0 | my $name = $tagName; | 
| 524 | 0 |  |  |  |  | 0 | $name =~ s/\W+(\w)/\u$1/sg; | 
| 525 | 0 |  |  |  |  | 0 | my $info = { Name => $name, Groups => { 0 => 'HTML' } }; | 
| 526 | 0 | 0 |  |  |  | 0 | $info->{Groups}->{1} = ($grp eq 'http-equiv' ? 'HTTP-equiv' : "HTML-$grp") if $grp; | 
|  |  | 0 |  |  |  |  |  | 
| 527 | 0 |  |  |  |  | 0 | AddTagToTable($table, $tag, $info); | 
| 528 | 0 |  |  |  |  | 0 | $et->VPrint(0, "  [adding $tag '${tagName}']\n"); | 
| 529 |  |  |  |  |  |  | } | 
| 530 |  |  |  |  |  |  | # recode if necessary | 
| 531 | 34 | 100 |  |  |  | 131 | $val = $et->Decode($val, $$et{HTMLCharset}) if $$et{HTMLCharset}; | 
| 532 | 34 |  |  |  |  | 150 | $val =~ s{\s*$/\s*}{ }sg;   # replace linefeeds and indenting spaces | 
| 533 | 34 |  |  |  |  | 79 | $val = UnescapeHTML($val);  # unescape HTML character references | 
| 534 | 34 |  |  |  |  | 99 | $et->HandleTag($table, $tag, $val); | 
| 535 |  |  |  |  |  |  | } | 
| 536 | 1 |  |  |  |  | 7 | return 1; | 
| 537 |  |  |  |  |  |  | } | 
| 538 |  |  |  |  |  |  |  | 
| 539 |  |  |  |  |  |  | 1;  # end | 
| 540 |  |  |  |  |  |  |  | 
| 541 |  |  |  |  |  |  | __END__ |