| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
2
|
|
|
|
|
|
|
# File: HTML.pm |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Description: Read HTML meta information |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Revisions: 01/30/2007 - P. Harvey Created |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# References: 1) http://www.w3.org/TR/html4/ |
|
9
|
|
|
|
|
|
|
# 2) http://www.daisy.org/publications/specifications/daisy_202.html |
|
10
|
|
|
|
|
|
|
# 3) http://vancouver-webpages.com/META/metatags.detail.html |
|
11
|
|
|
|
|
|
|
# 4) http://www.html-reference.com/META.htm |
|
12
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
package Image::ExifTool::HTML; |
|
15
|
|
|
|
|
|
|
|
|
16
|
4
|
|
|
4
|
|
3525
|
use strict; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
123
|
|
|
17
|
4
|
|
|
4
|
|
20
|
use vars qw($VERSION @ISA @EXPORT_OK); |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
190
|
|
|
18
|
4
|
|
|
4
|
|
19
|
use Image::ExifTool qw(:DataAccess :Utils); |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
717
|
|
|
19
|
4
|
|
|
4
|
|
830
|
use Image::ExifTool::PostScript; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
138
|
|
|
20
|
4
|
|
|
4
|
|
1686
|
use Image::ExifTool::XMP qw(EscapeXML UnescapeXML); |
|
|
4
|
|
|
|
|
118
|
|
|
|
4
|
|
|
|
|
9898
|
|
|
21
|
|
|
|
|
|
|
require Exporter; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$VERSION = '1.16'; |
|
24
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
|
25
|
|
|
|
|
|
|
@EXPORT_OK = qw(EscapeHTML UnescapeHTML); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub SetHTMLCharset($$); |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# convert HTML charset (lower case) to ExifTool Charset name |
|
30
|
|
|
|
|
|
|
my %htmlCharset = ( |
|
31
|
|
|
|
|
|
|
macintosh => 'MacRoman', |
|
32
|
|
|
|
|
|
|
'iso-8859-1' => 'Latin', |
|
33
|
|
|
|
|
|
|
'utf-8' => 'UTF8', |
|
34
|
|
|
|
|
|
|
'windows-1252' => 'Latin', |
|
35
|
|
|
|
|
|
|
); |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# HTML 4 character entity references |
|
38
|
|
|
|
|
|
|
my %entityNum = ( |
|
39
|
|
|
|
|
|
|
'quot' => 34, 'eth' => 240, 'lsquo' => 8216, |
|
40
|
|
|
|
|
|
|
'amp' => 38, 'ntilde' => 241, 'rsquo' => 8217, |
|
41
|
|
|
|
|
|
|
'apos' => 39, 'ograve' => 242, 'sbquo' => 8218, |
|
42
|
|
|
|
|
|
|
'lt' => 60, 'oacute' => 243, 'ldquo' => 8220, |
|
43
|
|
|
|
|
|
|
'gt' => 62, 'ocirc' => 244, 'rdquo' => 8221, |
|
44
|
|
|
|
|
|
|
'nbsp' => 160, 'otilde' => 245, 'bdquo' => 8222, |
|
45
|
|
|
|
|
|
|
'iexcl' => 161, 'ouml' => 246, 'dagger' => 8224, |
|
46
|
|
|
|
|
|
|
'cent' => 162, 'divide' => 247, 'Dagger' => 8225, |
|
47
|
|
|
|
|
|
|
'pound' => 163, 'oslash' => 248, 'bull' => 8226, |
|
48
|
|
|
|
|
|
|
'curren' => 164, 'ugrave' => 249, 'hellip' => 8230, |
|
49
|
|
|
|
|
|
|
'yen' => 165, 'uacute' => 250, 'permil' => 8240, |
|
50
|
|
|
|
|
|
|
'brvbar' => 166, 'ucirc' => 251, 'prime' => 8242, |
|
51
|
|
|
|
|
|
|
'sect' => 167, 'uuml' => 252, 'Prime' => 8243, |
|
52
|
|
|
|
|
|
|
'uml' => 168, 'yacute' => 253, 'lsaquo' => 8249, |
|
53
|
|
|
|
|
|
|
'copy' => 169, 'thorn' => 254, 'rsaquo' => 8250, |
|
54
|
|
|
|
|
|
|
'ordf' => 170, 'yuml' => 255, 'oline' => 8254, |
|
55
|
|
|
|
|
|
|
'laquo' => 171, 'OElig' => 338, 'frasl' => 8260, |
|
56
|
|
|
|
|
|
|
'not' => 172, 'oelig' => 339, 'euro' => 8364, |
|
57
|
|
|
|
|
|
|
'shy' => 173, 'Scaron' => 352, 'image' => 8465, |
|
58
|
|
|
|
|
|
|
'reg' => 174, 'scaron' => 353, 'weierp' => 8472, |
|
59
|
|
|
|
|
|
|
'macr' => 175, 'Yuml' => 376, 'real' => 8476, |
|
60
|
|
|
|
|
|
|
'deg' => 176, 'fnof' => 402, 'trade' => 8482, |
|
61
|
|
|
|
|
|
|
'plusmn' => 177, 'circ' => 710, 'alefsym'=> 8501, |
|
62
|
|
|
|
|
|
|
'sup2' => 178, 'tilde' => 732, 'larr' => 8592, |
|
63
|
|
|
|
|
|
|
'sup3' => 179, 'Alpha' => 913, 'uarr' => 8593, |
|
64
|
|
|
|
|
|
|
'acute' => 180, 'Beta' => 914, 'rarr' => 8594, |
|
65
|
|
|
|
|
|
|
'micro' => 181, 'Gamma' => 915, 'darr' => 8595, |
|
66
|
|
|
|
|
|
|
'para' => 182, 'Delta' => 916, 'harr' => 8596, |
|
67
|
|
|
|
|
|
|
'middot' => 183, 'Epsilon'=> 917, 'crarr' => 8629, |
|
68
|
|
|
|
|
|
|
'cedil' => 184, 'Zeta' => 918, 'lArr' => 8656, |
|
69
|
|
|
|
|
|
|
'sup1' => 185, 'Eta' => 919, 'uArr' => 8657, |
|
70
|
|
|
|
|
|
|
'ordm' => 186, 'Theta' => 920, 'rArr' => 8658, |
|
71
|
|
|
|
|
|
|
'raquo' => 187, 'Iota' => 921, 'dArr' => 8659, |
|
72
|
|
|
|
|
|
|
'frac14' => 188, 'Kappa' => 922, 'hArr' => 8660, |
|
73
|
|
|
|
|
|
|
'frac12' => 189, 'Lambda' => 923, 'forall' => 8704, |
|
74
|
|
|
|
|
|
|
'frac34' => 190, 'Mu' => 924, 'part' => 8706, |
|
75
|
|
|
|
|
|
|
'iquest' => 191, 'Nu' => 925, 'exist' => 8707, |
|
76
|
|
|
|
|
|
|
'Agrave' => 192, 'Xi' => 926, 'empty' => 8709, |
|
77
|
|
|
|
|
|
|
'Aacute' => 193, 'Omicron'=> 927, 'nabla' => 8711, |
|
78
|
|
|
|
|
|
|
'Acirc' => 194, 'Pi' => 928, 'isin' => 8712, |
|
79
|
|
|
|
|
|
|
'Atilde' => 195, 'Rho' => 929, 'notin' => 8713, |
|
80
|
|
|
|
|
|
|
'Auml' => 196, 'Sigma' => 931, 'ni' => 8715, |
|
81
|
|
|
|
|
|
|
'Aring' => 197, 'Tau' => 932, 'prod' => 8719, |
|
82
|
|
|
|
|
|
|
'AElig' => 198, 'Upsilon'=> 933, 'sum' => 8721, |
|
83
|
|
|
|
|
|
|
'Ccedil' => 199, 'Phi' => 934, 'minus' => 8722, |
|
84
|
|
|
|
|
|
|
'Egrave' => 200, 'Chi' => 935, 'lowast' => 8727, |
|
85
|
|
|
|
|
|
|
'Eacute' => 201, 'Psi' => 936, 'radic' => 8730, |
|
86
|
|
|
|
|
|
|
'Ecirc' => 202, 'Omega' => 937, 'prop' => 8733, |
|
87
|
|
|
|
|
|
|
'Euml' => 203, 'alpha' => 945, 'infin' => 8734, |
|
88
|
|
|
|
|
|
|
'Igrave' => 204, 'beta' => 946, 'ang' => 8736, |
|
89
|
|
|
|
|
|
|
'Iacute' => 205, 'gamma' => 947, 'and' => 8743, |
|
90
|
|
|
|
|
|
|
'Icirc' => 206, 'delta' => 948, 'or' => 8744, |
|
91
|
|
|
|
|
|
|
'Iuml' => 207, 'epsilon'=> 949, 'cap' => 8745, |
|
92
|
|
|
|
|
|
|
'ETH' => 208, 'zeta' => 950, 'cup' => 8746, |
|
93
|
|
|
|
|
|
|
'Ntilde' => 209, 'eta' => 951, 'int' => 8747, |
|
94
|
|
|
|
|
|
|
'Ograve' => 210, 'theta' => 952, 'there4' => 8756, |
|
95
|
|
|
|
|
|
|
'Oacute' => 211, 'iota' => 953, 'sim' => 8764, |
|
96
|
|
|
|
|
|
|
'Ocirc' => 212, 'kappa' => 954, 'cong' => 8773, |
|
97
|
|
|
|
|
|
|
'Otilde' => 213, 'lambda' => 955, 'asymp' => 8776, |
|
98
|
|
|
|
|
|
|
'Ouml' => 214, 'mu' => 956, 'ne' => 8800, |
|
99
|
|
|
|
|
|
|
'times' => 215, 'nu' => 957, 'equiv' => 8801, |
|
100
|
|
|
|
|
|
|
'Oslash' => 216, 'xi' => 958, 'le' => 8804, |
|
101
|
|
|
|
|
|
|
'Ugrave' => 217, 'omicron'=> 959, 'ge' => 8805, |
|
102
|
|
|
|
|
|
|
'Uacute' => 218, 'pi' => 960, 'sub' => 8834, |
|
103
|
|
|
|
|
|
|
'Ucirc' => 219, 'rho' => 961, 'sup' => 8835, |
|
104
|
|
|
|
|
|
|
'Uuml' => 220, 'sigmaf' => 962, 'nsub' => 8836, |
|
105
|
|
|
|
|
|
|
'Yacute' => 221, 'sigma' => 963, 'sube' => 8838, |
|
106
|
|
|
|
|
|
|
'THORN' => 222, 'tau' => 964, 'supe' => 8839, |
|
107
|
|
|
|
|
|
|
'szlig' => 223, 'upsilon'=> 965, 'oplus' => 8853, |
|
108
|
|
|
|
|
|
|
'agrave' => 224, 'phi' => 966, 'otimes' => 8855, |
|
109
|
|
|
|
|
|
|
'aacute' => 225, 'chi' => 967, 'perp' => 8869, |
|
110
|
|
|
|
|
|
|
'acirc' => 226, 'psi' => 968, 'sdot' => 8901, |
|
111
|
|
|
|
|
|
|
'atilde' => 227, 'omega' => 969, 'lceil' => 8968, |
|
112
|
|
|
|
|
|
|
'auml' => 228, 'thetasym'=>977, 'rceil' => 8969, |
|
113
|
|
|
|
|
|
|
'aring' => 229, 'upsih' => 978, 'lfloor' => 8970, |
|
114
|
|
|
|
|
|
|
'aelig' => 230, 'piv' => 982, 'rfloor' => 8971, |
|
115
|
|
|
|
|
|
|
'ccedil' => 231, 'ensp' => 8194, 'lang' => 9001, |
|
116
|
|
|
|
|
|
|
'egrave' => 232, 'emsp' => 8195, 'rang' => 9002, |
|
117
|
|
|
|
|
|
|
'eacute' => 233, 'thinsp' => 8201, 'loz' => 9674, |
|
118
|
|
|
|
|
|
|
'ecirc' => 234, 'zwnj' => 8204, 'spades' => 9824, |
|
119
|
|
|
|
|
|
|
'euml' => 235, 'zwj' => 8205, 'clubs' => 9827, |
|
120
|
|
|
|
|
|
|
'igrave' => 236, 'lrm' => 8206, 'hearts' => 9829, |
|
121
|
|
|
|
|
|
|
'iacute' => 237, 'rlm' => 8207, 'diams' => 9830, |
|
122
|
|
|
|
|
|
|
'icirc' => 238, 'ndash' => 8211, |
|
123
|
|
|
|
|
|
|
'iuml' => 239, 'mdash' => 8212, |
|
124
|
|
|
|
|
|
|
); |
|
125
|
|
|
|
|
|
|
my %entityName; # look up entity names by number (built as necessary) |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# HTML info |
|
128
|
|
|
|
|
|
|
# (tag ID's are case insensitive and must be all lower case in tables) |
|
129
|
|
|
|
|
|
|
%Image::ExifTool::HTML::Main = ( |
|
130
|
|
|
|
|
|
|
GROUPS => { 2 => 'Document' }, |
|
131
|
|
|
|
|
|
|
NOTES => q{ |
|
132
|
|
|
|
|
|
|
Meta information extracted from the header of HTML and XHTML files. This is |
|
133
|
|
|
|
|
|
|
a mix of information found in the C elements, C element, and the |
|
134
|
|
|
|
|
|
|
C element. |
|
135
|
|
|
|
|
|
|
}, |
|
136
|
|
|
|
|
|
|
dc => { |
|
137
|
|
|
|
|
|
|
Name => 'DC', |
|
138
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' }, |
|
139
|
|
|
|
|
|
|
}, |
|
140
|
|
|
|
|
|
|
ncc => { |
|
141
|
|
|
|
|
|
|
Name => 'NCC', |
|
142
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' }, |
|
143
|
|
|
|
|
|
|
}, |
|
144
|
|
|
|
|
|
|
prod => { |
|
145
|
|
|
|
|
|
|
Name => 'Prod', |
|
146
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' }, |
|
147
|
|
|
|
|
|
|
}, |
|
148
|
|
|
|
|
|
|
vw96 => { |
|
149
|
|
|
|
|
|
|
Name => 'VW96', |
|
150
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' }, |
|
151
|
|
|
|
|
|
|
}, |
|
152
|
|
|
|
|
|
|
'http-equiv' => { |
|
153
|
|
|
|
|
|
|
Name => 'HTTP-equiv', |
|
154
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' }, |
|
155
|
|
|
|
|
|
|
}, |
|
156
|
|
|
|
|
|
|
o => { |
|
157
|
|
|
|
|
|
|
Name => 'Office', |
|
158
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' }, |
|
159
|
|
|
|
|
|
|
}, |
|
160
|
|
|
|
|
|
|
abstract => { }, |
|
161
|
|
|
|
|
|
|
author => { }, |
|
162
|
|
|
|
|
|
|
classification => { }, |
|
163
|
|
|
|
|
|
|
'content-language'=>{ Name => 'ContentLanguage' }, |
|
164
|
|
|
|
|
|
|
copyright => { }, |
|
165
|
|
|
|
|
|
|
description => { }, |
|
166
|
|
|
|
|
|
|
distribution => { }, |
|
167
|
|
|
|
|
|
|
'doc-class' => { Name => 'DocClass' }, |
|
168
|
|
|
|
|
|
|
'doc-rights' => { Name => 'DocRights' }, |
|
169
|
|
|
|
|
|
|
'doc-type' => { Name => 'DocType' }, |
|
170
|
|
|
|
|
|
|
formatter => { }, |
|
171
|
|
|
|
|
|
|
generator => { }, |
|
172
|
|
|
|
|
|
|
generatorversion=> { Name => 'GeneratorVersion' }, |
|
173
|
|
|
|
|
|
|
googlebot => { Name => 'GoogleBot' }, |
|
174
|
|
|
|
|
|
|
keywords => { List => 1 }, |
|
175
|
|
|
|
|
|
|
mssmarttagspreventparsing => { Name => 'NoMSSmartTags' }, |
|
176
|
|
|
|
|
|
|
originator => { }, |
|
177
|
|
|
|
|
|
|
owner => { }, |
|
178
|
|
|
|
|
|
|
progid => { Name => 'ProgID' }, |
|
179
|
|
|
|
|
|
|
rating => { }, |
|
180
|
|
|
|
|
|
|
refresh => { }, |
|
181
|
|
|
|
|
|
|
'resource-type' => { Name => 'ResourceType' }, |
|
182
|
|
|
|
|
|
|
'revisit-after' => { Name => 'RevisitAfter' }, |
|
183
|
|
|
|
|
|
|
robots => { List => 1 }, |
|
184
|
|
|
|
|
|
|
title => { Notes => "the only extracted tag which isn't from an HTML META element" }, |
|
185
|
|
|
|
|
|
|
); |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# ref 2 |
|
188
|
|
|
|
|
|
|
%Image::ExifTool::HTML::dc = ( |
|
189
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-dc', 2 => 'Document' }, |
|
190
|
|
|
|
|
|
|
NOTES => 'Dublin Core schema tags (also used in XMP).', |
|
191
|
|
|
|
|
|
|
contributor => { Groups => { 2 => 'Author' }, List => 'Bag' }, |
|
192
|
|
|
|
|
|
|
coverage => { }, |
|
193
|
|
|
|
|
|
|
creator => { Groups => { 2 => 'Author' }, List => 'Seq' }, |
|
194
|
|
|
|
|
|
|
date => { |
|
195
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
|
196
|
|
|
|
|
|
|
List => 'Seq', |
|
197
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
|
198
|
|
|
|
|
|
|
}, |
|
199
|
|
|
|
|
|
|
description => { }, |
|
200
|
|
|
|
|
|
|
'format' => { }, |
|
201
|
|
|
|
|
|
|
identifier => { }, |
|
202
|
|
|
|
|
|
|
language => { List => 'Bag' }, |
|
203
|
|
|
|
|
|
|
publisher => { Groups => { 2 => 'Author' }, List => 'Bag' }, |
|
204
|
|
|
|
|
|
|
relation => { List => 'Bag' }, |
|
205
|
|
|
|
|
|
|
rights => { Groups => { 2 => 'Author' } }, |
|
206
|
|
|
|
|
|
|
source => { Groups => { 2 => 'Author' } }, |
|
207
|
|
|
|
|
|
|
subject => { List => 'Bag' }, |
|
208
|
|
|
|
|
|
|
title => { }, |
|
209
|
|
|
|
|
|
|
type => { List => 'Bag' }, |
|
210
|
|
|
|
|
|
|
); |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# ref 2 |
|
213
|
|
|
|
|
|
|
%Image::ExifTool::HTML::ncc = ( |
|
214
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' }, |
|
215
|
|
|
|
|
|
|
charset => { Name => 'CharacterSet' }, # name changed to avoid conflict with -charset option |
|
216
|
|
|
|
|
|
|
depth => { }, |
|
217
|
|
|
|
|
|
|
files => { }, |
|
218
|
|
|
|
|
|
|
footnotes => { }, |
|
219
|
|
|
|
|
|
|
generator => { }, |
|
220
|
|
|
|
|
|
|
kbytesize => { Name => 'KByteSize' }, |
|
221
|
|
|
|
|
|
|
maxpagenormal => { Name => 'MaxPageNormal' }, |
|
222
|
|
|
|
|
|
|
multimediatype => { Name => 'MultimediaType' }, |
|
223
|
|
|
|
|
|
|
narrator => { }, |
|
224
|
|
|
|
|
|
|
pagefront => { Name => 'PageFront' }, |
|
225
|
|
|
|
|
|
|
pagenormal => { Name => 'PageNormal' }, |
|
226
|
|
|
|
|
|
|
pagespecial => { Name => 'PageSpecial' }, |
|
227
|
|
|
|
|
|
|
prodnotes => { Name => 'ProdNotes' }, |
|
228
|
|
|
|
|
|
|
producer => { }, |
|
229
|
|
|
|
|
|
|
produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # YYYY-mm-dd |
|
230
|
|
|
|
|
|
|
revision => { }, |
|
231
|
|
|
|
|
|
|
revisiondate => { Name => 'RevisionDate', Groups => { 2 => 'Time' } }, |
|
232
|
|
|
|
|
|
|
setinfo => { Name => 'SetInfo' }, |
|
233
|
|
|
|
|
|
|
sidebars => { }, |
|
234
|
|
|
|
|
|
|
sourcedate => { Name => 'SourceDate', Groups => { 2 => 'Time' } }, |
|
235
|
|
|
|
|
|
|
sourceedition => { Name => 'SourceEdition' }, |
|
236
|
|
|
|
|
|
|
sourcepublisher => { Name => 'SourcePublisher' }, |
|
237
|
|
|
|
|
|
|
sourcerights => { Name => 'SourceRights' }, |
|
238
|
|
|
|
|
|
|
sourcetitle => { Name => 'SourceTitle' }, |
|
239
|
|
|
|
|
|
|
tocitems => { Name => 'TOCItems' }, |
|
240
|
|
|
|
|
|
|
totaltime => { Name => 'Duration' }, # HH:MM:SS |
|
241
|
|
|
|
|
|
|
); |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# ref 3 |
|
244
|
|
|
|
|
|
|
%Image::ExifTool::HTML::vw96 = ( |
|
245
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-vw96', 2 => 'Document' }, |
|
246
|
|
|
|
|
|
|
objecttype => { Name => 'ObjectType' }, |
|
247
|
|
|
|
|
|
|
); |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# ref 2 |
|
250
|
|
|
|
|
|
|
%Image::ExifTool::HTML::prod = ( |
|
251
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-prod', 2 => 'Document' }, |
|
252
|
|
|
|
|
|
|
reclocation => { Name => 'RecLocation' }, |
|
253
|
|
|
|
|
|
|
recengineer => { Name => 'RecEngineer' }, |
|
254
|
|
|
|
|
|
|
); |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# ref 3/4 |
|
257
|
|
|
|
|
|
|
%Image::ExifTool::HTML::equiv = ( |
|
258
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTTP-equiv', 2 => 'Document' }, |
|
259
|
|
|
|
|
|
|
NOTES => 'These tags have a family 1 group name of "HTTP-equiv".', |
|
260
|
|
|
|
|
|
|
'cache-control' => { Name => 'CacheControl' }, |
|
261
|
|
|
|
|
|
|
'content-disposition' => { Name => 'ContentDisposition' }, |
|
262
|
|
|
|
|
|
|
'content-language' => { Name => 'ContentLanguage' }, |
|
263
|
|
|
|
|
|
|
'content-script-type' => { Name => 'ContentScriptType' }, |
|
264
|
|
|
|
|
|
|
'content-style-type' => { Name => 'ContentStyleType' }, |
|
265
|
|
|
|
|
|
|
# note: setting the HTMLCharset like this will miss any tags which come earlier |
|
266
|
|
|
|
|
|
|
'content-type' => { Name => 'ContentType', RawConv => \&SetHTMLCharset }, |
|
267
|
|
|
|
|
|
|
'default-style' => { Name => 'DefaultStyle' }, |
|
268
|
|
|
|
|
|
|
expires => { }, |
|
269
|
|
|
|
|
|
|
'ext-cache' => { Name => 'ExtCache' }, |
|
270
|
|
|
|
|
|
|
imagetoolbar => { Name => 'ImageToolbar' }, |
|
271
|
|
|
|
|
|
|
lotus => { }, |
|
272
|
|
|
|
|
|
|
'page-enter' => { Name => 'PageEnter' }, |
|
273
|
|
|
|
|
|
|
'page-exit' => { Name => 'PageExit' }, |
|
274
|
|
|
|
|
|
|
'pics-label' => { Name => 'PicsLabel' }, |
|
275
|
|
|
|
|
|
|
pragma => { }, |
|
276
|
|
|
|
|
|
|
refresh => { }, |
|
277
|
|
|
|
|
|
|
'reply-to' => { Name => 'ReplyTo' }, |
|
278
|
|
|
|
|
|
|
'set-cookie' => { Name => 'SetCookie' }, |
|
279
|
|
|
|
|
|
|
'site-enter' => { Name => 'SiteEnter' }, |
|
280
|
|
|
|
|
|
|
'site-exit' => { Name => 'SiteExit' }, |
|
281
|
|
|
|
|
|
|
vary => { }, |
|
282
|
|
|
|
|
|
|
'window-target' => { Name => 'WindowTarget' }, |
|
283
|
|
|
|
|
|
|
); |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# MS Office namespace (ref PH) |
|
286
|
|
|
|
|
|
|
%Image::ExifTool::HTML::Office = ( |
|
287
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-office', 2 => 'Document' }, |
|
288
|
|
|
|
|
|
|
NOTES => 'Tags written by Microsoft Office applications.', |
|
289
|
|
|
|
|
|
|
Subject => { }, |
|
290
|
|
|
|
|
|
|
Author => { Groups => { 2 => 'Author' } }, |
|
291
|
|
|
|
|
|
|
Keywords => { }, |
|
292
|
|
|
|
|
|
|
Description => { }, |
|
293
|
|
|
|
|
|
|
Template => { }, |
|
294
|
|
|
|
|
|
|
LastAuthor => { Groups => { 2 => 'Author' } }, |
|
295
|
|
|
|
|
|
|
Revision => { Name => 'RevisionNumber' }, |
|
296
|
|
|
|
|
|
|
TotalTime => { Name => 'TotalEditTime', PrintConv => 'ConvertTimeSpan($val, 60)' }, |
|
297
|
|
|
|
|
|
|
Created => { |
|
298
|
|
|
|
|
|
|
Name => 'CreateDate', |
|
299
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
|
300
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
|
301
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
|
302
|
|
|
|
|
|
|
}, |
|
303
|
|
|
|
|
|
|
LastSaved => { |
|
304
|
|
|
|
|
|
|
Name => 'ModifyDate', |
|
305
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
|
306
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
|
307
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
|
308
|
|
|
|
|
|
|
}, |
|
309
|
|
|
|
|
|
|
LastSaved => { |
|
310
|
|
|
|
|
|
|
Name => 'ModifyDate', |
|
311
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
|
312
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
|
313
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
|
314
|
|
|
|
|
|
|
}, |
|
315
|
|
|
|
|
|
|
LastPrinted => { |
|
316
|
|
|
|
|
|
|
Name => 'LastPrinted', |
|
317
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
|
318
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
|
319
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
|
320
|
|
|
|
|
|
|
}, |
|
321
|
|
|
|
|
|
|
Pages => { }, |
|
322
|
|
|
|
|
|
|
Words => { }, |
|
323
|
|
|
|
|
|
|
Characters => { }, |
|
324
|
|
|
|
|
|
|
Category => { }, |
|
325
|
|
|
|
|
|
|
Manager => { }, |
|
326
|
|
|
|
|
|
|
Company => { }, |
|
327
|
|
|
|
|
|
|
Lines => { }, |
|
328
|
|
|
|
|
|
|
Paragraphs => { }, |
|
329
|
|
|
|
|
|
|
CharactersWithSpaces => { }, |
|
330
|
|
|
|
|
|
|
Version => { Name => 'RevisionNumber' }, |
|
331
|
|
|
|
|
|
|
); |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
334
|
|
|
|
|
|
|
# Set HTMLCharset member based on content type |
|
335
|
|
|
|
|
|
|
# Inputs: 0) content type string, 1) ExifTool ref |
|
336
|
|
|
|
|
|
|
# Returns: original string |
|
337
|
|
|
|
|
|
|
sub SetHTMLCharset($$) |
|
338
|
|
|
|
|
|
|
{ |
|
339
|
1
|
|
|
1
|
0
|
113
|
my ($val, $et) = @_; |
|
340
|
1
|
50
|
|
|
|
13
|
$$et{HTMLCharset} = $htmlCharset{lc $1} if $val =~ /charset=['"]?([-\w]+)/; |
|
341
|
1
|
|
|
|
|
3
|
return $val; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
345
|
|
|
|
|
|
|
# Convert single UTF-8 character to HTML character reference |
|
346
|
|
|
|
|
|
|
# Inputs: 0) UTF-8 character sequence |
|
347
|
|
|
|
|
|
|
# Returns: HTML character reference (eg. """); |
|
348
|
|
|
|
|
|
|
# Note: Must be called via EscapeHTML to load name lookup |
|
349
|
|
|
|
|
|
|
sub EscapeChar($) |
|
350
|
|
|
|
|
|
|
{ |
|
351
|
157
|
|
|
157
|
0
|
233
|
my $ch = shift; |
|
352
|
157
|
|
|
|
|
162
|
my $val; |
|
353
|
157
|
50
|
|
|
|
193
|
if ($] < 5.006001) { |
|
354
|
0
|
|
|
|
|
0
|
($val) = Image::ExifTool::UnpackUTF8($ch); |
|
355
|
|
|
|
|
|
|
} else { |
|
356
|
|
|
|
|
|
|
# the meaning of "U0" is reversed as of Perl 5.10.0! |
|
357
|
157
|
50
|
|
|
|
326
|
($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch); |
|
358
|
|
|
|
|
|
|
} |
|
359
|
157
|
50
|
|
|
|
260
|
return '?' unless defined $val; |
|
360
|
157
|
100
|
|
|
|
352
|
return "&$entityName{$val};" if $entityName{$val}; |
|
361
|
112
|
|
|
|
|
356
|
return sprintf('%x;',$val); |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
365
|
|
|
|
|
|
|
# Escape any special characters for HTML |
|
366
|
|
|
|
|
|
|
# Inputs: 0) string to be escaped, 1) optional string encoding (default 'UTF8') |
|
367
|
|
|
|
|
|
|
# Returns: escaped string |
|
368
|
|
|
|
|
|
|
sub EscapeHTML($;$) |
|
369
|
|
|
|
|
|
|
{ |
|
370
|
167
|
|
|
167
|
0
|
254
|
my ($str, $enc) = @_; |
|
371
|
|
|
|
|
|
|
# escape XML characters |
|
372
|
167
|
|
|
|
|
307
|
$str = EscapeXML($str); |
|
373
|
|
|
|
|
|
|
# escape other special characters if they exist |
|
374
|
167
|
100
|
|
|
|
354
|
if ($str =~ /[\x80-\xff]/) { |
|
375
|
|
|
|
|
|
|
# generate entity name lookup if necessary |
|
376
|
16
|
100
|
|
|
|
38
|
unless (%entityName) { |
|
377
|
2
|
|
|
|
|
5
|
local $_; |
|
378
|
2
|
|
|
|
|
93
|
foreach (keys %entityNum) { |
|
379
|
506
|
|
|
|
|
1016
|
$entityName{$entityNum{$_}} = $_; |
|
380
|
|
|
|
|
|
|
} |
|
381
|
2
|
|
|
|
|
20
|
delete $entityName{39}; # 'apos' is not valid HTML |
|
382
|
|
|
|
|
|
|
} |
|
383
|
|
|
|
|
|
|
# suppress warnings |
|
384
|
16
|
|
|
0
|
|
85
|
local $SIG{'__WARN__'} = sub { 1 }; |
|
|
0
|
|
|
|
|
0
|
|
|
385
|
16
|
50
|
33
|
|
|
78
|
if ($enc and $enc ne 'UTF8') { |
|
386
|
0
|
|
|
|
|
0
|
$str = Image::ExifTool::Decode(undef, $str, $enc, undef, 'UTF8'); |
|
387
|
0
|
|
|
|
|
0
|
$str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge; |
|
|
0
|
|
|
|
|
0
|
|
|
388
|
0
|
|
|
|
|
0
|
$str = Image::ExifTool::Decode(undef, $str, 'UTF8', undef, $enc); |
|
389
|
|
|
|
|
|
|
} else { |
|
390
|
|
|
|
|
|
|
# escape any non-ascii characters for HTML |
|
391
|
16
|
|
|
|
|
60
|
$str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge; |
|
|
157
|
|
|
|
|
234
|
|
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
} |
|
394
|
167
|
|
|
|
|
322
|
return $str; |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
398
|
|
|
|
|
|
|
# Unescape all HTML character references |
|
399
|
|
|
|
|
|
|
# Inputs: 0) string to be unescaped, 1) optional string encoding (default 'UTF8') |
|
400
|
|
|
|
|
|
|
# Returns: unescaped string |
|
401
|
|
|
|
|
|
|
sub UnescapeHTML($;$) |
|
402
|
|
|
|
|
|
|
{ |
|
403
|
154
|
|
|
154
|
0
|
269
|
my ($str, $enc) = @_; |
|
404
|
154
|
|
|
|
|
364
|
return UnescapeXML($str, \%entityNum, $enc); |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
408
|
|
|
|
|
|
|
# Extract information from a HTML file |
|
409
|
|
|
|
|
|
|
# Inputs: 0) ExifTool object reference, 1) DirInfo reference |
|
410
|
|
|
|
|
|
|
# Returns: 1 on success, 0 if this wasn't a valid HTML file |
|
411
|
|
|
|
|
|
|
sub ProcessHTML($$) |
|
412
|
|
|
|
|
|
|
{ |
|
413
|
1
|
|
|
1
|
0
|
2
|
my ($et, $dirInfo) = @_; |
|
414
|
1
|
|
|
|
|
2
|
my $raf = $$dirInfo{RAF}; |
|
415
|
1
|
|
|
|
|
2
|
my $buff; |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# validate HTML or XHTML file |
|
418
|
1
|
50
|
|
|
|
3
|
$raf->Read($buff, 256) or return 0; |
|
419
|
1
|
50
|
|
|
|
8
|
$buff =~ /^(\xef\xbb\xbf)?\s*<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0; |
|
420
|
1
|
50
|
50
|
|
|
8
|
$buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $2 eq '?xml'; |
|
421
|
1
|
|
|
|
|
14
|
$et->SetFileType(); |
|
422
|
|
|
|
|
|
|
|
|
423
|
1
|
50
|
|
|
|
4
|
$raf->Seek(0,0) or $et->Warn('Seek error'), return 1; |
|
424
|
|
|
|
|
|
|
|
|
425
|
1
|
|
|
|
|
4
|
local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf); |
|
426
|
1
|
50
|
|
|
|
3
|
$/ or $et->Warn('Invalid HTML data'), return 1; |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# extract header information |
|
429
|
1
|
|
|
|
|
17
|
my $doc; |
|
430
|
1
|
|
|
|
|
5
|
while ($raf->ReadLine($buff)) { |
|
431
|
76
|
100
|
|
|
|
106
|
if (not defined $doc) { |
|
432
|
|
|
|
|
|
|
# look for 'head' element |
|
433
|
5
|
100
|
|
|
|
20
|
next unless $buff =~ /
|
|
434
|
1
|
|
|
|
|
3
|
$doc = substr($buff, pos($buff)); |
|
435
|
1
|
|
|
|
|
3
|
next; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
71
|
|
|
|
|
90
|
$doc .= $buff; |
|
438
|
71
|
100
|
|
|
|
158
|
last if $buff =~ m{}i; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
1
|
50
|
|
|
|
3
|
return 1 unless defined $doc; |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# process all elements in header |
|
443
|
1
|
|
|
|
|
17
|
my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main'); |
|
444
|
1
|
|
|
|
|
2
|
for (;;) { |
|
445
|
36
|
100
|
|
|
|
160
|
last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg; |
|
446
|
35
|
|
|
|
|
101
|
my ($tagName, $attrs) = ($1, $2); |
|
447
|
35
|
|
|
|
|
50
|
my $tag = lc($tagName); |
|
448
|
35
|
|
|
|
|
39
|
my ($val, $grp); |
|
449
|
35
|
100
|
|
|
|
77
|
if ($attrs =~ m{/$}) { # self-contained XHTML tags end in '/>' |
|
450
|
33
|
|
|
|
|
48
|
$val = ''; |
|
451
|
|
|
|
|
|
|
} else { |
|
452
|
|
|
|
|
|
|
# look for element close |
|
453
|
2
|
|
|
|
|
4
|
my $pos = pos($doc); |
|
454
|
2
|
|
|
|
|
12
|
my $close = "$tagName>"; |
|
455
|
|
|
|
|
|
|
# the following doesn't work on Solaris Perl 5.6.1 due to Perl bug: |
|
456
|
|
|
|
|
|
|
# if ($doc =~ m{(.*?)$tagName>}sg) { |
|
457
|
|
|
|
|
|
|
# $val = $1; |
|
458
|
2
|
50
|
|
|
|
51
|
if ($doc =~ m{$close}sg) { |
|
459
|
2
|
|
|
|
|
19
|
$val = substr($doc, $pos, pos($doc)-$pos-length($close)); |
|
460
|
|
|
|
|
|
|
} else { |
|
461
|
0
|
|
|
|
|
0
|
pos($doc) = $pos; |
|
462
|
0
|
0
|
|
|
|
0
|
next unless $tag eq 'meta'; # META tags don't need to be closed |
|
463
|
0
|
|
|
|
|
0
|
$val = ''; |
|
464
|
|
|
|
|
|
|
} |
|
465
|
|
|
|
|
|
|
} |
|
466
|
35
|
|
|
|
|
43
|
my $table = $tagTablePtr; |
|
467
|
35
|
100
|
|
|
|
52
|
if ($tag eq 'meta') { |
|
|
|
100
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# parse HTML META element |
|
469
|
33
|
|
|
|
|
37
|
undef $tag; |
|
470
|
|
|
|
|
|
|
# tag name is in NAME or HTTP-EQUIV attribute |
|
471
|
33
|
100
|
|
|
|
114
|
if ($attrs =~ /\bname\s*=\s*['"]?([\w:.-]+)/si) { |
|
|
|
50
|
|
|
|
|
|
|
472
|
32
|
|
|
|
|
57
|
$tagName = $1; |
|
473
|
|
|
|
|
|
|
} elsif ($attrs =~ /\bhttp-equiv\s*=\s*['"]?([\w:.-]+)/si) { |
|
474
|
1
|
|
|
|
|
4
|
$tagName = "HTTP-equiv.$1"; |
|
475
|
|
|
|
|
|
|
} else { |
|
476
|
0
|
|
|
|
|
0
|
next; # no name |
|
477
|
|
|
|
|
|
|
} |
|
478
|
33
|
50
|
|
|
|
63
|
$tag = lc($tagName) or next; |
|
479
|
|
|
|
|
|
|
# tag value is in CONTENT attribute |
|
480
|
33
|
50
|
33
|
|
|
131
|
if ($attrs =~ /\bcontent\s*=\s*(['"])(.*?)\1/si or |
|
481
|
|
|
|
|
|
|
$attrs =~ /\bcontent\s*=\s*(['"]?)([\w:.-]+)/si) |
|
482
|
|
|
|
|
|
|
{ |
|
483
|
33
|
|
|
|
|
57
|
$val = $2; |
|
484
|
|
|
|
|
|
|
} else { |
|
485
|
0
|
0
|
|
|
|
0
|
next unless length $val; |
|
486
|
|
|
|
|
|
|
} |
|
487
|
|
|
|
|
|
|
# isolate group name (separator is '.' in HTML, but ':' in ref 2) |
|
488
|
33
|
50
|
|
|
|
90
|
if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) { |
|
489
|
33
|
|
|
|
|
63
|
($grp, $tag) = ($1, $2); |
|
490
|
33
|
|
|
|
|
67
|
my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp); |
|
491
|
33
|
50
|
33
|
|
|
95
|
if ($tagInfo and $$tagInfo{SubDirectory}) { |
|
492
|
33
|
|
|
|
|
63
|
$table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); |
|
493
|
|
|
|
|
|
|
} else { |
|
494
|
0
|
|
|
|
|
0
|
$tag = "$grp.$tag"; |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
} |
|
497
|
|
|
|
|
|
|
} elsif ($tag eq 'xml') { |
|
498
|
1
|
|
|
|
|
5
|
$et->VPrint(0, "Parsing XML\n"); |
|
499
|
|
|
|
|
|
|
# parse XML tags (quick-and-dirty) |
|
500
|
1
|
|
|
|
|
1
|
my $xml = $val; |
|
501
|
1
|
|
|
|
|
12
|
while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) { |
|
502
|
25
|
|
|
|
|
76
|
($grp, $tag, $val) = ($1, $2, $4); |
|
503
|
25
|
|
|
|
|
47
|
my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp); |
|
504
|
25
|
50
|
33
|
|
|
68
|
next unless $tagInfo and $$tagInfo{SubDirectory}; |
|
505
|
25
|
|
|
|
|
52
|
$table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); |
|
506
|
25
|
100
|
|
|
|
51
|
unless ($$table{$tag}) { |
|
507
|
5
|
|
|
|
|
8
|
my $name = ucfirst $tag; |
|
508
|
5
|
|
|
|
|
11
|
$name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes |
|
|
1
|
|
|
|
|
6
|
|
|
509
|
5
|
|
|
|
|
13
|
$name =~ s/\s(.)/\U$1/g; # capitalize all words in tag name |
|
510
|
5
|
|
|
|
|
7
|
$name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters (also hex code wide chars) |
|
511
|
5
|
|
|
|
|
18
|
AddTagToTable($table, $tag, { Name => $name }); |
|
512
|
5
|
|
|
|
|
18
|
$et->VPrint(0, " [adding $tag '${name}']\n"); |
|
513
|
|
|
|
|
|
|
} |
|
514
|
25
|
50
|
|
|
|
84
|
$val = $et->Decode($val, $$et{HTMLCharset}) if $$et{HTMLCharset}; |
|
515
|
25
|
|
|
|
|
55
|
$et->HandleTag($table, $tag, UnescapeXML($val)); |
|
516
|
|
|
|
|
|
|
} |
|
517
|
1
|
|
|
|
|
3
|
next; |
|
518
|
|
|
|
|
|
|
} else { |
|
519
|
|
|
|
|
|
|
# the only other element we process is TITLE |
|
520
|
1
|
50
|
|
|
|
3
|
next unless $tag eq 'title'; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
34
|
50
|
|
|
|
67
|
unless ($$table{$tag}) { |
|
523
|
0
|
|
|
|
|
0
|
my $name = $tagName; |
|
524
|
0
|
|
|
|
|
0
|
$name =~ s/\W+(\w)/\u$1/sg; |
|
525
|
0
|
|
|
|
|
0
|
my $info = { Name => $name, Groups => { 0 => 'HTML' } }; |
|
526
|
0
|
0
|
|
|
|
0
|
$info->{Groups}->{1} = ($grp eq 'http-equiv' ? 'HTTP-equiv' : "HTML-$grp") if $grp; |
|
|
|
0
|
|
|
|
|
|
|
527
|
0
|
|
|
|
|
0
|
AddTagToTable($table, $tag, $info); |
|
528
|
0
|
|
|
|
|
0
|
$et->VPrint(0, " [adding $tag '${tagName}']\n"); |
|
529
|
|
|
|
|
|
|
} |
|
530
|
|
|
|
|
|
|
# recode if necessary |
|
531
|
34
|
100
|
|
|
|
94
|
$val = $et->Decode($val, $$et{HTMLCharset}) if $$et{HTMLCharset}; |
|
532
|
34
|
|
|
|
|
130
|
$val =~ s{\s*$/\s*}{ }sg; # replace linefeeds and indenting spaces |
|
533
|
34
|
|
|
|
|
61
|
$val = UnescapeHTML($val); # unescape HTML character references |
|
534
|
34
|
|
|
|
|
75
|
$et->HandleTag($table, $tag, $val); |
|
535
|
|
|
|
|
|
|
} |
|
536
|
1
|
|
|
|
|
5
|
return 1; |
|
537
|
|
|
|
|
|
|
} |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
1; # end |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
__END__ |