line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
2
|
|
|
|
|
|
|
# File: HTML.pm |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Description: Read HTML meta information |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Revisions: 01/30/2007 - P. Harvey Created |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# References: 1) http://www.w3.org/TR/html4/ |
9
|
|
|
|
|
|
|
# 2) http://www.daisy.org/publications/specifications/daisy_202.html |
10
|
|
|
|
|
|
|
# 3) http://vancouver-webpages.com/META/metatags.detail.html |
11
|
|
|
|
|
|
|
# 4) http://www.html-reference.com/META.htm |
12
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
package Image::ExifTool::HTML; |
15
|
|
|
|
|
|
|
|
16
|
4
|
|
|
4
|
|
3525
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
123
|
|
17
|
4
|
|
|
4
|
|
20
|
use vars qw($VERSION @ISA @EXPORT_OK); |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
190
|
|
18
|
4
|
|
|
4
|
|
19
|
use Image::ExifTool qw(:DataAccess :Utils); |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
717
|
|
19
|
4
|
|
|
4
|
|
830
|
use Image::ExifTool::PostScript; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
138
|
|
20
|
4
|
|
|
4
|
|
1686
|
use Image::ExifTool::XMP qw(EscapeXML UnescapeXML); |
|
4
|
|
|
|
|
118
|
|
|
4
|
|
|
|
|
9898
|
|
21
|
|
|
|
|
|
|
require Exporter; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$VERSION = '1.16'; |
24
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
25
|
|
|
|
|
|
|
@EXPORT_OK = qw(EscapeHTML UnescapeHTML); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub SetHTMLCharset($$); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# convert HTML charset (lower case) to ExifTool Charset name |
30
|
|
|
|
|
|
|
my %htmlCharset = ( |
31
|
|
|
|
|
|
|
macintosh => 'MacRoman', |
32
|
|
|
|
|
|
|
'iso-8859-1' => 'Latin', |
33
|
|
|
|
|
|
|
'utf-8' => 'UTF8', |
34
|
|
|
|
|
|
|
'windows-1252' => 'Latin', |
35
|
|
|
|
|
|
|
); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# HTML 4 character entity references |
38
|
|
|
|
|
|
|
my %entityNum = ( |
39
|
|
|
|
|
|
|
'quot' => 34, 'eth' => 240, 'lsquo' => 8216, |
40
|
|
|
|
|
|
|
'amp' => 38, 'ntilde' => 241, 'rsquo' => 8217, |
41
|
|
|
|
|
|
|
'apos' => 39, 'ograve' => 242, 'sbquo' => 8218, |
42
|
|
|
|
|
|
|
'lt' => 60, 'oacute' => 243, 'ldquo' => 8220, |
43
|
|
|
|
|
|
|
'gt' => 62, 'ocirc' => 244, 'rdquo' => 8221, |
44
|
|
|
|
|
|
|
'nbsp' => 160, 'otilde' => 245, 'bdquo' => 8222, |
45
|
|
|
|
|
|
|
'iexcl' => 161, 'ouml' => 246, 'dagger' => 8224, |
46
|
|
|
|
|
|
|
'cent' => 162, 'divide' => 247, 'Dagger' => 8225, |
47
|
|
|
|
|
|
|
'pound' => 163, 'oslash' => 248, 'bull' => 8226, |
48
|
|
|
|
|
|
|
'curren' => 164, 'ugrave' => 249, 'hellip' => 8230, |
49
|
|
|
|
|
|
|
'yen' => 165, 'uacute' => 250, 'permil' => 8240, |
50
|
|
|
|
|
|
|
'brvbar' => 166, 'ucirc' => 251, 'prime' => 8242, |
51
|
|
|
|
|
|
|
'sect' => 167, 'uuml' => 252, 'Prime' => 8243, |
52
|
|
|
|
|
|
|
'uml' => 168, 'yacute' => 253, 'lsaquo' => 8249, |
53
|
|
|
|
|
|
|
'copy' => 169, 'thorn' => 254, 'rsaquo' => 8250, |
54
|
|
|
|
|
|
|
'ordf' => 170, 'yuml' => 255, 'oline' => 8254, |
55
|
|
|
|
|
|
|
'laquo' => 171, 'OElig' => 338, 'frasl' => 8260, |
56
|
|
|
|
|
|
|
'not' => 172, 'oelig' => 339, 'euro' => 8364, |
57
|
|
|
|
|
|
|
'shy' => 173, 'Scaron' => 352, 'image' => 8465, |
58
|
|
|
|
|
|
|
'reg' => 174, 'scaron' => 353, 'weierp' => 8472, |
59
|
|
|
|
|
|
|
'macr' => 175, 'Yuml' => 376, 'real' => 8476, |
60
|
|
|
|
|
|
|
'deg' => 176, 'fnof' => 402, 'trade' => 8482, |
61
|
|
|
|
|
|
|
'plusmn' => 177, 'circ' => 710, 'alefsym'=> 8501, |
62
|
|
|
|
|
|
|
'sup2' => 178, 'tilde' => 732, 'larr' => 8592, |
63
|
|
|
|
|
|
|
'sup3' => 179, 'Alpha' => 913, 'uarr' => 8593, |
64
|
|
|
|
|
|
|
'acute' => 180, 'Beta' => 914, 'rarr' => 8594, |
65
|
|
|
|
|
|
|
'micro' => 181, 'Gamma' => 915, 'darr' => 8595, |
66
|
|
|
|
|
|
|
'para' => 182, 'Delta' => 916, 'harr' => 8596, |
67
|
|
|
|
|
|
|
'middot' => 183, 'Epsilon'=> 917, 'crarr' => 8629, |
68
|
|
|
|
|
|
|
'cedil' => 184, 'Zeta' => 918, 'lArr' => 8656, |
69
|
|
|
|
|
|
|
'sup1' => 185, 'Eta' => 919, 'uArr' => 8657, |
70
|
|
|
|
|
|
|
'ordm' => 186, 'Theta' => 920, 'rArr' => 8658, |
71
|
|
|
|
|
|
|
'raquo' => 187, 'Iota' => 921, 'dArr' => 8659, |
72
|
|
|
|
|
|
|
'frac14' => 188, 'Kappa' => 922, 'hArr' => 8660, |
73
|
|
|
|
|
|
|
'frac12' => 189, 'Lambda' => 923, 'forall' => 8704, |
74
|
|
|
|
|
|
|
'frac34' => 190, 'Mu' => 924, 'part' => 8706, |
75
|
|
|
|
|
|
|
'iquest' => 191, 'Nu' => 925, 'exist' => 8707, |
76
|
|
|
|
|
|
|
'Agrave' => 192, 'Xi' => 926, 'empty' => 8709, |
77
|
|
|
|
|
|
|
'Aacute' => 193, 'Omicron'=> 927, 'nabla' => 8711, |
78
|
|
|
|
|
|
|
'Acirc' => 194, 'Pi' => 928, 'isin' => 8712, |
79
|
|
|
|
|
|
|
'Atilde' => 195, 'Rho' => 929, 'notin' => 8713, |
80
|
|
|
|
|
|
|
'Auml' => 196, 'Sigma' => 931, 'ni' => 8715, |
81
|
|
|
|
|
|
|
'Aring' => 197, 'Tau' => 932, 'prod' => 8719, |
82
|
|
|
|
|
|
|
'AElig' => 198, 'Upsilon'=> 933, 'sum' => 8721, |
83
|
|
|
|
|
|
|
'Ccedil' => 199, 'Phi' => 934, 'minus' => 8722, |
84
|
|
|
|
|
|
|
'Egrave' => 200, 'Chi' => 935, 'lowast' => 8727, |
85
|
|
|
|
|
|
|
'Eacute' => 201, 'Psi' => 936, 'radic' => 8730, |
86
|
|
|
|
|
|
|
'Ecirc' => 202, 'Omega' => 937, 'prop' => 8733, |
87
|
|
|
|
|
|
|
'Euml' => 203, 'alpha' => 945, 'infin' => 8734, |
88
|
|
|
|
|
|
|
'Igrave' => 204, 'beta' => 946, 'ang' => 8736, |
89
|
|
|
|
|
|
|
'Iacute' => 205, 'gamma' => 947, 'and' => 8743, |
90
|
|
|
|
|
|
|
'Icirc' => 206, 'delta' => 948, 'or' => 8744, |
91
|
|
|
|
|
|
|
'Iuml' => 207, 'epsilon'=> 949, 'cap' => 8745, |
92
|
|
|
|
|
|
|
'ETH' => 208, 'zeta' => 950, 'cup' => 8746, |
93
|
|
|
|
|
|
|
'Ntilde' => 209, 'eta' => 951, 'int' => 8747, |
94
|
|
|
|
|
|
|
'Ograve' => 210, 'theta' => 952, 'there4' => 8756, |
95
|
|
|
|
|
|
|
'Oacute' => 211, 'iota' => 953, 'sim' => 8764, |
96
|
|
|
|
|
|
|
'Ocirc' => 212, 'kappa' => 954, 'cong' => 8773, |
97
|
|
|
|
|
|
|
'Otilde' => 213, 'lambda' => 955, 'asymp' => 8776, |
98
|
|
|
|
|
|
|
'Ouml' => 214, 'mu' => 956, 'ne' => 8800, |
99
|
|
|
|
|
|
|
'times' => 215, 'nu' => 957, 'equiv' => 8801, |
100
|
|
|
|
|
|
|
'Oslash' => 216, 'xi' => 958, 'le' => 8804, |
101
|
|
|
|
|
|
|
'Ugrave' => 217, 'omicron'=> 959, 'ge' => 8805, |
102
|
|
|
|
|
|
|
'Uacute' => 218, 'pi' => 960, 'sub' => 8834, |
103
|
|
|
|
|
|
|
'Ucirc' => 219, 'rho' => 961, 'sup' => 8835, |
104
|
|
|
|
|
|
|
'Uuml' => 220, 'sigmaf' => 962, 'nsub' => 8836, |
105
|
|
|
|
|
|
|
'Yacute' => 221, 'sigma' => 963, 'sube' => 8838, |
106
|
|
|
|
|
|
|
'THORN' => 222, 'tau' => 964, 'supe' => 8839, |
107
|
|
|
|
|
|
|
'szlig' => 223, 'upsilon'=> 965, 'oplus' => 8853, |
108
|
|
|
|
|
|
|
'agrave' => 224, 'phi' => 966, 'otimes' => 8855, |
109
|
|
|
|
|
|
|
'aacute' => 225, 'chi' => 967, 'perp' => 8869, |
110
|
|
|
|
|
|
|
'acirc' => 226, 'psi' => 968, 'sdot' => 8901, |
111
|
|
|
|
|
|
|
'atilde' => 227, 'omega' => 969, 'lceil' => 8968, |
112
|
|
|
|
|
|
|
'auml' => 228, 'thetasym'=>977, 'rceil' => 8969, |
113
|
|
|
|
|
|
|
'aring' => 229, 'upsih' => 978, 'lfloor' => 8970, |
114
|
|
|
|
|
|
|
'aelig' => 230, 'piv' => 982, 'rfloor' => 8971, |
115
|
|
|
|
|
|
|
'ccedil' => 231, 'ensp' => 8194, 'lang' => 9001, |
116
|
|
|
|
|
|
|
'egrave' => 232, 'emsp' => 8195, 'rang' => 9002, |
117
|
|
|
|
|
|
|
'eacute' => 233, 'thinsp' => 8201, 'loz' => 9674, |
118
|
|
|
|
|
|
|
'ecirc' => 234, 'zwnj' => 8204, 'spades' => 9824, |
119
|
|
|
|
|
|
|
'euml' => 235, 'zwj' => 8205, 'clubs' => 9827, |
120
|
|
|
|
|
|
|
'igrave' => 236, 'lrm' => 8206, 'hearts' => 9829, |
121
|
|
|
|
|
|
|
'iacute' => 237, 'rlm' => 8207, 'diams' => 9830, |
122
|
|
|
|
|
|
|
'icirc' => 238, 'ndash' => 8211, |
123
|
|
|
|
|
|
|
'iuml' => 239, 'mdash' => 8212, |
124
|
|
|
|
|
|
|
); |
125
|
|
|
|
|
|
|
my %entityName; # look up entity names by number (built as necessary) |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# HTML info |
128
|
|
|
|
|
|
|
# (tag ID's are case insensitive and must be all lower case in tables) |
129
|
|
|
|
|
|
|
%Image::ExifTool::HTML::Main = ( |
130
|
|
|
|
|
|
|
GROUPS => { 2 => 'Document' }, |
131
|
|
|
|
|
|
|
NOTES => q{ |
132
|
|
|
|
|
|
|
Meta information extracted from the header of HTML and XHTML files. This is |
133
|
|
|
|
|
|
|
a mix of information found in the C elements, C element, and the |
134
|
|
|
|
|
|
|
C element. |
135
|
|
|
|
|
|
|
}, |
136
|
|
|
|
|
|
|
dc => { |
137
|
|
|
|
|
|
|
Name => 'DC', |
138
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' }, |
139
|
|
|
|
|
|
|
}, |
140
|
|
|
|
|
|
|
ncc => { |
141
|
|
|
|
|
|
|
Name => 'NCC', |
142
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' }, |
143
|
|
|
|
|
|
|
}, |
144
|
|
|
|
|
|
|
prod => { |
145
|
|
|
|
|
|
|
Name => 'Prod', |
146
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' }, |
147
|
|
|
|
|
|
|
}, |
148
|
|
|
|
|
|
|
vw96 => { |
149
|
|
|
|
|
|
|
Name => 'VW96', |
150
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' }, |
151
|
|
|
|
|
|
|
}, |
152
|
|
|
|
|
|
|
'http-equiv' => { |
153
|
|
|
|
|
|
|
Name => 'HTTP-equiv', |
154
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' }, |
155
|
|
|
|
|
|
|
}, |
156
|
|
|
|
|
|
|
o => { |
157
|
|
|
|
|
|
|
Name => 'Office', |
158
|
|
|
|
|
|
|
SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' }, |
159
|
|
|
|
|
|
|
}, |
160
|
|
|
|
|
|
|
abstract => { }, |
161
|
|
|
|
|
|
|
author => { }, |
162
|
|
|
|
|
|
|
classification => { }, |
163
|
|
|
|
|
|
|
'content-language'=>{ Name => 'ContentLanguage' }, |
164
|
|
|
|
|
|
|
copyright => { }, |
165
|
|
|
|
|
|
|
description => { }, |
166
|
|
|
|
|
|
|
distribution => { }, |
167
|
|
|
|
|
|
|
'doc-class' => { Name => 'DocClass' }, |
168
|
|
|
|
|
|
|
'doc-rights' => { Name => 'DocRights' }, |
169
|
|
|
|
|
|
|
'doc-type' => { Name => 'DocType' }, |
170
|
|
|
|
|
|
|
formatter => { }, |
171
|
|
|
|
|
|
|
generator => { }, |
172
|
|
|
|
|
|
|
generatorversion=> { Name => 'GeneratorVersion' }, |
173
|
|
|
|
|
|
|
googlebot => { Name => 'GoogleBot' }, |
174
|
|
|
|
|
|
|
keywords => { List => 1 }, |
175
|
|
|
|
|
|
|
mssmarttagspreventparsing => { Name => 'NoMSSmartTags' }, |
176
|
|
|
|
|
|
|
originator => { }, |
177
|
|
|
|
|
|
|
owner => { }, |
178
|
|
|
|
|
|
|
progid => { Name => 'ProgID' }, |
179
|
|
|
|
|
|
|
rating => { }, |
180
|
|
|
|
|
|
|
refresh => { }, |
181
|
|
|
|
|
|
|
'resource-type' => { Name => 'ResourceType' }, |
182
|
|
|
|
|
|
|
'revisit-after' => { Name => 'RevisitAfter' }, |
183
|
|
|
|
|
|
|
robots => { List => 1 }, |
184
|
|
|
|
|
|
|
title => { Notes => "the only extracted tag which isn't from an HTML META element" }, |
185
|
|
|
|
|
|
|
); |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# ref 2 |
188
|
|
|
|
|
|
|
%Image::ExifTool::HTML::dc = ( |
189
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-dc', 2 => 'Document' }, |
190
|
|
|
|
|
|
|
NOTES => 'Dublin Core schema tags (also used in XMP).', |
191
|
|
|
|
|
|
|
contributor => { Groups => { 2 => 'Author' }, List => 'Bag' }, |
192
|
|
|
|
|
|
|
coverage => { }, |
193
|
|
|
|
|
|
|
creator => { Groups => { 2 => 'Author' }, List => 'Seq' }, |
194
|
|
|
|
|
|
|
date => { |
195
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
196
|
|
|
|
|
|
|
List => 'Seq', |
197
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
198
|
|
|
|
|
|
|
}, |
199
|
|
|
|
|
|
|
description => { }, |
200
|
|
|
|
|
|
|
'format' => { }, |
201
|
|
|
|
|
|
|
identifier => { }, |
202
|
|
|
|
|
|
|
language => { List => 'Bag' }, |
203
|
|
|
|
|
|
|
publisher => { Groups => { 2 => 'Author' }, List => 'Bag' }, |
204
|
|
|
|
|
|
|
relation => { List => 'Bag' }, |
205
|
|
|
|
|
|
|
rights => { Groups => { 2 => 'Author' } }, |
206
|
|
|
|
|
|
|
source => { Groups => { 2 => 'Author' } }, |
207
|
|
|
|
|
|
|
subject => { List => 'Bag' }, |
208
|
|
|
|
|
|
|
title => { }, |
209
|
|
|
|
|
|
|
type => { List => 'Bag' }, |
210
|
|
|
|
|
|
|
); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# ref 2 |
213
|
|
|
|
|
|
|
%Image::ExifTool::HTML::ncc = ( |
214
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' }, |
215
|
|
|
|
|
|
|
charset => { Name => 'CharacterSet' }, # name changed to avoid conflict with -charset option |
216
|
|
|
|
|
|
|
depth => { }, |
217
|
|
|
|
|
|
|
files => { }, |
218
|
|
|
|
|
|
|
footnotes => { }, |
219
|
|
|
|
|
|
|
generator => { }, |
220
|
|
|
|
|
|
|
kbytesize => { Name => 'KByteSize' }, |
221
|
|
|
|
|
|
|
maxpagenormal => { Name => 'MaxPageNormal' }, |
222
|
|
|
|
|
|
|
multimediatype => { Name => 'MultimediaType' }, |
223
|
|
|
|
|
|
|
narrator => { }, |
224
|
|
|
|
|
|
|
pagefront => { Name => 'PageFront' }, |
225
|
|
|
|
|
|
|
pagenormal => { Name => 'PageNormal' }, |
226
|
|
|
|
|
|
|
pagespecial => { Name => 'PageSpecial' }, |
227
|
|
|
|
|
|
|
prodnotes => { Name => 'ProdNotes' }, |
228
|
|
|
|
|
|
|
producer => { }, |
229
|
|
|
|
|
|
|
produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # YYYY-mm-dd |
230
|
|
|
|
|
|
|
revision => { }, |
231
|
|
|
|
|
|
|
revisiondate => { Name => 'RevisionDate', Groups => { 2 => 'Time' } }, |
232
|
|
|
|
|
|
|
setinfo => { Name => 'SetInfo' }, |
233
|
|
|
|
|
|
|
sidebars => { }, |
234
|
|
|
|
|
|
|
sourcedate => { Name => 'SourceDate', Groups => { 2 => 'Time' } }, |
235
|
|
|
|
|
|
|
sourceedition => { Name => 'SourceEdition' }, |
236
|
|
|
|
|
|
|
sourcepublisher => { Name => 'SourcePublisher' }, |
237
|
|
|
|
|
|
|
sourcerights => { Name => 'SourceRights' }, |
238
|
|
|
|
|
|
|
sourcetitle => { Name => 'SourceTitle' }, |
239
|
|
|
|
|
|
|
tocitems => { Name => 'TOCItems' }, |
240
|
|
|
|
|
|
|
totaltime => { Name => 'Duration' }, # HH:MM:SS |
241
|
|
|
|
|
|
|
); |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# ref 3 |
244
|
|
|
|
|
|
|
%Image::ExifTool::HTML::vw96 = ( |
245
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-vw96', 2 => 'Document' }, |
246
|
|
|
|
|
|
|
objecttype => { Name => 'ObjectType' }, |
247
|
|
|
|
|
|
|
); |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# ref 2 |
250
|
|
|
|
|
|
|
%Image::ExifTool::HTML::prod = ( |
251
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-prod', 2 => 'Document' }, |
252
|
|
|
|
|
|
|
reclocation => { Name => 'RecLocation' }, |
253
|
|
|
|
|
|
|
recengineer => { Name => 'RecEngineer' }, |
254
|
|
|
|
|
|
|
); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# ref 3/4 |
257
|
|
|
|
|
|
|
%Image::ExifTool::HTML::equiv = ( |
258
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTTP-equiv', 2 => 'Document' }, |
259
|
|
|
|
|
|
|
NOTES => 'These tags have a family 1 group name of "HTTP-equiv".', |
260
|
|
|
|
|
|
|
'cache-control' => { Name => 'CacheControl' }, |
261
|
|
|
|
|
|
|
'content-disposition' => { Name => 'ContentDisposition' }, |
262
|
|
|
|
|
|
|
'content-language' => { Name => 'ContentLanguage' }, |
263
|
|
|
|
|
|
|
'content-script-type' => { Name => 'ContentScriptType' }, |
264
|
|
|
|
|
|
|
'content-style-type' => { Name => 'ContentStyleType' }, |
265
|
|
|
|
|
|
|
# note: setting the HTMLCharset like this will miss any tags which come earlier |
266
|
|
|
|
|
|
|
'content-type' => { Name => 'ContentType', RawConv => \&SetHTMLCharset }, |
267
|
|
|
|
|
|
|
'default-style' => { Name => 'DefaultStyle' }, |
268
|
|
|
|
|
|
|
expires => { }, |
269
|
|
|
|
|
|
|
'ext-cache' => { Name => 'ExtCache' }, |
270
|
|
|
|
|
|
|
imagetoolbar => { Name => 'ImageToolbar' }, |
271
|
|
|
|
|
|
|
lotus => { }, |
272
|
|
|
|
|
|
|
'page-enter' => { Name => 'PageEnter' }, |
273
|
|
|
|
|
|
|
'page-exit' => { Name => 'PageExit' }, |
274
|
|
|
|
|
|
|
'pics-label' => { Name => 'PicsLabel' }, |
275
|
|
|
|
|
|
|
pragma => { }, |
276
|
|
|
|
|
|
|
refresh => { }, |
277
|
|
|
|
|
|
|
'reply-to' => { Name => 'ReplyTo' }, |
278
|
|
|
|
|
|
|
'set-cookie' => { Name => 'SetCookie' }, |
279
|
|
|
|
|
|
|
'site-enter' => { Name => 'SiteEnter' }, |
280
|
|
|
|
|
|
|
'site-exit' => { Name => 'SiteExit' }, |
281
|
|
|
|
|
|
|
vary => { }, |
282
|
|
|
|
|
|
|
'window-target' => { Name => 'WindowTarget' }, |
283
|
|
|
|
|
|
|
); |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# MS Office namespace (ref PH) |
286
|
|
|
|
|
|
|
%Image::ExifTool::HTML::Office = ( |
287
|
|
|
|
|
|
|
GROUPS => { 1 => 'HTML-office', 2 => 'Document' }, |
288
|
|
|
|
|
|
|
NOTES => 'Tags written by Microsoft Office applications.', |
289
|
|
|
|
|
|
|
Subject => { }, |
290
|
|
|
|
|
|
|
Author => { Groups => { 2 => 'Author' } }, |
291
|
|
|
|
|
|
|
Keywords => { }, |
292
|
|
|
|
|
|
|
Description => { }, |
293
|
|
|
|
|
|
|
Template => { }, |
294
|
|
|
|
|
|
|
LastAuthor => { Groups => { 2 => 'Author' } }, |
295
|
|
|
|
|
|
|
Revision => { Name => 'RevisionNumber' }, |
296
|
|
|
|
|
|
|
TotalTime => { Name => 'TotalEditTime', PrintConv => 'ConvertTimeSpan($val, 60)' }, |
297
|
|
|
|
|
|
|
Created => { |
298
|
|
|
|
|
|
|
Name => 'CreateDate', |
299
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
300
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
301
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
302
|
|
|
|
|
|
|
}, |
303
|
|
|
|
|
|
|
LastSaved => { |
304
|
|
|
|
|
|
|
Name => 'ModifyDate', |
305
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
306
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
307
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
308
|
|
|
|
|
|
|
}, |
309
|
|
|
|
|
|
|
LastSaved => { |
310
|
|
|
|
|
|
|
Name => 'ModifyDate', |
311
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
312
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
313
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
314
|
|
|
|
|
|
|
}, |
315
|
|
|
|
|
|
|
LastPrinted => { |
316
|
|
|
|
|
|
|
Name => 'LastPrinted', |
317
|
|
|
|
|
|
|
Groups => { 2 => 'Time' }, |
318
|
|
|
|
|
|
|
ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', |
319
|
|
|
|
|
|
|
PrintConv => '$self->ConvertDateTime($val)', |
320
|
|
|
|
|
|
|
}, |
321
|
|
|
|
|
|
|
Pages => { }, |
322
|
|
|
|
|
|
|
Words => { }, |
323
|
|
|
|
|
|
|
Characters => { }, |
324
|
|
|
|
|
|
|
Category => { }, |
325
|
|
|
|
|
|
|
Manager => { }, |
326
|
|
|
|
|
|
|
Company => { }, |
327
|
|
|
|
|
|
|
Lines => { }, |
328
|
|
|
|
|
|
|
Paragraphs => { }, |
329
|
|
|
|
|
|
|
CharactersWithSpaces => { }, |
330
|
|
|
|
|
|
|
Version => { Name => 'RevisionNumber' }, |
331
|
|
|
|
|
|
|
); |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
334
|
|
|
|
|
|
|
# Set HTMLCharset member based on content type |
335
|
|
|
|
|
|
|
# Inputs: 0) content type string, 1) ExifTool ref |
336
|
|
|
|
|
|
|
# Returns: original string |
337
|
|
|
|
|
|
|
sub SetHTMLCharset($$) |
338
|
|
|
|
|
|
|
{ |
339
|
1
|
|
|
1
|
0
|
113
|
my ($val, $et) = @_; |
340
|
1
|
50
|
|
|
|
13
|
$$et{HTMLCharset} = $htmlCharset{lc $1} if $val =~ /charset=['"]?([-\w]+)/; |
341
|
1
|
|
|
|
|
3
|
return $val; |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
345
|
|
|
|
|
|
|
# Convert single UTF-8 character to HTML character reference |
346
|
|
|
|
|
|
|
# Inputs: 0) UTF-8 character sequence |
347
|
|
|
|
|
|
|
# Returns: HTML character reference (eg. """); |
348
|
|
|
|
|
|
|
# Note: Must be called via EscapeHTML to load name lookup |
349
|
|
|
|
|
|
|
sub EscapeChar($) |
350
|
|
|
|
|
|
|
{ |
351
|
157
|
|
|
157
|
0
|
233
|
my $ch = shift; |
352
|
157
|
|
|
|
|
162
|
my $val; |
353
|
157
|
50
|
|
|
|
193
|
if ($] < 5.006001) { |
354
|
0
|
|
|
|
|
0
|
($val) = Image::ExifTool::UnpackUTF8($ch); |
355
|
|
|
|
|
|
|
} else { |
356
|
|
|
|
|
|
|
# the meaning of "U0" is reversed as of Perl 5.10.0! |
357
|
157
|
50
|
|
|
|
326
|
($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch); |
358
|
|
|
|
|
|
|
} |
359
|
157
|
50
|
|
|
|
260
|
return '?' unless defined $val; |
360
|
157
|
100
|
|
|
|
352
|
return "&$entityName{$val};" if $entityName{$val}; |
361
|
112
|
|
|
|
|
356
|
return sprintf('%x;',$val); |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
365
|
|
|
|
|
|
|
# Escape any special characters for HTML |
366
|
|
|
|
|
|
|
# Inputs: 0) string to be escaped, 1) optional string encoding (default 'UTF8') |
367
|
|
|
|
|
|
|
# Returns: escaped string |
368
|
|
|
|
|
|
|
sub EscapeHTML($;$) |
369
|
|
|
|
|
|
|
{ |
370
|
167
|
|
|
167
|
0
|
254
|
my ($str, $enc) = @_; |
371
|
|
|
|
|
|
|
# escape XML characters |
372
|
167
|
|
|
|
|
307
|
$str = EscapeXML($str); |
373
|
|
|
|
|
|
|
# escape other special characters if they exist |
374
|
167
|
100
|
|
|
|
354
|
if ($str =~ /[\x80-\xff]/) { |
375
|
|
|
|
|
|
|
# generate entity name lookup if necessary |
376
|
16
|
100
|
|
|
|
38
|
unless (%entityName) { |
377
|
2
|
|
|
|
|
5
|
local $_; |
378
|
2
|
|
|
|
|
93
|
foreach (keys %entityNum) { |
379
|
506
|
|
|
|
|
1016
|
$entityName{$entityNum{$_}} = $_; |
380
|
|
|
|
|
|
|
} |
381
|
2
|
|
|
|
|
20
|
delete $entityName{39}; # 'apos' is not valid HTML |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
# suppress warnings |
384
|
16
|
|
|
0
|
|
85
|
local $SIG{'__WARN__'} = sub { 1 }; |
|
0
|
|
|
|
|
0
|
|
385
|
16
|
50
|
33
|
|
|
78
|
if ($enc and $enc ne 'UTF8') { |
386
|
0
|
|
|
|
|
0
|
$str = Image::ExifTool::Decode(undef, $str, $enc, undef, 'UTF8'); |
387
|
0
|
|
|
|
|
0
|
$str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge; |
|
0
|
|
|
|
|
0
|
|
388
|
0
|
|
|
|
|
0
|
$str = Image::ExifTool::Decode(undef, $str, 'UTF8', undef, $enc); |
389
|
|
|
|
|
|
|
} else { |
390
|
|
|
|
|
|
|
# escape any non-ascii characters for HTML |
391
|
16
|
|
|
|
|
60
|
$str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge; |
|
157
|
|
|
|
|
234
|
|
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
} |
394
|
167
|
|
|
|
|
322
|
return $str; |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
398
|
|
|
|
|
|
|
# Unescape all HTML character references |
399
|
|
|
|
|
|
|
# Inputs: 0) string to be unescaped, 1) optional string encoding (default 'UTF8') |
400
|
|
|
|
|
|
|
# Returns: unescaped string |
401
|
|
|
|
|
|
|
sub UnescapeHTML($;$) |
402
|
|
|
|
|
|
|
{ |
403
|
154
|
|
|
154
|
0
|
269
|
my ($str, $enc) = @_; |
404
|
154
|
|
|
|
|
364
|
return UnescapeXML($str, \%entityNum, $enc); |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
408
|
|
|
|
|
|
|
# Extract information from a HTML file |
409
|
|
|
|
|
|
|
# Inputs: 0) ExifTool object reference, 1) DirInfo reference |
410
|
|
|
|
|
|
|
# Returns: 1 on success, 0 if this wasn't a valid HTML file |
411
|
|
|
|
|
|
|
sub ProcessHTML($$) |
412
|
|
|
|
|
|
|
{ |
413
|
1
|
|
|
1
|
0
|
2
|
my ($et, $dirInfo) = @_; |
414
|
1
|
|
|
|
|
2
|
my $raf = $$dirInfo{RAF}; |
415
|
1
|
|
|
|
|
2
|
my $buff; |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# validate HTML or XHTML file |
418
|
1
|
50
|
|
|
|
3
|
$raf->Read($buff, 256) or return 0; |
419
|
1
|
50
|
|
|
|
8
|
$buff =~ /^(\xef\xbb\xbf)?\s*<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0; |
420
|
1
|
50
|
50
|
|
|
8
|
$buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $2 eq '?xml'; |
421
|
1
|
|
|
|
|
14
|
$et->SetFileType(); |
422
|
|
|
|
|
|
|
|
423
|
1
|
50
|
|
|
|
4
|
$raf->Seek(0,0) or $et->Warn('Seek error'), return 1; |
424
|
|
|
|
|
|
|
|
425
|
1
|
|
|
|
|
4
|
local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf); |
426
|
1
|
50
|
|
|
|
3
|
$/ or $et->Warn('Invalid HTML data'), return 1; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# extract header information |
429
|
1
|
|
|
|
|
17
|
my $doc; |
430
|
1
|
|
|
|
|
5
|
while ($raf->ReadLine($buff)) { |
431
|
76
|
100
|
|
|
|
106
|
if (not defined $doc) { |
432
|
|
|
|
|
|
|
# look for 'head' element |
433
|
5
|
100
|
|
|
|
20
|
next unless $buff =~ /
|
434
|
1
|
|
|
|
|
3
|
$doc = substr($buff, pos($buff)); |
435
|
1
|
|
|
|
|
3
|
next; |
436
|
|
|
|
|
|
|
} |
437
|
71
|
|
|
|
|
90
|
$doc .= $buff; |
438
|
71
|
100
|
|
|
|
158
|
last if $buff =~ m{}i; |
439
|
|
|
|
|
|
|
} |
440
|
1
|
50
|
|
|
|
3
|
return 1 unless defined $doc; |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# process all elements in header |
443
|
1
|
|
|
|
|
17
|
my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main'); |
444
|
1
|
|
|
|
|
2
|
for (;;) { |
445
|
36
|
100
|
|
|
|
160
|
last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg; |
446
|
35
|
|
|
|
|
101
|
my ($tagName, $attrs) = ($1, $2); |
447
|
35
|
|
|
|
|
50
|
my $tag = lc($tagName); |
448
|
35
|
|
|
|
|
39
|
my ($val, $grp); |
449
|
35
|
100
|
|
|
|
77
|
if ($attrs =~ m{/$}) { # self-contained XHTML tags end in '/>' |
450
|
33
|
|
|
|
|
48
|
$val = ''; |
451
|
|
|
|
|
|
|
} else { |
452
|
|
|
|
|
|
|
# look for element close |
453
|
2
|
|
|
|
|
4
|
my $pos = pos($doc); |
454
|
2
|
|
|
|
|
12
|
my $close = "$tagName>"; |
455
|
|
|
|
|
|
|
# the following doesn't work on Solaris Perl 5.6.1 due to Perl bug: |
456
|
|
|
|
|
|
|
# if ($doc =~ m{(.*?)$tagName>}sg) { |
457
|
|
|
|
|
|
|
# $val = $1; |
458
|
2
|
50
|
|
|
|
51
|
if ($doc =~ m{$close}sg) { |
459
|
2
|
|
|
|
|
19
|
$val = substr($doc, $pos, pos($doc)-$pos-length($close)); |
460
|
|
|
|
|
|
|
} else { |
461
|
0
|
|
|
|
|
0
|
pos($doc) = $pos; |
462
|
0
|
0
|
|
|
|
0
|
next unless $tag eq 'meta'; # META tags don't need to be closed |
463
|
0
|
|
|
|
|
0
|
$val = ''; |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
} |
466
|
35
|
|
|
|
|
43
|
my $table = $tagTablePtr; |
467
|
35
|
100
|
|
|
|
52
|
if ($tag eq 'meta') { |
|
|
100
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# parse HTML META element |
469
|
33
|
|
|
|
|
37
|
undef $tag; |
470
|
|
|
|
|
|
|
# tag name is in NAME or HTTP-EQUIV attribute |
471
|
33
|
100
|
|
|
|
114
|
if ($attrs =~ /\bname\s*=\s*['"]?([\w:.-]+)/si) { |
|
|
50
|
|
|
|
|
|
472
|
32
|
|
|
|
|
57
|
$tagName = $1; |
473
|
|
|
|
|
|
|
} elsif ($attrs =~ /\bhttp-equiv\s*=\s*['"]?([\w:.-]+)/si) { |
474
|
1
|
|
|
|
|
4
|
$tagName = "HTTP-equiv.$1"; |
475
|
|
|
|
|
|
|
} else { |
476
|
0
|
|
|
|
|
0
|
next; # no name |
477
|
|
|
|
|
|
|
} |
478
|
33
|
50
|
|
|
|
63
|
$tag = lc($tagName) or next; |
479
|
|
|
|
|
|
|
# tag value is in CONTENT attribute |
480
|
33
|
50
|
33
|
|
|
131
|
if ($attrs =~ /\bcontent\s*=\s*(['"])(.*?)\1/si or |
481
|
|
|
|
|
|
|
$attrs =~ /\bcontent\s*=\s*(['"]?)([\w:.-]+)/si) |
482
|
|
|
|
|
|
|
{ |
483
|
33
|
|
|
|
|
57
|
$val = $2; |
484
|
|
|
|
|
|
|
} else { |
485
|
0
|
0
|
|
|
|
0
|
next unless length $val; |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
# isolate group name (separator is '.' in HTML, but ':' in ref 2) |
488
|
33
|
50
|
|
|
|
90
|
if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) { |
489
|
33
|
|
|
|
|
63
|
($grp, $tag) = ($1, $2); |
490
|
33
|
|
|
|
|
67
|
my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp); |
491
|
33
|
50
|
33
|
|
|
95
|
if ($tagInfo and $$tagInfo{SubDirectory}) { |
492
|
33
|
|
|
|
|
63
|
$table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); |
493
|
|
|
|
|
|
|
} else { |
494
|
0
|
|
|
|
|
0
|
$tag = "$grp.$tag"; |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
} elsif ($tag eq 'xml') { |
498
|
1
|
|
|
|
|
5
|
$et->VPrint(0, "Parsing XML\n"); |
499
|
|
|
|
|
|
|
# parse XML tags (quick-and-dirty) |
500
|
1
|
|
|
|
|
1
|
my $xml = $val; |
501
|
1
|
|
|
|
|
12
|
while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) { |
502
|
25
|
|
|
|
|
76
|
($grp, $tag, $val) = ($1, $2, $4); |
503
|
25
|
|
|
|
|
47
|
my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp); |
504
|
25
|
50
|
33
|
|
|
68
|
next unless $tagInfo and $$tagInfo{SubDirectory}; |
505
|
25
|
|
|
|
|
52
|
$table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); |
506
|
25
|
100
|
|
|
|
51
|
unless ($$table{$tag}) { |
507
|
5
|
|
|
|
|
8
|
my $name = ucfirst $tag; |
508
|
5
|
|
|
|
|
11
|
$name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes |
|
1
|
|
|
|
|
6
|
|
509
|
5
|
|
|
|
|
13
|
$name =~ s/\s(.)/\U$1/g; # capitalize all words in tag name |
510
|
5
|
|
|
|
|
7
|
$name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters (also hex code wide chars) |
511
|
5
|
|
|
|
|
18
|
AddTagToTable($table, $tag, { Name => $name }); |
512
|
5
|
|
|
|
|
18
|
$et->VPrint(0, " [adding $tag '${name}']\n"); |
513
|
|
|
|
|
|
|
} |
514
|
25
|
50
|
|
|
|
84
|
$val = $et->Decode($val, $$et{HTMLCharset}) if $$et{HTMLCharset}; |
515
|
25
|
|
|
|
|
55
|
$et->HandleTag($table, $tag, UnescapeXML($val)); |
516
|
|
|
|
|
|
|
} |
517
|
1
|
|
|
|
|
3
|
next; |
518
|
|
|
|
|
|
|
} else { |
519
|
|
|
|
|
|
|
# the only other element we process is TITLE |
520
|
1
|
50
|
|
|
|
3
|
next unless $tag eq 'title'; |
521
|
|
|
|
|
|
|
} |
522
|
34
|
50
|
|
|
|
67
|
unless ($$table{$tag}) { |
523
|
0
|
|
|
|
|
0
|
my $name = $tagName; |
524
|
0
|
|
|
|
|
0
|
$name =~ s/\W+(\w)/\u$1/sg; |
525
|
0
|
|
|
|
|
0
|
my $info = { Name => $name, Groups => { 0 => 'HTML' } }; |
526
|
0
|
0
|
|
|
|
0
|
$info->{Groups}->{1} = ($grp eq 'http-equiv' ? 'HTTP-equiv' : "HTML-$grp") if $grp; |
|
|
0
|
|
|
|
|
|
527
|
0
|
|
|
|
|
0
|
AddTagToTable($table, $tag, $info); |
528
|
0
|
|
|
|
|
0
|
$et->VPrint(0, " [adding $tag '${tagName}']\n"); |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
# recode if necessary |
531
|
34
|
100
|
|
|
|
94
|
$val = $et->Decode($val, $$et{HTMLCharset}) if $$et{HTMLCharset}; |
532
|
34
|
|
|
|
|
130
|
$val =~ s{\s*$/\s*}{ }sg; # replace linefeeds and indenting spaces |
533
|
34
|
|
|
|
|
61
|
$val = UnescapeHTML($val); # unescape HTML character references |
534
|
34
|
|
|
|
|
75
|
$et->HandleTag($table, $tag, $val); |
535
|
|
|
|
|
|
|
} |
536
|
1
|
|
|
|
|
5
|
return 1; |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
1; # end |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
__END__ |