line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* |
2
|
|
|
|
|
|
|
* $Id: _deHTMLxs.c,v 1.5 2006/02/16 19:16:00 rsoderberg Exp $ |
3
|
|
|
|
|
|
|
*/ |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
#include |
6
|
|
|
|
|
|
|
#include |
7
|
|
|
|
|
|
|
#include |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#include "deHTMLxs.h" |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
/* Read-only structure, so it's thread-safe */ |
13
|
|
|
|
|
|
|
typedef struct { |
14
|
|
|
|
|
|
|
char *name; |
15
|
|
|
|
|
|
|
char chr; |
16
|
|
|
|
|
|
|
} CM_PREPROC_html_tags_t; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
CM_PREPROC_html_tags_t CM_PREPROC_html_tags[] = { |
19
|
|
|
|
|
|
|
{ "lt" , '<' }, { "gt" , '>' }, { "amp" , '&' }, |
20
|
|
|
|
|
|
|
{ "quot" , '"' }, { "nbsp" , ' ' }, { "iexcl" , (char)161 }, |
21
|
|
|
|
|
|
|
{ "cent" , (char)162 }, { "pound" , (char)163 }, { "curren", (char)164 }, |
22
|
|
|
|
|
|
|
{ "yen" , (char)165 }, { "brvbar", (char)166 }, { "sect" , (char)167 }, |
23
|
|
|
|
|
|
|
{ "uml" , (char)168 }, { "copy" , (char)169 }, { "ordf" , (char)170 }, |
24
|
|
|
|
|
|
|
{ "laquo" , (char)171 }, { "not" , (char)172 }, { "shy" , (char)173 }, |
25
|
|
|
|
|
|
|
{ "reg" , (char)174 }, { "macr" , (char)175 }, { "deg" , (char)176 }, |
26
|
|
|
|
|
|
|
{ "plusmn", (char)177 }, { "sup2" , (char)178 }, { "sup3" , (char)179 }, |
27
|
|
|
|
|
|
|
{ "acute" , (char)180 }, { "micro" , (char)181 }, { "para" , (char)182 }, |
28
|
|
|
|
|
|
|
{ "middot", (char)183 }, { "cedil" , (char)184 }, { "sup1" , (char)185 }, |
29
|
|
|
|
|
|
|
{ "ordm" , (char)186 }, { "raquo" , (char)187 }, { "frac14", (char)188 }, |
30
|
|
|
|
|
|
|
{ "frac12", (char)189 }, { "frac34", (char)190 }, { "iquest", (char)191 }, |
31
|
|
|
|
|
|
|
{ "Agrave", (char)192 }, { "Aacute", (char)193 }, { "Acirc" , (char)194 }, |
32
|
|
|
|
|
|
|
{ "Atilde", (char)195 }, { "Auml" , (char)196 }, { "Aring" , (char)197 }, |
33
|
|
|
|
|
|
|
{ "AElig" , (char)198 }, { "Ccedil", (char)199 }, { "Egrave", (char)200 }, |
34
|
|
|
|
|
|
|
{ "Eacute", (char)201 }, { "Ecirc" , (char)202 }, { "Euml" , (char)203 }, |
35
|
|
|
|
|
|
|
{ "Igrave", (char)204 }, { "Iacute", (char)205 }, { "Icirc" , (char)206 }, |
36
|
|
|
|
|
|
|
{ "Iuml" , (char)207 }, { "ETH" , (char)208 }, { "Ntilde", (char)209 }, |
37
|
|
|
|
|
|
|
{ "Ograve", (char)210 }, { "Oacute", (char)211 }, { "Ocirc" , (char)212 }, |
38
|
|
|
|
|
|
|
{ "Otilde", (char)213 }, { "Ouml" , (char)214 }, { "times" , (char)215 }, |
39
|
|
|
|
|
|
|
{ "Oslash", (char)216 }, { "Ugrave", (char)217 }, { "Uacute", (char)218 }, |
40
|
|
|
|
|
|
|
{ "Ucirc" , (char)219 }, { "Uuml" , (char)220 }, { "Yacute", (char)221 }, |
41
|
|
|
|
|
|
|
{ "THORN" , (char)222 }, { "szlig" , (char)223 }, { "agrave", (char)224 }, |
42
|
|
|
|
|
|
|
{ "aacute", (char)225 }, { "acirc" , (char)226 }, { "atilde", (char)227 }, |
43
|
|
|
|
|
|
|
{ "auml" , (char)228 }, { "aring" , (char)229 }, { "aelig" , (char)230 }, |
44
|
|
|
|
|
|
|
{ "ccedil", (char)231 }, { "egrave", (char)232 }, { "eacute", (char)233 }, |
45
|
|
|
|
|
|
|
{ "ecirc" , (char)234 }, { "euml" , (char)235 }, { "igrave", (char)236 }, |
46
|
|
|
|
|
|
|
{ "iacute", (char)237 }, { "icirc" , (char)238 }, { "iuml" , (char)239 }, |
47
|
|
|
|
|
|
|
{ "eth" , (char)240 }, { "ntilde", (char)241 }, { "ograve", (char)242 }, |
48
|
|
|
|
|
|
|
{ "oacute", (char)243 }, { "ocirc" , (char)244 }, { "otilde", (char)245 }, |
49
|
|
|
|
|
|
|
{ "ouml" , (char)246 }, { "divide", (char)247 }, { "oslash", (char)248 }, |
50
|
|
|
|
|
|
|
{ "ugrave", (char)249 }, { "uacute", (char)250 }, { "ucirc" , (char)251 }, |
51
|
|
|
|
|
|
|
{ "uuml" , (char)252 }, { "yacute", (char)253 }, { "thorn" , (char)254 }, |
52
|
|
|
|
|
|
|
{ "yuml" , (char)255 }, { 0, (char)0 } |
53
|
|
|
|
|
|
|
}; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
8
|
|
|
|
|
|
const char *CM_PREPROC_parse_html_tag_tolower(const char *body, char *tagname, unsigned int tagnamelen) { |
57
|
8
|
|
|
|
|
|
unsigned int cch = 0; |
58
|
|
|
|
|
|
|
|
59
|
8
|
50
|
|
|
|
|
if (*body != '<') |
60
|
0
|
|
|
|
|
|
return NULL; |
61
|
|
|
|
|
|
|
|
62
|
8
|
|
|
|
|
|
body++; |
63
|
|
|
|
|
|
|
|
64
|
8
|
50
|
|
|
|
|
if ((*body == '!') || (*body == '/')) |
|
|
50
|
|
|
|
|
|
65
|
0
|
|
|
|
|
|
body++; |
66
|
|
|
|
|
|
|
|
67
|
8
|
50
|
|
|
|
|
while (isspace((unsigned char) *body)) |
68
|
0
|
|
|
|
|
|
body++; |
69
|
|
|
|
|
|
|
|
70
|
40
|
100
|
|
|
|
|
while (isalpha((unsigned char) *body)) { |
71
|
32
|
50
|
|
|
|
|
if(--tagnamelen == 0) |
72
|
0
|
|
|
|
|
|
break; |
73
|
|
|
|
|
|
|
|
74
|
32
|
|
|
|
|
|
*tagname++ = tolower(*body++); |
75
|
32
|
|
|
|
|
|
cch++; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
8
|
|
|
|
|
|
*tagname = '\0'; |
79
|
|
|
|
|
|
|
|
80
|
8
|
50
|
|
|
|
|
if (cch == 0) |
81
|
0
|
|
|
|
|
|
return NULL; |
82
|
|
|
|
|
|
|
|
83
|
47
|
50
|
|
|
|
|
while ((*body != '\0') && (*body != '>')) |
|
|
100
|
|
|
|
|
|
84
|
39
|
|
|
|
|
|
body++; |
85
|
|
|
|
|
|
|
|
86
|
8
|
50
|
|
|
|
|
if (*body != '>') |
87
|
0
|
|
|
|
|
|
return NULL; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
/* Return pointer to the ending '>' */ |
90
|
8
|
|
|
|
|
|
return body; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
8
|
|
|
|
|
|
int CM_PREPROC_is_html(const char *body) { |
94
|
8
|
|
|
|
|
|
char tagname[100] = {0}; |
95
|
|
|
|
|
|
|
|
96
|
8
|
|
|
|
|
|
const char *ppHtmlSubsetLowerCase[] = { |
97
|
|
|
|
|
|
|
"html", "body", "a", "font", "table", "head", "base", "meta", "td", "tr", "style", "img", "object", "br", |
98
|
|
|
|
|
|
|
"b", "i", "span", "div", "form", "input", "button", "frame", "iframe", "tbody", "col", "th", "hr", |
99
|
|
|
|
|
|
|
"xml", "script", "pre", "param", "applet", "center", "area", "map", "em", "embed", "xmp", "sub", "sup", |
100
|
|
|
|
|
|
|
NULL |
101
|
|
|
|
|
|
|
}; |
102
|
|
|
|
|
|
|
|
103
|
8
|
50
|
|
|
|
|
if ((body == NULL) || (*body == '\0')) |
|
|
50
|
|
|
|
|
|
104
|
0
|
|
|
|
|
|
return 0; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
/* Loop through all '<' chars and try to parse a recognizable HTML tag */ |
107
|
8
|
50
|
|
|
|
|
for (body = strchr(body, '<'); body != NULL; body = strchr(body + 1, '<')) { |
108
|
|
|
|
|
|
|
/* Attempt tp parse the tag */ |
109
|
8
|
|
|
|
|
|
const char *pTagEnd = CM_PREPROC_parse_html_tag_tolower(body, tagname, sizeof(tagname)); |
110
|
|
|
|
|
|
|
const char **ppCurTag; |
111
|
|
|
|
|
|
|
|
112
|
8
|
50
|
|
|
|
|
if (pTagEnd == NULL) |
113
|
0
|
|
|
|
|
|
continue; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
/* Check our tag array for its existence (everything is lower case) */ |
116
|
11
|
50
|
|
|
|
|
for(ppCurTag = ppHtmlSubsetLowerCase; *ppCurTag != NULL; ppCurTag++) { |
117
|
11
|
|
|
|
|
|
const char *pCurTag = *ppCurTag; |
118
|
11
|
100
|
|
|
|
|
if((*pCurTag == *tagname) && (strcmp(tagname, pCurTag) == 0)) |
|
|
50
|
|
|
|
|
|
119
|
8
|
|
|
|
|
|
return 1; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
|
body = pTagEnd; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
8
|
|
|
|
|
|
return 0; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
62
|
|
|
|
|
|
static char CM_PREPROC_html_tagxlat(char **ref) { |
129
|
62
|
|
|
|
|
|
char c = 0, *s = *ref; |
130
|
|
|
|
|
|
|
|
131
|
62
|
|
|
|
|
|
unsigned int len = (unsigned int) strlen(s); |
132
|
62
|
|
|
|
|
|
unsigned int offset = ( len > 10 ? 10 : len ); |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
CM_PREPROC_html_tags_t *tags; |
135
|
|
|
|
|
|
|
unsigned int tlen; |
136
|
|
|
|
|
|
|
|
137
|
62
|
100
|
|
|
|
|
if (!isalpha(*s)) |
138
|
29
|
|
|
|
|
|
return '&'; |
139
|
|
|
|
|
|
|
|
140
|
194
|
50
|
|
|
|
|
for (tags = (CM_PREPROC_html_tags_t*)&CM_PREPROC_html_tags; tags->name && !c; tags++) { |
|
|
100
|
|
|
|
|
|
141
|
161
|
|
|
|
|
|
tlen = (unsigned int) strlen(tags->name); |
142
|
|
|
|
|
|
|
|
143
|
161
|
50
|
|
|
|
|
if (tlen > offset) |
144
|
0
|
|
|
|
|
|
continue; |
145
|
|
|
|
|
|
|
|
146
|
161
|
100
|
|
|
|
|
if (!strncmp(s, tags->name, tlen)) { |
147
|
33
|
|
|
|
|
|
c = tags->chr; |
148
|
33
|
|
|
|
|
|
s += tlen; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
33
|
50
|
|
|
|
|
if (!c) |
153
|
0
|
|
|
|
|
|
c = '&'; |
154
|
33
|
50
|
|
|
|
|
else if (*s == ';') |
155
|
33
|
|
|
|
|
|
s++; |
156
|
|
|
|
|
|
|
|
157
|
33
|
|
|
|
|
|
*ref = s; |
158
|
|
|
|
|
|
|
|
159
|
33
|
|
|
|
|
|
return c; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
8
|
|
|
|
|
|
char *CM_PREPROC_html_strip(char *s, char *text) { |
164
|
8
|
|
|
|
|
|
int sgml = 0, tag = 0; |
165
|
8
|
|
|
|
|
|
char c, last = '\0', quote = '\0', *t; |
166
|
|
|
|
|
|
|
|
167
|
8
|
50
|
|
|
|
|
if ((t = text) == NULL) |
168
|
0
|
|
|
|
|
|
return NULL; |
169
|
|
|
|
|
|
|
|
170
|
8
|
50
|
|
|
|
|
if (!s || !*s) |
|
|
50
|
|
|
|
|
|
171
|
0
|
|
|
|
|
|
return NULL; |
172
|
|
|
|
|
|
|
|
173
|
8
|
|
|
|
|
|
memset(text, 0, strlen(s)+1); |
174
|
|
|
|
|
|
|
|
175
|
31504
|
100
|
|
|
|
|
while ((c = *s++)) { |
176
|
31496
|
100
|
|
|
|
|
if (c == quote) { |
177
|
|
|
|
|
|
|
|
178
|
467
|
100
|
|
|
|
|
if (c == '-' && last != '-') |
|
|
100
|
|
|
|
|
|
179
|
|
|
|
|
|
|
goto next; |
180
|
|
|
|
|
|
|
else |
181
|
435
|
|
|
|
|
|
last = '\0'; |
182
|
|
|
|
|
|
|
|
183
|
435
|
|
|
|
|
|
quote = '\0'; |
184
|
|
|
|
|
|
|
|
185
|
31029
|
100
|
|
|
|
|
} else if (!quote) { |
186
|
|
|
|
|
|
|
|
187
|
21137
|
|
|
|
|
|
switch (c) { |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
case '<': |
190
|
1209
|
|
|
|
|
|
tag = 1; |
191
|
1209
|
100
|
|
|
|
|
if (*s == '!') { |
192
|
5
|
|
|
|
|
|
sgml = 1; |
193
|
5
|
|
|
|
|
|
s++; |
194
|
1204
|
50
|
|
|
|
|
} else if (*s) |
195
|
1204
|
|
|
|
|
|
s++; |
196
|
|
|
|
|
|
|
|
197
|
1209
|
|
|
|
|
|
break; |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
case '>': |
200
|
1206
|
50
|
|
|
|
|
if (tag) |
201
|
1206
|
|
|
|
|
|
sgml = tag = 0; |
202
|
1206
|
|
|
|
|
|
break; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
case '-': |
205
|
84
|
100
|
|
|
|
|
if (sgml && last == '-') |
|
|
100
|
|
|
|
|
|
206
|
5
|
|
|
|
|
|
quote = '-'; |
207
|
|
|
|
|
|
|
else |
208
|
|
|
|
|
|
|
goto valid; |
209
|
5
|
|
|
|
|
|
break; |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
case '"': |
212
|
|
|
|
|
|
|
case '\'': |
213
|
441
|
100
|
|
|
|
|
if (tag) |
214
|
430
|
|
|
|
|
|
quote = c; |
215
|
|
|
|
|
|
|
else |
216
|
11
|
|
|
|
|
|
goto valid; |
217
|
430
|
|
|
|
|
|
break; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
case '&': |
220
|
62
|
|
|
|
|
|
*t++ = CM_PREPROC_html_tagxlat(&s); |
221
|
62
|
|
|
|
|
|
break; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
default: |
224
|
|
|
|
|
|
|
valid: |
225
|
18225
|
100
|
|
|
|
|
if (!tag) |
226
|
8554
|
|
|
|
|
|
*t++ = c; |
227
|
18225
|
|
|
|
|
|
break; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
next: |
233
|
31496
|
|
|
|
|
|
last = c; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
8
|
|
|
|
|
|
return text; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
} |