| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* $Id: _deHTMLxs.c,v 1.5 2006/02/16 19:16:00 rsoderberg Exp $ |
|
3
|
|
|
|
|
|
|
*/ |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
#include |
|
6
|
|
|
|
|
|
|
#include |
|
7
|
|
|
|
|
|
|
#include |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#include "deHTMLxs.h" |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
/* Read-only structure, so it's thread-safe */ |
|
13
|
|
|
|
|
|
|
typedef struct { |
|
14
|
|
|
|
|
|
|
char *name; |
|
15
|
|
|
|
|
|
|
char chr; |
|
16
|
|
|
|
|
|
|
} CM_PREPROC_html_tags_t; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
CM_PREPROC_html_tags_t CM_PREPROC_html_tags[] = { |
|
19
|
|
|
|
|
|
|
{ "lt" , '<' }, { "gt" , '>' }, { "amp" , '&' }, |
|
20
|
|
|
|
|
|
|
{ "quot" , '"' }, { "nbsp" , ' ' }, { "iexcl" , (char)161 }, |
|
21
|
|
|
|
|
|
|
{ "cent" , (char)162 }, { "pound" , (char)163 }, { "curren", (char)164 }, |
|
22
|
|
|
|
|
|
|
{ "yen" , (char)165 }, { "brvbar", (char)166 }, { "sect" , (char)167 }, |
|
23
|
|
|
|
|
|
|
{ "uml" , (char)168 }, { "copy" , (char)169 }, { "ordf" , (char)170 }, |
|
24
|
|
|
|
|
|
|
{ "laquo" , (char)171 }, { "not" , (char)172 }, { "shy" , (char)173 }, |
|
25
|
|
|
|
|
|
|
{ "reg" , (char)174 }, { "macr" , (char)175 }, { "deg" , (char)176 }, |
|
26
|
|
|
|
|
|
|
{ "plusmn", (char)177 }, { "sup2" , (char)178 }, { "sup3" , (char)179 }, |
|
27
|
|
|
|
|
|
|
{ "acute" , (char)180 }, { "micro" , (char)181 }, { "para" , (char)182 }, |
|
28
|
|
|
|
|
|
|
{ "middot", (char)183 }, { "cedil" , (char)184 }, { "sup1" , (char)185 }, |
|
29
|
|
|
|
|
|
|
{ "ordm" , (char)186 }, { "raquo" , (char)187 }, { "frac14", (char)188 }, |
|
30
|
|
|
|
|
|
|
{ "frac12", (char)189 }, { "frac34", (char)190 }, { "iquest", (char)191 }, |
|
31
|
|
|
|
|
|
|
{ "Agrave", (char)192 }, { "Aacute", (char)193 }, { "Acirc" , (char)194 }, |
|
32
|
|
|
|
|
|
|
{ "Atilde", (char)195 }, { "Auml" , (char)196 }, { "Aring" , (char)197 }, |
|
33
|
|
|
|
|
|
|
{ "AElig" , (char)198 }, { "Ccedil", (char)199 }, { "Egrave", (char)200 }, |
|
34
|
|
|
|
|
|
|
{ "Eacute", (char)201 }, { "Ecirc" , (char)202 }, { "Euml" , (char)203 }, |
|
35
|
|
|
|
|
|
|
{ "Igrave", (char)204 }, { "Iacute", (char)205 }, { "Icirc" , (char)206 }, |
|
36
|
|
|
|
|
|
|
{ "Iuml" , (char)207 }, { "ETH" , (char)208 }, { "Ntilde", (char)209 }, |
|
37
|
|
|
|
|
|
|
{ "Ograve", (char)210 }, { "Oacute", (char)211 }, { "Ocirc" , (char)212 }, |
|
38
|
|
|
|
|
|
|
{ "Otilde", (char)213 }, { "Ouml" , (char)214 }, { "times" , (char)215 }, |
|
39
|
|
|
|
|
|
|
{ "Oslash", (char)216 }, { "Ugrave", (char)217 }, { "Uacute", (char)218 }, |
|
40
|
|
|
|
|
|
|
{ "Ucirc" , (char)219 }, { "Uuml" , (char)220 }, { "Yacute", (char)221 }, |
|
41
|
|
|
|
|
|
|
{ "THORN" , (char)222 }, { "szlig" , (char)223 }, { "agrave", (char)224 }, |
|
42
|
|
|
|
|
|
|
{ "aacute", (char)225 }, { "acirc" , (char)226 }, { "atilde", (char)227 }, |
|
43
|
|
|
|
|
|
|
{ "auml" , (char)228 }, { "aring" , (char)229 }, { "aelig" , (char)230 }, |
|
44
|
|
|
|
|
|
|
{ "ccedil", (char)231 }, { "egrave", (char)232 }, { "eacute", (char)233 }, |
|
45
|
|
|
|
|
|
|
{ "ecirc" , (char)234 }, { "euml" , (char)235 }, { "igrave", (char)236 }, |
|
46
|
|
|
|
|
|
|
{ "iacute", (char)237 }, { "icirc" , (char)238 }, { "iuml" , (char)239 }, |
|
47
|
|
|
|
|
|
|
{ "eth" , (char)240 }, { "ntilde", (char)241 }, { "ograve", (char)242 }, |
|
48
|
|
|
|
|
|
|
{ "oacute", (char)243 }, { "ocirc" , (char)244 }, { "otilde", (char)245 }, |
|
49
|
|
|
|
|
|
|
{ "ouml" , (char)246 }, { "divide", (char)247 }, { "oslash", (char)248 }, |
|
50
|
|
|
|
|
|
|
{ "ugrave", (char)249 }, { "uacute", (char)250 }, { "ucirc" , (char)251 }, |
|
51
|
|
|
|
|
|
|
{ "uuml" , (char)252 }, { "yacute", (char)253 }, { "thorn" , (char)254 }, |
|
52
|
|
|
|
|
|
|
{ "yuml" , (char)255 }, { 0, (char)0 } |
|
53
|
|
|
|
|
|
|
}; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
|
56
|
8
|
|
|
|
|
|
const char *CM_PREPROC_parse_html_tag_tolower(const char *body, char *tagname, unsigned int tagnamelen) { |
|
57
|
8
|
|
|
|
|
|
unsigned int cch = 0; |
|
58
|
|
|
|
|
|
|
|
|
59
|
8
|
50
|
|
|
|
|
if (*body != '<') |
|
60
|
0
|
|
|
|
|
|
return NULL; |
|
61
|
|
|
|
|
|
|
|
|
62
|
8
|
|
|
|
|
|
body++; |
|
63
|
|
|
|
|
|
|
|
|
64
|
8
|
50
|
|
|
|
|
if ((*body == '!') || (*body == '/')) |
|
|
|
50
|
|
|
|
|
|
|
65
|
0
|
|
|
|
|
|
body++; |
|
66
|
|
|
|
|
|
|
|
|
67
|
8
|
50
|
|
|
|
|
while (isspace((unsigned char) *body)) |
|
68
|
0
|
|
|
|
|
|
body++; |
|
69
|
|
|
|
|
|
|
|
|
70
|
40
|
100
|
|
|
|
|
while (isalpha((unsigned char) *body)) { |
|
71
|
32
|
50
|
|
|
|
|
if(--tagnamelen == 0) |
|
72
|
0
|
|
|
|
|
|
break; |
|
73
|
|
|
|
|
|
|
|
|
74
|
32
|
|
|
|
|
|
*tagname++ = tolower(*body++); |
|
75
|
32
|
|
|
|
|
|
cch++; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
8
|
|
|
|
|
|
*tagname = '\0'; |
|
79
|
|
|
|
|
|
|
|
|
80
|
8
|
50
|
|
|
|
|
if (cch == 0) |
|
81
|
0
|
|
|
|
|
|
return NULL; |
|
82
|
|
|
|
|
|
|
|
|
83
|
47
|
50
|
|
|
|
|
while ((*body != '\0') && (*body != '>')) |
|
|
|
100
|
|
|
|
|
|
|
84
|
39
|
|
|
|
|
|
body++; |
|
85
|
|
|
|
|
|
|
|
|
86
|
8
|
50
|
|
|
|
|
if (*body != '>') |
|
87
|
0
|
|
|
|
|
|
return NULL; |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
/* Return pointer to the ending '>' */ |
|
90
|
8
|
|
|
|
|
|
return body; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
8
|
|
|
|
|
|
int CM_PREPROC_is_html(const char *body) { |
|
94
|
8
|
|
|
|
|
|
char tagname[100] = {0}; |
|
95
|
|
|
|
|
|
|
|
|
96
|
8
|
|
|
|
|
|
const char *ppHtmlSubsetLowerCase[] = { |
|
97
|
|
|
|
|
|
|
"html", "body", "a", "font", "table", "head", "base", "meta", "td", "tr", "style", "img", "object", "br", |
|
98
|
|
|
|
|
|
|
"b", "i", "span", "div", "form", "input", "button", "frame", "iframe", "tbody", "col", "th", "hr", |
|
99
|
|
|
|
|
|
|
"xml", "script", "pre", "param", "applet", "center", "area", "map", "em", "embed", "xmp", "sub", "sup", |
|
100
|
|
|
|
|
|
|
NULL |
|
101
|
|
|
|
|
|
|
}; |
|
102
|
|
|
|
|
|
|
|
|
103
|
8
|
50
|
|
|
|
|
if ((body == NULL) || (*body == '\0')) |
|
|
|
50
|
|
|
|
|
|
|
104
|
0
|
|
|
|
|
|
return 0; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
/* Loop through all '<' chars and try to parse a recognizable HTML tag */ |
|
107
|
8
|
50
|
|
|
|
|
for (body = strchr(body, '<'); body != NULL; body = strchr(body + 1, '<')) { |
|
108
|
|
|
|
|
|
|
/* Attempt tp parse the tag */ |
|
109
|
8
|
|
|
|
|
|
const char *pTagEnd = CM_PREPROC_parse_html_tag_tolower(body, tagname, sizeof(tagname)); |
|
110
|
|
|
|
|
|
|
const char **ppCurTag; |
|
111
|
|
|
|
|
|
|
|
|
112
|
8
|
50
|
|
|
|
|
if (pTagEnd == NULL) |
|
113
|
0
|
|
|
|
|
|
continue; |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
/* Check our tag array for its existence (everything is lower case) */ |
|
116
|
11
|
50
|
|
|
|
|
for(ppCurTag = ppHtmlSubsetLowerCase; *ppCurTag != NULL; ppCurTag++) { |
|
117
|
11
|
|
|
|
|
|
const char *pCurTag = *ppCurTag; |
|
118
|
11
|
100
|
|
|
|
|
if((*pCurTag == *tagname) && (strcmp(tagname, pCurTag) == 0)) |
|
|
|
50
|
|
|
|
|
|
|
119
|
8
|
|
|
|
|
|
return 1; |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
|
body = pTagEnd; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
8
|
|
|
|
|
|
return 0; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
62
|
|
|
|
|
|
static char CM_PREPROC_html_tagxlat(char **ref) { |
|
129
|
62
|
|
|
|
|
|
char c = 0, *s = *ref; |
|
130
|
|
|
|
|
|
|
|
|
131
|
62
|
|
|
|
|
|
unsigned int len = (unsigned int) strlen(s); |
|
132
|
62
|
|
|
|
|
|
unsigned int offset = ( len > 10 ? 10 : len ); |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
CM_PREPROC_html_tags_t *tags; |
|
135
|
|
|
|
|
|
|
unsigned int tlen; |
|
136
|
|
|
|
|
|
|
|
|
137
|
62
|
100
|
|
|
|
|
if (!isalpha(*s)) |
|
138
|
29
|
|
|
|
|
|
return '&'; |
|
139
|
|
|
|
|
|
|
|
|
140
|
194
|
50
|
|
|
|
|
for (tags = (CM_PREPROC_html_tags_t*)&CM_PREPROC_html_tags; tags->name && !c; tags++) { |
|
|
|
100
|
|
|
|
|
|
|
141
|
161
|
|
|
|
|
|
tlen = (unsigned int) strlen(tags->name); |
|
142
|
|
|
|
|
|
|
|
|
143
|
161
|
50
|
|
|
|
|
if (tlen > offset) |
|
144
|
0
|
|
|
|
|
|
continue; |
|
145
|
|
|
|
|
|
|
|
|
146
|
161
|
100
|
|
|
|
|
if (!strncmp(s, tags->name, tlen)) { |
|
147
|
33
|
|
|
|
|
|
c = tags->chr; |
|
148
|
33
|
|
|
|
|
|
s += tlen; |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
33
|
50
|
|
|
|
|
if (!c) |
|
153
|
0
|
|
|
|
|
|
c = '&'; |
|
154
|
33
|
50
|
|
|
|
|
else if (*s == ';') |
|
155
|
33
|
|
|
|
|
|
s++; |
|
156
|
|
|
|
|
|
|
|
|
157
|
33
|
|
|
|
|
|
*ref = s; |
|
158
|
|
|
|
|
|
|
|
|
159
|
33
|
|
|
|
|
|
return c; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
|
163
|
8
|
|
|
|
|
|
char *CM_PREPROC_html_strip(char *s, char *text) { |
|
164
|
8
|
|
|
|
|
|
int sgml = 0, tag = 0; |
|
165
|
8
|
|
|
|
|
|
char c, last = '\0', quote = '\0', *t; |
|
166
|
|
|
|
|
|
|
|
|
167
|
8
|
50
|
|
|
|
|
if ((t = text) == NULL) |
|
168
|
0
|
|
|
|
|
|
return NULL; |
|
169
|
|
|
|
|
|
|
|
|
170
|
8
|
50
|
|
|
|
|
if (!s || !*s) |
|
|
|
50
|
|
|
|
|
|
|
171
|
0
|
|
|
|
|
|
return NULL; |
|
172
|
|
|
|
|
|
|
|
|
173
|
8
|
|
|
|
|
|
memset(text, 0, strlen(s)+1); |
|
174
|
|
|
|
|
|
|
|
|
175
|
31504
|
100
|
|
|
|
|
while ((c = *s++)) { |
|
176
|
31496
|
100
|
|
|
|
|
if (c == quote) { |
|
177
|
|
|
|
|
|
|
|
|
178
|
467
|
100
|
|
|
|
|
if (c == '-' && last != '-') |
|
|
|
100
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
goto next; |
|
180
|
|
|
|
|
|
|
else |
|
181
|
435
|
|
|
|
|
|
last = '\0'; |
|
182
|
|
|
|
|
|
|
|
|
183
|
435
|
|
|
|
|
|
quote = '\0'; |
|
184
|
|
|
|
|
|
|
|
|
185
|
31029
|
100
|
|
|
|
|
} else if (!quote) { |
|
186
|
|
|
|
|
|
|
|
|
187
|
21137
|
|
|
|
|
|
switch (c) { |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
case '<': |
|
190
|
1209
|
|
|
|
|
|
tag = 1; |
|
191
|
1209
|
100
|
|
|
|
|
if (*s == '!') { |
|
192
|
5
|
|
|
|
|
|
sgml = 1; |
|
193
|
5
|
|
|
|
|
|
s++; |
|
194
|
1204
|
50
|
|
|
|
|
} else if (*s) |
|
195
|
1204
|
|
|
|
|
|
s++; |
|
196
|
|
|
|
|
|
|
|
|
197
|
1209
|
|
|
|
|
|
break; |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
case '>': |
|
200
|
1206
|
50
|
|
|
|
|
if (tag) |
|
201
|
1206
|
|
|
|
|
|
sgml = tag = 0; |
|
202
|
1206
|
|
|
|
|
|
break; |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
case '-': |
|
205
|
84
|
100
|
|
|
|
|
if (sgml && last == '-') |
|
|
|
100
|
|
|
|
|
|
|
206
|
5
|
|
|
|
|
|
quote = '-'; |
|
207
|
|
|
|
|
|
|
else |
|
208
|
|
|
|
|
|
|
goto valid; |
|
209
|
5
|
|
|
|
|
|
break; |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
case '"': |
|
212
|
|
|
|
|
|
|
case '\'': |
|
213
|
441
|
100
|
|
|
|
|
if (tag) |
|
214
|
430
|
|
|
|
|
|
quote = c; |
|
215
|
|
|
|
|
|
|
else |
|
216
|
11
|
|
|
|
|
|
goto valid; |
|
217
|
430
|
|
|
|
|
|
break; |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
case '&': |
|
220
|
62
|
|
|
|
|
|
*t++ = CM_PREPROC_html_tagxlat(&s); |
|
221
|
62
|
|
|
|
|
|
break; |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
default: |
|
224
|
|
|
|
|
|
|
valid: |
|
225
|
18225
|
100
|
|
|
|
|
if (!tag) |
|
226
|
8554
|
|
|
|
|
|
*t++ = c; |
|
227
|
18225
|
|
|
|
|
|
break; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
next: |
|
233
|
31496
|
|
|
|
|
|
last = c; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
8
|
|
|
|
|
|
return text; |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
} |