#ifdef DEBUG_PRINT
#include "stdafx.h" // remove me
#include "defines.h" // remove me
#include "logger.h"
#endif
#include "deHTMLxs.h"
#include <string.h>
#include <ctype.h>
struct _tags {
char *name;
char chr;
} preproc_html_tags[] = {
{ "lt" , '<' }, { "gt" , '>' }, { "amp" , '&' },
{ "quot" , '"' }, { "nbsp" , ' ' }, { "iexcl" , (char)161 },
{ "cent" , (char)162 }, { "pound" , (char)163 }, { "curren", (char)164 },
{ "yen" , (char)165 }, { "brvbar", (char)166 }, { "sect" , (char)167 },
{ "uml" , (char)168 }, { "copy" , (char)169 }, { "ordf" , (char)170 },
{ "laquo" , (char)171 }, { "not" , (char)172 }, { "shy" , (char)173 },
{ "reg" , (char)174 }, { "macr" , (char)175 }, { "deg" , (char)176 },
{ "plusmn", (char)177 }, { "sup2" , (char)178 }, { "sup3" , (char)179 },
{ "acute" , (char)180 }, { "micro" , (char)181 }, { "para" , (char)182 },
{ "middot", (char)183 }, { "cedil" , (char)184 }, { "sup1" , (char)185 },
{ "ordm" , (char)186 }, { "raquo" , (char)187 }, { "frac14", (char)188 },
{ "frac12", (char)189 }, { "frac34", (char)190 }, { "iquest", (char)191 },
{ "Agrave", (char)192 }, { "Aacute", (char)193 }, { "Acirc" , (char)194 },
{ "Atilde", (char)195 }, { "Auml" , (char)196 }, { "Aring" , (char)197 },
{ "AElig" , (char)198 }, { "Ccedil", (char)199 }, { "Egrave", (char)200 },
{ "Eacute", (char)201 }, { "Ecirc" , (char)202 }, { "Euml" , (char)203 },
{ "Igrave", (char)204 }, { "Iacute", (char)205 }, { "Icirc" , (char)206 },
{ "Iuml" , (char)207 }, { "ETH" , (char)208 }, { "Ntilde", (char)209 },
{ "Ograve", (char)210 }, { "Oacute", (char)211 }, { "Ocirc" , (char)212 },
{ "Otilde", (char)213 }, { "Ouml" , (char)214 }, { "times" , (char)215 },
{ "Oslash", (char)216 }, { "Ugrave", (char)217 }, { "Uacute", (char)218 },
{ "Ucirc" , (char)219 }, { "Uuml" , (char)220 }, { "Yacute", (char)221 },
{ "THORN" , (char)222 }, { "szlig" , (char)223 }, { "agrave", (char)224 },
{ "aacute", (char)225 }, { "acirc" , (char)226 }, { "atilde", (char)227 },
{ "auml" , (char)228 }, { "aring" , (char)229 }, { "aelig" , (char)230 },
{ "ccedil", (char)231 }, { "egrave", (char)232 }, { "eacute", (char)233 },
{ "ecirc" , (char)234 }, { "euml" , (char)235 }, { "igrave", (char)236 },
{ "iacute", (char)237 }, { "icirc" , (char)238 }, { "iuml" , (char)239 },
{ "eth" , (char)240 }, { "ntilde", (char)241 }, { "ograve", (char)242 },
{ "oacute", (char)243 }, { "ocirc" , (char)244 }, { "otilde", (char)245 },
{ "ouml" , (char)246 }, { "divide", (char)247 }, { "oslash", (char)248 },
{ "ugrave", (char)249 }, { "uacute", (char)250 }, { "ucirc" , (char)251 },
{ "uuml" , (char)252 }, { "yacute", (char)253 }, { "thorn" , (char)254 },
{ "yuml" , (char)255 }, { 0, (char)0 }
};
int is_html(char *body) {
if ((strstr(body, "<HTML>") != NULL) || (strstr(body, "<html") != NULL) ||
(strstr(body, "<BODY") != NULL) || (strstr(body, "<body") != NULL) ||
(strstr(body, "<A HREF") != NULL) || (strstr(body, "<a href") != NULL) ||
(strstr(body, "<A href") != NULL) ||
(strstr(body, "<FONT") != NULL) || (strstr(body, "<font") != NULL))
return 1;
return 0;
}
char html_tagxlat(char **ref) {
char c = 0, *s = *ref;
unsigned int tlen;
struct _tags *tags;
unsigned int len, offset;
if (!isalpha(*s))
return '&';
len = strlen(s);
offset = ( len > 10 ? 10 : len );
for (tags = (struct _tags*)&preproc_html_tags; tags->name && !c; tags++) {
tlen = strlen(tags->name);
if (tlen > offset) continue;
if (!strncmp(s, tags->name, tlen)) {
c = tags->chr;
s += tlen;
}
}
if (!c)
c = '&';
else if (*s == ';')
s++;
*ref = s;
return c;
}
char *html_strip(char *s, char *text) {
int sgml = 0, tag = 0;
char c, last, quote = '\0', *t;
if ((t = text) == NULL)
return NULL;
if (!s)
return NULL;
memset( text, 0, strlen(s)+1);
while ((c = *s++)) {
if (c == quote) {
if (c == '-' && last != '-')
goto next;
else
last = '\0';
quote = '\0';
} else if (!quote) {
switch (c) {
case '<':
tag = 1;
if (*s++ == '!')
sgml = 1;
break;
case '>':
if (tag)
sgml = tag = 0;
break;
case '-':
if (sgml && last == '-')
quote = '-';
else
goto valid;
break;
case '"':
case '\'':
if (tag)
quote = c;
else
goto valid;
break;
case '&':
*t++ = html_tagxlat(&s);
break;
default:
valid:
if (!tag)
*t++ = c;
break;
}
}
next:
last = c;
}
return text;
}