entities.c   [plain text]


/* entities.c -- recognize HTML ISO entities

  (c) 1998-2004 (W3C) MIT, ERCIM, Keio University
  See tidy.h for the copyright notice.

  CVS Info :

    $Author: rbraun $ 
    $Date: 2004/05/04 20:05:14 $ 
    $Revision: 1.1.1.1 $ 

  Entity handling can be static because there are no config or
  document-specific values.  Lookup table is 100% defined at 
  compile time.

*/

#include <stdio.h>
#include "entities.h"
#include "tidy-int.h"
#include "tmbstr.h"

struct _entity;
typedef struct _entity entity;

struct _entity
{
    ctmbstr name;
    uint    versions;
    uint    code;
};


static const entity entities[] =
{
    /*
    ** Markup pre-defined character entities
    */
    { "quot",    VERS_ALL|VERS_XML,    34 },
    { "amp",     VERS_ALL|VERS_XML,    38 },
    { "apos",    VERS_FROM40|VERS_XML, 39 },
    { "lt",      VERS_ALL|VERS_XML,    60 },
    { "gt",      VERS_ALL|VERS_XML,    62 },

    /*
    ** Latin-1 character entities
    */
    { "nbsp",     VERS_ALL,      160 },
    { "iexcl",    VERS_ALL,      161 },
    { "cent",     VERS_ALL,      162 },
    { "pound",    VERS_ALL,      163 },
    { "curren",   VERS_ALL,      164 },
    { "yen",      VERS_ALL,      165 },
    { "brvbar",   VERS_ALL,      166 },
    { "sect",     VERS_ALL,      167 },
    { "uml",      VERS_ALL,      168 },
    { "copy",     VERS_ALL,      169 },
    { "ordf",     VERS_ALL,      170 },
    { "laquo",    VERS_ALL,      171 },
    { "not",      VERS_ALL,      172 },
    { "shy",      VERS_ALL,      173 },
    { "reg",      VERS_ALL,      174 },
    { "macr",     VERS_ALL,      175 },
    { "deg",      VERS_ALL,      176 },
    { "plusmn",   VERS_ALL,      177 },
    { "sup2",     VERS_ALL,      178 },
    { "sup3",     VERS_ALL,      179 },
    { "acute",    VERS_ALL,      180 },
    { "micro",    VERS_ALL,      181 },
    { "para",     VERS_ALL,      182 },
    { "middot",   VERS_ALL,      183 },
    { "cedil",    VERS_ALL,      184 },
    { "sup1",     VERS_ALL,      185 },
    { "ordm",     VERS_ALL,      186 },
    { "raquo",    VERS_ALL,      187 },
    { "frac14",   VERS_ALL,      188 },
    { "frac12",   VERS_ALL,      189 },
    { "frac34",   VERS_ALL,      190 },
    { "iquest",   VERS_ALL,      191 },
    { "Agrave",   VERS_ALL,      192 },
    { "Aacute",   VERS_ALL,      193 },
    { "Acirc",    VERS_ALL,      194 },
    { "Atilde",   VERS_ALL,      195 },
    { "Auml",     VERS_ALL,      196 },
    { "Aring",    VERS_ALL,      197 },
    { "AElig",    VERS_ALL,      198 },
    { "Ccedil",   VERS_ALL,      199 },
    { "Egrave",   VERS_ALL,      200 },
    { "Eacute",   VERS_ALL,      201 },
    { "Ecirc",    VERS_ALL,      202 },
    { "Euml",     VERS_ALL,      203 },
    { "Igrave",   VERS_ALL,      204 },
    { "Iacute",   VERS_ALL,      205 },
    { "Icirc",    VERS_ALL,      206 },
    { "Iuml",     VERS_ALL,      207 },
    { "ETH",      VERS_ALL,      208 },
    { "Ntilde",   VERS_ALL,      209 },
    { "Ograve",   VERS_ALL,      210 },
    { "Oacute",   VERS_ALL,      211 },
    { "Ocirc",    VERS_ALL,      212 },
    { "Otilde",   VERS_ALL,      213 },
    { "Ouml",     VERS_ALL,      214 },
    { "times",    VERS_ALL,      215 },
    { "Oslash",   VERS_ALL,      216 },
    { "Ugrave",   VERS_ALL,      217 },
    { "Uacute",   VERS_ALL,      218 },
    { "Ucirc",    VERS_ALL,      219 },
    { "Uuml",     VERS_ALL,      220 },
    { "Yacute",   VERS_ALL,      221 },
    { "THORN",    VERS_ALL,      222 },
    { "szlig",    VERS_ALL,      223 },
    { "agrave",   VERS_ALL,      224 },
    { "aacute",   VERS_ALL,      225 },
    { "acirc",    VERS_ALL,      226 },
    { "atilde",   VERS_ALL,      227 },
    { "auml",     VERS_ALL,      228 },
    { "aring",    VERS_ALL,      229 },
    { "aelig",    VERS_ALL,      230 },
    { "ccedil",   VERS_ALL,      231 },
    { "egrave",   VERS_ALL,      232 },
    { "eacute",   VERS_ALL,      233 },
    { "ecirc",    VERS_ALL,      234 },
    { "euml",     VERS_ALL,      235 },
    { "igrave",   VERS_ALL,      236 },
    { "iacute",   VERS_ALL,      237 },
    { "icirc",    VERS_ALL,      238 },
    { "iuml",     VERS_ALL,      239 },
    { "eth",      VERS_ALL,      240 },
    { "ntilde",   VERS_ALL,      241 },
    { "ograve",   VERS_ALL,      242 },
    { "oacute",   VERS_ALL,      243 },
    { "ocirc",    VERS_ALL,      244 },
    { "otilde",   VERS_ALL,      245 },
    { "ouml",     VERS_ALL,      246 },
    { "divide",   VERS_ALL,      247 },
    { "oslash",   VERS_ALL,      248 },
    { "ugrave",   VERS_ALL,      249 },
    { "uacute",   VERS_ALL,      250 },
    { "ucirc",    VERS_ALL,      251 },
    { "uuml",     VERS_ALL,      252 },
    { "yacute",   VERS_ALL,      253 },
    { "thorn",    VERS_ALL,      254 },
    { "yuml",     VERS_ALL,      255 },

    /*
    ** Extended Entities defined in HTML 4: Symbols 
    */
    { "fnof",     VERS_FROM40,   402 },
    { "Alpha",    VERS_FROM40,   913 },
    { "Beta",     VERS_FROM40,   914 },
    { "Gamma",    VERS_FROM40,   915 },
    { "Delta",    VERS_FROM40,   916 },
    { "Epsilon",  VERS_FROM40,   917 },
    { "Zeta",     VERS_FROM40,   918 },
    { "Eta",      VERS_FROM40,   919 },
    { "Theta",    VERS_FROM40,   920 },
    { "Iota",     VERS_FROM40,   921 },
    { "Kappa",    VERS_FROM40,   922 },
    { "Lambda",   VERS_FROM40,   923 },
    { "Mu",       VERS_FROM40,   924 },
    { "Nu",       VERS_FROM40,   925 },
    { "Xi",       VERS_FROM40,   926 },
    { "Omicron",  VERS_FROM40,   927 },
    { "Pi",       VERS_FROM40,   928 },
    { "Rho",      VERS_FROM40,   929 },
    { "Sigma",    VERS_FROM40,   931 },
    { "Tau",      VERS_FROM40,   932 },
    { "Upsilon",  VERS_FROM40,   933 },
    { "Phi",      VERS_FROM40,   934 },
    { "Chi",      VERS_FROM40,   935 },
    { "Psi",      VERS_FROM40,   936 },
    { "Omega",    VERS_FROM40,   937 },
    { "alpha",    VERS_FROM40,   945 },
    { "beta",     VERS_FROM40,   946 },
    { "gamma",    VERS_FROM40,   947 },
    { "delta",    VERS_FROM40,   948 },
    { "epsilon",  VERS_FROM40,   949 },
    { "zeta",     VERS_FROM40,   950 },
    { "eta",      VERS_FROM40,   951 },
    { "theta",    VERS_FROM40,   952 },
    { "iota",     VERS_FROM40,   953 },
    { "kappa",    VERS_FROM40,   954 },
    { "lambda",   VERS_FROM40,   955 },
    { "mu",       VERS_FROM40,   956 },
    { "nu",       VERS_FROM40,   957 },
    { "xi",       VERS_FROM40,   958 },
    { "omicron",  VERS_FROM40,   959 },
    { "pi",       VERS_FROM40,   960 },
    { "rho",      VERS_FROM40,   961 },
    { "sigmaf",   VERS_FROM40,   962 },
    { "sigma",    VERS_FROM40,   963 },
    { "tau",      VERS_FROM40,   964 },
    { "upsilon",  VERS_FROM40,   965 },
    { "phi",      VERS_FROM40,   966 },
    { "chi",      VERS_FROM40,   967 },
    { "psi",      VERS_FROM40,   968 },
    { "omega",    VERS_FROM40,   969 },
    { "thetasym", VERS_FROM40,   977 },
    { "upsih",    VERS_FROM40,   978 },
    { "piv",      VERS_FROM40,   982 },
    { "bull",     VERS_FROM40,  8226 },
    { "hellip",   VERS_FROM40,  8230 },
    { "prime",    VERS_FROM40,  8242 },
    { "Prime",    VERS_FROM40,  8243 },
    { "oline",    VERS_FROM40,  8254 },
    { "frasl",    VERS_FROM40,  8260 },
    { "weierp",   VERS_FROM40,  8472 },
    { "image",    VERS_FROM40,  8465 },
    { "real",     VERS_FROM40,  8476 },
    { "trade",    VERS_FROM40,  8482 },
    { "alefsym",  VERS_FROM40,  8501 },
    { "larr",     VERS_FROM40,  8592 },
    { "uarr",     VERS_FROM40,  8593 },
    { "rarr",     VERS_FROM40,  8594 },
    { "darr",     VERS_FROM40,  8595 },
    { "harr",     VERS_FROM40,  8596 },
    { "crarr",    VERS_FROM40,  8629 },
    { "lArr",     VERS_FROM40,  8656 },
    { "uArr",     VERS_FROM40,  8657 },
    { "rArr",     VERS_FROM40,  8658 },
    { "dArr",     VERS_FROM40,  8659 },
    { "hArr",     VERS_FROM40,  8660 },
    { "forall",   VERS_FROM40,  8704 },
    { "part",     VERS_FROM40,  8706 },
    { "exist",    VERS_FROM40,  8707 },
    { "empty",    VERS_FROM40,  8709 },
    { "nabla",    VERS_FROM40,  8711 },
    { "isin",     VERS_FROM40,  8712 },
    { "notin",    VERS_FROM40,  8713 },
    { "ni",       VERS_FROM40,  8715 },
    { "prod",     VERS_FROM40,  8719 },
    { "sum",      VERS_FROM40,  8721 },
    { "minus",    VERS_FROM40,  8722 },
    { "lowast",   VERS_FROM40,  8727 },
    { "radic",    VERS_FROM40,  8730 },
    { "prop",     VERS_FROM40,  8733 },
    { "infin",    VERS_FROM40,  8734 },
    { "ang",      VERS_FROM40,  8736 },
    { "and",      VERS_FROM40,  8743 },
    { "or",       VERS_FROM40,  8744 },
    { "cap",      VERS_FROM40,  8745 },
    { "cup",      VERS_FROM40,  8746 },
    { "int",      VERS_FROM40,  8747 },
    { "there4",   VERS_FROM40,  8756 },
    { "sim",      VERS_FROM40,  8764 },
    { "cong",     VERS_FROM40,  8773 },
    { "asymp",    VERS_FROM40,  8776 },
    { "ne",       VERS_FROM40,  8800 },
    { "equiv",    VERS_FROM40,  8801 },
    { "le",       VERS_FROM40,  8804 },
    { "ge",       VERS_FROM40,  8805 },
    { "sub",      VERS_FROM40,  8834 },
    { "sup",      VERS_FROM40,  8835 },
    { "nsub",     VERS_FROM40,  8836 },
    { "sube",     VERS_FROM40,  8838 },
    { "supe",     VERS_FROM40,  8839 },
    { "oplus",    VERS_FROM40,  8853 },
    { "otimes",   VERS_FROM40,  8855 },
    { "perp",     VERS_FROM40,  8869 },
    { "sdot",     VERS_FROM40,  8901 },
    { "lceil",    VERS_FROM40,  8968 },
    { "rceil",    VERS_FROM40,  8969 },
    { "lfloor",   VERS_FROM40,  8970 },
    { "rfloor",   VERS_FROM40,  8971 },
    { "lang",     VERS_FROM40,  9001 },
    { "rang",     VERS_FROM40,  9002 },
    { "loz",      VERS_FROM40,  9674 },
    { "spades",   VERS_FROM40,  9824 },
    { "clubs",    VERS_FROM40,  9827 },
    { "hearts",   VERS_FROM40,  9829 },
    { "diams",    VERS_FROM40,  9830 },

    /*
    ** Extended Entities defined in HTML 4: Special (less Markup at top)
    */
    { "OElig",    VERS_FROM40,   338 },
    { "oelig",    VERS_FROM40,   339 },
    { "Scaron",   VERS_FROM40,   352 },
    { "scaron",   VERS_FROM40,   353 },
    { "Yuml",     VERS_FROM40,   376 },
    { "circ",     VERS_FROM40,   710 },
    { "tilde",    VERS_FROM40,   732 },
    { "ensp",     VERS_FROM40,  8194 },
    { "emsp",     VERS_FROM40,  8195 },
    { "thinsp",   VERS_FROM40,  8201 },
    { "zwnj",     VERS_FROM40,  8204 },
    { "zwj",      VERS_FROM40,  8205 },
    { "lrm",      VERS_FROM40,  8206 },
    { "rlm",      VERS_FROM40,  8207 },
    { "ndash",    VERS_FROM40,  8211 },
    { "mdash",    VERS_FROM40,  8212 },
    { "lsquo",    VERS_FROM40,  8216 },
    { "rsquo",    VERS_FROM40,  8217 },
    { "sbquo",    VERS_FROM40,  8218 },
    { "ldquo",    VERS_FROM40,  8220 },
    { "rdquo",    VERS_FROM40,  8221 },
    { "bdquo",    VERS_FROM40,  8222 },
    { "dagger",   VERS_FROM40,  8224 },
    { "Dagger",   VERS_FROM40,  8225 },
    { "permil",   VERS_FROM40,  8240 },
    { "lsaquo",   VERS_FROM40,  8249 },
    { "rsaquo",   VERS_FROM40,  8250 },
    { "euro",     VERS_FROM40,  8364 },
    { NULL,       0,               0 }
};


/* Pure static implementation.  Trades off lookup speed
** for faster setup time (well, none actually).
** Optimization of comparing 1st character buys enough
** speed that hash doesn't improve things without > 500
** items in list.
*/
static const entity* lookup( ctmbstr s )
{
    tmbchar ch = (tmbchar)( s ? *s : 0 );
    const entity *np;
    for ( np = entities; ch && np && np->name; ++np )
        if ( ch == *np->name && tmbstrcmp(s, np->name) == 0 )
            return np;
    return NULL;
}

/* entity starting with "&" returns zero on error */
uint EntityCode( ctmbstr name, uint versions )
{
    const entity* np;
    assert( name && name[0] == '&' );

    /* numeric entitity: name = "&#" followed by number */
    if ( name[1] == '#' )
    {
        uint c = 0;  /* zero on missing/bad number */
        Bool isXml = ( (versions & VERS_XML) == VERS_XML );

        /* 'x' prefix denotes hexadecimal number format */
        if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
            sscanf( name+3, "%x", &c );
        else
            sscanf( name+2, "%d", &c );

        return (uint) c;
    }

   /* Named entity: name ="&" followed by a name */
    if ( NULL != (np = lookup(name+1)) )
    {
        /* Only recognize entity name if version supports it.  */
        if ( np->versions & versions )
            return np->code;
    }

    return 0;   /* zero signifies unknown entity name */
}

Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions )
{
    const entity* np;
    assert( name && name[0] == '&' );
    assert( code != NULL );
    assert( versions != NULL );

    /* numeric entitity: name = "&#" followed by number */
    if ( name[1] == '#' )
    {
        uint c = 0;  /* zero on missing/bad number */

        /* 'x' prefix denotes hexadecimal number format */
        if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
            sscanf( name+3, "%x", &c );
        else
            sscanf( name+2, "%d", &c );

        *code = c;
        *versions = VERS_ALL;
        return yes;
    }

    /* Named entity: name ="&" followed by a name */
    if ( NULL != (np = lookup(name+1)) )
    {
        *code = np->code;
        *versions = np->versions;
        return yes;
    }

    *code = 0;
    *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
    return no;
}


ctmbstr EntityName( uint ch, uint versions )
{
    ctmbstr entnam = NULL;
    const entity *ep;

    for ( ep = entities; ep->name != NULL; ++ep )
    {
        if ( ep->code == ch )
        {
            if ( ep->versions & versions )
                entnam = ep->name;
            break; /* Found code. Stop search. */
        }
    }
    return entnam;
}