lc.c   [plain text]


/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*          Copyright (c) 1985-2007 AT&T Intellectual Property          *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                    by AT&T Intellectual Property                     *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                  David Korn <dgk@research.att.com>                   *
*                   Phong Vo <kpv@research.att.com>                    *
*                                                                      *
***********************************************************************/
#pragma prototyped

/*
 * locale state implementation
 */

#include "lclib.h"
#include "lclang.h"

#include <ctype.h>

static Lc_numeric_t	default_numeric = { '.', -1 };

static Lc_t		default_lc =
{
	"C",
	"POSIX",
	&lc_languages[0],
	&lc_territories[0],
	&lc_charsets[0],
	0, 
	LC_default|LC_checked|LC_local,
	0,
	{
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, (void*)&default_numeric },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 },
		{ &default_lc, 0, 0 }
	}
};

static Lc_numeric_t	debug_numeric = { ',', '.' };

static Lc_t		debug_lc =
{
	"debug",
	"debug",
	&lc_languages[1],
	&lc_territories[1],
	&lc_charsets[0],
	0, 
	LC_debug|LC_checked|LC_local,
	0,
	{
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, (void*)&debug_numeric },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 },
		{ &debug_lc, 0, 0 }
	},
	&default_lc
};

static Lc_t*		lcs = &debug_lc;

Lc_t*			locales[] =
{
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc,
	&default_lc
};

/*
 * return the internal category index for category
 */

int
lcindex(int category, int min)
{
	switch (category)
	{
	case LC_ALL:		return min ? -1 : AST_LC_ALL;
	case LC_ADDRESS:	return AST_LC_ADDRESS;
	case LC_COLLATE:	return AST_LC_COLLATE;
	case LC_CTYPE:		return AST_LC_CTYPE;
	case LC_IDENTIFICATION:	return AST_LC_IDENTIFICATION;
	case LC_MEASUREMENT:	return AST_LC_MEASUREMENT;
	case LC_MESSAGES:	return AST_LC_MESSAGES;
	case LC_MONETARY:	return AST_LC_MONETARY;
	case LC_NAME:		return AST_LC_NAME;
	case LC_NUMERIC:	return AST_LC_NUMERIC;
	case LC_PAPER:		return AST_LC_PAPER;
	case LC_TELEPHONE:	return AST_LC_TELEPHONE;
	case LC_TIME:		return AST_LC_TIME;
	case LC_XLITERATE:	return AST_LC_XLITERATE;
	}
	return -1;
}

/*
 * return the first category table entry
 */

Lc_category_t*
lccategories(void)
{
	return (Lc_category_t*)&lc_categories[0];
}

/*
 * return the current info for category
 */

Lc_info_t*
lcinfo(register int category)
{
	if ((category = lcindex(category, 0)) < 0)
		return 0;
	return LCINFO(category);
}

/*
 * return 1 if s matches the alternation pattern p
 * if minimum!=0 then at least that many chars must match
 * if standard!=0 and s[0] is a digit leading non-digits are ignored in p
 */

static int
match(const char* s, register const char* p, int minimum, int standard)
{
	register const char*	t;
	const char*		x;
	int			w;
	int			z;

	z = 0;
	do
	{
		t = s;
		if (standard)
		{
			if (isdigit(*t))
				while (*p && !isdigit(*p))
					p++;
			else if (isdigit(*p))
				while (*t && !isdigit(*t))
					t++;
		}
		if (*p)
		{
			w = 0;
			x = p;
			while (*p && *p != '|')
			{
				if (!*t || *t == ',')
					break;
				else if (*t == *p)
					/*ok*/;
				else if (*t == '-')
				{
					if (standard && isdigit(*p))
					{
						t++;
						continue;
					}
					while (*p && *p != '-')
						p++;
					if (!*p)
						break;
				}
				else if (*p == '-')
				{
					if (standard && isdigit(*t))
					{
						p++;
						continue;
					}
					w = 1;
					while (*t && *t != '-')
						t++;
					if (!*t)
						break;
				}
				else
					break;
				t++;
				p++;
			}
			if ((!*t || *t == ',') && (!*p || *p == '|' || w))
				return p - x;
			if (minimum && z < (p - x) && (p - x) >= minimum)
				z = p - x;
		}
		while (*p && *p != '|')
			p++;
	} while (*p++);
	return z;
}

/*
 * return 1 if s matches the charset names in cp
 */

static int
match_charset(register const char* s, register const Lc_charset_t* cp)
{
	return match(s, cp->code, 0, 1) || match(s, cp->alternates, 3, 1) || cp->ms && match(s, cp->ms, 0, 1);
}

/*
 * low level for lccanon
 */

static size_t
canonical(const Lc_language_t* lp, const Lc_territory_t* tp, const Lc_charset_t* cp, const Lc_attribute_list_t* ap, unsigned long flags, char* buf, size_t siz)
{
	register int		c;
	register int		u;
	register char*		s;
	register char*		e;
	register const char*	t;

	if (!(flags & (LC_abbreviated|LC_default|LC_local|LC_qualified|LC_verbose)))
		flags |= LC_abbreviated;
	s = buf;
	e = &buf[siz - 3];
	if (lp)
	{
		if (lp->flags & (LC_debug|LC_default))
		{
			for (t = lp->code; s < e && (*s = *t++); s++);
			*s++ = 0;
			return s - buf;
		}
		if (flags & LC_verbose)
		{
			u = 1;
			t = lp->name;
			while (s < e && (c = *t++))
			{
				if (u)
				{
					u = 0;
					c = toupper(c);
				}
				else if (!isalnum(c))
					u = 1;
				*s++ = c;
			}
		}
		else
			for (t = lp->code; s < e && (*s = *t++); s++);
	}
	if (s < e)
	{
		if (tp && tp != &lc_territories[0] && (!(flags & (LC_abbreviated|LC_default)) || !lp || !streq(lp->code, tp->code)))
		{
			if (lp)
				*s++ = '_';
			if (flags & LC_verbose)
			{
				u = 1;
				t = tp->name;
				while (s < e && (c = *t++) && c != '|')
				{
					if (u)
					{
						u = 0;
						c = toupper(c);
					}
					else if (!isalnum(c))
						u = 1;
					*s++ = c;
				}
			}
			else
				for (t = tp->code; s < e && (*s = toupper(*t++)); s++);
		}
		if (lp && (!(flags & (LC_abbreviated|LC_default)) || cp != lp->charset) && s < e)
		{
			*s++ = '.';
			for (t = cp->code; s < e && (c = *t++); s++)
			{
				if (islower(c))
					c = toupper(c);
				*s = c;
			}
		}
		for (c = '@'; ap && s < e; ap = ap->next)
			if (!(flags & (LC_abbreviated|LC_default|LC_verbose)) || !(ap->attribute->flags & LC_default))
			{
				*s++ = c;
				c = ',';
				for (t = ap->attribute->name; s < e && (*s = *t++); s++);
			}
	}
	*s++ = 0;
	return s - buf;
}

/*
 * generate a canonical locale name in buf
 */

size_t
lccanon(Lc_t* lc, unsigned long flags, char* buf, size_t siz)
{
	if ((flags & LC_local) && (!lc->language || !(lc->language->flags & (LC_debug|LC_default))))
	{
#if _WINIX
		char	lang[64];
		char	code[64];
		char	ctry[64];

		if (lc->index &&
		    GetLocaleInfo(lc->index, LOCALE_SENGLANGUAGE, lang, sizeof(lang)) &&
		    GetLocaleInfo(lc->index, LOCALE_SENGCOUNTRY, ctry, sizeof(ctry)))
		{
		    	if (!GetLocaleInfo(lc->index, LOCALE_IDEFAULTANSICODEPAGE, code, sizeof(code)))
				code[0] = 0;
			if (!lc->charset || !lc->charset->ms)
				return sfsprintf(buf, siz, "%s_%s", lang, ctry);
			else if (streq(lc->charset->ms, code))
				return sfsprintf(buf, siz, "%s_%s.%s", lang, ctry, code);
			else
				return sfsprintf(buf, siz, "%s_%s.%s,%s", lang, ctry, code, lc->charset->ms);
		}
#endif
		buf[0] = '-';
		buf[1] = 0;
		return 0;
	}
	return canonical(lc->language, lc->territory, lc->charset, lc->attributes, flags, buf, siz);
}

/*
 * make an Lc_t from a locale name
 */

Lc_t*
lcmake(const char* name)
{
	register int			c;
	register char*			s;
	register char*			e;
	register const char*		t;
	const char*			a;
	char*				w;
	char*				language_name;
	char*				territory_name;
	char*				charset_name;
	char*				attributes_name;
	Lc_t*				lc;
	const Lc_map_t*			mp;
	const Lc_language_t*		lp;
	const Lc_territory_t*		tp;
	const Lc_territory_t*		tpb;
	const Lc_territory_t*		primary;
	const Lc_charset_t*		cp;
	const Lc_charset_t*		ppa;
	const Lc_attribute_t*		ap;
	Lc_attribute_list_t*		ai;
	Lc_attribute_list_t*		al;
	int				i;
	int				n;
	int				z;
	char				buf[PATH_MAX / 2];
	char				tmp[PATH_MAX / 2];

	if (!(t = name) || !*t)
		return &default_lc;
	for (lc = lcs; lc; lc = lc->next)
		if (!strcasecmp(t, lc->code) || !strcasecmp(t, lc->name))
			return lc;
	for (mp = lc_maps; mp->code; mp++)
		if (streq(t, mp->code))
		{
			lp = mp->language;
			tp = mp->territory;
			cp = mp->charset;
			if (!mp->attribute)
				al = 0;
			else if (al = newof(0, Lc_attribute_list_t, 1, 0))
				al->attribute = mp->attribute;
			goto mapped;
		}
	language_name = buf;
	territory_name = charset_name = attributes_name = 0;
	s = buf;
	e = &buf[sizeof(buf)-2];
	a = 0;
	n = 0;
	while (s < e && (c = *t++))
	{
		if (isspace(c) || (c == '(' || c == '-' && *t == '-') && ++n)
		{
			while ((c = *t++) && (isspace(c) || (c == '-' || c == '(' || c == ')') && ++n))
			if (!c)
				break;
			if (isalnum(c) && !n)
				*s++ = '-';
			else
			{
				n = 0;
				if (!a)
				{
					a = t - 1;
					while (c && c != '_' && c != '.' && c != '@')
						c = *t++;
					if (!c)
						break;
				}
			}
		}
		if (c == '_' && !territory_name)
		{
			*s++ = 0;
			territory_name = s;
		}
		else if (c == '.' && !charset_name)
		{
			*s++ = 0;
			charset_name = s;
		}
		else if (c == '@' && !attributes_name)
		{
			*s++ = 0;
			attributes_name = s;
		}
		else
		{
			if (isupper(c))
				c = tolower(c);
			*s++ = c;
		}
	}
	if ((t = a) && s < e)
	{
		if (attributes_name)
			*s++ = ',';
		else
		{
			*s++ = 0;
			attributes_name = s;
		}
		while (s < e && (c = *t++))
		{
			if (isspace(c) || (c == '(' || c == ')' || c == '-' && *t == '-') && ++n)
			{
				while ((c = *t++) && (isspace(c) || (c == '-' || c == '(' || c == ')') && ++n))
				if (!c)
					break;
				if (isalnum(c) && !n)
					*s++ = '-';
				else
					n = 0;
			}
			if (c == '_' || c == '.' || c == '@')
				break;
			if (isupper(c))
				c = tolower(c);
			*s++ = c;
		}
	}
	*s = 0;
	tp = 0;
	cp = ppa = 0;
	al = 0;

	/*
	 * language
	 */

	n = strlen(s = language_name);
	if (n == 2)
		for (lp = lc_languages; lp->code && !streq(s, lp->code); lp++);
	else if (n == 3)
	{
		for (lp = lc_languages; lp->code && (!lp->alternates || !match(s, lp->alternates, n, 0)); lp++);
		if (!lp->code)
		{
			c = s[2];
			s[2] = 0;
			for (lp = lc_languages; lp->code && !streq(s, lp->code); lp++);
			s[2] = c;
			if (lp->code)
				n = 1;
		}
	}
	else
		lp = 0;
	if (!lp || !lp->code)
	{
		for (lp = lc_languages; lp->code && !match(s, lp->name, 0, 0); lp++);
		if (!lp || !lp->code)
		{
			if (!territory_name)
			{
				if (n == 2)
					for (tp = lc_territories; tp->code && !streq(s, tp->code); tp++);
				else
				{
					z = 0;
					tpb = 0;
					for (tp = lc_territories; tp->name; tp++)
						if ((i = match(s, tp->name, 3, 0)) > z)
						{
							tpb = tp;
							if ((z = i) == n)
								break;
						}
					if (tpb)
						tp = tpb;
				}
				if (tp->code)
					lp = tp->languages[0];
			}
			if (!lp || !lp->code)
			{
				/*
				 * name not in the tables so let
				 * _ast_setlocale() and/or setlocale()
				 * handle the validity checks
				 */

				s = (char*)name;
				z = strlen(s) + 1;
				if (!(lp = newof(0, Lc_language_t, 1, z)))
					return 0;
				name = ((Lc_language_t*)lp)->code = ((Lc_language_t*)lp)->name = (const char*)(lp + 1);
				memcpy((char*)lp->code, s, z - 1);
				tp = &lc_territories[0];
				cp = ((Lc_language_t*)lp)->charset = &lc_charsets[0];
				al = 0;
				goto override;
			}
		}
	}

	/*
	 * territory
	 */

	if (!tp || !tp->code)
	{
		if (!(s = territory_name))
		{
			n = 0;
			primary = 0;
			for (tp = lc_territories; tp->code; tp++)
				if (tp->languages[0] == lp)
				{
					if (tp->flags & LC_primary)
					{
						n = 1;
						primary = tp;
						break;
					}
					n++;
					primary = tp;
				}
			if (n == 1)
				tp = primary;
			s = (char*)lp->code;
		}
		if (!tp || !tp->code)
		{
			n = strlen(s);
			if (n == 2)
			{
				for (tp = lc_territories; tp->code; tp++)
					if (streq(s, tp->code))
					{
						for (i = 0; i < elementsof(tp->languages) && lp != tp->languages[i]; i++);
						if (i >= elementsof(tp->languages))
							tp = 0;
						break;
					}
			}
			else
			{
				for (tp = lc_territories; tp->code; tp++)
					if (match(s, tp->name, 3, 0))
					{
						for (i = 0; i < elementsof(tp->languages) && lp != tp->languages[i]; i++);
						if (i < elementsof(tp->languages))
							break;
					}
			}
			if (tp && !tp->code)
				tp = 0;
		}
	}

	/*
	 * attributes -- done here to catch misplaced charset references
	 */

	if (s = attributes_name)
	{
		do
		{
			for (w = s; *s && *s != ','; s++);
			c = *s;
			*s = 0;
			if (!(cp = lp->charset) || !match_charset(w, cp))
				for (cp = lc_charsets; cp->code; cp++)
					if (match_charset(w, cp))
					{
						ppa = cp;
						break;
					}
			if (!cp->code)
			{
				for (i = 0; i < elementsof(lp->attributes) && (ap = lp->attributes[i]); i++)
					if (match(w, ap->name, 5, 0))
					{
						if (ai = newof(0, Lc_attribute_list_t, 1, 0))
						{
							ai->attribute = ap;
							ai->next = al;
							al = ai;
						}
						break;
					}
				if (i >= elementsof(lp->attributes) && (ap = newof(0, Lc_attribute_t, 1, sizeof(Lc_attribute_list_t) + s - w + 1)))
				{
					ai = (Lc_attribute_list_t*)(ap + 1);
					strcpy((char*)(((Lc_attribute_t*)ap)->name = (const char*)(ai + 1)), w);
					ai->attribute = ap;
					ai->next = al;
					al = ai;
				}
			}
			*s = c;
		} while (*s++);
	}

	/*
	 * charset
	 */

	if (s = charset_name)
		for (cp = lc_charsets; cp->code; cp++)
			if (match_charset(s, cp))
				break;
	if (!cp || !cp->code)
		cp = ppa ? ppa : lp->charset;
 mapped:
	z = canonical(lp, tp, cp, al, 0, s = tmp, sizeof(tmp));

	/*
	 * add to the list of possibly active locales
	 */

 override:
	n = strlen(name) + 1;
	if (!(lc = newof(0, Lc_t, 1, n + z)))
		return 0;
	strcpy((char*)(lc->name = (const char*)(lc + 1)), name);
	strcpy((char*)(lc->code = lc->name + n), s);
	lc->language = lp ? lp : &lc_languages[0];
	lc->territory = tp ? tp : &lc_territories[0];
	lc->charset = cp ? cp : &lc_charsets[0];  
	lc->attributes = al;
	for (i = 0; i < elementsof(lc->info); i++)
		lc->info[i].lc = lc;
#if _WINIX
	n = SUBLANG_DEFAULT;
	if (tp)
		for (i = 0; i < elementsof(tp->languages); i++)
			if (lp == tp->languages[i])
			{
				n = tp->indices[i];
				break;
			}
	lc->index = MAKELCID(MAKELANGID(lp->index, n), SORT_DEFAULT);
#endif
	lc->next = lcs;
	lcs = lc;
	return lc;
}

/*
 * return an Lc_t* for each locale in the tables
 * one Lc_t is allocated on the first call with lc==0
 * this is freed when 0 returned
 * the return value is not part of the lcmake() cache
 */

typedef struct Lc_scan_s
{
	Lc_t			lc;
	Lc_attribute_list_t	list;
	int			territory;
	int			language;
	int			attribute;
	char			buf[256];
} Lc_scan_t;

Lc_t*
lcscan(Lc_t* lc)
{
	register Lc_scan_t*	ls;

	if (!(ls = (Lc_scan_t*)lc))
	{
		if (!(ls = newof(0, Lc_scan_t, 1, 0)))
			return 0;
		ls->lc.code = ls->lc.name = ls->buf;
		ls->territory = -1;
		ls->language = elementsof(ls->lc.territory->languages);
		ls->attribute = elementsof(ls->lc.language->attributes);
	}
	if (++ls->attribute >= elementsof(ls->lc.language->attributes) || !(ls->list.attribute = ls->lc.language->attributes[ls->attribute]))
	{
		if (++ls->language >= elementsof(ls->lc.territory->languages) || !(ls->lc.language = ls->lc.territory->languages[ls->language]))
		{
			if (!lc_territories[++ls->territory].code)
			{
				free(ls);
				return 0;
			}
			ls->lc.territory = &lc_territories[ls->territory];
			ls->lc.language = ls->lc.territory->languages[ls->language = 0];
		}
		if (ls->lc.language)
		{
			ls->lc.charset = ls->lc.language->charset ? ls->lc.language->charset : &lc_charsets[0];
			ls->list.attribute = ls->lc.language->attributes[ls->attribute = 0];
		}
		else
		{
			ls->lc.charset = &lc_charsets[0];
			ls->list.attribute = 0;
		}
	}
	ls->lc.attributes = ls->list.attribute ? &ls->list : (Lc_attribute_list_t*)0;
#if _WINIX
	if (!ls->lc.language || !ls->lc.language->index)
		ls->lc.index = 0;
	else
	{
		if ((!ls->list.attribute || !(ls->lc.index = ls->list.attribute->index)) &&
		    (!ls->lc.territory || !(ls->lc.index = ls->lc.territory->indices[ls->language])))
			ls->lc.index = SUBLANG_DEFAULT;
		ls->lc.index = MAKELCID(MAKELANGID(ls->lc.language->index, ls->lc.index), SORT_DEFAULT);
	}
#endif
	canonical(ls->lc.language, ls->lc.territory, ls->lc.charset, ls->lc.attributes, 0, ls->buf, sizeof(ls->buf));
	return (Lc_t*)ls;
}