#include <sys/param.h>
#include <sys/utfconv.h>
#include <sys/errno.h>
#include <architecture/byte_order.h>
#define UCS_TO_UTF_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *));
size_t
utf8_encodelen(ucsp, ucslen, altslash, flags)
const u_int16_t * ucsp;
size_t ucslen;
u_int16_t altslash;
int flags;
{
u_int16_t ucs_ch;
int charcnt;
int swapbytes = (flags & UTF_REVERSE_ENDIAN);
size_t len;
charcnt = ucslen / 2;
len = 0;
while (charcnt-- > 0) {
ucs_ch = *ucsp++;
if (swapbytes)
ucs_ch = NXSwapShort(ucs_ch);
if (altslash && ucs_ch == '/')
ucs_ch = altslash;
if (ucs_ch == '\0')
ucs_ch = 0xc080;
len += UCS_TO_UTF_LEN(ucs_ch);
}
return (len);
}
int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags)
const u_int16_t * ucsp;
size_t ucslen;
u_int8_t * utf8p;
size_t * utf8len;
size_t buflen;
u_int16_t altslash;
int flags;
{
u_int8_t * bufstart;
u_int8_t * bufend;
u_int16_t ucs_ch;
int charcnt;
int swapbytes = (flags & UTF_REVERSE_ENDIAN);
int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
int result = 0;
bufstart = utf8p;
bufend = bufstart + buflen;
if (nullterm)
--bufend;
charcnt = ucslen / 2;
while (charcnt-- > 0) {
ucs_ch = *ucsp++;
if (swapbytes)
ucs_ch = NXSwapShort(ucs_ch);
if (altslash && ucs_ch == '/')
ucs_ch = altslash;
if ((ucs_ch < 0x0080) && (ucs_ch != '\0')) {
if (utf8p >= bufend) {
result = ENAMETOOLONG;
break;
}
*utf8p++ = ucs_ch;
} else if (ucs_ch < 0x800) {
if ((utf8p + 1) >= bufend) {
result = ENAMETOOLONG;
break;
}
*utf8p++ = (ucs_ch >> 6) | 0xc0;
*utf8p++ = (ucs_ch & 0x3f) | 0x80;
} else {
if ((utf8p + 2) >= bufend) {
result = ENAMETOOLONG;
break;
}
*utf8p++ = (ucs_ch >> 12) | 0xe0;
*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
*utf8p++ = ((ucs_ch) & 0x3f) | 0x80;
}
}
*utf8len = utf8p - bufstart;
if (nullterm)
*utf8p++ = '\0';
return (result);
}
int
utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags)
const u_int8_t* utf8p;
size_t utf8len;
u_int16_t* ucsp;
size_t *ucslen;
size_t buflen;
u_int16_t altslash;
int flags;
{
u_int16_t* bufstart;
u_int16_t* bufend;
u_int16_t ucs_ch;
u_int8_t byte;
int result = 0;
int decompose, swapbytes;
decompose = (flags & UTF_DECOMPOSED);
swapbytes = (flags & UTF_REVERSE_ENDIAN);
bufstart = ucsp;
bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
if (ucsp >= bufend) {
result = ENAMETOOLONG;
goto stop;
}
if (byte < 0x80) {
ucs_ch = byte;
} else {
switch (byte & 0xf0) {
case 0xc0:
case 0xd0:
ucs_ch = (byte & 0x1F) << 6;
if ((ucs_ch < 0x0080) && (*utf8p != 0x80)) {
result = EINVAL;
goto stop;
}
break;
case 0xe0:
ucs_ch = (byte & 0x0F) << 6;
if (((byte = *utf8p++) & 0xc0) != 0x80) {
result = EINVAL;
goto stop;
}
utf8len--;
ucs_ch += (byte & 0x3F);
ucs_ch <<= 6;
if (ucs_ch < 0x0800) {
result = EINVAL;
goto stop;
}
break;
default:
result = EINVAL;
goto stop;
}
if (((byte = *utf8p++) & 0xc0) != 0x80) {
result = EINVAL;
goto stop;
}
utf8len--;
ucs_ch += (byte & 0x3F);
if (decompose) {
u_int16_t comb_ch;
ucs_ch = ucs_decompose(ucs_ch, &comb_ch);
if (comb_ch) {
if (swapbytes)
*ucsp++ = NXSwapShort(ucs_ch);
else
*ucsp++ = ucs_ch;
if (ucsp >= bufend) {
result = ENAMETOOLONG;
goto stop;
}
ucs_ch = comb_ch;
}
}
}
if (ucs_ch == altslash)
ucs_ch = '/';
if (swapbytes)
ucs_ch = NXSwapShort(ucs_ch);
*ucsp++ = ucs_ch;
}
stop:
*ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
return (result);
}
static unsigned char primary_char[64] = {
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
};
static unsigned char combining_char[64] = {
0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
};
static const unsigned long __CJKDecompBitmap[] = {
0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C,
0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2,
};
#define IS_DECOMPOSABLE(table,unicodeVal) \
(table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
static u_int16_t
ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
{
u_int16_t base;
*cmb = 0;
if ((ch <= 0x00FF) && (ch >= 0x00C0)) {
ch -= 0x00C0;
base = (u_int16_t) primary_char[ch];
if (base <= 'z') {
*cmb = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch];
}
} else if ((ch > 0x3000) && (ch < 0x3100) &&
IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {
switch(ch) {
case 0x3071: base = 0x306F; *cmb = 0x309A; break;
case 0x3074: base = 0x3072; *cmb = 0x309A; break;
case 0x3077: base = 0x3075; *cmb = 0x309A; break;
case 0x307A: base = 0x3078; *cmb = 0x309A; break;
case 0x307D: base = 0x307B; *cmb = 0x309A; break;
case 0x3094: base = 0x3046; *cmb = 0x3099; break;
case 0x30D1: base = 0x30CF; *cmb = 0x309A; break;
case 0x30D4: base = 0x30D2; *cmb = 0x309A; break;
case 0x30D7: base = 0x30D5; *cmb = 0x309A; break;
case 0x30DA: base = 0x30D8; *cmb = 0x309A; break;
case 0x30DD: base = 0x30DB; *cmb = 0x309A; break;
case 0x30F4: base = 0x30A6; *cmb = 0x3099; break;
case 0x30F7: base = 0x30EF; *cmb = 0x3099; break;
case 0x30F8: base = 0x30F0; *cmb = 0x3099; break;
case 0x30F9: base = 0x30F1; *cmb = 0x3099; break;
case 0x30FA: base = 0x30F2; *cmb = 0x3099; break;
default:
base = ch - 1;
*cmb = 0x3099;
}
} else {
base = ch;
}
return (base);
}