CFBuiltinConverters.c [plain text]
#include "CFStringEncodingConverterExt.h"
#include "CFUniChar.h"
#include "CFUnicodeDecomposition.h"
#include "CFUnicodePrecomposition.h"
#include "CFStringEncodingConverterPriv.h"
#include "CFInternal.h"
#define ParagraphSeparator 0x2029
#define ASCIINewLine 0x0a
static const UInt32 __CFLatin1CombiningCharBitmap[] = { 0xFBB94010, 0x01800000, 0x0000000,
};
Boolean CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character) {
return ((character >= 0x300) && (character < 0x360) && (__CFLatin1CombiningCharBitmap[(character - 0x300) / 32] & (1 << (31 - ((character - 0x300) % 32)))) ? true : false);
}
UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, UInt32 numChars, UInt32 *usedChars) {
if (numChars > 0) {
UTF32Char ch = *(character++), nextCh, composedChar;
UInt32 usedCharLen = 1;
if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) {
if (usedChars) (*usedChars) = usedCharLen;
return ch;
}
while (usedCharLen < numChars) {
nextCh = *(character++);
if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break;
if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) {
if (composedChar > 0xFFFF) { break;
} else {
ch = composedChar;
}
} else {
break;
}
++usedCharLen;
}
if (usedChars) (*usedChars) = usedCharLen;
return ch;
}
return 0xFFFD;
}
static Boolean __CFToASCII(UInt32 flags, UniChar character, uint8_t *byte) {
if (character < 0x80) {
*byte = (uint8_t)character;
} else if (character == ParagraphSeparator) {
*byte = ASCIINewLine;
} else {
return false;
}
return true;
}
static Boolean __CFFromASCII(UInt32 flags, uint8_t byte, UniChar *character) {
if (byte < 0x80) {
*character = (UniChar)byte;
return true;
} else {
return false;
}
}
__private_extern__ CFStringEncodingConverter __CFConverterASCII = {
__CFToASCII, __CFFromASCII, 1, 1, kCFStringEncodingConverterCheapEightBit,
NULL, NULL, NULL, NULL, NULL, NULL,
};
static Boolean __CFToISOLatin1(UInt32 flags, UniChar character, uint8_t *byte) {
if (character <= 0xFF) {
*byte = (uint8_t)character;
} else if (character == ParagraphSeparator) {
*byte = ASCIINewLine;
} else {
return false;
}
return true;
}
static Boolean __CFFromISOLatin1(UInt32 flags, uint8_t byte, UniChar *character) {
*character = (UniChar)byte;
return true;
}
static UInt32 __CFToISOLatin1Precompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
uint8_t byte;
UInt32 usedCharLen;
if (__CFToISOLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
if (maxByteLen) *bytes = byte;
*usedByteLen = 1;
return usedCharLen;
} else {
return 0;
}
}
__private_extern__ CFStringEncodingConverter __CFConverterISOLatin1 = {
__CFToISOLatin1, __CFFromISOLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
NULL, NULL, NULL, NULL, __CFToISOLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
};
#define NUM_MACROMAN_FROM_UNI 129
static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni[NUM_MACROMAN_FROM_UNI] = {
{ 0x00A0, 0xCA },
{ 0x00A1, 0xC1 },
{ 0x00A2, 0xA2 },
{ 0x00A3, 0xA3 },
{ 0x00A5, 0xB4 },
{ 0x00A7, 0xA4 },
{ 0x00A8, 0xAC },
{ 0x00A9, 0xA9 },
{ 0x00AA, 0xBB },
{ 0x00AB, 0xC7 },
{ 0x00AC, 0xC2 },
{ 0x00AE, 0xA8 },
{ 0x00AF, 0xF8 },
{ 0x00B0, 0xA1 },
{ 0x00B1, 0xB1 },
{ 0x00B4, 0xAB },
{ 0x00B5, 0xB5 },
{ 0x00B6, 0xA6 },
{ 0x00B7, 0xE1 },
{ 0x00B8, 0xFC },
{ 0x00BA, 0xBC },
{ 0x00BB, 0xC8 },
{ 0x00BF, 0xC0 },
{ 0x00C0, 0xCB },
{ 0x00C1, 0xE7 },
{ 0x00C2, 0xE5 },
{ 0x00C3, 0xCC },
{ 0x00C4, 0x80 },
{ 0x00C5, 0x81 },
{ 0x00C6, 0xAE },
{ 0x00C7, 0x82 },
{ 0x00C8, 0xE9 },
{ 0x00C9, 0x83 },
{ 0x00CA, 0xE6 },
{ 0x00CB, 0xE8 },
{ 0x00CC, 0xED },
{ 0x00CD, 0xEA },
{ 0x00CE, 0xEB },
{ 0x00CF, 0xEC },
{ 0x00D1, 0x84 },
{ 0x00D2, 0xF1 },
{ 0x00D3, 0xEE },
{ 0x00D4, 0xEF },
{ 0x00D5, 0xCD },
{ 0x00D6, 0x85 },
{ 0x00D8, 0xAF },
{ 0x00D9, 0xF4 },
{ 0x00DA, 0xF2 },
{ 0x00DB, 0xF3 },
{ 0x00DC, 0x86 },
{ 0x00DF, 0xA7 },
{ 0x00E0, 0x88 },
{ 0x00E1, 0x87 },
{ 0x00E2, 0x89 },
{ 0x00E3, 0x8B },
{ 0x00E4, 0x8A },
{ 0x00E5, 0x8C },
{ 0x00E6, 0xBE },
{ 0x00E7, 0x8D },
{ 0x00E8, 0x8F },
{ 0x00E9, 0x8E },
{ 0x00EA, 0x90 },
{ 0x00EB, 0x91 },
{ 0x00EC, 0x93 },
{ 0x00ED, 0x92 },
{ 0x00EE, 0x94 },
{ 0x00EF, 0x95 },
{ 0x00F1, 0x96 },
{ 0x00F2, 0x98 },
{ 0x00F3, 0x97 },
{ 0x00F4, 0x99 },
{ 0x00F5, 0x9B },
{ 0x00F6, 0x9A },
{ 0x00F7, 0xD6 },
{ 0x00F8, 0xBF },
{ 0x00F9, 0x9D },
{ 0x00FA, 0x9C },
{ 0x00FB, 0x9E },
{ 0x00FC, 0x9F },
{ 0x00FF, 0xD8 },
{ 0x0131, 0xF5 },
{ 0x0152, 0xCE },
{ 0x0153, 0xCF },
{ 0x0178, 0xD9 },
{ 0x0192, 0xC4 },
{ 0x02C6, 0xF6 },
{ 0x02C7, 0xFF },
{ 0x02D8, 0xF9 },
{ 0x02D9, 0xFA },
{ 0x02DA, 0xFB },
{ 0x02DB, 0xFE },
{ 0x02DC, 0xF7 },
{ 0x02DD, 0xFD },
{ 0x03A9, 0xBD },
{ 0x03C0, 0xB9 },
{ 0x2013, 0xD0 },
{ 0x2014, 0xD1 },
{ 0x2018, 0xD4 },
{ 0x2019, 0xD5 },
{ 0x201A, 0xE2 },
{ 0x201C, 0xD2 },
{ 0x201D, 0xD3 },
{ 0x201E, 0xE3 },
{ 0x2020, 0xA0 },
{ 0x2021, 0xE0 },
{ 0x2022, 0xA5 },
{ 0x2026, 0xC9 },
{ 0x2030, 0xE4 },
{ 0x2039, 0xDC },
{ 0x203A, 0xDD },
{ 0x2044, 0xDA },
{ 0x20AC, 0xDB },
{ 0x2122, 0xAA },
{ 0x2126, 0xBD },
{ 0x2202, 0xB6 },
{ 0x2206, 0xC6 },
{ 0x220F, 0xB8 },
{ 0x2211, 0xB7 },
{ 0x221A, 0xC3 },
{ 0x221E, 0xB0 },
{ 0x222B, 0xBA },
{ 0x2248, 0xC5 },
{ 0x2260, 0xAD },
{ 0x2264, 0xB2 },
{ 0x2265, 0xB3 },
{ 0x25CA, 0xD7 },
{ 0xF8FF, 0xF0 },
{ 0xFB01, 0xDE },
{ 0xFB02, 0xDF },
};
static Boolean __CFToMacRoman(UInt32 flags, UniChar character, uint8_t *byte) {
if (character < 0x80) {
*byte = (uint8_t)character;
return true;
} else {
return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni, NUM_MACROMAN_FROM_UNI, character, byte);
}
}
static const UniChar macRoman_to_uni[128] = {
0x00C4,
0x00C5,
0x00C7,
0x00C9,
0x00D1,
0x00D6,
0x00DC,
0x00E1,
0x00E0,
0x00E2,
0x00E4,
0x00E3,
0x00E5,
0x00E7,
0x00E9,
0x00E8,
0x00EA,
0x00EB,
0x00ED,
0x00EC,
0x00EE,
0x00EF,
0x00F1,
0x00F3,
0x00F2,
0x00F4,
0x00F6,
0x00F5,
0x00FA,
0x00F9,
0x00FB,
0x00FC,
0x2020,
0x00B0,
0x00A2,
0x00A3,
0x00A7,
0x2022,
0x00B6,
0x00DF,
0x00AE,
0x00A9,
0x2122,
0x00B4,
0x00A8,
0x2260,
0x00C6,
0x00D8,
0x221E,
0x00B1,
0x2264,
0x2265,
0x00A5,
0x00B5,
0x2202,
0x2211,
0x220F,
0x03C0,
0x222B,
0x00AA,
0x00BA,
0x03A9,
0x00E6,
0x00F8,
0x00BF,
0x00A1,
0x00AC,
0x221A,
0x0192,
0x2248,
0x2206,
0x00AB,
0x00BB,
0x2026,
0x00A0,
0x00C0,
0x00C3,
0x00D5,
0x0152,
0x0153,
0x2013,
0x2014,
0x201C,
0x201D,
0x2018,
0x2019,
0x00F7,
0x25CA,
0x00FF,
0x0178,
0x2044,
0x20AC,
0x2039,
0x203A,
0xFB01,
0xFB02,
0x2021,
0x00B7,
0x201A,
0x201E,
0x2030,
0x00C2,
0x00CA,
0x00C1,
0x00CB,
0x00C8,
0x00CD,
0x00CE,
0x00CF,
0x00CC,
0x00D3,
0x00D4,
0xF8FF,
0x00D2,
0x00DA,
0x00DB,
0x00D9,
0x0131,
0x02C6,
0x02DC,
0x00AF,
0x02D8,
0x02D9,
0x02DA,
0x00B8,
0x02DD,
0x02DB,
0x02C7,
};
static Boolean __CFFromMacRoman(UInt32 flags, uint8_t byte, UniChar *character) {
*character = (byte < 0x80 ? (UniChar)byte : macRoman_to_uni[byte - 0x80]);
return true;
}
static UInt32 __CFToMacRomanPrecompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
uint8_t byte;
UInt32 usedCharLen;
if (__CFToMacRoman(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
if (maxByteLen) *bytes = byte;
*usedByteLen = 1;
return usedCharLen;
} else {
return 0;
}
}
__private_extern__ CFStringEncodingConverter __CFConverterMacRoman = {
__CFToMacRoman, __CFFromMacRoman, 1, 1, kCFStringEncodingConverterCheapEightBit,
NULL, NULL, NULL, NULL, __CFToMacRomanPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
};
#define NUM_1252_FROM_UNI 27
static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni[NUM_1252_FROM_UNI] = {
{0x0152, 0x8C}, {0x0153, 0x9C}, {0x0160, 0x8A}, {0x0161, 0x9A}, {0x0178, 0x9F}, {0x017D, 0x8E}, {0x017E, 0x9E}, {0x0192, 0x83}, {0x02C6, 0x88}, {0x02DC, 0x98}, {0x2013, 0x96}, {0x2014, 0x97}, {0x2018, 0x91}, {0x2019, 0x92}, {0x201A, 0x82}, {0x201C, 0x93}, {0x201D, 0x94}, {0x201E, 0x84}, {0x2020, 0x86}, {0x2021, 0x87}, {0x2022, 0x95}, {0x2026, 0x85}, {0x2030, 0x89}, {0x2039, 0x8B}, {0x203A, 0x9B}, {0x20AC, 0x80}, {0x2122, 0x99}, };
static Boolean __CFToWinLatin1(UInt32 flags, UniChar character, uint8_t *byte) {
if ((character < 0x80) || ((character > 0x9F) && (character <= 0x00FF))) {
*byte = (uint8_t)character;
return true;
}
return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni, NUM_1252_FROM_UNI, character, byte);
}
static const unsigned short cp1252_to_uni[32] = {
0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178, };
static Boolean __CFFromWinLatin1(UInt32 flags, uint8_t byte, UniChar *character) {
*character = (byte < 0x80 || byte > 0x9F ? (UniChar)byte : cp1252_to_uni[byte - 0x80]);
return (*character != 0xFFFD);
}
static UInt32 __CFToWinLatin1Precompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
uint8_t byte;
UInt32 usedCharLen;
if (__CFToWinLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
if (maxByteLen) *bytes = byte;
*usedByteLen = 1;
return usedCharLen;
} else {
return 0;
}
}
__private_extern__ CFStringEncodingConverter __CFConverterWinLatin1 = {
__CFToWinLatin1, __CFFromWinLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
NULL, NULL, NULL, NULL, __CFToWinLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
};
#define NUM_NEXTSTEP_FROM_UNI 128
static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab[NUM_NEXTSTEP_FROM_UNI] = {
{ 0x00a0, 0x80 },
{ 0x00a1, 0xa1 },
{ 0x00a2, 0xa2 },
{ 0x00a3, 0xa3 },
{ 0x00a4, 0xa8 },
{ 0x00a5, 0xa5 },
{ 0x00a6, 0xb5 },
{ 0x00a7, 0xa7 },
{ 0x00a8, 0xc8 },
{ 0x00a9, 0xa0 },
{ 0x00aa, 0xe3 },
{ 0x00ab, 0xab },
{ 0x00ac, 0xbe },
{ 0x00ae, 0xb0 },
{ 0x00af, 0xc5 },
{ 0x00b1, 0xd1 },
{ 0x00b2, 0xc9 },
{ 0x00b3, 0xcc },
{ 0x00b4, 0xc2 },
{ 0x00b5, 0x9d },
{ 0x00b6, 0xb6 },
{ 0x00b7, 0xb4 },
{ 0x00b8, 0xcb },
{ 0x00b9, 0xc0 },
{ 0x00ba, 0xeb },
{ 0x00bb, 0xbb },
{ 0x00bc, 0xd2 },
{ 0x00bd, 0xd3 },
{ 0x00be, 0xd4 },
{ 0x00bf, 0xbf },
{ 0x00c0, 0x81 },
{ 0x00c1, 0x82 },
{ 0x00c2, 0x83 },
{ 0x00c3, 0x84 },
{ 0x00c4, 0x85 },
{ 0x00c5, 0x86 },
{ 0x00c6, 0xe1 },
{ 0x00c7, 0x87 },
{ 0x00c8, 0x88 },
{ 0x00c9, 0x89 },
{ 0x00ca, 0x8a },
{ 0x00cb, 0x8b },
{ 0x00cc, 0x8c },
{ 0x00cd, 0x8d },
{ 0x00ce, 0x8e },
{ 0x00cf, 0x8f },
{ 0x00d0, 0x90 },
{ 0x00d1, 0x91 },
{ 0x00d2, 0x92 },
{ 0x00d3, 0x93 },
{ 0x00d4, 0x94 },
{ 0x00d5, 0x95 },
{ 0x00d6, 0x96 },
{ 0x00d7, 0x9e },
{ 0x00d8, 0xe9 },
{ 0x00d9, 0x97 },
{ 0x00da, 0x98 },
{ 0x00db, 0x99 },
{ 0x00dc, 0x9a },
{ 0x00dd, 0x9b },
{ 0x00de, 0x9c },
{ 0x00df, 0xfb },
{ 0x00e0, 0xd5 },
{ 0x00e1, 0xd6 },
{ 0x00e2, 0xd7 },
{ 0x00e3, 0xd8 },
{ 0x00e4, 0xd9 },
{ 0x00e5, 0xda },
{ 0x00e6, 0xf1 },
{ 0x00e7, 0xdb },
{ 0x00e8, 0xdc },
{ 0x00e9, 0xdd },
{ 0x00ea, 0xde },
{ 0x00eb, 0xdf },
{ 0x00ec, 0xe0 },
{ 0x00ed, 0xe2 },
{ 0x00ee, 0xe4 },
{ 0x00ef, 0xe5 },
{ 0x00f0, 0xe6 },
{ 0x00f1, 0xe7 },
{ 0x00f2, 0xec },
{ 0x00f3, 0xed },
{ 0x00f4, 0xee },
{ 0x00f5, 0xef },
{ 0x00f6, 0xf0 },
{ 0x00f7, 0x9f },
{ 0x00f8, 0xf9 },
{ 0x00f9, 0xf2 },
{ 0x00fa, 0xf3 },
{ 0x00fb, 0xf4 },
{ 0x00fc, 0xf6 },
{ 0x00fd, 0xf7 },
{ 0x00fe, 0xfc },
{ 0x00ff, 0xfd },
{ 0x0131, 0xf5 },
{ 0x0141, 0xe8 },
{ 0x0142, 0xf8 },
{ 0x0152, 0xea },
{ 0x0153, 0xfa },
{ 0x0192, 0xa6 },
{ 0x02c6, 0xc3 },
{ 0x02c7, 0xcf },
{ 0x02cb, 0xc1 },
{ 0x02d8, 0xc6 },
{ 0x02d9, 0xc7 },
{ 0x02da, 0xca },
{ 0x02db, 0xce },
{ 0x02dc, 0xc4 },
{ 0x02dd, 0xcd },
{ 0x2013, 0xb1 },
{ 0x2014, 0xd0 },
{ 0x2019, 0xa9 },
{ 0x201a, 0xb8 },
{ 0x201c, 0xaa },
{ 0x201d, 0xba },
{ 0x201e, 0xb9 },
{ 0x2020, 0xb2 },
{ 0x2021, 0xb3 },
{ 0x2022, 0xb7 },
{ 0x2026, 0xbc },
{ 0x2029, 0x0a },
{ 0x2030, 0xbd },
{ 0x2039, 0xac },
{ 0x203a, 0xad },
{ 0x2044, 0xa4 },
{ 0xfb01, 0xae },
{ 0xfb02, 0xaf },
{ 0xfffd, 0xff },
};
static Boolean __CFToNextStepLatin(UInt32 flags, UniChar character, uint8_t *byte) {
if (character < 0x80) {
*byte = (uint8_t)character;
return true;
} else {
return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab, NUM_NEXTSTEP_FROM_UNI, character, byte);
}
};
static const UniChar NSToPrecompUnicodeTable[128] = {
0x00a0,
0x00c0,
0x00c1,
0x00c2,
0x00c3,
0x00c4,
0x00c5,
0x00c7,
0x00c8,
0x00c9,
0x00ca,
0x00cb,
0x00cc,
0x00cd,
0x00ce,
0x00cf,
0x00d0,
0x00d1,
0x00d2,
0x00d3,
0x00d4,
0x00d5,
0x00d6,
0x00d9,
0x00da,
0x00db,
0x00dc,
0x00dd,
0x00de,
0x00b5,
0x00d7,
0x00f7,
0x00a9,
0x00a1,
0x00a2,
0x00a3,
0x2044,
0x00a5,
0x0192,
0x00a7,
0x00a4,
0x2019,
0x201c,
0x00ab,
0x2039,
0x203a,
0xFB01,
0xFB02,
0x00ae,
0x2013,
0x2020,
0x2021,
0x00b7,
0x00a6,
0x00b6,
0x2022,
0x201a,
0x201e,
0x201d,
0x00bb,
0x2026,
0x2030,
0x00ac,
0x00bf,
0x00b9,
0x02cb,
0x00b4,
0x02c6,
0x02dc,
0x00af,
0x02d8,
0x02d9,
0x00a8,
0x00b2,
0x02da,
0x00b8,
0x00b3,
0x02dd,
0x02db,
0x02c7,
0x2014,
0x00b1,
0x00bc,
0x00bd,
0x00be,
0x00e0,
0x00e1,
0x00e2,
0x00e3,
0x00e4,
0x00e5,
0x00e7,
0x00e8,
0x00e9,
0x00ea,
0x00eb,
0x00ec,
0x00c6,
0x00ed,
0x00aa,
0x00ee,
0x00ef,
0x00f0,
0x00f1,
0x0141,
0x00d8,
0x0152,
0x00ba,
0x00f2,
0x00f3,
0x00f4,
0x00f5,
0x00f6,
0x00e6,
0x00f9,
0x00fa,
0x00fb,
0x0131,
0x00fc,
0x00fd,
0x0142,
0x00f8,
0x0153,
0x00df,
0x00fe,
0x00ff,
0xFFFD,
0xFFFD
};
static Boolean __CFFromNextStepLatin(UInt32 flags, uint8_t byte, UniChar *character) {
return ((*character = (byte < 0x80 ? (UniChar)byte : NSToPrecompUnicodeTable[byte - 0x80])) != 0xFFFD);
}
static UInt32 __CFToNextStepLatinPrecompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
uint8_t byte;
UInt32 usedCharLen;
if (__CFToNextStepLatin(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
if (maxByteLen) *bytes = byte;
*usedByteLen = 1;
return usedCharLen;
} else {
return 0;
}
}
__private_extern__ CFStringEncodingConverter __CFConverterNextStepLatin = {
__CFToNextStepLatin, __CFFromNextStepLatin, 1, 1, kCFStringEncodingConverterCheapEightBit,
NULL, NULL, NULL, NULL, __CFToNextStepLatinPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
};
static const UInt32 kReplacementCharacter = 0x0000FFFDUL;
static const UInt32 kMaximumUCS2 = 0x0000FFFFUL;
static const UInt32 kMaximumUTF16 = 0x0010FFFFUL;
static const UInt32 kMaximumUCS4 = 0x7FFFFFFFUL;
static const int halfShift = 10;
static const UInt32 halfBase = 0x0010000UL;
static const UInt32 halfMask = 0x3FFUL;
static const UInt32 kSurrogateHighStart = 0xD800UL;
static const UInt32 kSurrogateHighEnd = 0xDBFFUL;
static const UInt32 kSurrogateLowStart = 0xDC00UL;
static const UInt32 kSurrogateLowEnd = 0xDFFFUL;
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
static const UTF32Char offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
CF_INLINE uint16_t __CFUTF8BytesToWriteForCharacter(UInt32 ch) {
if (ch < 0x80) return 1;
else if (ch < 0x800) return 2;
else if (ch < 0x10000) return 3;
else if (ch < 0x200000) return 4;
else if (ch < 0x4000000) return 5;
else if (ch <= kMaximumUCS4) return 6;
else return 0;
}
CF_INLINE uint16_t __CFToUTF8Core(UInt32 ch, uint8_t *bytes, UInt32 maxByteLen) {
uint16_t bytesToWrite = __CFUTF8BytesToWriteForCharacter(ch);
const UInt32 byteMask = 0xBF;
const UInt32 byteMark = 0x80;
if (!bytesToWrite) {
bytesToWrite = 2;
ch = kReplacementCharacter;
}
if (maxByteLen < bytesToWrite) return 0;
switch (bytesToWrite) {
case 6: bytes[5] = (ch | byteMark) & byteMask; ch >>= 6;
case 5: bytes[4] = (ch | byteMark) & byteMask; ch >>= 6;
case 4: bytes[3] = (ch | byteMark) & byteMask; ch >>= 6;
case 3: bytes[2] = (ch | byteMark) & byteMask; ch >>= 6;
case 2: bytes[1] = (ch | byteMark) & byteMask; ch >>= 6;
case 1: bytes[0] = ch | firstByteMark[bytesToWrite];
}
return bytesToWrite;
}
static UInt32 __CFToUTF8(UInt32 flags, const UniChar *characters, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
uint16_t bytesWritten;
UInt32 ch;
const UniChar *beginCharacter = characters;
const UniChar *endCharacter = characters + numChars;
const uint8_t *beginBytes = bytes;
const uint8_t *endBytes = bytes + maxByteLen;
bool isStrict = (flags & kCFStringEncodingUseHFSPlusCanonical ? false : true);
while ((characters < endCharacter) && (!maxByteLen || (bytes < endBytes))) {
ch = *(characters++);
if (ch < 0x80) { if (maxByteLen) *bytes = ch;
++bytes;
} else {
if (ch >= kSurrogateHighStart) {
if (ch <= kSurrogateHighEnd) {
if ((characters < endCharacter) && ((*characters >= kSurrogateLowStart) && (*characters <= kSurrogateLowEnd))) {
ch = ((ch - kSurrogateHighStart) << halfShift) + (*(characters++) - kSurrogateLowStart) + halfBase;
} else if (isStrict) {
--characters;
break;
}
} else if (isStrict && (ch <= kSurrogateLowEnd)) {
--characters;
break;
}
}
if (!(bytesWritten = (maxByteLen ? __CFToUTF8Core(ch, bytes, endBytes - bytes) : __CFUTF8BytesToWriteForCharacter(ch)))) {
--characters;
break;
}
bytes += bytesWritten;
}
}
if (usedByteLen) *usedByteLen = bytes - beginBytes;
return characters - beginCharacter;
}
CF_INLINE bool __CFIsLegalUTF8(const uint8_t *source, int length) {
uint8_t a;
const uint8_t *srcptr = source+length;
switch (length) {
default: return false;
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
case 0xE0: if (a < 0xA0) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
if (*source > 0xF4) return false;
}
return true;
}
static UInt32 __CFFromUTF8(UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) {
const uint8_t *source = bytes;
uint16_t extraBytesToRead;
UInt32 theUsedCharLen = 0;
UInt32 ch;
Boolean isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
Boolean needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
Boolean strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
int32_t decompLength;
bool isStrict = !isHFSPlus;
while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
extraBytesToRead = trailingBytesForUTF8[*source];
if (extraBytesToRead > --numBytes) break;
numBytes -= extraBytesToRead;
if ((strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1)) || (extraBytesToRead > 3)) {
if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
numBytes += extraBytesToRead;
++source;
if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
++theUsedCharLen;
continue;
} else {
break;
}
}
ch = 0;
switch (extraBytesToRead) {
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (ch <= kMaximumUCS2) {
if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
source -= (extraBytesToRead + 1);
break;
}
if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
if (maxCharLen) {
if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, (uint32_t *)&theUsedCharLen, kCFUniCharUTF16Format)) break;
} else {
theUsedCharLen += decompLength;
}
} else {
if (maxCharLen) *(characters++) = (UTF16Char)ch;
++theUsedCharLen;
}
} else if (ch > kMaximumUTF16) {
if (isStrict) {
source -= (extraBytesToRead + 1);
break;
}
if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
++theUsedCharLen;
} else {
if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
if (maxCharLen) {
if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, (uint32_t *)&theUsedCharLen, kCFUniCharUTF16Format)) break;
} else {
while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
}
} else {
if (maxCharLen) {
if ((theUsedCharLen + 2) > maxCharLen) break;
ch -= halfBase;
*(characters++) = (ch >> halfShift) + kSurrogateHighStart;
*(characters++) = (ch & halfMask) + kSurrogateLowStart;
}
theUsedCharLen += 2;
}
}
}
if (usedCharLen) *usedCharLen = theUsedCharLen;
return source - bytes;
}
static UInt32 __CFToUTF8Len(UInt32 flags, const UniChar *characters, UInt32 numChars) {
UInt32 bytesToWrite = 0;
UInt32 ch;
while (numChars) {
ch = *characters++;
numChars--;
if ((ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd) && numChars && (*characters >= kSurrogateLowStart && *characters <= kSurrogateLowEnd)) {
ch = ((ch - kSurrogateHighStart) << halfShift) + (*characters++ - kSurrogateLowStart) + halfBase;
numChars--;
}
bytesToWrite += __CFUTF8BytesToWriteForCharacter(ch);
}
return bytesToWrite;
}
static UInt32 __CFFromUTF8Len(UInt32 flags, const uint8_t *source, UInt32 numBytes) {
uint16_t extraBytesToRead;
UInt32 theUsedCharLen = 0;
UInt32 ch;
Boolean isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
Boolean needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
Boolean strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
int32_t decompLength;
bool isStrict = !isHFSPlus;
while (numBytes) {
extraBytesToRead = trailingBytesForUTF8[*source];
if (extraBytesToRead > --numBytes) break;
numBytes -= extraBytesToRead;
if ((strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1)) || (extraBytesToRead > 3)) {
if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
numBytes += extraBytesToRead;
++source;
++theUsedCharLen;
continue;
} else {
break;
}
}
ch = 0;
switch (extraBytesToRead) {
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (ch <= kMaximumUCS2) {
if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
break;
}
if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
theUsedCharLen += decompLength;
} else {
++theUsedCharLen;
}
} else if (ch > kMaximumUTF16) {
++theUsedCharLen;
} else {
if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
} else {
theUsedCharLen += 2;
}
}
}
return theUsedCharLen;
}
__private_extern__ CFStringEncodingConverter __CFConverterUTF8 = {
__CFToUTF8, __CFFromUTF8, 6, 2, kCFStringEncodingConverterStandard,
__CFToUTF8Len, __CFFromUTF8Len, NULL, NULL, NULL, NULL,
};