#include "config.h"
#include "TextCodecGtk.h"
#include "GOwnPtr.h"
#include "Logging.h"
#include "PlatformString.h"
#include <wtf/Assertions.h>
#include <wtf/HashMap.h>
#include <wtf/text/CString.h>
using std::min;
namespace WebCore {
#if (G_BYTE_ORDER == G_BIG_ENDIAN)
const gchar* WebCore::TextCodecGtk::m_internalEncodingName = "UTF-16BE";
#else
const gchar* WebCore::TextCodecGtk::m_internalEncodingName = "UTF-16LE";
#endif
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_UTF_8 = { "UTF-8", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_1 = { "ISO-8859-1", "CP819", "IBM819", "ISO-IR-100", "ISO8859-1", "ISO_8859-1", "ISO_8859-1:1987", "L1", "LATIN1", "CSISOLATIN1", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACROMAN = { "MACROMAN", "MAC", "MACINTOSH", "CSMACINTOSH", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_SHIFT_JIS = { "Shift_JIS", "MS_KANJI", "SHIFT-JIS", "SJIS", "CSSHIFTJIS", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_JP = { "EUC-JP", "EUC_JP", "EUCJP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE", "CSEUCPKDFMTJAPANESE", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_2022_JP = { "ISO-2022-JP", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_BIG5 = { "BIG5", "BIG-5", "BIG-FIVE", "BIG5", "BIGFIVE", "CN-BIG5", "CSBIG5", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_BIG5_HKSCS = { "BIG5-HKSCS", "BIG5-HKSCS:2004", "BIG5HKSCS", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP950 = { "CP950", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_2022_KR = { "ISO-2022-KR", "CSISO2022KR", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP949 = { "CP949", "UHC", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_KR = { "EUC-KR", "CSEUCKR", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_6 = { "ISO-8859-6", "ARABIC", "ASMO-708", "ECMA-114", "ISO-IR-127", "ISO8859-6", "ISO_8859-6", "ISO_8859-6:1987", "CSISOLATINARABIC", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1256 = { "windows-1256", "CP1256", "MS-ARAB", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_8 = { "ISO-8859-8", "HEBREW", "ISO-8859-8", "ISO-IR-138", "ISO8859-8", "ISO_8859-8", "ISO_8859-8:1988", "CSISOLATINHEBREW", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1255 = { "windows-1255", "CP1255", "MS-HEBR", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_7 = { "ISO-8859-7", "ECMA-118", "ELOT_928", "GREEK", "GREEK8", "ISO-IR-126", "ISO8859-7", "ISO_8859-7", "ISO_8859-7:1987", "ISO_8859-7:2003", "CSI", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP869 = { "CP869", "869", "CP-GR", "IBM869", "CSIBM869", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_WINDOWS_1253 = { "WINDOWS-1253", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_5 = { "ISO-8859-5", "CYRILLIC", "ISO-IR-144", "ISO8859-5", "ISO_8859-5", "ISO_8859-5:1988", "CSISOLATINCYRILLIC", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_KOI8_R = { "KOI8-R", "CSKOI8R", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP866 = { "CP866", "866", "IBM866", "CSIBM866", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_KOI8_U = { "KOI8-U", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_WINDOWS_1251 = { "windows-1251", "CP1251", 0 }; TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACCYRILLIC = { "mac-cyrillic", "MACCYRILLIC", "x-mac-cyrillic", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP874 = { "CP874", "WINDOWS-874", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_TIS_620 = { "TIS-620", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_GBK = { "GBK", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_HZ = { "HZ", "HZ-GB-2312", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_GB18030 = { "GB18030", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_CN = { "EUC-CN", "EUCCN", "GB2312", "CN-GB", "CSGB2312", "EUC_CN", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_2312_80 = { "GB_2312-80", "CHINESE", "csISO58GB231280", "GB2312.1980-0", "ISO-IR-58" };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_2 = { "ISO-8859-2", "ISO-IR-101", "ISO8859-2", "ISO_8859-2", "ISO_8859-2:1987", "L2", "LATIN2", "CSISOLATIN2", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1250 = { "CP1250", "MS-EE", "WINDOWS-1250", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACCENTRALEUROPE = { "MAC-CENTRALEUROPE", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1258 = { "CP1258", "WINDOWS-1258", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1254 = { "CP1254", "MS-TURK", "WINDOWS-1254", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_9 = { "ISO-8859-9", "ISO-IR-148", "ISO8859-9", "ISO_8859-9", "ISO_8859-9:1989", "L5", "LATIN5", "CSISOLATIN5", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1257 = { "CP1257", "WINBALTRIM", "WINDOWS-1257", 0 };
TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_4 = { "ISO-8859-4", "ISO-IR-110", "ISO8859-4", "ISO_8859-4", "ISO_8859-4:1988", "L4", "LATIN4", "CSISOLATIN4", 0 };
gconstpointer const TextCodecGtk::m_iconvBaseCodecList[] = {
&m_codecAliases_UTF_8,
&m_codecAliases_ISO_8859_1
};
gconstpointer const TextCodecGtk::m_iconvExtendedCodecList[] =
{
&m_codecAliases_MACROMAN,
&m_codecAliases_SHIFT_JIS,
&m_codecAliases_EUC_JP,
&m_codecAliases_ISO_2022_JP,
&m_codecAliases_BIG5,
&m_codecAliases_BIG5_HKSCS,
&m_codecAliases_CP950,
&m_codecAliases_ISO_2022_KR,
&m_codecAliases_CP949,
&m_codecAliases_EUC_KR,
&m_codecAliases_ISO_8859_6,
&m_codecAliases_CP1256,
&m_codecAliases_ISO_8859_8,
&m_codecAliases_CP1255,
&m_codecAliases_ISO_8859_7,
&m_codecAliases_CP869,
&m_codecAliases_WINDOWS_1253,
&m_codecAliases_ISO_8859_5,
&m_codecAliases_KOI8_R,
&m_codecAliases_CP866,
&m_codecAliases_KOI8_U,
&m_codecAliases_WINDOWS_1251,
&m_codecAliases_MACCYRILLIC,
&m_codecAliases_CP874,
&m_codecAliases_TIS_620,
&m_codecAliases_GBK,
&m_codecAliases_HZ,
&m_codecAliases_GB18030,
&m_codecAliases_EUC_CN,
&m_codecAliases_2312_80,
&m_codecAliases_ISO_8859_2,
&m_codecAliases_CP1250,
&m_codecAliases_MACCENTRALEUROPE,
&m_codecAliases_CP1258,
&m_codecAliases_CP1254,
&m_codecAliases_ISO_8859_9,
&m_codecAliases_CP1257,
&m_codecAliases_ISO_8859_4
};
const size_t ConversionBufferSize = 16384;
static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*)
{
return new TextCodecGtk(encoding);
}
gboolean TextCodecGtk::isEncodingAvailable(const gchar* encName)
{
GIConv tester;
tester = g_iconv_open(m_internalEncodingName, encName);
if (tester == reinterpret_cast<GIConv>(-1)) {
return false;
} else {
g_iconv_close(tester);
tester = g_iconv_open(encName, m_internalEncodingName);
if (tester == reinterpret_cast<GIConv>(-1)) {
return false;
} else {
g_iconv_close(tester);
return true;
}
}
}
void TextCodecGtk::registerEncodingNames(EncodingNameRegistrar registrar, bool extended)
{
const void* const* encodingList;
unsigned int listLength = 0;
if (extended) {
encodingList = m_iconvExtendedCodecList;
listLength = sizeof(m_iconvExtendedCodecList)/sizeof(gpointer);
} else {
encodingList = m_iconvBaseCodecList;
listLength = sizeof(m_iconvBaseCodecList)/sizeof(gpointer);
}
for (unsigned int i = 0; i < listLength; ++i) {
codecAliasList *codecAliases = static_cast<codecAliasList*>(encodingList[i]);
int codecCount = 0;
const char *canonicalName;
canonicalName = (*codecAliases)[codecCount];
if (!isEncodingAvailable(canonicalName))
continue;
registrar(canonicalName, canonicalName);
const char *currentAlias;
while ((currentAlias = (*codecAliases)[++codecCount])) {
if (isEncodingAvailable(currentAlias))
registrar(currentAlias, canonicalName);
}
}
}
void TextCodecGtk::registerCodecs(TextCodecRegistrar registrar, bool extended)
{
const void* const* encodingList;
unsigned int listLength = 0;
if (extended) {
encodingList = m_iconvExtendedCodecList;
listLength = sizeof(m_iconvExtendedCodecList)/sizeof(gpointer);
} else {
encodingList = m_iconvBaseCodecList;
listLength = sizeof(m_iconvBaseCodecList)/sizeof(gpointer);
}
for (unsigned int i = 0; i < listLength; ++i) {
codecAliasList *codecAliases = static_cast<codecAliasList*>(encodingList[i]);
const gchar *codecName = (*codecAliases)[0];
if (isEncodingAvailable(codecName))
registrar(codecName, newTextCodecGtk, 0);
}
}
void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar)
{
registerEncodingNames(registrar, false);
}
void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar)
{
registerCodecs(registrar, false);
}
void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
registerEncodingNames(registrar, true);
}
void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar)
{
registerCodecs(registrar, true);
}
TextCodecGtk::TextCodecGtk(const TextEncoding& encoding)
: m_encoding(encoding)
, m_numBufferedBytes(0)
, m_iconvDecoder(reinterpret_cast<GIConv>(-1))
, m_iconvEncoder(reinterpret_cast<GIConv>(-1))
{
}
TextCodecGtk::~TextCodecGtk()
{
if (m_iconvDecoder != reinterpret_cast<GIConv>(-1)) {
g_iconv_close(m_iconvDecoder);
m_iconvDecoder = reinterpret_cast<GIConv>(-1);
}
if (m_iconvEncoder != reinterpret_cast<GIConv>(-1)) {
g_iconv_close(m_iconvEncoder);
m_iconvEncoder = reinterpret_cast<GIConv>(-1);
}
}
void TextCodecGtk::createIConvDecoder() const
{
ASSERT(m_iconvDecoder == reinterpret_cast<GIConv>(-1));
m_iconvDecoder = g_iconv_open(m_internalEncodingName, m_encoding.name());
}
void TextCodecGtk::createIConvEncoder() const
{
ASSERT(m_iconvDecoder == reinterpret_cast<GIConv>(-1));
m_iconvEncoder = g_iconv_open(m_encoding.name(), m_internalEncodingName);
}
String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
if (m_iconvDecoder == reinterpret_cast<GIConv>(-1)) {
createIConvDecoder();
ASSERT(m_iconvDecoder != reinterpret_cast<GIConv>(-1));
if (m_iconvDecoder == reinterpret_cast<GIConv>(-1)) {
LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
return String();
}
}
size_t countWritten, countRead, conversionLength;
const char* conversionBytes;
char* prefixedBytes = 0;
if (m_numBufferedBytes) {
conversionLength = length + m_numBufferedBytes;
prefixedBytes = static_cast<char*>(fastMalloc(conversionLength));
memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes);
memcpy(prefixedBytes + m_numBufferedBytes, bytes, length);
conversionBytes = prefixedBytes;
m_numBufferedBytes = 0;
} else {
conversionBytes = bytes;
conversionLength = length;
}
GOwnPtr<GError> err;
GOwnPtr<UChar> buffer;
buffer.outPtr() = reinterpret_cast<UChar*>(g_convert_with_iconv(conversionBytes, conversionLength, m_iconvDecoder, &countRead, &countWritten, &err.outPtr()));
if (err) {
LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", err->code, err->message);
m_numBufferedBytes = 0; fastFree(prefixedBytes);
sawError = true;
return String();
}
m_numBufferedBytes = conversionLength - countRead;
if (m_numBufferedBytes > 0) {
if (flush) {
LOG_ERROR("Partial bytes at end of input while flush requested.");
m_numBufferedBytes = 0; fastFree(prefixedBytes);
sawError = true;
return String();
}
memcpy(m_bufferedBytes, conversionBytes + countRead, m_numBufferedBytes);
}
fastFree(prefixedBytes);
Vector<UChar> result;
result.append(buffer.get(), countWritten / sizeof(UChar));
return String::adopt(result);
}
CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling)
{
if (!length)
return "";
if (m_iconvEncoder == reinterpret_cast<GIConv>(-1))
createIConvEncoder();
if (m_iconvEncoder == reinterpret_cast<GIConv>(-1))
return CString();
size_t count;
GOwnPtr<GError> err;
GOwnPtr<char> buffer;
buffer.outPtr() = g_convert_with_iconv(reinterpret_cast<const char*>(characters), length * sizeof(UChar), m_iconvEncoder, 0, &count, &err.outPtr());
if (err) {
LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", err->code, err->message);
return CString();
}
return CString(buffer.get(), count);
}
}