collationbasedatabuilder.cpp [plain text]
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/localpointer.h"
#include "unicode/ucharstriebuilder.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "collation.h"
#include "collationbasedatabuilder.h"
#include "collationdata.h"
#include "collationdatabuilder.h"
#include "collationrootelements.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "utrie2.h"
#include "uvectr32.h"
#include "uvectr64.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
namespace {
int32_t
compareInt64AsUnsigned(int64_t a, int64_t b) {
if((uint64_t)a < (uint64_t)b) {
return -1;
} else if((uint64_t)a > (uint64_t)b) {
return 1;
} else {
return 0;
}
}
int32_t
binarySearch(const UVector64 &list, int64_t ce) {
if (list.size() == 0) { return ~0; }
int32_t start = 0;
int32_t limit = list.size();
for (;;) {
int32_t i = (start + limit) / 2;
int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
if (cmp == 0) {
return i;
} else if (cmp < 0) {
if (i == start) {
return ~start; }
limit = i;
} else {
if (i == start) {
return ~(start + 1); }
start = i;
}
}
}
}
CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
: CollationDataBuilder(errorCode),
numericPrimary(0x12000000),
firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
rootElements(errorCode) {
}
CollationBaseDataBuilder::~CollationBaseDataBuilder() {
}
void
CollationBaseDataBuilder::init(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(trie != NULL) {
errorCode = U_INVALID_STATE_ERROR;
return;
}
uprv_memset(compressibleBytes, FALSE, 256);
compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE;
trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
for(UChar32 c = 0; c < 0x180; ++c) {
utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
}
utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
UnicodeString s((UChar)0xfdd1); s.append((UChar)0xfdd0); int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
add(UnicodeString(), s, &ce, 1, errorCode);
ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
rootElements.addElement(ce, errorCode);
uint32_t ce32 = Collation::FFFD_CE32;
utrie2_set32(trie, 0xfffd, ce32, &errorCode);
addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
ce32 = Collation::MAX_REGULAR_CE32;
utrie2_set32(trie, 0xffff, ce32, &errorCode);
addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
}
void
CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode) || length == 0) { return; }
if((length & 1) != 0) { errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(isAssigned(0x4e00)) { errorCode = U_INVALID_STATE_ERROR;
return;
}
int32_t numHanCodePoints = 0;
for(int32_t i = 0; i < length; i += 2) {
UChar32 start = ranges[i];
UChar32 end = ranges[i + 1];
numHanCodePoints += end - start + 1;
}
int32_t gap = 1;
hanStep = gap + 1;
int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
int32_t numHanPerLeadByte = 254 * 254;
int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
hanPrimary |= 0x20200;
firstHanPrimary = hanPrimary;
for(int32_t i = 0; i < length; i += 2) {
UChar32 start = ranges[i];
UChar32 end = ranges[i + 1];
hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
}
lastHanPrimary = hanPrimary;
}
UBool
CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
return compressibleBytes[b];
}
void
CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
compressibleBytes[b] = TRUE;
}
int32_t
CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
if((p1 & 0xff000000) == (p2 & 0xff000000)) {
return (int32_t)(p2 - p1) >> 16;
} else {
int32_t linear1;
int32_t linear2;
int32_t factor;
if(isCompressible) {
linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
factor = 251;
} else {
linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
factor = 254;
}
linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
return linear2 - linear1;
}
}
int32_t
CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
return (int32_t)(p2 - p1) >> 8;
} else {
int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
int32_t factor;
if(isCompressible) {
linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
factor = 251 * 254;
} else {
linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
factor = 254 * 254;
}
linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
return linear2 - linear1;
}
}
uint32_t
CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
addRootElements(ces, cesLength, errorCode);
return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
}
void
CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
for(int32_t i = 0; i < cesLength; ++i) {
addRootElement(ces[i], errorCode);
}
}
void
CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
if(U_FAILURE(errorCode) || ce == 0) { return; }
ce &= INT64_C(0xffffffffffff3fff);
U_ASSERT((ce & 0xc0) == 0); uint32_t p = (uint32_t)(ce >> 32);
uint32_t secTer = (uint32_t)ce;
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
if(firstHanPrimary <= p && p <= lastHanPrimary) {
return;
}
} else {
uint32_t s = secTer >> 16;
uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
}
if((p & 0xff) != 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t i = binarySearch(rootElements, ce);
if(i < 0) {
rootElements.insertElementAt(ce, ~i, errorCode);
}
}
void
CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
const UnicodeString &groupScripts,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(groupScripts.isEmpty()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
scripts.append((UChar)((firstByte << 8) | lastByte));
scripts.append((UChar)groupScripts.length());
scripts.append(groupScripts);
}
void
CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
buildMappings(data, errorCode);
data.numericPrimary = numericPrimary;
data.compressibleBytes = compressibleBytes;
data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
data.scriptsLength = scripts.length();
buildFastLatinTable(data, errorCode);
}
void
CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
uint32_t nextHanPrimary = firstHanPrimary; uint32_t prevPrimary = 0; UBool tryRange = FALSE;
for(int32_t i = 0; i < rootElements.size(); ++i) {
int64_t ce = rootElements.elementAti(i);
uint32_t p = (uint32_t)(ce >> 32);
uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
if(p != prevPrimary) {
U_ASSERT((p & 0xff) == 0);
int32_t end;
if(p >= nextHanPrimary) {
U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE);
if(p == nextHanPrimary) {
table.addElement((int32_t)p, errorCode);
if(p < lastHanPrimary) {
nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
} else {
nextHanPrimary = 0xffffffff;
}
} else {
table.addElement((int32_t)nextHanPrimary, errorCode);
if(nextHanPrimary == lastHanPrimary) {
nextHanPrimary = 0xffffffff;
} else if(p < lastHanPrimary) {
table.addElement((int32_t)p | hanStep, errorCode);
nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
} else if(p == lastHanPrimary) {
table.addElement((int32_t)p | hanStep, errorCode);
nextHanPrimary = 0xffffffff;
} else {
table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
nextHanPrimary = 0xffffffff;
table.addElement((int32_t)p, errorCode);
}
}
} else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
(end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
ce = rootElements.elementAti(end);
p = (uint32_t)(ce >> 32);
secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
i = end;
} else {
table.addElement((int32_t)p, errorCode);
}
prevPrimary = p;
}
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
tryRange = TRUE;
} else {
table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
tryRange = FALSE;
}
}
table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
}
int32_t
CollationBaseDataBuilder::writeRootElementsRange(
uint32_t prevPrimary, uint32_t p, int32_t i,
UVector32 &table, UErrorCode &errorCode) {
if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
U_ASSERT(prevPrimary < p);
if((p & prevPrimary & 0xff0000) == 0) { return 0; }
UBool isCompressible = isCompressiblePrimary(p);
if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
(p & 0xff000000) != (prevPrimary & 0xff000000)) {
return 0;
}
UBool twoBytes;
int32_t step;
if((p & 0xff00) == 0) {
if((prevPrimary & 0xff00) != 0) { return 0; } twoBytes = TRUE;
step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
} else {
if((prevPrimary & 0xff00) == 0) { return 0; } twoBytes = FALSE;
step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
}
if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
int32_t end = 0; for(;;) {
prevPrimary = p;
uint32_t nextPrimary; if(twoBytes) {
nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
} else {
nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
}
int64_t ce = rootElements.elementAti(i);
p = (uint32_t)(ce >> 32);
uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
if(p != nextPrimary ||
((p & 0xff000000) != (prevPrimary & 0xff000000) &&
(isCompressible || isCompressiblePrimary(p)))) {
p = prevPrimary;
break;
}
end = i++;
if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
}
if(end != 0) {
table.addElement((int32_t)p | step, errorCode);
}
return end;
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION