#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unormimp.h"
#include "ustr_imp.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/udata.h"
#include "unicode/uchar.h"
#include "unicode/uiter.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/unorm.h"
#include "cmemory.h"
#include "umutex.h"
#include "utrie.h"
#include "unicode/uset.h"
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
enum {
_STACK_BUFFER_CAPACITY=100
};
enum {
_NORM_OPTIONS_NX_MASK=0x1f,
_NORM_OPTIONS_UNICODE_MASK=0xe0,
_NORM_OPTIONS_SETS_MASK=0xff,
_NORM_OPTIONS_UNICODE_SHIFT=5
};
static inline UBool
isHangulWithoutJamoT(UChar c) {
c-=HANGUL_BASE;
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
static inline UBool
isNorm32Regular(uint32_t norm32) {
return norm32<_NORM_MIN_SPECIAL;
}
static inline UBool
isNorm32LeadSurrogate(uint32_t norm32) {
return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
}
static inline UBool
isNorm32HangulOrJamo(uint32_t norm32) {
return norm32>=_NORM_MIN_HANGUL;
}
static inline UBool
isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
return norm32<_NORM_MIN_JAMO_V;
}
static inline UBool
isJamoVTNorm32JamoV(uint32_t norm32) {
return norm32<_NORM_JAMO_V_TOP;
}
static const UChar *
_findPreviousStarter(const UChar *start, const UChar *src,
uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe);
static const UChar *
_findNextStarter(const UChar *src, const UChar *limit,
uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe);
static const UChar *
_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
const UChar *prevStarter, const UChar *src,
uint32_t qcMask, uint8_t &prevCC,
const UnicodeSet *nx,
UErrorCode *pErrorCode);
#define DATA_NAME "unorm"
#define DATA_TYPE "icu"
static UDataMemory *normData=NULL;
static UErrorCode dataErrorCode=U_ZERO_ERROR;
static int8_t haveNormData=0;
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
static const uint16_t *extraData=NULL,
*combiningTable=NULL,
*canonStartSets=NULL;
static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
static UVersionInfo dataVersion={ 0, 0, 0, 0 };
static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
U_CDECL_BEGIN
UBool
unorm_cleanup() {
int32_t i;
if(normData!=NULL) {
udata_close(normData);
normData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
haveNormData=0;
for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
delete nxCache[i];
}
uprv_memset(nxCache, 0, sizeof(nxCache));
return TRUE;
}
static int32_t U_CALLCONV
getFoldingNormOffset(uint32_t norm32) {
if(isNorm32LeadSurrogate(norm32)) {
return
UTRIE_BMP_INDEX_LENGTH+
(((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
(0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
} else {
return 0;
}
}
static int32_t U_CALLCONV
getFoldingFCDOffset(uint32_t data) {
return (int32_t)data;
}
static int32_t U_CALLCONV
getFoldingAuxOffset(uint32_t data) {
return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
}
static UBool U_CALLCONV
isAcceptable(void * ,
const char * , const char * ,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x4e &&
pInfo->dataFormat[1]==0x6f &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6d &&
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
return TRUE;
} else {
return FALSE;
}
}
static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 , uint32_t ) {
uset_add((USet *)context, start);
return TRUE;
}
U_CDECL_END
static int8_t
loadNormData(UErrorCode &errorCode) {
if(haveNormData==0) {
UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
}
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
dataErrorCode=errorCode;
if(U_FAILURE(errorCode)) {
return haveNormData=-1;
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
_normTrie.getFoldingOffset=getFoldingNormOffset;
pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
_fcdTrie.getFoldingOffset=getFoldingFCDOffset;
if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
_auxTrie.getFoldingOffset=getFoldingAuxOffset;
}
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
return haveNormData=-1;
}
umtx_lock(NULL);
if(normData==NULL) {
normData=data;
data=NULL;
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(normData);
}
umtx_unlock(NULL);
extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
if(formatVersion_2_1) {
canonStartSets=combiningTable+
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
(indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
}
haveNormData=1;
if(data!=NULL) {
udata_close(data);
}
}
return haveNormData;
}
static inline UBool
_haveData(UErrorCode &errorCode) {
if(haveNormData!=0) {
errorCode=dataErrorCode;
return (UBool)(haveNormData>0);
} else {
return (UBool)(loadNormData(errorCode)>0);
}
}
U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode *pErrorCode) {
return _haveData(*pErrorCode);
}
U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrie(UErrorCode *pErrorCode) {
if(_haveData(*pErrorCode)) {
return fcdTrie.index;
} else {
return NULL;
}
}
static inline uint32_t
_getNorm32(UChar c) {
return UTRIE_GET32_FROM_LEAD(&normTrie, c);
}
static inline uint32_t
_getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
norm32=
UTRIE_BMP_INDEX_LENGTH+
((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
(0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
}
static inline uint32_t
_getNorm32(const UChar *p, uint32_t mask) {
uint32_t norm32=_getNorm32(*p);
if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
}
return norm32;
}
static inline uint16_t
_getFCD16(UChar c) {
return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
}
static inline uint16_t
_getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
}
static inline const uint16_t *
_getExtraData(uint32_t norm32) {
return extraData+(norm32>>_NORM_EXTRA_SHIFT);
}
static const UnicodeSet *
internalGetNXHangul(UErrorCode &errorCode) {
UBool isCached;
umtx_lock(NULL);
isCached=nxCache[UNORM_NX_HANGUL]!=NULL;
umtx_unlock(NULL);
if(!isCached) {
UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
if(set==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
umtx_lock(NULL);
if(nxCache[UNORM_NX_HANGUL]==NULL) {
nxCache[UNORM_NX_HANGUL]=set;
set=NULL;
}
umtx_unlock(NULL);
delete set;
}
return nxCache[UNORM_NX_HANGUL];
}
static const UnicodeSet *
internalGetNXCJKCompat(UErrorCode &errorCode) {
UBool isCached;
umtx_lock(NULL);
isCached=nxCache[UNORM_NX_CJK_COMPAT]!=NULL;
umtx_unlock(NULL);
if(!isCached) {
UnicodeSet *set, *hasDecomp;
set=new UnicodeSet(UNICODE_STRING("[:Ideographic:]", 15), errorCode);
if(set==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if(U_FAILURE(errorCode)) {
delete set;
return NULL;
}
hasDecomp=new UnicodeSet();
if(hasDecomp==NULL) {
delete set;
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
UnicodeSetIterator it(*set);
UChar32 start, end;
uint32_t norm32;
while(it.nextRange() && !it.isString()) {
start=it.getCodepoint();
end=it.getCodepointEnd();
while(start<=end) {
UTRIE_GET32(&normTrie, start, norm32);
if(norm32&_NORM_QC_NFD) {
hasDecomp->add(start);
}
++start;
}
}
umtx_lock(NULL);
if(nxCache[UNORM_NX_CJK_COMPAT]==NULL) {
nxCache[UNORM_NX_CJK_COMPAT]=hasDecomp;
hasDecomp=NULL;
}
umtx_unlock(NULL);
delete hasDecomp;
delete set;
}
return nxCache[UNORM_NX_CJK_COMPAT];
}
static const UnicodeSet *
internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
options&=_NORM_OPTIONS_UNICODE_MASK;
if(options==0) {
return NULL;
}
UBool isCached;
umtx_lock(NULL);
isCached=nxCache[options]!=NULL;
umtx_unlock(NULL);
if(!isCached) {
UnicodeSet *set;
switch(options) {
case UNORM_UNICODE_3_2:
set=new UnicodeSet(UNICODE_STRING("[:^Age=3.2:]", 12), errorCode);
break;
default:
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if(set==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if(U_FAILURE(errorCode)) {
delete set;
return NULL;
}
umtx_lock(NULL);
if(nxCache[options]==NULL) {
nxCache[options]=set;
set=NULL;
}
umtx_unlock(NULL);
delete set;
}
return nxCache[options];
}
static const UnicodeSet *
internalGetNX(int32_t options, UErrorCode &errorCode) {
options&=_NORM_OPTIONS_SETS_MASK;
UBool isCached;
umtx_lock(NULL);
isCached=nxCache[options]!=NULL;
umtx_unlock(NULL);
if(!isCached) {
if(options==UNORM_NX_HANGUL) {
return internalGetNXHangul(errorCode);
}
if(options==UNORM_NX_CJK_COMPAT) {
return internalGetNXCJKCompat(errorCode);
}
if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
return internalGetNXUnicode(options, errorCode);
}
UnicodeSet *set;
const UnicodeSet *other;
set=new UnicodeSet();
if(set==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
set->addAll(*other);
}
if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
set->addAll(*other);
}
if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
set->addAll(*other);
}
if(U_FAILURE(errorCode)) {
delete set;
return NULL;
}
umtx_lock(NULL);
if(nxCache[options]==NULL) {
nxCache[options]=set;
set=NULL;
}
umtx_unlock(NULL);
delete set;
}
return nxCache[options];
}
static inline const UnicodeSet *
getNX(int32_t options, UErrorCode &errorCode) {
if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
return NULL;
} else {
return internalGetNX(options, errorCode);
}
}
static inline UBool
nx_contains(const UnicodeSet *nx, UChar32 c) {
return nx!=NULL && nx->contains(c);
}
static inline UBool
nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
}
static inline const UChar *
_decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
uint8_t &cc, uint8_t &trailCC) {
const UChar *p=(const UChar *)_getExtraData(norm32);
length=*p++;
if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
length>>=8;
}
if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
UChar bothCCs=*p++;
cc=(uint8_t)(bothCCs>>8);
trailCC=(uint8_t)bothCCs;
} else {
cc=trailCC=0;
}
length&=_NORM_DECOMP_LENGTH_MASK;
return p;
}
static inline const UChar *
_decompose(uint32_t norm32, int32_t &length,
uint8_t &cc, uint8_t &trailCC) {
const UChar *p=(const UChar *)_getExtraData(norm32);
length=*p++;
if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
UChar bothCCs=*p++;
cc=(uint8_t)(bothCCs>>8);
trailCC=(uint8_t)bothCCs;
} else {
cc=trailCC=0;
}
length&=_NORM_DECOMP_LENGTH_MASK;
return p;
}
static const UChar *
_decompose(UChar32 c, UChar buffer[4], int32_t &length) {
uint32_t norm32;
UTRIE_GET32(&normTrie, c, norm32);
if(norm32&_NORM_QC_NFD) {
if(isNorm32HangulOrJamo(norm32)) {
UChar c2;
c-=HANGUL_BASE;
c2=(UChar)(c%JAMO_T_COUNT);
c/=JAMO_T_COUNT;
if(c2>0) {
buffer[2]=(UChar)(JAMO_T_BASE+c2);
length=3;
} else {
length=2;
}
buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
return buffer;
} else {
uint8_t cc, trailCC;
return _decompose(norm32, length, cc, trailCC);
}
} else {
return 0;
}
}
static inline uint8_t
_getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
uint32_t norm32;
c=*p++;
norm32=_getNorm32(c);
if((norm32&_NORM_CC_MASK)==0) {
c2=0;
return 0;
} else {
if(!isNorm32LeadSurrogate(norm32)) {
c2=0;
} else {
if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
++p;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
c2=0;
return 0;
}
}
return (uint8_t)(norm32>>_NORM_CC_SHIFT);
}
}
static inline uint32_t
_getPrevNorm32(const UChar *start, const UChar *&src,
uint32_t minC, uint32_t mask,
UChar &c, UChar &c2) {
uint32_t norm32;
c=*--src;
c2=0;
if(c<minC) {
return 0;
} else if(!UTF_IS_SURROGATE(c)) {
return _getNorm32(c);
} else if(UTF_IS_SURROGATE_FIRST(c)) {
return 0;
} else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
--src;
norm32=_getNorm32(c2);
if((norm32&mask)==0) {
return 0;
} else {
return _getNorm32FromSurrogatePair(norm32, c);
}
} else {
c2=0;
return 0;
}
}
static inline uint8_t
_getPrevCC(const UChar *start, const UChar *&p) {
UChar c, c2;
return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
}
static inline UBool
_isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
if((norm32&ccOrQCMask)==0) {
return TRUE;
}
if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
int32_t length;
uint8_t cc, trailCC;
_decompose(norm32, decompQCMask, length, cc, trailCC);
return cc==0;
} else {
return (norm32&_NORM_CC_MASK)==0;
}
}
static inline UBool
_isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
if((norm32&ccOrQCMask)==0) {
return TRUE;
}
if((norm32&decompQCMask)!=0) {
const UChar *p;
int32_t length;
uint8_t cc, trailCC;
p=_decompose(norm32, decompQCMask, length, cc, trailCC);
if(cc==0) {
uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
if((_getNorm32(p, qcMask)&qcMask)==0) {
return TRUE;
}
}
}
return FALSE;
}
U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode)) {
uint32_t norm32;
UTRIE_GET32(&normTrie, c, norm32);
return (uint8_t)(norm32>>_NORM_CC_SHIFT);
} else {
return 0;
}
}
U_CAPI UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode) && formatVersion_2_1) {
uint16_t aux;
UTRIE_GET16(&auxTrie, c, aux);
return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
} else {
return FALSE;
}
}
U_CAPI UBool U_EXPORT2
unorm_isCanonSafeStart(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode) && formatVersion_2_1) {
uint16_t aux;
UTRIE_GET16(&auxTrie, c, aux);
return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
} else {
return FALSE;
}
}
U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
UErrorCode errorCode=U_ZERO_ERROR;
if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
_haveData(errorCode) && canonStartSets!=NULL
) {
const uint16_t *table;
int32_t i, start, limit;
if(c<=0xffff) {
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
start=0;
limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
while(start<limit-2) {
i=(uint16_t)(((start+limit)/4)*2);
if(c<table[i]) {
limit=i;
} else {
start=i;
}
}
if(c==table[start]) {
i=table[start+1];
if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
i&=(_NORM_MAX_CANON_SETS-1);
return uset_getSerializedSet(fillSet,
canonStartSets+i,
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
} else {
uset_setSerializedToOne(fillSet, (UChar32)i);
return TRUE;
}
}
} else {
uint16_t high, low, h;
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
start=0;
limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
high=(uint16_t)(c>>16);
low=(uint16_t)c;
while(start<limit-3) {
i=(uint16_t)(((start+limit)/6)*3);
h=table[i]&0x1f;
if(high<h || (high==h && low<table[i+1])) {
limit=i;
} else {
start=i;
}
}
h=table[start];
if(high==(h&0x1f) && low==table[start+1]) {
i=table[start+2];
if((h&0x8000)==0) {
return uset_getSerializedSet(fillSet,
canonStartSets+i,
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
} else {
i|=((int32_t)h&0x1f00)<<8;
uset_setSerializedToOne(fillSet, (UChar32)i);
return TRUE;
}
}
}
}
return FALSE;
}
U_CAPI int32_t U_EXPORT2
u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
uint16_t aux;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(!_haveData(*pErrorCode) || !formatVersion_2_1) {
return 0;
}
UTRIE_GET16(&auxTrie, c, aux);
aux&=_NORM_AUX_FNC_MASK;
if(aux!=0) {
const UChar *s;
int32_t length;
s=(const UChar *)(extraData+aux);
if(*s<0xff00) {
length=1;
} else {
length=*s&0xff;
++s;
}
if(0<length && length<=destCapacity) {
uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
}
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
} else {
return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
}
}
U_CAPI UBool U_EXPORT2
unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
UErrorCode errorCode;
uint32_t norm32, mask;
uint16_t aux, fcd;
errorCode=U_ZERO_ERROR;
if(!_haveData(errorCode)) {
return FALSE;
}
switch(mode) {
case UNORM_NONE:
return TRUE;
case UNORM_NFD:
mask=_NORM_CC_MASK|_NORM_QC_NFD;
break;
case UNORM_NFKD:
mask=_NORM_CC_MASK|_NORM_QC_NFKD;
break;
case UNORM_NFC:
mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
break;
case UNORM_NFKC:
mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
break;
case UNORM_FCD:
UTRIE_GET16(&fcdTrie, c, fcd);
return fcd<=1;
default:
return FALSE;
}
UTRIE_GET32(&normTrie, c, norm32);
if((norm32&mask)!=0) {
return FALSE;
}
if(mode<UNORM_NFC) {
return TRUE;
}
if((norm32&_NORM_QC_NFD)==0) {
return TRUE;
}
if(isNorm32HangulOrJamo(norm32)) {
return !isHangulWithoutJamoT((UChar)c);
}
if(!formatVersion_2_2) {
return FALSE;
}
UTRIE_GET16(&auxTrie, c, aux);
return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0;
}
U_CAPI void U_EXPORT2
unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode) {
UChar c;
if(!_haveData(*pErrorCode)) {
return;
}
utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
if(formatVersion_2_1) {
utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
}
for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
uset_add(set, c);
uset_add(set, c+1);
}
uset_add(set, HANGUL_BASE+HANGUL_COUNT);
}
static uint8_t
_insertOrdered(const UChar *start, UChar *current, UChar *p,
UChar c, UChar c2, uint8_t cc) {
const UChar *pBack, *pPreBack;
UChar *r;
uint8_t prevCC, trailCC=cc;
if(start<current && cc!=0) {
pPreBack=pBack=current;
prevCC=_getPrevCC(start, pPreBack);
if(cc<prevCC) {
trailCC=prevCC;
pBack=pPreBack;
while(start<pPreBack) {
prevCC=_getPrevCC(start, pPreBack);
if(cc>=prevCC) {
break;
}
pBack=pPreBack;
}
r=p;
do {
*--r=*--current;
} while(pBack!=current);
}
}
*current=c;
if(c2!=0) {
*(current+1)=c2;
}
return trailCC;
}
static uint8_t
_mergeOrdered(UChar *start, UChar *current,
const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
UChar *r;
UChar c, c2;
uint8_t cc, trailCC=0;
UBool adjacent;
adjacent= current==next;
if(start!=current || !isOrdered) {
while(next<limit) {
cc=_getNextCC(next, limit, c, c2);
if(cc==0) {
trailCC=0;
if(adjacent) {
current=(UChar *)next;
} else {
*current++=c;
if(c2!=0) {
*current++=c2;
}
}
if(isOrdered) {
break;
} else {
start=current;
}
} else {
r=current+(c2==0 ? 1 : 2);
trailCC=_insertOrdered(start, current, r, c, c2, cc);
current=r;
}
}
}
if(next==limit) {
return trailCC;
} else {
if(!adjacent) {
do {
*current++=*next++;
} while(next!=limit);
limit=current;
}
return _getPrevCC(start, limit);
}
}
static UBool
unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
const UChar *limit;
UChar c, c2;
uint16_t fcd16;
int16_t prevCC, cc;
prevCC=0;
if(srcLength>=0) {
limit=src+srcLength;
} else {
limit=NULL;
}
U_ALIGN_CODE(16);
for(;;) {
if(limit==NULL) {
for(;;) {
c=*src++;
if(c<_NORM_MIN_WITH_LEAD_CC) {
if(c==0) {
return TRUE;
}
prevCC=(int16_t)-c;
} else if((fcd16=_getFCD16(c))==0) {
prevCC=0;
} else {
break;
}
}
} else {
for(;;) {
if(src==limit) {
return TRUE;
} else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
prevCC=(int16_t)-c;
} else if((fcd16=_getFCD16(c))==0) {
prevCC=0;
} else {
break;
}
}
}
if(UTF_IS_FIRST_SURROGATE(c)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
} else {
c2=0;
fcd16=0;
}
} else {
c2=0;
}
if(nx_contains(nx, c, c2)) {
prevCC=0;
continue;
}
cc=(int16_t)(fcd16>>8);
if(cc!=0) {
if(prevCC<0) {
if(!nx_contains(nx, (UChar32)-prevCC)) {
prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
} else {
prevCC=0;
}
}
if(cc<prevCC) {
return FALSE;
}
}
prevCC=(int16_t)(fcd16&0xff);
}
}
static UNormalizationCheckResult
_quickCheck(const UChar *src,
int32_t srcLength,
UNormalizationMode mode,
UBool allowMaybe,
const UnicodeSet *nx,
UErrorCode *pErrorCode) {
UChar stackBuffer[_STACK_BUFFER_CAPACITY];
UChar *buffer;
int32_t bufferCapacity;
const UChar *start, *limit;
uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
UChar c, c2, minNoMaybe;
uint8_t cc, prevCC;
UNormalizationCheckResult result;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return UNORM_MAYBE;
}
if(src==NULL || srcLength<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_MAYBE;
}
if(!_haveData(*pErrorCode)) {
return UNORM_MAYBE;
}
switch(mode) {
case UNORM_NFC:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
qcMask=_NORM_QC_NFC;
break;
case UNORM_NFKC:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
qcMask=_NORM_QC_NFKC;
break;
case UNORM_NFD:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
qcMask=_NORM_QC_NFD;
break;
case UNORM_NFKD:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
qcMask=_NORM_QC_NFKD;
break;
case UNORM_FCD:
return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
default:
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_MAYBE;
}
buffer=stackBuffer;
bufferCapacity=_STACK_BUFFER_CAPACITY;
ccOrQCMask=_NORM_CC_MASK|qcMask;
result=UNORM_YES;
prevCC=0;
start=src;
if(srcLength>=0) {
limit=src+srcLength;
} else {
limit=NULL;
}
U_ALIGN_CODE(16);
for(;;) {
if(limit==NULL) {
for(;;) {
c=*src++;
if(c<minNoMaybe) {
if(c==0) {
goto endloop;
}
} else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
break;
}
prevCC=0;
}
} else {
for(;;) {
if(src==limit) {
goto endloop;
} else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
break;
}
prevCC=0;
}
}
if(isNorm32LeadSurrogate(norm32)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
c2=0;
norm32=0;
}
} else {
c2=0;
}
if(nx_contains(nx, c, c2)) {
norm32=0;
}
cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
if(cc!=0 && cc<prevCC) {
result=UNORM_NO;
break;
}
prevCC=cc;
qcNorm32=norm32&qcMask;
if(qcNorm32&_NORM_QC_ANY_NO) {
result=UNORM_NO;
break;
} else if(qcNorm32!=0) {
if(allowMaybe) {
result=UNORM_MAYBE;
} else {
const UChar *prevStarter;
uint32_t decompQCMask;
int32_t length;
decompQCMask=(qcMask<<2)&0xf;
prevStarter=src-1;
if(UTF_IS_TRAIL(*prevStarter)) {
--prevStarter;
}
prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
_composePart(stackBuffer, buffer, bufferCapacity,
length,
prevStarter,
src,
qcMask,
prevCC, nx, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
result=UNORM_MAYBE;
break;
}
if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
result=UNORM_NO;
break;
}
}
}
}
endloop:
if(buffer!=stackBuffer) {
uprv_free(buffer);
}
return result;
}
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *src,
int32_t srcLength,
UNormalizationMode mode,
UErrorCode *pErrorCode) {
return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
}
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
}
U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar *src, int32_t srcLength,
UNormalizationMode mode,
UErrorCode *pErrorCode) {
return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
}
U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
}
U_CAPI int32_t U_EXPORT2
unorm_getDecomposition(UChar32 c, UBool compat,
UChar *dest, int32_t destCapacity) {
UErrorCode errorCode=U_ZERO_ERROR;
if( (uint32_t)c<=0x10ffff &&
_haveData(errorCode) &&
((dest!=NULL && destCapacity>0) || destCapacity==0)
) {
uint32_t norm32, qcMask;
UChar32 minNoMaybe;
int32_t length;
if(!compat) {
minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
qcMask=_NORM_QC_NFD;
} else {
minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
qcMask=_NORM_QC_NFKD;
}
if(c<minNoMaybe) {
if(destCapacity>0) {
dest[0]=(UChar)c;
}
return -1;
}
UTRIE_GET32(&normTrie, c, norm32);
if((norm32&qcMask)==0) {
if(c<=0xffff) {
if(destCapacity>0) {
dest[0]=(UChar)c;
}
return -1;
} else {
if(destCapacity>=2) {
dest[0]=UTF16_LEAD(c);
dest[1]=UTF16_TRAIL(c);
}
return -2;
}
} else if(isNorm32HangulOrJamo(norm32)) {
UChar c2;
c-=HANGUL_BASE;
c2=(UChar)(c%JAMO_T_COUNT);
c/=JAMO_T_COUNT;
if(c2>0) {
if(destCapacity>=3) {
dest[2]=(UChar)(JAMO_T_BASE+c2);
}
length=3;
} else {
length=2;
}
if(destCapacity>=2) {
dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
}
return length;
} else {
const UChar *p, *limit;
uint8_t cc, trailCC;
p=_decompose(norm32, qcMask, length, cc, trailCC);
if(length<=destCapacity) {
limit=p+length;
do {
*dest++=*p++;
} while(p<limit);
}
return length;
}
} else {
return 0;
}
}
static int32_t
_decompose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, const UnicodeSet *nx,
uint8_t &outTrailCC) {
UChar buffer[3];
const UChar *limit, *prevSrc, *p;
uint32_t norm32, ccOrQCMask, qcMask;
int32_t destIndex, reorderStartIndex, length;
UChar c, c2, minNoMaybe;
uint8_t cc, prevCC, trailCC;
if(!compat) {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
qcMask=_NORM_QC_NFD;
} else {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
qcMask=_NORM_QC_NFKD;
}
ccOrQCMask=_NORM_CC_MASK|qcMask;
destIndex=reorderStartIndex=0;
prevCC=0;
norm32=0;
c=0;
if(srcLength>=0) {
limit=src+srcLength;
} else {
limit=NULL;
}
U_ALIGN_CODE(16);
for(;;) {
prevSrc=src;
if(limit==NULL) {
while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
prevCC=0;
++src;
}
} else {
while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
prevCC=0;
++src;
}
}
if(src!=prevSrc) {
length=(int32_t)(src-prevSrc);
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
}
destIndex+=length;
reorderStartIndex=destIndex;
}
if(limit==NULL ? c==0 : src==limit) {
break;
}
++src;
if(isNorm32HangulOrJamo(norm32)) {
if(nx_contains(nx, c)) {
c2=0;
p=NULL;
length=1;
} else {
p=buffer;
cc=trailCC=0;
c-=HANGUL_BASE;
c2=(UChar)(c%JAMO_T_COUNT);
c/=JAMO_T_COUNT;
if(c2>0) {
buffer[2]=(UChar)(JAMO_T_BASE+c2);
length=3;
} else {
length=2;
}
buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
}
} else {
if(isNorm32Regular(norm32)) {
c2=0;
length=1;
} else {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
length=2;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
c2=0;
length=1;
norm32=0;
}
}
if(nx_contains(nx, c, c2)) {
cc=trailCC=0;
p=NULL;
} else if((norm32&qcMask)==0) {
cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
p=NULL;
} else {
p=_decompose(norm32, qcMask, length, cc, trailCC);
if(length==1) {
c=*p;
c2=0;
p=NULL;
}
}
}
if((destIndex+length)<=destCapacity) {
UChar *reorderSplit=dest+destIndex;
if(p==NULL) {
if(cc!=0 && cc<prevCC) {
destIndex+=length;
trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
} else {
dest[destIndex++]=c;
if(c2!=0) {
dest[destIndex++]=c2;
}
}
} else {
if(cc!=0 && cc<prevCC) {
destIndex+=length;
trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
} else {
do {
dest[destIndex++]=*p++;
} while(--length>0);
}
}
} else {
destIndex+=length;
}
prevCC=trailCC;
if(prevCC==0) {
reorderStartIndex=destIndex;
}
}
outTrailCC=prevCC;
return destIndex;
}
U_CAPI int32_t U_EXPORT2
unorm_decompose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, int32_t options,
UErrorCode *pErrorCode) {
const UnicodeSet *nx;
int32_t destIndex;
uint8_t trailCC;
if(!_haveData(*pErrorCode)) {
return 0;
}
nx=getNX(options, *pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
destIndex=_decompose(dest, destCapacity,
src, srcLength,
compat, nx,
trailCC);
return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
}
static const UChar *
_findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
UChar c, c2;
for(;;) {
if((fcd16&0xff)==0) {
break;
}
if(src==limit) {
break;
}
c=*src;
if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
break;
}
if(!UTF_IS_FIRST_SURROGATE(c)) {
if(fcd16<=0xff) {
break;
}
++src;
} else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
if(fcd16<=0xff) {
break;
}
src+=2;
} else {
break;
}
}
return src;
}
static uint8_t
_decomposeFCD(const UChar *src, const UChar *decompLimit,
UChar *dest, int32_t &destIndex, int32_t destCapacity,
const UnicodeSet *nx) {
const UChar *p;
uint32_t norm32;
int32_t reorderStartIndex, length;
UChar c, c2;
uint8_t cc, prevCC, trailCC;
reorderStartIndex=destIndex;
prevCC=0;
while(src<decompLimit) {
c=*src++;
norm32=_getNorm32(c);
if(isNorm32Regular(norm32)) {
c2=0;
length=1;
} else {
if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
length=2;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
c2=0;
length=1;
norm32=0;
}
}
if(nx_contains(nx, c, c2)) {
cc=trailCC=0;
p=NULL;
} else if((norm32&_NORM_QC_NFD)==0) {
cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
p=NULL;
} else {
p=_decompose(norm32, length, cc, trailCC);
if(length==1) {
c=*p;
c2=0;
p=NULL;
}
}
if((destIndex+length)<=destCapacity) {
UChar *reorderSplit=dest+destIndex;
if(p==NULL) {
if(cc!=0 && cc<prevCC) {
destIndex+=length;
trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
} else {
dest[destIndex++]=c;
if(c2!=0) {
dest[destIndex++]=c2;
}
}
} else {
if(cc!=0 && cc<prevCC) {
destIndex+=length;
trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
} else {
do {
dest[destIndex++]=*p++;
} while(--length>0);
}
}
} else {
destIndex+=length;
}
prevCC=trailCC;
if(prevCC==0) {
reorderStartIndex=destIndex;
}
}
return prevCC;
}
static int32_t
unorm_makeFCD(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const UnicodeSet *nx,
UErrorCode *pErrorCode) {
const UChar *limit, *prevSrc, *decompStart;
int32_t destIndex, length;
UChar c, c2;
uint16_t fcd16;
int16_t prevCC, cc;
if(!_haveData(*pErrorCode)) {
return 0;
}
decompStart=src;
destIndex=0;
prevCC=0;
c=0;
fcd16=0;
if(srcLength>=0) {
limit=src+srcLength;
} else {
limit=NULL;
}
U_ALIGN_CODE(16);
for(;;) {
prevSrc=src;
if(limit==NULL) {
for(;;) {
c=*src;
if(c<_NORM_MIN_WITH_LEAD_CC) {
if(c==0) {
break;
}
prevCC=(int16_t)-c;
} else if((fcd16=_getFCD16(c))==0) {
prevCC=0;
} else {
break;
}
++src;
}
} else {
for(;;) {
if(src==limit) {
break;
} else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
prevCC=(int16_t)-c;
} else if((fcd16=_getFCD16(c))==0) {
prevCC=0;
} else {
break;
}
++src;
}
}
if(src!=prevSrc) {
length=(int32_t)(src-prevSrc);
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
}
destIndex+=length;
prevSrc=src;
if(prevCC<0) {
if(!nx_contains(nx, (UChar32)-prevCC)) {
prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
} else {
prevCC=0;
}
decompStart=prevSrc-1;
}
}
if(limit==NULL ? c==0 : src==limit) {
break;
}
if(prevCC==0) {
decompStart=prevSrc;
}
++src;
if(UTF_IS_FIRST_SURROGATE(c)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
} else {
c2=0;
fcd16=0;
}
} else {
c2=0;
}
if(nx_contains(nx, c, c2)) {
fcd16=0;
}
cc=(int16_t)(fcd16>>8);
if(cc==0 || cc>=prevCC) {
if(cc==0) {
decompStart=prevSrc;
}
prevCC=(int16_t)(fcd16&0xff);
length= c2==0 ? 1 : 2;
if((destIndex+length)<=destCapacity) {
dest[destIndex++]=c;
if(c2!=0) {
dest[destIndex++]=c2;
}
} else {
destIndex+=length;
}
} else {
destIndex-=(int32_t)(prevSrc-decompStart);
src=_findSafeFCD(src, limit, fcd16);
prevCC=_decomposeFCD(decompStart, src,
dest, destIndex, destCapacity,
nx);
decompStart=src;
}
}
return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
}
static inline uint32_t
_getNextCombining(UChar *&p, const UChar *limit,
UChar &c, UChar &c2,
uint16_t &combiningIndex, uint8_t &cc,
const UnicodeSet *nx) {
uint32_t norm32, combineFlags;
c=*p++;
norm32=_getNorm32(c);
c2=0;
combiningIndex=0;
cc=0;
if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
return 0;
} else {
if(isNorm32Regular(norm32)) {
} else if(isNorm32HangulOrJamo(norm32)) {
combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
return norm32&_NORM_COMBINES_ANY;
} else {
if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
++p;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
c2=0;
return 0;
}
}
if(nx_contains(nx, c, c2)) {
return 0;
}
cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
combineFlags=norm32&_NORM_COMBINES_ANY;
if(combineFlags!=0) {
combiningIndex=*(_getExtraData(norm32)-1);
}
return combineFlags;
}
}
static inline uint16_t
_getCombiningIndexFromStarter(UChar c, UChar c2) {
uint32_t norm32;
norm32=_getNorm32(c);
if(c2!=0) {
norm32=_getNorm32FromSurrogatePair(norm32, c2);
}
return *(_getExtraData(norm32)-1);
}
static inline uint16_t
_combine(const uint16_t *table, uint16_t combineBackIndex,
uint16_t &value, uint16_t &value2) {
uint16_t key;
for(;;) {
key=*table++;
if(key>=combineBackIndex) {
break;
}
table+= *table&0x8000 ? 2 : 1;
}
if((key&0x7fff)==combineBackIndex) {
value=*table;
key=(uint16_t)((value&0x2000)+1);
if(value&0x8000) {
if(value&0x4000) {
value=(uint16_t)((value&0x3ff)|0xd800);
value2=*(table+1);
} else {
value=*(table+1);
value2=0;
}
} else {
value&=0x1fff;
value2=0;
}
return key;
} else {
return 0;
}
}
static uint8_t
_recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
UChar *starter, *pRemove, *q, *r;
uint32_t combineFlags;
UChar c, c2;
uint16_t combineFwdIndex, combineBackIndex;
uint16_t result, value, value2;
uint8_t cc, prevCC;
UBool starterIsSupplementary;
starter=NULL;
combineFwdIndex=0;
combineBackIndex=0;
value=value2=0;
starterIsSupplementary=FALSE;
prevCC=0;
for(;;) {
combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
if(combineBackIndex&0x8000) {
pRemove=NULL;
c2=*starter;
if(combineBackIndex==0xfff2) {
c2=(UChar)(c2-JAMO_L_BASE);
if(c2<JAMO_L_COUNT) {
pRemove=p-1;
c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
++p;
c+=c2;
}
if(!nx_contains(nx, c)) {
*starter=c;
} else {
if(!isHangulWithoutJamoT(c)) {
--p;
}
pRemove=NULL;
}
}
#if 0
} else {
if(isHangulWithoutJamoT(c2)) {
pRemove=p-1;
*starter=(UChar)(c2+(c-JAMO_T_BASE));
}
#endif
}
if(pRemove!=NULL) {
q=pRemove;
r=p;
while(r<limit) {
*q++=*r++;
}
p=pRemove;
limit=q;
}
c2=0;
} else if(
!(combineFwdIndex&0x8000) &&
(prevCC<cc || prevCC==0) &&
0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
!nx_contains(nx, value, value2)
) {
pRemove= c2==0 ? p-1 : p-2;
*starter=(UChar)value;
if(starterIsSupplementary) {
if(value2!=0) {
*(starter+1)=(UChar)value2;
} else {
starterIsSupplementary=FALSE;
q=starter+1;
r=q+1;
while(r<pRemove) {
*q++=*r++;
}
--pRemove;
}
} else if(value2!=0) {
starterIsSupplementary=TRUE;
++starter;
q=pRemove;
r=++pRemove;
while(starter<q) {
*--r=*--q;
}
*starter=(UChar)value2;
--starter;
}
if(pRemove<p) {
q=pRemove;
r=p;
while(r<limit) {
*q++=*r++;
}
p=pRemove;
limit=q;
}
if(p==limit) {
return prevCC;
}
if(result>1) {
combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
} else {
starter=NULL;
}
continue;
}
}
prevCC=cc;
if(p==limit) {
return prevCC;
}
if(cc==0) {
if(combineFlags&_NORM_COMBINES_FWD) {
if(c2==0) {
starterIsSupplementary=FALSE;
starter=p-1;
} else {
starterIsSupplementary=TRUE;
starter=p-2;
}
combineFwdIndex=combineBackIndex;
} else {
starter=NULL;
}
}
}
}
static const UChar *
_findPreviousStarter(const UChar *start, const UChar *src,
uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
uint32_t norm32;
UChar c, c2;
while(start<src) {
norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
break;
}
}
return src;
}
static const UChar *
_findNextStarter(const UChar *src, const UChar *limit,
uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
const UChar *p;
uint32_t norm32, ccOrQCMask;
int32_t length;
UChar c, c2;
uint8_t cc, trailCC;
ccOrQCMask=_NORM_CC_MASK|qcMask;
for(;;) {
if(src==limit) {
break;
}
c=*src;
if(c<minNoMaybe) {
break;
}
norm32=_getNorm32(c);
if((norm32&ccOrQCMask)==0) {
break;
}
if(isNorm32LeadSurrogate(norm32)) {
if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
break;
}
norm32=_getNorm32FromSurrogatePair(norm32, c2);
if((norm32&ccOrQCMask)==0) {
break;
}
} else {
c2=0;
}
if(norm32&decompQCMask) {
p=_decompose(norm32, decompQCMask, length, cc, trailCC);
if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
break;
}
}
src+= c2==0 ? 1 : 2;
}
return src;
}
static const UChar *
_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
const UChar *prevStarter, const UChar *src,
uint32_t qcMask, uint8_t &prevCC,
const UnicodeSet *nx,
UErrorCode *pErrorCode) {
UChar *recomposeLimit;
uint8_t trailCC;
UBool compat;
compat=(UBool)((qcMask&_NORM_QC_NFKC)!=0);
length=_decompose(buffer, bufferCapacity,
prevStarter, src-prevStarter,
compat, nx,
trailCC);
if(length>bufferCapacity) {
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
length=_decompose(buffer, bufferCapacity,
prevStarter, src-prevStarter,
compat, nx,
trailCC);
}
recomposeLimit=buffer+length;
if(length>=2) {
prevCC=_recompose(buffer, recomposeLimit, nx);
}
length=recomposeLimit-buffer;
return buffer;
}
static inline UBool
_composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
UBool compat, UChar *dest, const UnicodeSet *nx) {
if(isJamoVTNorm32JamoV(norm32)) {
prev=(UChar)(prev-JAMO_L_BASE);
if(prev<JAMO_L_COUNT) {
c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
if(src!=limit) {
UChar next, t;
next=*src;
if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
++src;
c+=t;
} else if(compat) {
norm32=_getNorm32(next);
if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
const UChar *p;
int32_t length;
uint8_t cc, trailCC;
p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
++src;
c+=t;
}
}
}
}
if(nx_contains(nx, c)) {
if(!isHangulWithoutJamoT(c)) {
--src;
}
return FALSE;
}
if(dest!=0) {
*dest=c;
}
return TRUE;
}
} else if(isHangulWithoutJamoT(prev)) {
c=(UChar)(prev+(c-JAMO_T_BASE));
if(nx_contains(nx, c)) {
return FALSE;
}
if(dest!=0) {
*dest=c;
}
return TRUE;
}
return FALSE;
}
static int32_t
_compose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, const UnicodeSet *nx,
UErrorCode *pErrorCode) {
UChar stackBuffer[_STACK_BUFFER_CAPACITY];
UChar *buffer;
int32_t bufferCapacity;
const UChar *limit, *prevSrc, *prevStarter;
uint32_t norm32, ccOrQCMask, qcMask;
int32_t destIndex, reorderStartIndex, length;
UChar c, c2, minNoMaybe;
uint8_t cc, prevCC;
if(!compat) {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
qcMask=_NORM_QC_NFC;
} else {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
qcMask=_NORM_QC_NFKC;
}
buffer=stackBuffer;
bufferCapacity=_STACK_BUFFER_CAPACITY;
prevStarter=src;
ccOrQCMask=_NORM_CC_MASK|qcMask;
destIndex=reorderStartIndex=0;
prevCC=0;
norm32=0;
c=0;
if(srcLength>=0) {
limit=src+srcLength;
} else {
limit=NULL;
}
U_ALIGN_CODE(16);
for(;;) {
prevSrc=src;
if(limit==NULL) {
while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
prevCC=0;
++src;
}
} else {
while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
prevCC=0;
++src;
}
}
if(src!=prevSrc) {
length=(int32_t)(src-prevSrc);
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
}
destIndex+=length;
reorderStartIndex=destIndex;
prevStarter=src-1;
if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
--prevStarter;
}
prevSrc=src;
}
if(limit==NULL ? c==0 : src==limit) {
break;
}
++src;
if(isNorm32HangulOrJamo(norm32)) {
prevCC=cc=0;
reorderStartIndex=destIndex;
if(
destIndex>0 &&
_composeHangul(
*(prevSrc-1), c, norm32, src, limit, compat,
destIndex<=destCapacity ? dest+(destIndex-1) : 0,
nx)
) {
prevStarter=src;
continue;
}
c2=0;
length=1;
prevStarter=prevSrc;
} else {
if(isNorm32Regular(norm32)) {
c2=0;
length=1;
} else {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
length=2;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
c2=0;
length=1;
norm32=0;
}
}
if(nx_contains(nx, c, c2)) {
cc=0;
} else if((norm32&qcMask)==0) {
cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
} else {
const UChar *p;
uint32_t decompQCMask;
decompQCMask=(qcMask<<2)&0xf;
if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
prevStarter=prevSrc;
} else {
destIndex-=(int32_t)(prevSrc-prevStarter);
}
src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
p=_composePart(stackBuffer, buffer, bufferCapacity,
length,
prevStarter, src,
qcMask,
prevCC,
nx,
pErrorCode);
if(p==NULL) {
destIndex=0;
break;
}
if((destIndex+length)<=destCapacity) {
while(length>0) {
dest[destIndex++]=*p++;
--length;
}
} else {
destIndex+=length;
}
prevStarter=src;
continue;
}
}
if((destIndex+length)<=destCapacity) {
if(cc!=0 && cc<prevCC) {
UChar *reorderSplit=dest+destIndex;
destIndex+=length;
prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
} else {
dest[destIndex++]=c;
if(c2!=0) {
dest[destIndex++]=c2;
}
prevCC=cc;
}
} else {
destIndex+=length;
prevCC=cc;
}
}
if(buffer!=stackBuffer) {
uprv_free(buffer);
}
return destIndex;
}
U_CAPI int32_t U_EXPORT2
unorm_compose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, int32_t options,
UErrorCode *pErrorCode) {
const UnicodeSet *nx;
int32_t destIndex;
if(!_haveData(*pErrorCode)) {
return 0;
}
nx=getNX(options, *pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
destIndex=_compose(dest, destCapacity,
src, srcLength,
compat, nx,
pErrorCode);
return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
}
static int32_t
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, const UnicodeSet *nx,
UErrorCode *pErrorCode) {
int32_t destLength;
uint8_t trailCC;
switch(mode) {
case UNORM_NFD:
destLength=_decompose(dest, destCapacity,
src, srcLength,
FALSE, nx, trailCC);
break;
case UNORM_NFKD:
destLength=_decompose(dest, destCapacity,
src, srcLength,
TRUE, nx, trailCC);
break;
case UNORM_NFC:
destLength=_compose(dest, destCapacity,
src, srcLength,
FALSE, nx, pErrorCode);
break;
case UNORM_NFKC:
destLength=_compose(dest, destCapacity,
src, srcLength,
TRUE, nx, pErrorCode);
break;
case UNORM_FCD:
return unorm_makeFCD(dest, destCapacity,
src, srcLength,
nx,
pErrorCode);
case UNORM_NONE:
if(srcLength==-1) {
srcLength=u_strlen(src);
}
if(srcLength>0 && srcLength<=destCapacity) {
uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
}
destLength=srcLength;
break;
default:
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
const UnicodeSet *nx;
if(!_haveData(*pErrorCode)) {
return 0;
}
nx=getNX(options, *pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
return unorm_internalNormalize(dest, destCapacity,
src, srcLength,
mode, nx,
pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
src==NULL || srcLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if( dest!=NULL &&
((src>=dest && src<(dest+destCapacity)) ||
(srcLength>0 && dest>=src && dest<(src+srcLength)))
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return unorm_internalNormalize(dest, destCapacity,
src, srcLength,
mode, options,
pErrorCode);
}
static inline uint32_t
_getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
uint32_t norm32;
c=(UChar)src.previous(&src);
c2=0;
if(c<minC) {
return 0;
} else if(!UTF_IS_SURROGATE(c)) {
return _getNorm32(c);
} else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
return 0;
} else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
norm32=_getNorm32(c2);
if((norm32&mask)==0) {
return 0;
} else {
return _getNorm32FromSurrogatePair(norm32, c);
}
} else {
src.move(&src, 1, UITER_CURRENT);
c2=0;
return 0;
}
}
typedef UBool
IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
static UBool
_isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
}
static UBool
_isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
uint32_t norm32, decompQCMask;
decompQCMask=(ccOrQCMask<<2)&0xf;
norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
}
static int32_t
_findPreviousIterationBoundary(UCharIterator &src,
IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
UChar *&buffer, int32_t &bufferCapacity,
int32_t &startIndex,
UErrorCode *pErrorCode) {
UChar *stackBuffer;
UChar c, c2;
UBool isBoundary;
stackBuffer=buffer;
startIndex=bufferCapacity;
while(src.hasPrevious(&src)) {
isBoundary=isPrevBoundary(src, minC, mask, c, c2);
if(startIndex < (c2==0 ? 1 : 2)) {
int32_t bufferLength=bufferCapacity;
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
src.move(&src, 0, UITER_START);
return 0;
}
uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
startIndex+=bufferCapacity-bufferLength;
}
buffer[--startIndex]=c;
if(c2!=0) {
buffer[--startIndex]=c2;
}
if(isBoundary) {
break;
}
}
return bufferCapacity-startIndex;
}
U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator *src,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode) {
UChar stackBuffer[100];
UChar *buffer=NULL;
IsPrevBoundaryFn *isPreviousBoundary=NULL;
uint32_t mask=0;
int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
int32_t c=0, c2=0;
UChar minC=0;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
src==NULL
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(!_haveData(*pErrorCode)) {
return 0;
}
if(pNeededToNormalize!=NULL) {
*pNeededToNormalize=FALSE;
}
switch(mode) {
case UNORM_NFD:
case UNORM_FCD:
isPreviousBoundary=_isPrevNFDSafe;
minC=_NORM_MIN_WITH_LEAD_CC;
mask=_NORM_CC_MASK|_NORM_QC_NFD;
break;
case UNORM_NFKD:
isPreviousBoundary=_isPrevNFDSafe;
minC=_NORM_MIN_WITH_LEAD_CC;
mask=_NORM_CC_MASK|_NORM_QC_NFKD;
break;
case UNORM_NFC:
isPreviousBoundary=_isPrevTrueStarter;
minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
mask=_NORM_CC_MASK|_NORM_QC_NFC;
break;
case UNORM_NFKC:
isPreviousBoundary=_isPrevTrueStarter;
minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
mask=_NORM_CC_MASK|_NORM_QC_NFKC;
break;
case UNORM_NONE:
destLength=0;
if((c=src->previous(src))>=0) {
destLength=1;
if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
if(UTF_IS_LEAD(c2)) {
if(destCapacity>=2) {
dest[1]=(UChar)c;
destLength=2;
}
c=c2;
} else {
src->move(src, 1, UITER_CURRENT);
}
}
if(destCapacity>0) {
dest[0]=(UChar)c;
}
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
default:
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
buffer=stackBuffer;
bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
bufferLength=_findPreviousIterationBoundary(*src,
isPreviousBoundary, minC, mask,
buffer, bufferCapacity,
startIndex,
pErrorCode);
if(bufferLength>0) {
if(doNormalize) {
destLength=unorm_internalNormalize(dest, destCapacity,
buffer+startIndex, bufferLength,
mode, options,
pErrorCode);
if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
*pNeededToNormalize=
(UBool)(destLength!=bufferLength ||
0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
}
} else {
if(destCapacity>0) {
uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
}
destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
}
} else {
destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
}
if(buffer!=stackBuffer) {
uprv_free(buffer);
}
return destLength;
}
static inline uint32_t
_getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
uint32_t norm32;
c=(UChar)src.next(&src);
c2=0;
if(c<minC) {
return 0;
}
norm32=_getNorm32(c);
if(UTF_IS_FIRST_SURROGATE(c)) {
if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
src.move(&src, 1, UITER_CURRENT);
if((norm32&mask)==0) {
return 0;
} else {
return _getNorm32FromSurrogatePair(norm32, c2);
}
} else {
c2=0;
return 0;
}
}
return norm32;
}
typedef UBool
IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
static UBool
_isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
}
static UBool
_isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
uint32_t norm32, decompQCMask;
decompQCMask=(ccOrQCMask<<2)&0xf;
norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
}
static int32_t
_findNextIterationBoundary(UCharIterator &src,
IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
UChar *&buffer, int32_t &bufferCapacity,
UErrorCode *pErrorCode) {
UChar *stackBuffer;
int32_t bufferIndex;
UChar c, c2;
if(!src.hasNext(&src)) {
return 0;
}
stackBuffer=buffer;
buffer[0]=c=(UChar)src.next(&src);
bufferIndex=1;
if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
buffer[bufferIndex++]=c2;
} else {
src.move(&src, -1, UITER_CURRENT);
}
}
while(src.hasNext(&src)) {
if(isNextBoundary(src, minC, mask, c, c2)) {
src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
break;
} else {
if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
2*bufferCapacity,
bufferIndex)
) {
buffer[bufferIndex++]=c;
if(c2!=0) {
buffer[bufferIndex++]=c2;
}
} else {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
src.move(&src, 0, UITER_LIMIT);
return 0;
}
}
}
return bufferIndex;
}
U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator *src,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode) {
UChar stackBuffer[100];
UChar *buffer;
IsNextBoundaryFn *isNextBoundary;
uint32_t mask;
int32_t bufferLength, bufferCapacity, destLength;
int32_t c, c2;
UChar minC;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
src==NULL
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(!_haveData(*pErrorCode)) {
return 0;
}
if(pNeededToNormalize!=NULL) {
*pNeededToNormalize=FALSE;
}
switch(mode) {
case UNORM_NFD:
case UNORM_FCD:
isNextBoundary=_isNextNFDSafe;
minC=_NORM_MIN_WITH_LEAD_CC;
mask=_NORM_CC_MASK|_NORM_QC_NFD;
break;
case UNORM_NFKD:
isNextBoundary=_isNextNFDSafe;
minC=_NORM_MIN_WITH_LEAD_CC;
mask=_NORM_CC_MASK|_NORM_QC_NFKD;
break;
case UNORM_NFC:
isNextBoundary=_isNextTrueStarter;
minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
mask=_NORM_CC_MASK|_NORM_QC_NFC;
break;
case UNORM_NFKC:
isNextBoundary=_isNextTrueStarter;
minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
mask=_NORM_CC_MASK|_NORM_QC_NFKC;
break;
case UNORM_NONE:
destLength=0;
if((c=src->next(src))>=0) {
destLength=1;
if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
if(UTF_IS_TRAIL(c2)) {
if(destCapacity>=2) {
dest[1]=(UChar)c2;
destLength=2;
}
} else {
src->move(src, -1, UITER_CURRENT);
}
}
if(destCapacity>0) {
dest[0]=(UChar)c;
}
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
default:
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
buffer=stackBuffer;
bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
bufferLength=_findNextIterationBoundary(*src,
isNextBoundary, minC, mask,
buffer, bufferCapacity,
pErrorCode);
if(bufferLength>0) {
if(doNormalize) {
destLength=unorm_internalNormalize(dest, destCapacity,
buffer, bufferLength,
mode, options,
pErrorCode);
if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
*pNeededToNormalize=
(UBool)(destLength!=bufferLength ||
0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
}
} else {
if(destCapacity>0) {
uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
}
destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
}
} else {
destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
}
if(buffer!=stackBuffer) {
uprv_free(buffer);
}
return destLength;
}
U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar *left, int32_t leftLength,
const UChar *right, int32_t rightLength,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
UChar stackBuffer[100];
UChar *buffer;
int32_t bufferLength, bufferCapacity;
UCharIterator iter;
int32_t leftBoundary, rightBoundary, destLength;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
left==NULL || leftLength<-1 ||
right==NULL || rightLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if( dest!=NULL &&
((right>=dest && right<(dest+destCapacity)) ||
(rightLength>0 && dest>=right && dest<(right+rightLength)))
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
buffer=stackBuffer;
bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
uiter_setString(&iter, left, leftLength);
iter.index=leftLength=iter.length;
bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
mode, options,
FALSE, NULL,
pErrorCode);
leftBoundary=iter.index;
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return 0;
}
uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
}
uiter_setString(&iter, right, rightLength);
rightLength=iter.length;
rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
mode, options,
FALSE, NULL,
pErrorCode);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return 0;
}
uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
}
bufferLength+=rightBoundary;
if(left!=dest && leftBoundary>0 && destCapacity>0) {
uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
}
destLength=leftBoundary;
if(destCapacity>destLength) {
destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
buffer, bufferLength,
mode, options,
pErrorCode);
} else {
destLength+=unorm_internalNormalize(NULL, 0,
buffer, bufferLength,
mode, options,
pErrorCode);
}
right+=rightBoundary;
rightLength-=rightBoundary;
if(rightLength>0 && destCapacity>destLength) {
uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
}
destLength+=rightLength;
if(buffer!=stackBuffer) {
uprv_free(buffer);
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
#else
static inline UBool
_haveData(UErrorCode &errorCode) {
if(U_SUCCESS(errorCode)) {
errorCode=U_INTERNAL_PROGRAM_ERROR;
}
return FALSE;
}
static inline const UChar *
_decompose(UChar32 , UChar [4], int32_t &) {
return NULL;
}
#endif
struct CmpEquivLevel {
const UChar *start, *s, *limit;
};
typedef struct CmpEquivLevel CmpEquivLevel;
U_CAPI int32_t U_EXPORT2
unorm_cmpEquivFold(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
uint32_t options,
UErrorCode *pErrorCode) {
const UChar *start1, *start2, *limit1, *limit2;
const UChar *p;
int32_t length;
CmpEquivLevel stack1[2], stack2[2];
UChar decomp1[4], decomp2[4];
UChar fold1[32], fold2[32];
int32_t level1, level2;
int32_t c1, c2, cp1, cp2;
if( ((options&_COMPARE_EQUIV)!=0 && !_haveData(*pErrorCode)) ||
((options&U_COMPARE_IGNORE_CASE)!=0 && !uprv_haveProperties(pErrorCode))
) {
return 0;
}
start1=s1;
if(length1==-1) {
limit1=NULL;
} else {
limit1=s1+length1;
}
start2=s2;
if(length2==-1) {
limit2=NULL;
} else {
limit2=s2+length2;
}
level1=level2=0;
c1=c2=-1;
for(;;) {
if(c1<0) {
for(;;) {
if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
if(level1==0) {
c1=-1;
break;
}
} else {
++s1;
break;
}
do {
--level1;
start1=stack1[level1].start;
} while(start1==NULL);
s1=stack1[level1].s;
limit1=stack1[level1].limit;
}
}
if(c2<0) {
for(;;) {
if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
if(level2==0) {
c2=-1;
break;
}
} else {
++s2;
break;
}
do {
--level2;
start2=stack2[level2].start;
} while(start2==NULL);
s2=stack2[level2].s;
limit2=stack2[level2].limit;
}
}
if(c1==c2) {
if(c1<0) {
return 0; }
c1=c2=-1; continue;
} else if(c1<0) {
return -1; } else if(c2<0) {
return 1; }
cp1=c1;
if(UTF_IS_SURROGATE(c1)) {
UChar c;
if(UTF_IS_SURROGATE_FIRST(c1)) {
if(s1!=limit1 && UTF_IS_TRAIL(c=*s1)) {
cp1=UTF16_GET_PAIR_VALUE(c1, c);
}
} else {
if(start1<=(s1-2) && UTF_IS_LEAD(c=*(s1-2))) {
cp1=UTF16_GET_PAIR_VALUE(c, c1);
}
}
}
cp2=c2;
if(UTF_IS_SURROGATE(c2)) {
UChar c;
if(UTF_IS_SURROGATE_FIRST(c2)) {
if(s2!=limit2 && UTF_IS_TRAIL(c=*s2)) {
cp2=UTF16_GET_PAIR_VALUE(c2, c);
}
} else {
if(start2<=(s2-2) && UTF_IS_LEAD(c=*(s2-2))) {
cp2=UTF16_GET_PAIR_VALUE(c, c2);
}
}
}
if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
(length=u_internalFoldCase((UChar32)cp1, fold1, 32, options))>=0
) {
if(UTF_IS_SURROGATE(c1)) {
if(UTF_IS_SURROGATE_FIRST(c1)) {
++s1;
} else {
--s2;
c2=*(s2-1);
}
}
stack1[0].start=start1;
stack1[0].s=s1;
stack1[0].limit=limit1;
++level1;
start1=s1=fold1;
limit1=fold1+length;
c1=-1;
continue;
}
if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
(length=u_internalFoldCase((UChar32)cp2, fold2, 32, options))>=0
) {
if(UTF_IS_SURROGATE(c2)) {
if(UTF_IS_SURROGATE_FIRST(c2)) {
++s2;
} else {
--s1;
c1=*(s1-1);
}
}
stack2[0].start=start2;
stack2[0].s=s2;
stack2[0].limit=limit2;
++level2;
start2=s2=fold2;
limit2=fold2+length;
c2=-1;
continue;
}
if( level1<2 && (options&_COMPARE_EQUIV) &&
0!=(p=_decompose((UChar32)cp1, decomp1, length))
) {
if(UTF_IS_SURROGATE(c1)) {
if(UTF_IS_SURROGATE_FIRST(c1)) {
++s1;
} else {
--s2;
c2=*(s2-1);
}
}
stack1[level1].start=start1;
stack1[level1].s=s1;
stack1[level1].limit=limit1;
++level1;
if(level1<2) {
stack1[level1++].start=NULL;
}
start1=s1=p;
limit1=p+length;
c1=-1;
continue;
}
if( level2<2 && (options&_COMPARE_EQUIV) &&
0!=(p=_decompose((UChar32)cp2, decomp2, length))
) {
if(UTF_IS_SURROGATE(c2)) {
if(UTF_IS_SURROGATE_FIRST(c2)) {
++s2;
} else {
--s1;
c1=*(s1-1);
}
}
stack2[level2].start=start2;
stack2[level2].s=s2;
stack2[level2].limit=limit2;
++level2;
if(level2<2) {
stack2[level2++].start=NULL;
}
start2=s2=p;
limit2=p+length;
c2=-1;
continue;
}
if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
if(
(c1<=0xdbff && s1!=limit1 && UTF_IS_TRAIL(*s1)) ||
(UTF_IS_TRAIL(c1) && start1!=(s1-1) && UTF_IS_LEAD(*(s1-2)))
) {
} else {
c1-=0x2800;
}
if(
(c2<=0xdbff && s2!=limit2 && UTF_IS_TRAIL(*s2)) ||
(UTF_IS_TRAIL(c2) && start2!=(s2-1) && UTF_IS_LEAD(*(s2-2)))
) {
} else {
c2-=0x2800;
}
}
return c1-c2;
}
}
#if !UCONFIG_NO_NORMALIZATION
U_CAPI int32_t U_EXPORT2
unorm_compare(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
uint32_t options,
UErrorCode *pErrorCode) {
UChar fcd1[300], fcd2[300];
UChar *d1, *d2;
const UnicodeSet *nx;
UNormalizationMode mode;
int32_t result;
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==0 || length1<-1 || s2==0 || length2<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(!_haveData(*pErrorCode)) {
return 0;
}
if(!uprv_haveProperties(pErrorCode)) {
return 0;
}
nx=getNX((int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT), *pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
d1=d2=0;
options|=_COMPARE_EQUIV;
result=0;
if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
mode=UNORM_NFD;
options&=~UNORM_INPUT_IS_FCD;
} else {
mode=UNORM_FCD;
}
if(!(options&UNORM_INPUT_IS_FCD)) {
int32_t _len1, _len2;
UBool isFCD1, isFCD2;
isFCD1= UNORM_YES==_quickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
isFCD2= UNORM_YES==_quickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(!isFCD1) {
_len1=unorm_internalNormalize(fcd1, LENGTHOF(fcd1),
s1, length1,
mode, nx,
pErrorCode);
if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
s1=fcd1;
} else {
d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
if(d1==0) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
*pErrorCode=U_ZERO_ERROR;
_len1=unorm_internalNormalize(d1, _len1,
s1, length1,
mode, nx,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
goto cleanup;
}
s1=d1;
}
length1=_len1;
}
if(!isFCD2) {
_len2=unorm_internalNormalize(fcd2, LENGTHOF(fcd2),
s2, length2,
mode, nx,
pErrorCode);
if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
s2=fcd2;
} else {
d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
if(d2==0) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
*pErrorCode=U_ZERO_ERROR;
_len2=unorm_internalNormalize(d2, _len2,
s2, length2,
mode, nx,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
goto cleanup;
}
s2=d2;
}
length2=_len2;
}
}
if(U_SUCCESS(*pErrorCode)) {
result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
}
cleanup:
if(d1!=0) {
uprv_free(d1);
}
if(d2!=0) {
uprv_free(d2);
}
return result;
}
#endif