store.c   [plain text]


/*
*******************************************************************************
*
*   Copyright (C) 2004, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  store.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2004aug28
*   created by: Markus W. Scherer
*
*   Store Unicode case mapping properties efficiently for
*   random access.
*/

#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "utrie.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "propsvec.h"
#include "gencase.h"

/* Unicode case mapping properties file format ---------------------------------

The file format prepared and written here contains several data
structures that store indexes or data.

Before the data contents described below, there are the headers required by
the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.

The following is a description of format version 1 .

The file contains the following structures:

    const int32_t indexes[i0] with values i0, i1, ...:
    (see UCASE_IX_... constants for names of indexes)

    i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
    i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
    i2 trieSize; -- size in bytes of the case mapping properties trie
    i3 exceptionsLength; -- length in uint16_t of the exceptions array

    i4..i14 reservedIndexes; -- reserved values; 0 for now

    i15 maxFullLength; -- maximum length of a full case mapping/folding string


    Serizalied trie, see utrie.h;

    const uint16_t exceptions[exceptionsLength];


Trie data word:
Bits
if(exception) {
    15..4   unsigned exception index
} else {
    if(not uncased) {
        15..6   signed delta to simple case mapping code point
                (add delta to input code point)
    } else {
            6   the code point is case-ignorable
                (U+0307 is also case-ignorable but has an exception)
    }
     5..4   0 normal character with cc=0
            1 soft-dotted character
            2 cc=230
            3 other cc
}
    3   exception
    2   case sensitive
 1..0   0 uncased
        1 lowercase
        2 uppercase
        3 titlecase


Exceptions:
A sub-array of the exceptions array is indexed by the exception index in a
trie word.
The sub-array consists of the following fields:
    uint16_t excWord;
    uint16_t optional values [];
    UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase

excWord: (see UCASE_EXC_...)
Bits
    15  conditional case folding
    14  conditional special casing
13..12  same as non-exception trie data bits 5..4
        moved here because the exception index needs more bits than the delta
        0 normal character with cc=0
        1 soft-dotted character
        2 cc=230
        3 other cc
11.. 9  reserved
     8  if set, then for each optional-value slot there are 2 uint16_t values
        (high and low parts of 32-bit values)
        instead of single ones
 7.. 0  bits for which optional value is present

Optional-value slots:
0   lowercase mapping (code point)
1   case folding (code point)
2   uppercase mapping (code point)
3   titlecase mapping (code point)
4..6 reserved
7   there is at least one full (string) case mapping
    the length of each is encoded in a nibble of this optional value,
    and the strings follow this optional value in the same order:
    lower/fold/upper/title

For space saving, some values are not stored. Lookups are as follows:
- If special casing is conditional, then no full lower/upper/title mapping
  strings are stored.
- If case folding is conditional, then no simple or full case foldings are
  stored.
- Fall back in this order:
    full (string) mapping -- if full mappings are used
    simple (code point) mapping of the same type
    simple fold->simple lower
    simple title->simple upper
    finally, the original code point (no mapping)

----------------------------------------------------------------------------- */

/* UDataInfo cf. udata.h */
static UDataInfo dataInfo={
    sizeof(UDataInfo),
    0,

    U_IS_BIG_ENDIAN,
    U_CHARSET_FAMILY,
    U_SIZEOF_UCHAR,
    0,

    /* dataFormat="cAsE" */
    { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
    { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
    { 4, 0, 1, 0 }                              /* dataVersion */
};

enum {
    /* maximum number of exceptions expected */
    MAX_EXC_COUNT=1000
};

/* exceptions values */
static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
static uint16_t exceptionsTop=0;
static Props excProps[MAX_EXC_COUNT];
static uint16_t exceptionsCount=0;

/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
static int32_t maxFullLength=U16_MAX_LENGTH;

/* -------------------------------------------------------------------------- */

extern void
setUnicodeVersion(const char *v) {
    UVersionInfo version;
    u_versionFromString(version, v);
    uprv_memcpy(dataInfo.dataVersion, version, 4);
}

/* store a character's properties ------------------------------------------- */

extern void
setProps(Props *p) {
    UErrorCode errorCode;
    uint32_t value, oldValue;
    int32_t delta;

    /* get the non-UnicodeData.txt properties */
    value=oldValue=upvec_getValue(pv, p->code, 0);

    /* default: map to self */
    delta=0;

    if(p->gc==U_TITLECASE_LETTER) {
        /* the Titlecase property is read late, from UnicodeData.txt */
        value|=UCASE_TITLE;
    }

    if(p->upperCase!=0) {
        /* uppercase mapping as delta if the character is lowercase */
        if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
            delta=p->upperCase-p->code;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(p->lowerCase!=0) {
        /* lowercase mapping as delta if the character is uppercase or titlecase */
        if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
            delta=p->lowerCase-p->code;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(p->upperCase!=p->titleCase) {
        value|=UCASE_EXCEPTION;
    }
    if(p->specialCasing!=NULL) {
        value|=UCASE_EXCEPTION;
    }
    if(p->caseFolding!=NULL) {
        value|=UCASE_EXCEPTION;
    }

    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
        value|=UCASE_EXCEPTION;
    }

    if(p->cc!=0) {
        if(value&UCASE_DOT_MASK) {
            fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
            exit(U_INTERNAL_PROGRAM_ERROR);
        }
        if(p->cc==230) {
            value|=UCASE_ABOVE;
        } else {
            value|=UCASE_OTHER_ACCENT;
        }
    }

    /* encode case-ignorable as delta==1 on uncased characters */
    if(
        (value&UCASE_TYPE_MASK)==UCASE_NONE &&
        p->code!=0x307 &&
        ((U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
            p->code==0x27 || p->code==0xad || p->code==0x2019)
    ) {
        /*
         * We use one of the delta/exception bits, which works because we only
         * store the case-ignorable flag for uncased characters.
         * There is no delta for uncased characters (see checks above).
         * If there is an exception for an uncased, case-ignorable character
         * (although there should not be any case mappings if it's uncased)
         * then we have a problem.
         * There is one character which is case-ignorable but has an exception:
         * U+0307 is uncased, Mn, has conditional special casing and
         * is therefore handled in code instead.
         */
        if(value&UCASE_EXCEPTION) {
            fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
                            (unsigned long)p->code);
            exit(U_INTERNAL_PROGRAM_ERROR);
        }

        delta=1;
    }

    /* handle exceptions */
    if(value&UCASE_EXCEPTION) {
        /* simply store exceptions for later processing and encoding */
        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
        uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
        if(++exceptionsCount==MAX_EXC_COUNT) {
            fprintf(stderr, "gencase: too many exceptions\n");
            exit(U_INDEX_OUTOFBOUNDS_ERROR);
        }
    } else {
        /* store the simple case mapping delta */
        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
    }

    errorCode=U_ZERO_ERROR;
    if( value!=oldValue &&
        !upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)
    ) {
        fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
                        u_errorName(errorCode));
        exit(errorCode);
    }
}

extern void
addCaseSensitive(UChar32 first, UChar32 last) {
    UErrorCode errorCode=U_ZERO_ERROR;
    if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) {
        fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
                        u_errorName(errorCode));
        exit(errorCode);
    }
}

extern void
makeCaseClosure() {
    /* TODO */
}

/* exceptions --------------------------------------------------------------- */

static UBool
fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
    int32_t i, length;
    UChar32 full;

    length=*s++;
    if(length==0 || length>U16_MAX_LENGTH) {
        return FALSE;
    }
    i=0;
    U16_NEXT(s, i, length, full);

    if(simple==0) {
        simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
    }
    return (UBool)(i==length && full==simple);
}

static uint16_t
makeException(uint32_t value, Props *p) {
    uint32_t slots[8];
    uint32_t slotBits;
    uint16_t excWord, excIndex, excTop, i, count, length, fullLengths;
    UBool doubleSlots;

    /* excIndex will be returned for storing in the trie word */
    excIndex=exceptionsTop;
    if(excIndex>=UCASE_MAX_EXCEPTIONS) {
        fprintf(stderr, "gencase error: too many exceptions words\n");
        exit(U_BUFFER_OVERFLOW_ERROR);
    }

    excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */

    /* copy and shift the soft-dotted bits */
    excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;

    /* update maxFullLength */
    if(p->specialCasing!=NULL) {
        length=p->specialCasing->lowerCase[0];
        if(length>maxFullLength) {
            maxFullLength=length;
        }
        length=p->specialCasing->upperCase[0];
        if(length>maxFullLength) {
            maxFullLength=length;
        }
        length=p->specialCasing->titleCase[0];
        if(length>maxFullLength) {
            maxFullLength=length;
        }
    }
    if(p->caseFolding!=NULL) {
        length=p->caseFolding->full[0];
        if(length>maxFullLength) {
            maxFullLength=length;
        }
    }

    /* set the bits for conditional mappings */
    if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
        excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
        p->specialCasing=NULL;
    }
    if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
        excWord|=UCASE_EXC_CONDITIONAL_FOLD;
        p->caseFolding=NULL;
    }

    /*
     * Note:
     * UCD stores no simple mappings when they are the same as the code point itself.
     * SpecialCasing and CaseFolding do store simple mappings even if they are
     * the same as the code point itself.
     * Comparisons between simple regular mappings and simple special/folding
     * mappings need to compensate for the difference by comparing with the
     * original code point if a simple UCD mapping is missing (0).
     */

    /* remove redundant data */
    if(p->specialCasing!=NULL) {
        /* do not store full mappings if they are the same as the simple ones */
        if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
            p->specialCasing->lowerCase[0]=0;
        }
        if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
            p->specialCasing->upperCase[0]=0;
        }
        if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
            p->specialCasing->titleCase[0]=0;
        }
    }
    if( p->caseFolding!=NULL &&
        fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
    ) {
        p->caseFolding->full[0]=0;
    }

    /* write the optional slots */
    slotBits=0;
    count=0;

    if(p->lowerCase!=0) {
        slots[count]=(uint32_t)p->lowerCase;
        slotBits|=slots[count];
        ++count;
        excWord|=U_MASK(UCASE_EXC_LOWER);
    }
    if( p->caseFolding!=NULL &&
        p->caseFolding->simple!=0 &&
        (p->lowerCase!=0 ?
            p->caseFolding->simple!=p->lowerCase :
            p->caseFolding->simple!=p->code)
    ) {
        slots[count]=(uint32_t)p->caseFolding->simple;
        slotBits|=slots[count];
        ++count;
        excWord|=U_MASK(UCASE_EXC_FOLD);
    }
    if(p->upperCase!=0) {
        slots[count]=(uint32_t)p->upperCase;
        slotBits|=slots[count];
        ++count;
        excWord|=U_MASK(UCASE_EXC_UPPER);
    }
    if(p->upperCase!=p->titleCase) {
        if(p->titleCase!=0) {
            slots[count]=(uint32_t)p->titleCase;
        } else {
            slots[count]=(uint32_t)p->code;
        }
        slotBits|=slots[count];
        ++count;
        excWord|=U_MASK(UCASE_EXC_TITLE);
    }

    /* lengths of full case mapping strings, stored in the last slot */
    fullLengths=0;
    if(p->specialCasing!=NULL) {
        fullLengths=p->specialCasing->lowerCase[0];
        fullLengths|=p->specialCasing->upperCase[0]<<8;
        fullLengths|=p->specialCasing->titleCase[0]<<12;
    }
    if(p->caseFolding!=NULL) {
        fullLengths|=p->caseFolding->full[0]<<4;
    }
    if(fullLengths!=0) {
        slots[count]=fullLengths;
        slotBits|=slots[count];
        ++count;
        excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
    }

    /* write slots */
    doubleSlots=(UBool)(slotBits>0xffff);
    if(!doubleSlots) {
        for(i=0; i<count; ++i) {
            exceptions[excTop++]=(uint16_t)slots[i];
        }
    } else {
        excWord|=UCASE_EXC_DOUBLE_SLOTS;
        for(i=0; i<count; ++i) {
            exceptions[excTop++]=(uint16_t)(slots[i]>>16);
            exceptions[excTop++]=(uint16_t)slots[i];
        }
    }

    /* write the full case mapping strings */
    if(p->specialCasing!=NULL) {
        length=(uint16_t)p->specialCasing->lowerCase[0];
        u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
        excTop+=length;
    }
    if(p->caseFolding!=NULL) {
        length=(uint16_t)p->caseFolding->full[0];
        u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
        excTop+=length;
    }
    if(p->specialCasing!=NULL) {
        length=(uint16_t)p->specialCasing->upperCase[0];
        u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
        excTop+=length;

        length=(uint16_t)p->specialCasing->titleCase[0];
        u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
        excTop+=length;
    }

    exceptionsTop=excTop;

    /* write the main exceptions word */
    exceptions[excIndex]=excWord;

    return excIndex;
}

extern void
makeExceptions() {
    uint32_t *row;
    uint32_t value;
    int32_t i;
    uint16_t excIndex;

    i=0;
    while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
        value=*row;
        if(value&UCASE_EXCEPTION) {
            excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
            *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
        }
        ++i;
    }
}

/* generate output data ----------------------------------------------------- */

extern void
generateData(const char *dataDir) {
    static int32_t indexes[UCASE_IX_TOP]={
        UCASE_IX_TOP
    };
    static uint8_t trieBlock[40000];

    const uint32_t *row;
    UChar32 start, limit;
    int32_t i;

    UNewDataMemory *pData;
    UNewTrie *pTrie;
    UErrorCode errorCode=U_ZERO_ERROR;
    int32_t trieSize;
    long dataLength;

    pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
    if(pTrie==NULL) {
        fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
        exit(U_MEMORY_ALLOCATION_ERROR);
    }

    for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) {
        if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) {
            fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
            exit(U_BUFFER_OVERFLOW_ERROR);
        }
    }

    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
        exit(errorCode);
    }

    indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
    indexes[UCASE_IX_TRIE_SIZE]=trieSize;
    indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop;

    indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;

    if(beVerbose) {
        printf("trie size in bytes:                    %5d\n", (int)trieSize);
        printf("number of code points with exceptions: %5d\n", exceptionsCount);
        printf("size in bytes of exceptions:           %5d\n", 2*exceptionsTop);
        printf("data size:                             %5d\n", (int)indexes[UCASE_IX_LENGTH]);
    }

    /* write the data */
    pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
                       haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
        exit(errorCode);
    }

    udata_writeBlock(pData, indexes, sizeof(indexes));
    udata_writeBlock(pData, trieBlock, trieSize);
    udata_writeBlock(pData, exceptions, 2*exceptionsTop);

    /* finish up */
    dataLength=udata_finish(pData, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
        exit(errorCode);
    }

    if(dataLength!=indexes[UCASE_IX_LENGTH]) {
        fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
            dataLength, (int)indexes[UCASE_IX_LENGTH]);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }

    utrie_close(pTrie);
}

/*
 * Hey, Emacs, please set the following:
 *
 * Local Variables:
 * indent-tabs-mode: nil
 * End:
 *
 */