collationdata.cpp   [plain text]


/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationdata.cpp
*
* created on: 2012jul28
* created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/ucol.h"
#include "unicode/udata.h"
#include "unicode/uscript.h"
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "uassert.h"
#include "utrie2.h"

U_NAMESPACE_BEGIN

uint32_t
CollationData::getIndirectCE32(uint32_t ce32) const {
    U_ASSERT(Collation::isSpecialCE32(ce32));
    int32_t tag = Collation::tagFromCE32(ce32);
    if(tag == Collation::DIGIT_TAG) {
        // Fetch the non-numeric-collation CE32.
        ce32 = ce32s[Collation::indexFromCE32(ce32)];
    } else if(tag == Collation::LEAD_SURROGATE_TAG) {
        ce32 = Collation::UNASSIGNED_CE32;
    } else if(tag == Collation::U0000_TAG) {
        // Fetch the normal ce32 for U+0000.
        ce32 = ce32s[0];
    }
    return ce32;
}

uint32_t
CollationData::getFinalCE32(uint32_t ce32) const {
    if(Collation::isSpecialCE32(ce32)) {
        ce32 = getIndirectCE32(ce32);
    }
    return ce32;
}

uint32_t
CollationData::getFirstPrimaryForGroup(int32_t script) const {
    int32_t index = findScript(script);
    if(index < 0) {
        return 0;
    }
    uint32_t head = scripts[index];
    return (head & 0xff00) << 16;
}

uint32_t
CollationData::getLastPrimaryForGroup(int32_t script) const {
    int32_t index = findScript(script);
    if(index < 0) {
        return 0;
    }
    uint32_t head = scripts[index];
    uint32_t lastByte = head & 0xff;
    return ((lastByte + 1) << 24) - 1;
}

int32_t
CollationData::getGroupForPrimary(uint32_t p) const {
    p >>= 24;  // Reordering groups are distinguished by primary lead bytes.
    for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) {
        uint32_t lastByte = scripts[i] & 0xff;
        if(p <= lastByte) {
            return scripts[i + 2];
        }
    }
    return -1;
}

int32_t
CollationData::findScript(int32_t script) const {
    if(script < 0 || 0xffff < script) { return -1; }
    for(int32_t i = 0; i < scriptsLength;) {
        int32_t limit = i + 2 + scripts[i + 1];
        for(int32_t j = i + 2; j < limit; ++j) {
            if(script == scripts[j]) { return i; }
        }
        i = limit;
    }
    return -1;
}

int32_t
CollationData::getEquivalentScripts(int32_t script,
                                    int32_t dest[], int32_t capacity,
                                    UErrorCode &errorCode) const {
    if(U_FAILURE(errorCode)) { return 0; }
    int32_t i = findScript(script);
    if(i < 0) { return 0; }
    int32_t length = scripts[i + 1];
    U_ASSERT(length != 0);
    if(length > capacity) {
        errorCode = U_BUFFER_OVERFLOW_ERROR;
        return length;
    }
    i += 2;
    dest[0] = scripts[i++];
    for(int32_t j = 1; j < length; ++j) {
        script = scripts[i++];
        // Sorted insertion.
        for(int32_t k = j;; --k) {
            // Invariant: dest[k] is free to receive either script or dest[k - 1].
            if(k > 0 && script < dest[k - 1]) {
                dest[k] = dest[k - 1];
            } else {
                dest[k] = script;
                break;
            }
        }
    }
    return length;
}

void
CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
                                uint8_t table[256], UErrorCode &errorCode) const {
    if(U_FAILURE(errorCode)) { return; }

    // Initialize the table.
    // Never reorder special low and high primary lead bytes.
    int32_t lowByte;
    for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {
        table[lowByte] = lowByte;
    }
    // lowByte == 03

    int32_t highByte;
    for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {
        table[highByte] = highByte;
    }
    // highByte == FE

    // Set intermediate bytes to 0 to indicate that they have not been set yet.
    for(int32_t i = lowByte; i <= highByte; ++i) {
        table[i] = 0;
    }

    // Get the set of special reorder codes in the input list.
    // This supports up to 32 special reorder codes;
    // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
    uint32_t specials = 0;
    for(int32_t i = 0; i < length; ++i) {
        int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
        if(0 <= reorderCode && reorderCode <= 31) {
            specials |= (uint32_t)1 << reorderCode;
        }
    }

    // Start the reordering with the special low reorder codes that do not occur in the input.
    for(int32_t i = 0;; i += 3) {
        if(scripts[i + 1] != 1) { break; }  // Went beyond special single-code reorder codes.
        int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST;
        if(reorderCode < 0) { break; }  // Went beyond special reorder codes.
        if((specials & ((uint32_t)1 << reorderCode)) == 0) {
            int32_t head = scripts[i];
            int32_t firstByte = head >> 8;
            int32_t lastByte = head & 0xff;
            do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
        }
    }

    // Reorder according to the input scripts, continuing from the bottom of the bytes range.
    for(int32_t i = 0; i < length;) {
        int32_t script = reorder[i++];
        if(script == USCRIPT_UNKNOWN) {
            // Put the remaining scripts at the top.
            while(i < length) {
                script = reorder[--length];
                if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
                        script == UCOL_REORDER_CODE_DEFAULT) {
                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
                    return;
                }
                int32_t index = findScript(script);
                if(index < 0) { continue; }
                int32_t head = scripts[index];
                int32_t firstByte = head >> 8;
                int32_t lastByte = head & 0xff;
                if(table[firstByte] != 0) {  // Duplicate or equivalent script.
                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
                    return;
                }
                do { table[lastByte--] = highByte--; } while(firstByte <= lastByte);
            }
            break;
        }
        if(script == UCOL_REORDER_CODE_DEFAULT) {
            // The default code must be the only one in the list, and that is handled by the caller.
            // Otherwise it must not be used.
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
        int32_t index = findScript(script);
        if(index < 0) { continue; }
        int32_t head = scripts[index];
        int32_t firstByte = head >> 8;
        int32_t lastByte = head & 0xff;
        if(table[firstByte] != 0) {  // Duplicate or equivalent script.
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
        do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
    }

    // Put all remaining scripts into the middle.
    // Avoid table[0] which must remain 0.
    for(int32_t i = 1; i <= 0xff; ++i) {
        if(table[i] == 0) { table[i] = lowByte++; }
    }
    U_ASSERT(lowByte == highByte + 1);
}

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION