collationbasedatabuilder.h   [plain text]


/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationbasedatabuilder.h
*
* created on: 2012aug11
* created by: Markus W. Scherer
*/

#ifndef __COLLATIONBASEDATABUILDER_H__
#define __COLLATIONBASEDATABUILDER_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "collation.h"
#include "collationdata.h"
#include "collationdatabuilder.h"
#include "normalizer2impl.h"
#include "utrie2.h"
#include "uvectr32.h"
#include "uvectr64.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

/**
 * Low-level base CollationData builder.
 */
class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder {
public:
    CollationBaseDataBuilder(UErrorCode &errorCode);

    virtual ~CollationBaseDataBuilder();

    void init(UErrorCode &errorCode);

    /**
     * Sets the Han ranges as ranges of offset CE32s.
     * Note: Unihan extension A sorts after the other BMP ranges.
     * See http://www.unicode.org/reports/tr10/#Implicit_Weights
     *
     * @param ranges array of ranges of [:Unified_Ideograph:] in collation order,
     *               as (start, end) code point pairs
     * @param length number of code points (not pairs)
     * @param errorCode in/out error code
     */
    void initHanRanges(const UChar32 ranges[], int32_t length, UErrorCode &errorCode);

    void setNumericPrimary(uint32_t np) { numericPrimary = np; }

    virtual UBool isCompressibleLeadByte(uint32_t b) const;

    void setCompressibleLeadByte(uint32_t b);

    static int32_t diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible);
    static int32_t diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible);

    virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);

    void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
    void addRootElement(int64_t ce, UErrorCode &errorCode);

    void addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
                            const UnicodeString &groupScripts,
                            UErrorCode &errorCode);

    virtual void build(CollationData &data, UErrorCode &errorCode);

    void buildRootElementsTable(UVector32 &table, UErrorCode &errorCode);

private:
    int32_t writeRootElementsRange(
            uint32_t prevPrimary, uint32_t p, int32_t i,
            UVector32 &table, UErrorCode &errorCode);

    // Flags for which primary-weight lead bytes are compressible.
    UBool compressibleBytes[256];
    uint32_t numericPrimary;
    uint32_t firstHanPrimary;
    uint32_t lastHanPrimary;
    int32_t hanStep;
    UVector64 rootElements;
    UnicodeString scripts;
};

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION
#endif  // __COLLATIONBASEDATABUILDER_H__