uscript_props.cpp   [plain text]


/*
*******************************************************************************
*   Copyright (C) 2013, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  uscript_props.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2013feb16
*   created by: Markus W. Scherer
*/

#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uscript.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"

#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

namespace {

// Script metadata (script properties).
// See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt

// 0 = NOT_ENCODED, no sample character, default false script properties.
// Bits 20.. 0: sample character

// Bits 23..21: usage
const int32_t UNKNOWN = 1 << 21;
const int32_t EXCLUSION = 2 << 21;
const int32_t LIMITED_USE = 3 << 21;
const int32_t ASPIRATIONAL = 4 << 21;
const int32_t RECOMMENDED = 5 << 21;

// Bits 31..24: Single-bit flags
const int32_t RTL = 1 << 24;
const int32_t LB_LETTERS = 1 << 25;
const int32_t CASED = 1 << 26;

const int32_t SCRIPT_PROPS[] = {
    // Begin copy-paste output from
    // tools/trunk/unicode/py/parsescriptmetadata.py
    0x0040 | UNKNOWN,  // Zyyy
    0x0308 | UNKNOWN,  // Zinh
    0x0628 | RECOMMENDED | RTL,  // Arab
    0x0531 | RECOMMENDED | CASED,  // Armn
    0x0995 | RECOMMENDED,  // Beng
    0x3105 | RECOMMENDED | LB_LETTERS,  // Bopo
    0x13C4 | LIMITED_USE,  // Cher
    0x03E2 | EXCLUSION | CASED,  // Copt
    0x042F | RECOMMENDED | CASED,  // Cyrl
    0x10414 | EXCLUSION | CASED,  // Dsrt
    0x0905 | RECOMMENDED,  // Deva
    0x12A0 | RECOMMENDED,  // Ethi
    0x10D3 | RECOMMENDED,  // Geor
    0x10330 | EXCLUSION,  // Goth
    0x03A9 | RECOMMENDED | CASED,  // Grek
    0x0A95 | RECOMMENDED,  // Gujr
    0x0A15 | RECOMMENDED,  // Guru
    0x5B57 | RECOMMENDED | LB_LETTERS,  // Hani
    0xAC00 | RECOMMENDED,  // Hang
    0x05D0 | RECOMMENDED | RTL,  // Hebr
    0x304B | RECOMMENDED | LB_LETTERS,  // Hira
    0x0C95 | RECOMMENDED,  // Knda
    0x30AB | RECOMMENDED | LB_LETTERS,  // Kana
    0x1780 | RECOMMENDED | LB_LETTERS,  // Khmr
    0x0EA5 | RECOMMENDED | LB_LETTERS,  // Laoo
    0x004C | RECOMMENDED | CASED,  // Latn
    0x0D15 | RECOMMENDED,  // Mlym
    0x1826 | ASPIRATIONAL,  // Mong
    0x1000 | RECOMMENDED | LB_LETTERS,  // Mymr
    0x168F | EXCLUSION,  // Ogam
    0x10300 | EXCLUSION,  // Ital
    0x0B15 | RECOMMENDED,  // Orya
    0x16A0 | EXCLUSION,  // Runr
    0x0D85 | RECOMMENDED,  // Sinh
    0x0710 | LIMITED_USE | RTL,  // Syrc
    0x0B95 | RECOMMENDED,  // Taml
    0x0C15 | RECOMMENDED,  // Telu
    0x078C | RECOMMENDED | RTL,  // Thaa
    0x0E17 | RECOMMENDED | LB_LETTERS,  // Thai
    0x0F40 | RECOMMENDED,  // Tibt
    0x14C0 | ASPIRATIONAL,  // Cans
    0xA288 | ASPIRATIONAL | LB_LETTERS,  // Yiii
    0x1703 | EXCLUSION,  // Tglg
    0x1723 | EXCLUSION,  // Hano
    0x1743 | EXCLUSION,  // Buhd
    0x1763 | EXCLUSION,  // Tagb
    0x2800 | UNKNOWN,  // Brai
    0x10800 | EXCLUSION | RTL,  // Cprt
    0x1900 | LIMITED_USE,  // Limb
    0x10000 | EXCLUSION,  // Linb
    0x10480 | EXCLUSION,  // Osma
    0x10450 | EXCLUSION,  // Shaw
    0x1950 | LIMITED_USE | LB_LETTERS,  // Tale
    0x10380 | EXCLUSION,  // Ugar
    0,
    0x1A00 | EXCLUSION,  // Bugi
    0x2C00 | EXCLUSION | CASED,  // Glag
    0x10A00 | EXCLUSION | RTL,  // Khar
    0xA800 | LIMITED_USE,  // Sylo
    0x1980 | LIMITED_USE | LB_LETTERS,  // Talu
    0x2D30 | ASPIRATIONAL,  // Tfng
    0x103A0 | EXCLUSION,  // Xpeo
    0x1B05 | LIMITED_USE | LB_LETTERS,  // Bali
    0x1BC0 | LIMITED_USE,  // Batk
    0,
    0x11005 | EXCLUSION,  // Brah
    0xAA00 | LIMITED_USE,  // Cham
    0,
    0,
    0,
    0,
    0x13153 | EXCLUSION,  // Egyp
    0,
    0x5B57 | RECOMMENDED | LB_LETTERS,  // Hans
    0x5B57 | RECOMMENDED | LB_LETTERS,  // Hant
    0,
    0,
    0,
    0xA984 | LIMITED_USE | LB_LETTERS,  // Java
    0xA90A | LIMITED_USE,  // Kali
    0,
    0,
    0x1C00 | LIMITED_USE,  // Lepc
    0,
    0x0840 | LIMITED_USE | RTL,  // Mand
    0,
    0x10980 | EXCLUSION | RTL,  // Mero
    0x07CA | LIMITED_USE | RTL,  // Nkoo
    0x10C00 | EXCLUSION | RTL,  // Orkh
    0,
    0xA840 | EXCLUSION,  // Phag
    0x10900 | EXCLUSION | RTL,  // Phnx
    0x16F00 | ASPIRATIONAL,  // Plrd
    0,
    0,
    0,
    0,
    0,
    0,
    0xA549 | LIMITED_USE,  // Vaii
    0,
    0x12000 | EXCLUSION,  // Xsux
    0,
    0xFDD0 | UNKNOWN,  // Zzzz
    0x102A0 | EXCLUSION,  // Cari
    0x304B | RECOMMENDED | LB_LETTERS,  // Jpan
    0x1A20 | LIMITED_USE | LB_LETTERS,  // Lana
    0x10280 | EXCLUSION,  // Lyci
    0x10920 | EXCLUSION | RTL,  // Lydi
    0x1C5A | LIMITED_USE,  // Olck
    0xA930 | EXCLUSION,  // Rjng
    0xA882 | LIMITED_USE,  // Saur
    0,
    0x1B83 | LIMITED_USE,  // Sund
    0,
    0xABC0 | LIMITED_USE,  // Mtei
    0x10840 | EXCLUSION | RTL,  // Armi
    0x10B00 | EXCLUSION | RTL,  // Avst
    0x11103 | LIMITED_USE,  // Cakm
    0xAC00 | RECOMMENDED,  // Kore
    0x11083 | EXCLUSION,  // Kthi
    0,
    0x10B60 | EXCLUSION | RTL,  // Phli
    0,
    0,
    0x10B40 | EXCLUSION | RTL,  // Prti
    0x0800 | EXCLUSION | RTL,  // Samr
    0xAA80 | LIMITED_USE | LB_LETTERS,  // Tavt
    0,
    0,
    0xA6A0 | LIMITED_USE,  // Bamu
    0xA4D0 | LIMITED_USE,  // Lisu
    0,
    0x10A60 | EXCLUSION | RTL,  // Sarb
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0x109A0 | EXCLUSION | RTL,  // Merc
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0x11183 | EXCLUSION,  // Shrd
    0x110D0 | EXCLUSION,  // Sora
    0x11680 | EXCLUSION,  // Takr
    0,
    0,
    0,
    0,
    0,
    // End copy-paste from parsescriptmetadata.py
};

int32_t getScriptProps(UScriptCode script) {
    if (0 <= script && script < LENGTHOF(SCRIPT_PROPS)) {
        return SCRIPT_PROPS[script];
    } else {
        return 0;
    }
}

}  // namespace

U_CAPI int32_t U_EXPORT2
uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) { return 0; }
    if(capacity < 0 || (capacity > 0 && dest == NULL)) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    int32_t sampleChar = getScriptProps(script) & 0x1fffff;
    int32_t length;
    if(sampleChar == 0) {
        length = 0;
    } else {
        length = U16_LENGTH(sampleChar);
        if(length <= capacity) {
            int32_t i = 0;
            U16_APPEND_UNSAFE(dest, i, sampleChar);
        }
    }
    return u_terminateUChars(dest, capacity, length, pErrorCode);
}

U_COMMON_API icu::UnicodeString U_EXPORT2
uscript_getSampleUnicodeString(UScriptCode script) {
    icu::UnicodeString sample;
    int32_t sampleChar = getScriptProps(script) & 0x1fffff;
    if(sampleChar != 0) {
        sample.append(sampleChar);
    }
    return sample;
}

U_CAPI UScriptUsage U_EXPORT2
uscript_getUsage(UScriptCode script) {
    return (UScriptUsage)((getScriptProps(script) >> 21) & 7);
}

U_CAPI UBool U_EXPORT2
uscript_isRightToLeft(UScriptCode script) {
    return (getScriptProps(script) & RTL) != 0;
}

U_CAPI UBool U_EXPORT2
uscript_breaksBetweenLetters(UScriptCode script) {
    return (getScriptProps(script) & LB_LETTERS) != 0;
}

U_CAPI UBool U_EXPORT2
uscript_isCased(UScriptCode script) {
    return (getScriptProps(script) & CASED) != 0;
}