testidn.cpp   [plain text]


/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  testidn.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003-02-06
*   created by: Ram Viswanadha
*
*   This program reads the rfc3454_*.txt files,
*   parses them, and extracts the data for Nameprep conformance.
*   It then preprocesses it and writes a binary file for efficient use
*   in various IDNA conversion processes.
*/

#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA && !UCONFIG_NO_TRANSLITERATION

#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "utrie.h"
#include "umutex.h"
#include "sprpimpl.h"
#include "testidna.h"

UBool beVerbose=FALSE, haveCopyright=TRUE;

/* prototypes --------------------------------------------------------------- */


static UBool isDataLoaded = FALSE;
static UTrie idnTrie={ 0,0,0,0,0,0,0 };
static UDataMemory *idnData=NULL;
static UErrorCode dataErrorCode =U_ZERO_ERROR;


static const uint16_t* mappingData = NULL;
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };


static void
parseMappings(const char *filename, UBool withNorm, UBool reportError,TestIDNA& test, UErrorCode *pErrorCode);

static void
parseTable(const char *filename, UBool isUnassigned, TestIDNA& test, UErrorCode *pErrorCode);

static UBool loadIDNData(UErrorCode &errorCode);

static UBool cleanup();

static void 
compareMapping(uint32_t codepoint, uint32_t* mapping, int32_t mapLength, 
               UBool withNorm);

static void
compareFlagsForRange(uint32_t start, uint32_t end,
                     UBool isUnassigned);

static void
testAllCodepoints(TestIDNA& test);

static TestIDNA* pTestIDNA =NULL;

static const char* fileNames[] = {
                                    "rfc3454_A_1.txt", /* contains unassigned code points */
                                    "rfc3454_C_X.txt", /* contains code points that are prohibited */
                                    "rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */
                                    "rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */
                                    /* "NormalizationCorrections.txt",contains NFKC case mappings whicha are not included in UTR 21  */
                                };
/* -------------------------------------------------------------------------- */

/* file definitions */
#define DATA_NAME "uidna"
#define DATA_TYPE "icu"

#define MISC_DIR "misc"

extern int
testData(TestIDNA& test) {
    char* filename = (char*) malloc(strlen(IntlTest::pathToDataDirectory())*3);
    //TODO get the srcDir dynamically 
    const char *srcDir=IntlTest::pathToDataDirectory();
    char *basename=NULL;
    UErrorCode errorCode=U_ZERO_ERROR;
    char *saveBasename =NULL;

    loadIDNData(errorCode);
    if(U_FAILURE(dataErrorCode)){
        test.errln( "Could not load data. Error: %s\n",u_errorName(dataErrorCode));
        return dataErrorCode;
    }
    
    //initialize
    pTestIDNA = &test;
    
    /* prepare the filename beginning with the source dir */
    if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){
        filename[0] = 0x2E;
        filename[1] = U_FILE_SEP_CHAR;
        uprv_strcpy(filename+2,srcDir);
    }else{
        uprv_strcpy(filename, srcDir);
    }
    basename=filename+uprv_strlen(filename);
    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
        *basename++=U_FILE_SEP_CHAR;
    }

    /* process unassigned */
    basename=filename+uprv_strlen(filename);
    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
        *basename++=U_FILE_SEP_CHAR;
    }
    
    /* first copy misc directory */
    saveBasename = basename;
    uprv_strcpy(basename,MISC_DIR);
    basename = basename + uprv_strlen(MISC_DIR);
    *basename++=U_FILE_SEP_CHAR;
    
    /* process unassigned */
    uprv_strcpy(basename,fileNames[0]);
    parseTable(filename,TRUE, test,&errorCode);
    if(U_FAILURE(errorCode)) {
        test.errln( "Could not open file %s for reading \n", filename);
        return errorCode;
    }
    /* process prohibited */
    uprv_strcpy(basename,fileNames[1]);
    parseTable(filename,FALSE, test, &errorCode);
    if(U_FAILURE(errorCode)) {
        test.errln( "Could not open file %s for reading \n", filename);
        return errorCode;
    }

    /* process mappings */
    uprv_strcpy(basename,fileNames[2]);
    parseMappings(filename, FALSE, FALSE,test, &errorCode);
    if(U_FAILURE(errorCode)) {
        test.errln( "Could not open file %s for reading \n", filename);
        return errorCode;
    }
    uprv_strcpy(basename,fileNames[3]);
    parseMappings(filename, TRUE, FALSE,test, &errorCode);
    if(U_FAILURE(errorCode)) {
        test.errln( "Could not open file %s for reading \n", filename);
        return errorCode;
    }

    testAllCodepoints(test);

    cleanup();
    pTestIDNA = NULL;
    free(filename);
    return errorCode;
}
U_CDECL_BEGIN
static void U_CALLCONV
caseMapLineFn(void *context,
              char *fields[][2], int32_t /*fieldCount*/,
              UErrorCode *pErrorCode) {
    uint32_t mapping[40];
    char *end, *s;
    uint32_t code;
    int32_t length;
    UBool* mapWithNorm = (UBool*) context;

    /* get the character code, field 0 */
    code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        *pErrorCode=U_PARSE_ERROR;

    }

    s = fields[1][0];
    /* parse the mapping string */
    length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);

    /* store the mapping */

    compareMapping(code,mapping, length, *mapWithNorm);
}
U_CDECL_END

static void
parseMappings(const char *filename,UBool withNorm, UBool reportError, TestIDNA& test, UErrorCode *pErrorCode) {
    char *fields[3][2];

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode);

    //fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);

    if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
        test.errln( "testidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
    }
}

/* parser for UnicodeData.txt ----------------------------------------------- */
U_CDECL_BEGIN

static void U_CALLCONV
unicodeDataLineFn(void *context,
                  char *fields[][2], int32_t /*fieldCount*/,
                  UErrorCode *pErrorCode) {
    uint32_t rangeStart=0,rangeEnd =0;
    UBool* isUnassigned = (UBool*) context;

    u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode);
    
    if(U_FAILURE(*pErrorCode)){
        *pErrorCode  = U_PARSE_ERROR;
        return;
    }


    compareFlagsForRange(rangeStart,rangeEnd,*isUnassigned);

}

U_CDECL_END

static void
parseTable(const char *filename,UBool isUnassigned,TestIDNA& test, UErrorCode *pErrorCode) {
    char *fields[2][2];
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode);


    if(U_FAILURE(*pErrorCode)) {
        test.errln( "testidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
    }
}

static void
testAllCodepoints(TestIDNA& test){
    if(isDataLoaded){
        uint32_t i = 0;
        int32_t unassigned      = 0;
        int32_t prohibited      = 0;
        int32_t mappedWithNorm  = 0;
        int32_t mapped          = 0;
        int32_t noValueInTrie   = 0;


        for(i=0;i<=0x10FFFF;i++){
            uint32_t result = 0;
            UTRIE_GET16(&idnTrie,i, result);

            if(result != UIDNA_NO_VALUE ){
                if((result & 0x07) == UIDNA_UNASSIGNED){
                    unassigned++;
                }
                if((result & 0x07) == UIDNA_PROHIBITED){
                    prohibited++;
                }
                if((result>>5) == _IDNA_MAP_TO_NOTHING){
                    mapped++;
                }
                if((result & 0x07) == UIDNA_MAP_NFKC){
                    mappedWithNorm++;
                }
            }else{
                noValueInTrie++;
                if(result > 0){
                    test.errln("The return value for 0x%06X is wrong. %i\n",i,result);
                }
            }
        }

        test.logln("Number of Unassinged code points : %i \n",unassigned);
        test.logln("Number of Prohibited code points : %i \n",prohibited);
        test.logln("Number of Mapped code points : %i \n",mapped);
        test.logln("Number of Mapped with NFKC code points : %i \n",mappedWithNorm);
        test.logln("Number of code points that have no value in Trie: %i \n",noValueInTrie);

    }
}

static inline void getValues(uint32_t result, int8_t& flag, 
                             int8_t& length, int32_t& index){
    /* first 3 bits contain the flag */
    flag = (int8_t) (result & 0x07);
    /* next 2 bits contain the length */
    length = (int8_t) ((result>>3) & 0x03);
    /* next 11 bits contain the index */
    index  = (result>> 5);
}

static void 
compareMapping(uint32_t codepoint, uint32_t* mapping,int32_t mapLength, 
               UBool withNorm){
    if(isDataLoaded){
        uint32_t result = 0;
        UTRIE_GET16(&idnTrie,codepoint, result);

        int8_t flag, length;
        int32_t index;
        getValues(result,flag,length, index);


        if(withNorm){
            if(flag != UIDNA_MAP_NFKC){
                pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n",codepoint, UIDNA_MAP_NFKC, flag);
            }
        }else{
            if(flag==UIDNA_NO_VALUE || flag == UIDNA_PROHIBITED){
                if(index != _IDNA_MAP_TO_NOTHING ){
                    pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n", codepoint, _IDNA_MAP_TO_NOTHING, index);
                }
            }
        }
        if(length ==_IDNA_LENGTH_IN_MAPPING_TABLE){
            length = (int8_t)mappingData[index];
            index++;
        }
        int32_t realLength =0;
        /* figure out the real length */ 
        for(int32_t j=0; j<mapLength; j++){
            if(mapping[j] > 0xFFFF){
                realLength +=2;
            }else{
                realLength++;
            }      
        }

        if(realLength != length){
            pTestIDNA->errln( "Did not get the expected length. Expected: %i Got: %i\n", mapLength, length);
        }
        

        for(int8_t i =0; i< mapLength; i++){
            if(mapping[i] <= 0xFFFF){
                if(mappingData[index+i] != (uint16_t)mapping[i]){
                    pTestIDNA->errln("Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n", mapping[i], mappingData[index+i]);
                }
            }else{
                UChar lead  = UTF16_LEAD(mapping[i]);
                UChar trail = UTF16_TRAIL(mapping[i]);
                if(mappingData[index+i] != lead ||
                    mappingData[index+i+1] != trail){
                    pTestIDNA->errln( "Did not get the expected result. Expected: 0x%04X 0x%04X  Got: 0x%04X 0x%04X", lead, trail, mappingData[index+i], mappingData[index+i+1]);
                }
            }
        }

    }

}

static void
compareFlagsForRange(uint32_t start, uint32_t end,
                     UBool isUnassigned){
    if(isDataLoaded){
        uint32_t result =0 ;
        while(start < end+1){
            UTRIE_GET16(&idnTrie,start, result);
            if(isUnassigned){
                if(result != UIDNA_UNASSIGNED){
                    pTestIDNA->errln( "UIDNA_UASSIGNED flag failed for 0x%06X. Expected: %04X Got: %04X\n",start,UIDNA_UNASSIGNED, result);
                }
            }else{
                if((result & 0x03) != UIDNA_PROHIBITED){
                    pTestIDNA->errln( "UIDNA_PROHIBITED flag failed for 0x%06X. Expected: %04X Got: %04X\n\n",start,UIDNA_PROHIBITED, result);
                }
            }
            start++;
        }
    }
}

UBool
cleanup() {
    if(idnData!=NULL) {
        udata_close(idnData);
        idnData=NULL;
    }
    dataErrorCode=U_ZERO_ERROR;
    isDataLoaded=FALSE;

    return TRUE;
}
U_CDECL_BEGIN
static UBool U_CALLCONV
isAcceptable(void * /* context */,
             const char * /* type */, const char * /* name */,
             const UDataInfo *pInfo) {
    if(
        pInfo->size>=20 &&
        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily==U_CHARSET_FAMILY &&
        pInfo->dataFormat[0]==0x49 &&   /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41  */
        pInfo->dataFormat[1]==0x44 &&
        pInfo->dataFormat[2]==0x4e &&
        pInfo->dataFormat[3]==0x41 &&
        pInfo->formatVersion[0]==2 &&
        pInfo->formatVersion[2]==UTRIE_SHIFT &&
        pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
    ) {
        return TRUE;
    } else {
        return FALSE;
    }
}

/* idnTrie: the folding offset is the lead FCD value itself */
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
    if(data&0x8000) {
        return (int32_t)(data&0x7fff);
    } else {
        return 0;
    }
}
U_CDECL_END

static UBool
loadIDNData(UErrorCode &errorCode) {
    /* load Unicode normalization data from file */
    if(isDataLoaded==FALSE) {
        UTrie _idnTrie={ 0,0,0,0,0,0,0 };
        UDataMemory *data;
        const int32_t *p=NULL;
        const uint8_t *pb;
        if(&errorCode==NULL || U_FAILURE(errorCode)) {
            return 0;
        }

        /* open the data outside the mutex block */
        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
        dataErrorCode=errorCode;
        if(U_FAILURE(errorCode)) {
            return isDataLoaded=FALSE;
        }

        p=(const int32_t *)udata_getMemory(data);
        pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
        utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
        _idnTrie.getFoldingOffset=getFoldingOffset;


        if(U_FAILURE(errorCode)) {
            dataErrorCode=errorCode;
            udata_close(data);
            return isDataLoaded=FALSE;
        }

        /* in the mutex block, set the data for this process */
        umtx_lock(NULL);
        if(idnData==NULL) {
            idnData=data;
            data=NULL;
            uprv_memcpy(&indexes, p, sizeof(indexes));
            uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
        } else {
            p=(const int32_t *)udata_getMemory(idnData);
        }
        umtx_unlock(NULL);
        /* initialize some variables */
        mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);

        isDataLoaded = TRUE;

        /* if a different thread set it first, then close the extra data */
        if(data!=NULL) {
            udata_close(data); /* NULL if it was set correctly */
        }
    }

    return isDataLoaded;
}

#endif /* #if !UCONFIG_NO_IDNA */

/*
 * Hey, Emacs, please set the following:
 *
 * Local Variables:
 * indent-tabs-mode: nil
 * End:
 *
 */