utfperf.cpp   [plain text]


/*  
 **********************************************************************
 *   Copyright (C) 2002-2007, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  utfperf.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2005Nov17
 *   created by: Raymond Yang
 *
 *   Ported from utfper.c created by Markus W. Scherer
 *   Performance test program for Unicode converters
 */

#include <stdio.h>
#include <stdlib.h>
#include "unicode/uperf.h"
#include "uoptions.h"

#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

/* definitions and text buffers */

#define INPUT_CAPACITY (1024*1024)
#define INTERMEDIATE_CAPACITY 4096
#define INTERMEDIATE_SMALL_CAPACITY 20
#define PIVOT_CAPACITY 1024
#define OUTPUT_CAPACITY INPUT_CAPACITY

static char utf8[INPUT_CAPACITY];
static UChar pivot[INTERMEDIATE_CAPACITY];

static UChar output[OUTPUT_CAPACITY];
static char intermediate[OUTPUT_CAPACITY];

static int32_t utf8Length, encodedLength, outputLength, countInputCodePoints;

static int32_t fromUCallbackCount;

// Command-line options specific to utfperf.
// Options do not have abbreviations: Force readable command lines.
// (Using U+0001 for abbreviation characters.)
enum {
    CHARSET,
    CHUNK_LENGTH,
    PIVOT_LENGTH,
    UTFPERF_OPTIONS_COUNT
};

static UOption options[UTFPERF_OPTIONS_COUNT]={
    UOPTION_DEF("charset",  '\x01', UOPT_REQUIRES_ARG),
    UOPTION_DEF("chunk",    '\x01', UOPT_REQUIRES_ARG),
    UOPTION_DEF("pivot",    '\x01', UOPT_REQUIRES_ARG)
};

static const char *const utfperf_usage =
    "\t--charset   Charset for which to test performance, e.g. windows-1251.\n"
    "\t            Default: UTF-8\n"
    "\t--chunk     Length (in bytes) of charset output chunks. [4096]\n"
    "\t--pivot     Length (in UChars) of the UTF-16 pivot buffer, if applicable.\n"
    "\t            [1024]\n";

// Test object.
class  UtfPerformanceTest : public UPerfTest{
public:
    UtfPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
            : UPerfTest(argc, argv, options, LENGTHOF(options), utfperf_usage, status) {
        if (U_SUCCESS(status)) {
            charset = options[CHARSET].value;

            chunkLength = atoi(options[CHUNK_LENGTH].value);
            if (chunkLength < 1 || OUTPUT_CAPACITY < chunkLength) {
                fprintf(stderr, "error: chunk length must be 1..%ld\n", (long)OUTPUT_CAPACITY);
                status = U_ILLEGAL_ARGUMENT_ERROR;
            }

            pivotLength = atoi(options[PIVOT_LENGTH].value);
            if (pivotLength < 1 || PIVOT_CAPACITY < pivotLength) {
                fprintf(stderr, "error: pivot length must be 1..%ld\n", (long)PIVOT_CAPACITY);
                status = U_ILLEGAL_ARGUMENT_ERROR;
            }

            int32_t inputLength;
            UPerfTest::getBuffer(inputLength, status);
            countInputCodePoints = u_countChar32(buffer, bufferLen);
            u_strToUTF8(utf8, (int32_t)sizeof(utf8), &utf8Length, buffer, bufferLen, &status);
        }
    }

    virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);

    const UChar *getBuffer() const { return buffer; }
    int32_t getBufferLen() const { return bufferLen; }

    const char *charset;
    int32_t chunkLength, pivotLength;
};

U_CDECL_BEGIN
// Custom callback for counting callback calls.
static void U_CALLCONV
fromUCallback(const void *context,
              UConverterFromUnicodeArgs *fromUArgs,
              const UChar *codeUnits,
              int32_t length,
              UChar32 codePoint,
              UConverterCallbackReason reason,
              UErrorCode *pErrorCode) {
    if (reason <= UCNV_IRREGULAR) {
        ++fromUCallbackCount;
    }
    UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, pErrorCode);
}
U_CDECL_END

// Base class for Roundtrip, FromUnicode and FromUTF8 with common setup.
class Command : public UPerfFunction {
protected:
    Command(const UtfPerformanceTest &testcase)
            : testcase(testcase),
              input(testcase.getBuffer()), inputLength(testcase.getBufferLen()),
              errorCode(U_ZERO_ERROR) {
        cnv=ucnv_open(testcase.charset, &errorCode);
        if (U_FAILURE(errorCode)) {
            fprintf(stderr, "error opening converter for \"%s\" - %s\n", testcase.charset, u_errorName(errorCode));
        }
        ucnv_setFromUCallBack(cnv, fromUCallback, NULL, NULL, NULL, &errorCode);
    }
public:
    virtual ~Command(){
        if(U_SUCCESS(errorCode)) {
            ucnv_close(cnv);
        }
    }
    // virtual void call(UErrorCode* pErrorCode) { ... }
    virtual long getOperationsPerIteration(){
        return countInputCodePoints;
    }

    const UtfPerformanceTest &testcase;
    const UChar *input;
    int32_t inputLength;
    UErrorCode errorCode;
    UConverter *cnv;
};

// Test roundtrip UTF-16->encoding->UTF-16.
class Roundtrip : public Command {
protected:
    Roundtrip(const UtfPerformanceTest &testcase) : Command(testcase) {}
public:
    static UPerfFunction* get(const UtfPerformanceTest &testcase) {
        Roundtrip * t = new Roundtrip(testcase);
        if (U_SUCCESS(t->errorCode)){
            return t;
        } else {
            delete t;
            return NULL;
        }
    }
    virtual void call(UErrorCode* pErrorCode){
        const UChar *pIn, *pInLimit;
        UChar *pOut, *pOutLimit;
        char *pInter, *pInterLimit;
        const char *p;
        UBool flush;

        ucnv_reset(cnv);
        fromUCallbackCount=0;

        pIn=input;
        pInLimit=input+inputLength;

        pOut=output;
        pOutLimit=output+OUTPUT_CAPACITY;

        pInterLimit=intermediate+testcase.chunkLength;

        encodedLength=outputLength=0;
        flush=FALSE;

        do {
            /* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
            pInter=intermediate;
            ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, TRUE, pErrorCode);
            encodedLength+=(int32_t)(pInter-intermediate);

            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                /* make sure that we convert once more to really flush */
                *pErrorCode=U_ZERO_ERROR;
            } else if(U_FAILURE(*pErrorCode)) {
                return;
            } else if(pIn==pInLimit) {
                flush=TRUE;
            }

            /* convert the block [intermediate..pInter[ back to UTF-16 */
            p=intermediate;
            ucnv_toUnicode(cnv, &pOut, pOutLimit,&p, pInter,NULL, flush,pErrorCode);
            if(U_FAILURE(*pErrorCode)) {
                return;
            }
            /* intermediate must have been consumed (p==pInter) because of the converter semantics */
        } while(!flush);

        outputLength=pOut-output;
        if(inputLength!=outputLength) {
            fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength);
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
        }
    }
};

// Test one-way conversion UTF-16->encoding.
class FromUnicode : public Command {
protected:
    FromUnicode(const UtfPerformanceTest &testcase) : Command(testcase) {}
public:
    static UPerfFunction* get(const UtfPerformanceTest &testcase) {
        FromUnicode * t = new FromUnicode(testcase);
        if (U_SUCCESS(t->errorCode)){
            return t;
        } else {
            delete t;
            return NULL;
        }
    }
    virtual void call(UErrorCode* pErrorCode){
        const UChar *pIn, *pInLimit;
        char *pInter, *pInterLimit;

        ucnv_resetFromUnicode(cnv);
        fromUCallbackCount=0;

        pIn=input;
        pInLimit=input+inputLength;

        pInterLimit=intermediate+testcase.chunkLength;

        encodedLength=0;

        for(;;) {
            pInter=intermediate;
            ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, TRUE, pErrorCode);
            encodedLength+=(int32_t)(pInter-intermediate);

            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                /* make sure that we convert once more to really flush */
                *pErrorCode=U_ZERO_ERROR;
            } else if(U_FAILURE(*pErrorCode)) {
                return;
            } else {
                break;  // all done
            }
        }
    }
};

// Test one-way conversion UTF-8->encoding.
class FromUTF8 : public Command {
protected:
    FromUTF8(const UtfPerformanceTest &testcase)
            : Command(testcase),
              utf8Cnv(NULL),
              input8(utf8), input8Length(utf8Length) {
        utf8Cnv=ucnv_open("UTF-8", &errorCode);
    }
public:
    static UPerfFunction* get(const UtfPerformanceTest &testcase) {
        FromUTF8 * t = new FromUTF8(testcase);
        if (U_SUCCESS(t->errorCode)){
            return t;
        } else {
            delete t;
            return NULL;
        }
    }
    ~FromUTF8() {
        ucnv_close(utf8Cnv);
    }
    virtual void call(UErrorCode* pErrorCode){
        const char *pIn, *pInLimit;
        char *pInter, *pInterLimit;
        UChar *pivotSource, *pivotTarget, *pivotLimit;

        ucnv_resetToUnicode(utf8Cnv);
        ucnv_resetFromUnicode(cnv);
        fromUCallbackCount=0;

        pIn=input8;
        pInLimit=input8+input8Length;

        pInterLimit=intermediate+testcase.chunkLength;

        pivotSource=pivotTarget=pivot;
        pivotLimit=pivot+testcase.pivotLength;

        encodedLength=0;

        for(;;) {
            pInter=intermediate;
            ucnv_convertEx(cnv, utf8Cnv,
                           &pInter, pInterLimit,
                           &pIn, pInLimit,
                           pivot, &pivotSource, &pivotTarget, pivotLimit,
                           FALSE, TRUE, pErrorCode);
            encodedLength+=(int32_t)(pInter-intermediate);

            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                /* make sure that we convert once more to really flush */
                *pErrorCode=U_ZERO_ERROR;
            } else if(U_FAILURE(*pErrorCode)) {
                return;
            } else {
                break;  // all done
            }
        }
    }
protected:
    UConverter *utf8Cnv;
    const char *input8;
    int32_t input8Length;
};

UPerfFunction* UtfPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
    switch (index) {
        case 0: name = "Roundtrip";     if (exec) return Roundtrip::get(*this); break;
        case 1: name = "FromUnicode";   if (exec) return FromUnicode::get(*this); break;
        case 2: name = "FromUTF8";      if (exec) return FromUTF8::get(*this); break;
        default: name = ""; break;
    }
    return NULL;
}

int main(int argc, const char *argv[])
{
    // Default values for command-line options.
    options[CHARSET].value = "UTF-8";
    options[CHUNK_LENGTH].value = "4096";
    options[PIVOT_LENGTH].value = "1024";

    UErrorCode status = U_ZERO_ERROR;
    UtfPerformanceTest test(argc, argv, status);

	if (U_FAILURE(status)){
        printf("The error is %s\n", u_errorName(status));
        test.usage();
        return status;
    }
        
    if (test.run() == FALSE){
        fprintf(stderr, "FAILED: Tests could not be run please check the "
			            "arguments.\n");
        return -1;
    }

    if (fromUCallbackCount > 0) {
        printf("Number of fromUnicode callback calls in the last iteration: %ld\n", (long)fromUCallbackCount);
    }

    return 0;
}