strsrch.cpp   [plain text]


/********************************************************************
 * COPYRIGHT:
 * Copyright (C) 2002-2006 IBM, Inc.   All Rights Reserved.
 *
 ********************************************************************/

/** 
 * This program demos string collation
 */

const char gHelpString[] =
    "usage: strsrch [options*] -source source_string -pattern pattern_string\n"
    "-help            Display this message.\n"
    "-locale name     ICU locale to use.  Default is en_US\n"
    "-rules rule      Collation rules file (overrides locale)\n"
    "-french          French accent ordering\n"
    "-norm            Normalizing mode on\n"
    "-shifted         Shifted mode\n"
    "-lower           Lower case first\n"
    "-upper           Upper case first\n"
    "-case            Enable separate case level\n"
    "-level n         Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
	"-source string   Source string\n"
	"-pattern string  Pattern string to look for in source\n"
	"-overlap         Enable searching to be done on overlapping patterns\n"
	"-canonical       Enable searching to be done matching canonical equivalent patterns"
    "Example strsrch -rules \\u0026b\\u003ca -source a\\u0020b\\u0020bc -pattern b\n"
	"The format \\uXXXX is supported for the rules and comparison strings\n"
	;

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <unicode/utypes.h>
#include <unicode/ucol.h>
#include <unicode/usearch.h>
#include <unicode/ustring.h>

/** 
 * Command line option variables
 *    These global variables are set according to the options specified
 *    on the command line by the user.
 */
char * opt_locale      = "en_US";
char * opt_rules       = 0;
UBool  opt_help        = FALSE;
UBool  opt_norm        = FALSE;
UBool  opt_french      = FALSE;
UBool  opt_shifted     = FALSE;
UBool  opt_lower       = FALSE;
UBool  opt_upper       = FALSE;
UBool  opt_case        = FALSE;
UBool  opt_overlap     = FALSE;
UBool  opt_canonical   = FALSE;
int    opt_level       = 0;
char * opt_source      = "International Components for Unicode";
char * opt_pattern     = "Unicode";
UCollator * collator   = 0;
UStringSearch * search = 0;
UChar rules[100];
UChar source[100];
UChar pattern[100];

/** 
 * Definitions for the command line options
 */
struct OptSpec {
    const char *name;
    enum {FLAG, NUM, STRING} type;
    void *pVar;
};

OptSpec opts[] = {
    {"-locale",      OptSpec::STRING, &opt_locale},
    {"-rules",       OptSpec::STRING, &opt_rules},
	{"-source",      OptSpec::STRING, &opt_source},
    {"-pattern",     OptSpec::STRING, &opt_pattern},
    {"-norm",        OptSpec::FLAG,   &opt_norm},
    {"-french",      OptSpec::FLAG,   &opt_french},
    {"-shifted",     OptSpec::FLAG,   &opt_shifted},
    {"-lower",       OptSpec::FLAG,   &opt_lower},
    {"-upper",       OptSpec::FLAG,   &opt_upper},
    {"-case",        OptSpec::FLAG,   &opt_case},
    {"-level",       OptSpec::NUM,    &opt_level},
	{"-overlap",     OptSpec::FLAG,   &opt_overlap},
	{"-canonical",   OptSpec::FLAG,   &opt_canonical},
    {"-help",        OptSpec::FLAG,   &opt_help},
    {"-?",           OptSpec::FLAG,   &opt_help},
    {0, OptSpec::FLAG, 0}
};

/**  
 * processOptions()  Function to read the command line options.
 */
UBool processOptions(int argc, const char **argv, OptSpec opts[])
{
    for (int argNum = 1; argNum < argc; argNum ++) {
        const char *pArgName = argv[argNum];
        OptSpec *pOpt;
        for (pOpt = opts;  pOpt->name != 0; pOpt ++) {
            if (strcmp(pOpt->name, pArgName) == 0) {
                switch (pOpt->type) {
                case OptSpec::FLAG:
                    *(UBool *)(pOpt->pVar) = TRUE;
                    break;
                case OptSpec::STRING:
                    argNum ++;
                    if (argNum >= argc) {
                        fprintf(stderr, "value expected for \"%s\" option.\n", 
							    pOpt->name);
                        return FALSE;
                    }
                    *(const char **)(pOpt->pVar) = argv[argNum];
                    break;
                case OptSpec::NUM:
                    argNum ++;
                    if (argNum >= argc) {
                        fprintf(stderr, "value expected for \"%s\" option.\n", 
							    pOpt->name);
                        return FALSE;
                    }
                    char *endp;
                    int i = strtol(argv[argNum], &endp, 0);
                    if (endp == argv[argNum]) {
                        fprintf(stderr, 
							    "integer value expected for \"%s\" option.\n", 
								pOpt->name);
                        return FALSE;
                    }
                    *(int *)(pOpt->pVar) = i;
                }
                break;
            }
        }
        if (pOpt->name == 0)
        {
            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
            return FALSE;
        }
    }
	return TRUE;
}

/**
 * Creates a collator
 */
UBool processCollator()
{
	// Set up an ICU collator
    UErrorCode status = U_ZERO_ERROR;

    if (opt_rules != 0) {
		u_unescape(opt_rules, rules, 100);
        collator = ucol_openRules(rules, -1, UCOL_OFF, UCOL_TERTIARY, 
			                  NULL, &status);
    }
    else {
        collator = ucol_open(opt_locale, &status);
    }
	if (U_FAILURE(status)) {
        fprintf(stderr, "Collator creation failed.: %d\n", status);
        return FALSE;
    }
    if (status == U_USING_DEFAULT_WARNING) {
        fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", 
			    opt_locale);
    }
    if (status == U_USING_FALLBACK_WARNING) {
        fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", 
			    opt_locale);
    }
    if (opt_norm) {
        ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
    }
    if (opt_french) {
        ucol_setAttribute(collator, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
    }
    if (opt_lower) {
        ucol_setAttribute(collator, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, 
			              &status);
    }
    if (opt_upper) {
        ucol_setAttribute(collator, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, 
			              &status);
    }
    if (opt_case) {
        ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_ON, &status);
    }
    if (opt_shifted) {
        ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, 
			              &status);
    }
    if (opt_level != 0) {
        switch (opt_level) {
        case 1:
            ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &status);
            break;
        case 2:
            ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_SECONDARY, 
				              &status);
            break;
        case 3:
            ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_TERTIARY, &status);
            break;
        case 4:
            ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_QUATERNARY, 
				              &status);
            break;
        case 5:
            ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_IDENTICAL, 
				              &status);
            break;
        default:
            fprintf(stderr, "-level param must be between 1 and 5\n");
            return FALSE;
        }
    }
    if (U_FAILURE(status)) {
        fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
        return FALSE;
    }
	return TRUE;
}

/**
 * Creates a string search
 */
UBool processStringSearch()
{
	u_unescape(opt_source, source, 100);
	u_unescape(opt_pattern, pattern, 100);
	UErrorCode status = U_ZERO_ERROR;
	search = usearch_openFromCollator(pattern, -1, source, -1, collator, NULL, 
		                              &status);
	if (U_FAILURE(status)) {
		return FALSE;
	}
	if (opt_overlap == TRUE) {
		usearch_setAttribute(search, USEARCH_OVERLAP, USEARCH_ON, &status);
	}
	if (opt_canonical == TRUE) {
		usearch_setAttribute(search, USEARCH_CANONICAL_MATCH, USEARCH_ON, 
			                 &status);
	}
	if (U_FAILURE(status)) {
		fprintf(stderr, "Error setting search attributes\n");
		return FALSE;
	}
	return TRUE;
}

UBool findPattern()
{
	UErrorCode status = U_ZERO_ERROR;
	int32_t offset = usearch_next(search, &status);
	if (offset == USEARCH_DONE) {
		fprintf(stdout, "Pattern not found in source\n");
	}
	while (offset != USEARCH_DONE) {
		fprintf(stdout, "Pattern found at offset %d size %d\n", offset,
				usearch_getMatchedLength(search));
		offset = usearch_next(search, &status);
	}
	if (U_FAILURE(status)) {
		fprintf(stderr, "Error in searching for pattern %d\n", status);
		return FALSE;
	}
	fprintf(stdout, "End of search\n");
	return TRUE;
}

/** 
 * Main   --  process command line, read in and pre-process the test file,
 *            call other functions to do the actual tests.
 */
int main(int argc, const char** argv) 
{
    if (processOptions(argc, argv, opts) != TRUE || opt_help) {
        printf(gHelpString);
        return -1;
    }

    if (processCollator() != TRUE) {
		fprintf(stderr, "Error creating collator\n");
		return -1;
	}

	if (processStringSearch() != TRUE) {
		fprintf(stderr, "Error creating string search\n");
		return -1;
	}

	fprintf(stdout, "Finding pattern %s in source %s\n", opt_pattern, 
		    opt_source);

	findPattern();
	ucol_close(collator);
	usearch_close(search);
	return 0;
}