TextBoundaries.mm [plain text]

/*
 * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#import "config.h"
#import "TextBoundaries.h"

using namespace WTF::Unicode;

#import "TextBreakIteratorInternalICU.h"
#import <CoreFoundation/CFStringTokenizer.h>
#import <Foundation/Foundation.h>
#import <unicode/ubrk.h>
#import <unicode/uchar.h>
#import <unicode/ustring.h>
#import <unicode/utypes.h>
#import <wtf/unicode/CharacterNames.h>

namespace WebCore {


static bool isSkipCharacter(UChar32 c)
{
    return c == 0xA0 || 
        c == '\n' || 
        c == '.' || 
        c == ',' || 
        c == '!'  || 
        c == '?' || 
        c == ';' || 
        c == ':' || 
        u_isspace(c);
}

static bool isWhitespaceCharacter(UChar32 c)
{
    return c == 0xA0 || 
        c == '\n' || 
        u_isspace(c);
}

static bool isWordDelimitingCharacter(UChar32 c)
{
    CFCharacterSetRef set = CFCharacterSetGetPredefined(kCFCharacterSetAlphaNumeric);
    // Ampersand is an exception added to treat AT&T as a single word (see <rdar://problem/5022264>).
    return !CFCharacterSetIsLongCharacterMember(set, c) && c != '&';
}

static bool isSymbolCharacter(UChar32 c)
{
    CFCharacterSetRef set = CFCharacterSetGetPredefined(kCFCharacterSetSymbol);
    return CFCharacterSetIsLongCharacterMember(set, c);
}

static bool isAmbiguousBoundaryCharacter(UChar32 character)
{
    // These are characters that can behave as word boundaries, but can appear within words.
    return character == '\'' || character == rightSingleQuotationMark || character == hebrewPunctuationGershayim;
}

static CFStringTokenizerRef tokenizerForString(CFStringRef str)
{
    static CFStringTokenizerRef tokenizer = NULL;
    static CFLocaleRef          locale = NULL;
    
    if (locale == NULL) {
        const char *currentLocaleID = currentTextBreakLocaleID();
        CFStringRef lang = CFStringCreateWithBytesNoCopy(NULL, reinterpret_cast<const UInt8 *>(currentLocaleID), strlen(currentLocaleID),  kCFStringEncodingASCII, false, kCFAllocatorNull);
        locale = CFLocaleCreate(NULL, lang);
        if (!locale)
            return NULL;
        CFRelease(lang);
    }

    CFRange entireRange = CFRangeMake(0, CFStringGetLength(str));    

    if (!tokenizer) {
        tokenizer = CFStringTokenizerCreate(NULL, str, entireRange, kCFStringTokenizerUnitWordBoundary, locale);
    } else {
        CFStringTokenizerSetString(tokenizer, str, entireRange);
    }
    return tokenizer;
}


// Simple case: a word is a stream of characters
// delimited by a special set of word-delimiting characters.
static void findSimpleWordBoundary(const UChar* chars, int len, int position, int* start, int* end)
{
    ASSERT(position >= 0);
    ASSERT(position < len);

    int startPos = position;
    while (startPos > 0) {
        int i = startPos;
        UChar32 characterBeforeStartPos;
        U16_PREV(chars, 0, i, characterBeforeStartPos);
        if (isWordDelimitingCharacter(characterBeforeStartPos)) {
            ASSERT(i >= 0);
            if (!i)
                break;

            if (!isAmbiguousBoundaryCharacter(characterBeforeStartPos))
                break;

            UChar32 characterBeforeBeforeStartPos;
            U16_PREV(chars, 0, i, characterBeforeBeforeStartPos);
            if (isWordDelimitingCharacter(characterBeforeBeforeStartPos))
                break;
        }
        U16_BACK_1(chars, 0, startPos);
    }
    
    int endPos = position;
    while (endPos < len) {
        UChar32 character;
        U16_GET(chars, 0, endPos, len, character);
        if (isWordDelimitingCharacter(character)) {
            int i = endPos;
            U16_FWD_1(chars, i, len);
            ASSERT(i <= len);
            if (i == len)
                break;
            UChar32 characterAfterEndPos;
            U16_NEXT(chars, i, len, characterAfterEndPos);
            if (!isAmbiguousBoundaryCharacter(character))
                break;
            if (isWordDelimitingCharacter(characterAfterEndPos))
                break;
        }
        U16_FWD_1(chars, endPos, len);
    }

    // The text may consist of all delimiter characters (e.g. "++++++++" or a series of emoji), and returning an empty range
    // makes no sense (and doesn't match findComplexWordBoundary() behavior).
    if (startPos == endPos && endPos < len) {
        UChar32 character;
        U16_GET(chars, 0, endPos, len, character);
        if (isSymbolCharacter(character))
            U16_FWD_1(chars, endPos, len);
    }

    *start = startPos;
    *end = endPos;
}

// Complex case: use CFStringTokenizer to find word boundary.
static void findComplexWordBoundary(const UChar* chars, int len, int position, int* start, int* end)
{
    CFStringRef charString = CFStringCreateWithCharactersNoCopy(NULL, chars, len, kCFAllocatorNull);
    ASSERT(charString);
    
    CFStringTokenizerRef tokenizer = tokenizerForString(charString);
    if (tokenizer) {        
        CFStringTokenizerTokenType  token = CFStringTokenizerGoToTokenAtIndex(tokenizer, position);
        CFRange result;
        if (token != kCFStringTokenizerTokenNone) {
            result = CFStringTokenizerGetCurrentTokenRange(tokenizer);
        } else {
            // if no token found: select entire block
            // NB: I never hit this section in all my testing...
            result.location = 0;
            result.length = len;
        }
        
        *start = result.location;
        *end = result.location + result.length;
    } else {    // error creating tokenizer
        findSimpleWordBoundary(chars, len, position, start, end);
    }

    CFRelease(charString);
}

void findWordBoundary(const UChar* chars, int len, int position, int* start, int* end)
{
    int pos = position;
    if ( position == len && position != 0)
        pos--;
    
    // For complex text (Thai, Japanese, Chinese), visible_units will pass the text in as a 
    // single contiguous run of characters, providing as much context as is possible.
    // We only need one character to determine if the text is complex.
    UChar32 ch;
    int i = pos;
    U16_NEXT(chars,i,len,ch);
    bool isComplex = requiresContextForWordBoundary(ch);
    
    // FIXME: This check improves our word boundary behavior, but doesn't actually go far enough.
    // See <rdar://problem/8853951> Take complex word boundary finding path when necessary
    if (!isComplex) {
        // Check again for complex text, at the start of the run.
        i = 0;
        U16_NEXT(chars, i, len, ch);
        isComplex = requiresContextForWordBoundary(ch);
    }
    
    if (isComplex) {
        findComplexWordBoundary(chars,len, position, start, end);
    } else {
        findSimpleWordBoundary(chars,len, position, start, end);
    }

#define LOG_WORD_BREAK 0
#if LOG_WORD_BREAK
    CFStringRef uniString = CFStringCreateWithCharacters(NULL, chars, len);
    CFStringRef foundWord = CFStringCreateWithCharacters(NULL, (const UniChar *) chars + *start, *end - *start);        
    NSLog(@"%s_BREAK '%@' (%d,%d) in '%@' (%p) at %d, length=%d", isComplex ? "COMPLEX" : "SIMPLE", foundWord, *start, *end, uniString, uniString, position, len);
    CFRelease(foundWord);
    CFRelease(uniString);
#endif
    
}

int findNextWordFromIndex(const UChar* chars, int len, int position, bool forward)
{   
    // This very likely won't behave exactly like the non-iPhone version, but it works
    // for the contexts in which it is used on iPhone, and in the future will be
    // tuned to improve the iPhone-specific behavior for the keyboard and text editing.
    int pos = position;
    UErrorCode status = U_ZERO_ERROR;
    UBreakIterator *boundary = ubrk_open(UBRK_WORD, currentTextBreakLocaleID(), const_cast<unichar *>(reinterpret_cast<const unichar *>(chars)), len, &status);

    if (boundary && U_SUCCESS(status)) {
        if (forward) {
            do {
                pos = ubrk_following(boundary, pos);    
                if (pos == UBRK_DONE) {
                    pos = len;
                }
            } while (pos < len && (pos == 0 || !isSkipCharacter(chars[pos-1])) && isSkipCharacter(chars[pos]));
        }
        else {
            do {
                pos = ubrk_preceding(boundary, pos);
                if (pos == UBRK_DONE) {
                    pos = 0;
                }
            } while (pos > 0 && isSkipCharacter(chars[pos]) && !isWhitespaceCharacter(chars[pos-1]));
        }
        ubrk_close(boundary);
    }
    return pos;
}

}