MarkupTokenBase.h [plain text]

/*
 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
 * Copyright (C) 2011 Apple Inc. All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef MarkupTokenBase_h
#define MarkupTokenBase_h

#include "ElementAttributeData.h"
#include <wtf/Vector.h>

#ifndef NDEBUG
#include <stdio.h>
#endif

namespace WebCore {

class DoctypeDataBase {
    WTF_MAKE_NONCOPYABLE(DoctypeDataBase);
public:
    DoctypeDataBase()
        : m_hasPublicIdentifier(false)
        , m_hasSystemIdentifier(false)
    {
    }

    bool m_hasPublicIdentifier;
    bool m_hasSystemIdentifier;
    WTF::Vector<UChar> m_publicIdentifier;
    WTF::Vector<UChar> m_systemIdentifier;
};

class AttributeBase {
public:
    class Range {
    public:
        int m_start;
        int m_end;
    };

    Range m_nameRange;
    Range m_valueRange;
    WTF::Vector<UChar, 32> m_name;
    WTF::Vector<UChar, 32> m_value;
};

template<typename TypeSet, typename DoctypeDataType = DoctypeDataBase, typename AttributeType = AttributeBase>
class MarkupTokenBase {
    WTF_MAKE_NONCOPYABLE(MarkupTokenBase);
    WTF_MAKE_FAST_ALLOCATED;
public:
    typedef TypeSet Type;
    typedef AttributeType Attribute;
    typedef DoctypeDataType DoctypeData;

    typedef WTF::Vector<Attribute, 10> AttributeList;
    typedef WTF::Vector<UChar, 1024> DataVector;

    MarkupTokenBase() { clear(); }
    virtual ~MarkupTokenBase() { }

    virtual void clear()
    {
        m_type = TypeSet::Uninitialized;
        m_range.m_start = 0;
        m_range.m_end = 0;
        m_baseOffset = 0;
        m_data.clear();
    }

    bool isUninitialized() { return m_type == TypeSet::Uninitialized; }

    int startIndex() const { return m_range.m_start; }
    int endIndex() const { return m_range.m_end; }

    void setBaseOffset(int offset)
    {
        m_baseOffset = offset;
    }

    void end(int endOffset)
    {
        m_range.m_end = endOffset - m_baseOffset;
    }

    void makeEndOfFile()
    {
        ASSERT(m_type == TypeSet::Uninitialized);
        m_type = TypeSet::EndOfFile;
    }

    void beginStartTag(UChar character)
    {
        ASSERT(character);
        ASSERT(m_type == TypeSet::Uninitialized);
        m_type = TypeSet::StartTag;
        m_selfClosing = false;
        m_currentAttribute = 0;
        m_attributes.clear();

        m_data.append(character);
    }

    template<typename T>
    void beginEndTag(T characters)
    {
        ASSERT(m_type == TypeSet::Uninitialized);
        m_type = TypeSet::EndTag;
        m_selfClosing = false;
        m_currentAttribute = 0;
        m_attributes.clear();

        m_data.append(characters);
    }

    // Starting a character token works slightly differently than starting
    // other types of tokens because we want to save a per-character branch.
    void ensureIsCharacterToken()
    {
        ASSERT(m_type == TypeSet::Uninitialized || m_type == TypeSet::Character);
        m_type = TypeSet::Character;
    }

    void beginComment()
    {
        ASSERT(m_type == TypeSet::Uninitialized);
        m_type = TypeSet::Comment;
    }

    void beginDOCTYPE()
    {
        ASSERT(m_type == TypeSet::Uninitialized);
        m_type = TypeSet::DOCTYPE;
        m_doctypeData = adoptPtr(new DoctypeData);
    }

    void beginDOCTYPE(UChar character)
    {
        ASSERT(character);
        beginDOCTYPE();
        m_data.append(character);
    }

    template<typename T>
    void appendToCharacter(T characters)
    {
        ASSERT(m_type == TypeSet::Character);
        m_data.append(characters);
    }

    void appendToComment(UChar character)
    {
        ASSERT(character);
        ASSERT(m_type == TypeSet::Comment);
        m_data.append(character);
    }

    void addNewAttribute()
    {
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        m_attributes.grow(m_attributes.size() + 1);
        m_currentAttribute = &m_attributes.last();
#ifndef NDEBUG
        m_currentAttribute->m_nameRange.m_start = 0;
        m_currentAttribute->m_nameRange.m_end = 0;
        m_currentAttribute->m_valueRange.m_start = 0;
        m_currentAttribute->m_valueRange.m_end = 0;
#endif
    }

    void beginAttributeName(int offset)
    {
        m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
    }

    void endAttributeName(int offset)
    {
        int index = offset - m_baseOffset;
        m_currentAttribute->m_nameRange.m_end = index;
        m_currentAttribute->m_valueRange.m_start = index;
        m_currentAttribute->m_valueRange.m_end = index;
    }

    void beginAttributeValue(int offset)
    {
        m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
#ifndef NDEBUG
        m_currentAttribute->m_valueRange.m_end = 0;
#endif
    }

    void endAttributeValue(int offset)
    {
        m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
    }

    void appendToAttributeName(UChar character)
    {
        ASSERT(character);
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        // FIXME: We should be able to add the following ASSERT once we fix
        // https://bugs.webkit.org/show_bug.cgi?id=62971
        //   ASSERT(m_currentAttribute->m_nameRange.m_start);
        m_currentAttribute->m_name.append(character);
    }

    void appendToAttributeValue(UChar character)
    {
        ASSERT(character);
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        ASSERT(m_currentAttribute->m_valueRange.m_start);
        m_currentAttribute->m_value.append(character);
    }

    void appendToAttributeValue(size_t i, const String& value)
    {
        ASSERT(!value.isEmpty());
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        m_attributes[i].m_value.append(value.characters(), value.length());
    }

    typename Type::Type type() const { return m_type; }

    bool selfClosing() const
    {
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        return m_selfClosing;
    }

    void setSelfClosing()
    {
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        m_selfClosing = true;
    }

    const AttributeList& attributes() const
    {
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        return m_attributes;
    }

    void eraseCharacters()
    {
        ASSERT(m_type == TypeSet::Character);
        m_data.clear();
    }

    void eraseValueOfAttribute(size_t i)
    {
        ASSERT(m_type == TypeSet::StartTag || m_type == TypeSet::EndTag);
        m_attributes[i].m_value.clear();
    }

    const DataVector& characters() const
    {
        ASSERT(m_type == TypeSet::Character);
        return m_data;
    }

    const DataVector& comment() const
    {
        ASSERT(m_type == TypeSet::Comment);
        return m_data;
    }

    // FIXME: Distinguish between a missing public identifer and an empty one.
    const WTF::Vector<UChar>& publicIdentifier() const
    {
        ASSERT(m_type == TypeSet::DOCTYPE);
        return m_doctypeData->m_publicIdentifier;
    }

    // FIXME: Distinguish between a missing system identifer and an empty one.
    const WTF::Vector<UChar>& systemIdentifier() const
    {
        ASSERT(m_type == TypeSet::DOCTYPE);
        return m_doctypeData->m_systemIdentifier;
    }

    void setPublicIdentifierToEmptyString()
    {
        ASSERT(m_type == TypeSet::DOCTYPE);
        m_doctypeData->m_hasPublicIdentifier = true;
        m_doctypeData->m_publicIdentifier.clear();
    }

    void setSystemIdentifierToEmptyString()
    {
        ASSERT(m_type == TypeSet::DOCTYPE);
        m_doctypeData->m_hasSystemIdentifier = true;
        m_doctypeData->m_systemIdentifier.clear();
    }

    void appendToPublicIdentifier(UChar character)
    {
        ASSERT(character);
        ASSERT(m_type == TypeSet::DOCTYPE);
        ASSERT(m_doctypeData->m_hasPublicIdentifier);
        m_doctypeData->m_publicIdentifier.append(character);
    }

    void appendToSystemIdentifier(UChar character)
    {
        ASSERT(character);
        ASSERT(m_type == TypeSet::DOCTYPE);
        ASSERT(m_doctypeData->m_hasSystemIdentifier);
        m_doctypeData->m_systemIdentifier.append(character);
    }

protected:

#ifndef NDEBUG
    void printString(const DataVector& string) const
    {
        DataVector::const_iterator iter = string.begin();
        for (; iter != string.end(); ++iter)
            fprintf(stderr, "%lc", wchar_t(*iter));
    }
#endif // NDEBUG

    inline void appendToName(UChar character)
    {
        ASSERT(character);
        m_data.append(character);
    }
    
    inline const DataVector& name() const
    {
        return m_data;
    }

    // FIXME: I'm not sure what the final relationship between MarkupTokenBase and
    // AtomicMarkupTokenBase will be. I'm marking this a friend for now, but we'll
    // want to end up with a cleaner interface between the two classes.
    template<typename Token>
    friend class AtomicMarkupTokenBase;

    typename Type::Type m_type;
    typename Attribute::Range m_range; // Always starts at zero.
    int m_baseOffset;
    DataVector m_data;

    // For DOCTYPE
    OwnPtr<DoctypeData> m_doctypeData;

    // For StartTag and EndTag
    bool m_selfClosing;
    AttributeList m_attributes;

    // A pointer into m_attributes used during lexing.
    Attribute* m_currentAttribute;
};

template<typename Token>
class AtomicMarkupTokenBase {
    WTF_MAKE_NONCOPYABLE(AtomicMarkupTokenBase);
public:
    AtomicMarkupTokenBase(Token* token)
        : m_type(token->type())
    {
        ASSERT(token);

        switch (m_type) {
        case Token::Type::Uninitialized:
            ASSERT_NOT_REACHED();
            break;
        case Token::Type::DOCTYPE:
            m_name = AtomicString(token->name().data(), token->name().size());
            m_doctypeData = token->m_doctypeData.release();
            break;
        case Token::Type::EndOfFile:
            break;
        case Token::Type::StartTag:
        case Token::Type::EndTag: {
            m_selfClosing = token->selfClosing();
            m_name = AtomicString(token->name().data(), token->name().size());
            initializeAttributes(token->attributes());
            break;
        }
        case Token::Type::Comment:
            m_data = String(token->comment().data(), token->comment().size());
            break;
        case Token::Type::Character:
            m_externalCharacters = &token->characters();
            break;
        default:
            break;
        }
    }

    AtomicMarkupTokenBase(typename Token::Type::Type type, AtomicString name, const Vector<Attribute>& attributes = Vector<Attribute>())
        : m_type(type)
        , m_name(name)
        , m_externalCharacters(0)
        , m_attributes(attributes)
    {
        ASSERT(usesName());
    }

    typename Token::Type::Type type() const { return m_type; }

    const AtomicString& name() const
    {
        ASSERT(usesName());
        return m_name;
    }

    void setName(const AtomicString& name)
    {
        ASSERT(usesName());
        m_name = name;
    }

    bool selfClosing() const
    {
        ASSERT(m_type == Token::Type::StartTag || m_type == Token::Type::EndTag);
        return m_selfClosing;
    }

    Attribute* getAttributeItem(const QualifiedName& attributeName)
    {
        ASSERT(usesAttributes());
        return findAttributeInVector(m_attributes, attributeName);
    }

    Vector<Attribute>& attributes()
    {
        ASSERT(usesAttributes());
        return m_attributes;
    }

    const Vector<Attribute>& attributes() const
    {
        ASSERT(usesAttributes());
        return m_attributes;
    }

    const typename Token::DataVector& characters() const
    {
        ASSERT(m_type == Token::Type::Character);
        return *m_externalCharacters;
    }

    const String& comment() const
    {
        ASSERT(m_type == Token::Type::Comment);
        return m_data;
    }

    // FIXME: Distinguish between a missing public identifer and an empty one.
    WTF::Vector<UChar>& publicIdentifier() const
    {
        ASSERT(m_type == Token::Type::DOCTYPE);
        return m_doctypeData->m_publicIdentifier;
    }

    // FIXME: Distinguish between a missing system identifer and an empty one.
    WTF::Vector<UChar>& systemIdentifier() const
    {
        ASSERT(m_type == Token::Type::DOCTYPE);
        return m_doctypeData->m_systemIdentifier;
    }

protected:
    typename Token::Type::Type m_type;

    void initializeAttributes(const typename Token::AttributeList& attributes);
    QualifiedName nameForAttribute(const typename Token::Attribute&) const;

    bool usesName() const;

    bool usesAttributes() const;

    // "name" for DOCTYPE, StartTag, and EndTag
    AtomicString m_name;

    // "data" for Comment
    String m_data;

    // "characters" for Character
    //
    // We don't want to copy the the characters out of the Token, so we
    // keep a pointer to its buffer instead. This buffer is owned by the
    // Token and causes a lifetime dependence between these objects.
    //
    // FIXME: Add a mechanism for "internalizing" the characters when the
    //        HTMLToken is destructed.
    const typename Token::DataVector* m_externalCharacters;

    // For DOCTYPE
    OwnPtr<typename Token::DoctypeData> m_doctypeData;

    // For StartTag and EndTag
    bool m_selfClosing;

    Vector<Attribute> m_attributes;
};

template<typename Token>
inline void AtomicMarkupTokenBase<Token>::initializeAttributes(const typename Token::AttributeList& attributes)
{
    size_t size = attributes.size();
    if (!size)
        return;

    m_attributes.clear();
    m_attributes.reserveInitialCapacity(size);
    for (size_t i = 0; i < size; ++i) {
        const typename Token::Attribute& attribute = attributes[i];
        if (attribute.m_name.isEmpty())
            continue;

        // FIXME: We should be able to add the following ASSERT once we fix
        // https://bugs.webkit.org/show_bug.cgi?id=62971
        //   ASSERT(attribute.m_nameRange.m_start);
        ASSERT(attribute.m_nameRange.m_end);
        ASSERT(attribute.m_valueRange.m_start);
        ASSERT(attribute.m_valueRange.m_end);

        AtomicString value(attribute.m_value.data(), attribute.m_value.size());
        const QualifiedName& name = nameForAttribute(attribute);
        if (!findAttributeInVector(m_attributes, name))
            m_attributes.append(Attribute(name, value));
    }
}

}

#endif // MarkupTokenBase_h