xmlparser.h   [plain text]


/*
*******************************************************************************
*
*   Copyright (C) 2004-2005, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  xmlparser.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2004jul21
*   created by: Andy Heninger
*
* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
* Not suitable for production use. Not supported.
* Not conformant. Not efficient.
* But very small.
*/

#ifndef __XMLPARSER_H__
#define __XMLPARSER_H__

#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "uvector.h"
#include "hash.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION

enum UXMLNodeType {
    /** Node type string (text contents), stored as a UnicodeString. */
    UXML_NODE_TYPE_STRING,
    /** Node type element, stored as a UXMLElement. */
    UXML_NODE_TYPE_ELEMENT,
    UXML_NODE_TYPE_COUNT
};

U_NAMESPACE_BEGIN

class UXMLParser;

/**
 * This class represents an element node in a parsed XML tree.
 */
class U_TOOLUTIL_API UXMLElement : public UObject {
public:
    /**
     * Destructor.
     */
    virtual ~UXMLElement();

    /**
     * Get the tag name of this element.
     */
    const UnicodeString &getTagName() const;
    /**
     * Get the text contents of the element.
     * Append the contents of all text child nodes.
     * @param recurse If TRUE, also recursively appends the contents of all
     *        text child nodes of element children.
     * @return The text contents.
     */
    UnicodeString getText(UBool recurse) const;
    /**
     * Get the number of attributes.
     */
    int32_t countAttributes() const;
    /**
     * Get the i-th attribute.
     * @param i Index of the attribute.
     * @param name Output parameter, receives the attribute name.
     * @param value Output parameter, receives the attribute value.
     * @return A pointer to the attribute value (may be &value or a pointer to an
     *         internal string object), or NULL if i is out of bounds.
     */
    const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
    /**
     * Get the value of the attribute with the given name.
     * @param name Attribute name to be looked up.
     * @return A pointer to the attribute value, or NULL if this element
     * does not have this attribute.
     */
    const UnicodeString *getAttribute(const UnicodeString &name) const;
    /**
     * Get the number of child nodes.
     */
    int32_t countChildren() const;
    /**
     * Get the i-th child node.
     * @param i Index of the child node.
     * @param type The child node type.
     * @return A pointer to the child node object, or NULL if i is out of bounds.
     */
    const UObject *getChild(int32_t i, UXMLNodeType &type) const;
    /**
     * Get the next child element node, skipping non-element child nodes.
     * @param i Enumeration index; initialize to 0 before getting the first child element.
     * @return A pointer to the next child element, or NULL if there is none.
     */
    const UXMLElement *nextChildElement(int32_t &i) const;
    /**
     * Get the immediate child element with the given name.
     * If there are multiple child elements with this name, then return
     * the first one.
     * @param name Element name to be looked up.
     * @return A pointer to the element node, or NULL if this element
     * does not have this immediate child element.
     */
    const UXMLElement *getChildElement(const UnicodeString &name) const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     */
    virtual UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     */
    static UClassID U_EXPORT2 getStaticClassID();

private:
    // prevent default construction etc.
    UXMLElement();
    UXMLElement(const UXMLElement &other);
    UXMLElement &operator=(const UXMLElement &other);

    void appendText(UnicodeString &text, UBool recurse) const;

    friend class UXMLParser;

    UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);

    const UXMLParser *fParser;
    const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
    UnicodeString       fContent;        // The text content of this node.  All element content is 
                                         //   concatenated even when there are intervening nested elements
                                         //   (which doesn't happen with most xml files we care about)
                                         //   Sections of content containing only white space are dropped,
                                         //   which gets rid  the bogus white space content from
                                         //   elements which are primarily containers for nested elements.
    UVector             fAttNames;       // A vector containing the names of this element's attributes
                                         //    The names are UnicodeString objects, owned by the UXMLParser.
    UVector             fAttValues;      // A vector containing the attribute values for
                                         //    this element's attributes.  The order is the same
                                         //    as that of the attribute name vector.

    UVector             fChildren;       // The child nodes of this element (a Vector)

    UXMLElement        *fParent;         // A pointer to the parent element of this element.
};

/**
 * A simple XML parser; it is neither efficient nor conformant and only useful for
 * restricted types of XML documents.
 *
 * The parse methods parse whole documents and return the parse trees via their
 * root elements.
 */
class U_TOOLUTIL_API UXMLParser : public UObject {
public:
    /**
     * Create an XML parser.
     */
    static UXMLParser *createParser(UErrorCode &errorCode);
    /**
     * Destructor.
     */
    virtual ~UXMLParser();

    /**
     * Parse an XML document, create the entire document tree, and
     * return a pointer to the root element of the parsed tree.
     * The caller must delete the element.
     */
    UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
    /**
     * Parse an XML file, create the entire document tree, and
     * return a pointer to the root element of the parsed tree.
     * The caller must delete the element.
     */
    UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     */
    virtual UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     */
    static UClassID U_EXPORT2 getStaticClassID();

private:
    // prevent default construction etc.
    UXMLParser();
    UXMLParser(const UXMLParser &other);
    UXMLParser &operator=(const UXMLParser &other);

    // constructor
    UXMLParser(UErrorCode &status);

    void           parseMisc(UErrorCode &status);
    UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
    void           error(const char *message, UErrorCode &status);
    UnicodeString  scanContent(UErrorCode &status);
    void           replaceCharRefs(UnicodeString &s, UErrorCode &status);

    const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
public:
    // public for UXMLElement only
    const UnicodeString *findName(const UnicodeString &s) const;
private:

    // There is one ICU regex matcher for each of the major XML syntax items
    //  that are recognized.
    RegexMatcher mXMLDecl;
    RegexMatcher mXMLComment;
    RegexMatcher mXMLSP;
    RegexMatcher mXMLDoctype;
    RegexMatcher mXMLPI;
    RegexMatcher mXMLElemStart;
    RegexMatcher mXMLElemEnd;
    RegexMatcher mXMLElemEmpty;
    RegexMatcher mXMLCharData;
    RegexMatcher mAttrValue;
    RegexMatcher mAttrNormalizer;
    RegexMatcher mNewLineNormalizer;
    RegexMatcher mAmps;

    Hashtable             fNames;           // interned element/attribute name strings
    UStack                fElementStack;    // Stack holds the parent elements when nested
                                            //    elements are being parsed.  All items on this
                                            //    stack are of type UXMLElement.
    int32_t               fPos;             // String index of the current scan position in
                                            //    xml source (in fSrc).
    UnicodeString         fOneLF;
};

U_NAMESPACE_END
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

#endif