WebPageSerializerImpl.cpp [plain text]
#include "config.h"
#include "WebPageSerializerImpl.h"
#include "Document.h"
#include "DocumentType.h"
#include "Element.h"
#include "FrameLoader.h"
#include "HTMLAllCollection.h"
#include "HTMLElement.h"
#include "HTMLFormElement.h"
#include "HTMLMetaElement.h"
#include "HTMLNames.h"
#include "KURL.h"
#include "PlatformString.h"
#include "StringBuilder.h"
#include "TextEncoding.h"
#include "markup.h"
#include "DOMUtilitiesPrivate.h"
#include "WebFrameImpl.h"
#include "WebURL.h"
#include "WebVector.h"
using namespace WebCore;
namespace WebKit {
static const unsigned dataBufferCapacity = 65536;
WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& currentFrameURL,
const TextEncoding& textEncoding,
Document* doc,
const String& directoryName)
: currentFrameURL(currentFrameURL)
, textEncoding(textEncoding)
, doc(doc)
, directoryName(directoryName)
, hasDoctype(false)
, hasCheckedMeta(false)
, skipMetaElement(0)
, isInScriptOrStyleTag(false)
, hasDocDeclaration(false)
{
isHTMLDocument = doc->isHTMLDocument();
}
String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
const Element* element, SerializeDomParam* param, bool* needSkip)
{
StringBuilder result;
*needSkip = false;
if (param->isHTMLDocument) {
if (element->hasTagName(HTMLNames::metaTag)) {
const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
String equiv = meta->httpEquiv();
if (equalIgnoringCase(equiv, "content-type")) {
String content = meta->content();
if (content.length() && content.contains("charset", false)) {
param->skipMetaElement = element;
*needSkip = true;
}
}
} else if (element->hasTagName(HTMLNames::htmlTag)) {
if (!param->hasDoctype) {
param->hasDoctype = true;
result.append(createMarkup(param->doc->doctype()));
}
result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->currentFrameURL));
} else if (element->hasTagName(HTMLNames::baseTag)) {
result.append("<!--");
}
} else {
if (!param->hasDocDeclaration) {
param->hasDocDeclaration = true;
String xmlEncoding = param->doc->xmlEncoding();
if (xmlEncoding.isEmpty())
xmlEncoding = param->doc->frame()->loader()->writer()->encoding();
if (xmlEncoding.isEmpty())
xmlEncoding = UTF8Encoding().name();
result.append("<?xml version=\"");
result.append(param->doc->xmlVersion());
result.append("\" encoding=\"");
result.append(xmlEncoding);
if (param->doc->xmlStandalone())
result.append("\" standalone=\"yes");
result.append("\"?>\n");
}
if (!param->hasDoctype) {
param->hasDoctype = true;
result.append(createMarkup(param->doc->doctype()));
}
}
return result.toString();
}
String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
const Element* element, SerializeDomParam* param)
{
StringBuilder result;
param->hasAddedContentsBeforeEnd = false;
if (!param->isHTMLDocument)
return result.toString();
if (!param->hasCheckedMeta
&& element->hasTagName(HTMLNames::headTag)) {
param->hasCheckedMeta = true;
result.append(WebPageSerializer::generateMetaCharsetDeclaration(
String(param->textEncoding.name())));
param->hasAddedContentsBeforeEnd = true;
} else if (element->hasTagName(HTMLNames::scriptTag)
|| element->hasTagName(HTMLNames::styleTag)) {
param->isInScriptOrStyleTag = true;
}
return result.toString();
}
String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
const Element* element, SerializeDomParam* param, bool* needSkip)
{
String result;
*needSkip = false;
if (!param->isHTMLDocument)
return result;
if (param->skipMetaElement == element)
*needSkip = true;
else if (element->hasTagName(HTMLNames::scriptTag)
|| element->hasTagName(HTMLNames::styleTag)) {
ASSERT(param->isInScriptOrStyleTag);
param->isInScriptOrStyleTag = false;
}
return result;
}
String WebPageSerializerImpl::postActionAfterSerializeEndTag(
const Element* element, SerializeDomParam* param)
{
StringBuilder result;
if (!param->isHTMLDocument)
return result.toString();
if (element->hasTagName(HTMLNames::baseTag)) {
result.append("-->");
result.append(WebPageSerializer::generateBaseTagDeclaration(
param->doc->baseTarget()));
}
return result.toString();
}
void WebPageSerializerImpl::saveHTMLContentToBuffer(
const String& result, SerializeDomParam* param)
{
m_dataBuffer.append(result);
encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
param,
0);
}
void WebPageSerializerImpl::encodeAndFlushBuffer(
WebPageSerializerClient::PageSerializationStatus status,
SerializeDomParam* param,
bool force)
{
if (!force && m_dataBuffer.length() <= dataBufferCapacity)
return;
String content = m_dataBuffer.toString();
m_dataBuffer.clear();
CString encodedContent = param->textEncoding.encode(
content.characters(), content.length(), EntitiesForUnencodables);
m_client->didSerializeDataForFrame(param->currentFrameURL,
WebCString(encodedContent.data(), encodedContent.length()),
status);
}
void WebPageSerializerImpl::openTagToString(const Element* element,
SerializeDomParam* param)
{
bool needSkip;
String result = preActionBeforeSerializeOpenTag(element, param, &needSkip);
if (needSkip)
return;
result += "<" + element->nodeName().lower();
const NamedNodeMap *attrMap = element->attributes(true);
if (attrMap) {
unsigned numAttrs = attrMap->length();
for (unsigned i = 0; i < numAttrs; i++) {
result += " ";
const Attribute *attribute = attrMap->attributeItem(i);
result += attribute->name().toString();
result += "=\"";
if (!attribute->value().isEmpty()) {
const String& attrValue = attribute->value();
const QualifiedName& attrName = attribute->name();
if (elementHasLegalLinkAttribute(element, attrName)) {
if (attrValue.startsWith("javascript:", false))
result += attrValue;
else {
String completeURL = param->doc->completeURL(attrValue);
if (m_localLinks.contains(completeURL)) {
if (!m_localDirectoryName.isEmpty())
result += "./" + m_localDirectoryName + "/";
result += m_localLinks.get(completeURL);
} else
result += completeURL;
}
} else {
if (param->isHTMLDocument)
result += m_htmlEntities.convertEntitiesInString(attrValue);
else
result += m_xmlEntities.convertEntitiesInString(attrValue);
}
}
result += "\"";
}
}
String addedContents = postActionAfterSerializeOpenTag(element, param);
if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd)
result += ">";
result += addedContents;
saveHTMLContentToBuffer(result, param);
}
void WebPageSerializerImpl::endTagToString(const Element* element,
SerializeDomParam* param)
{
bool needSkip;
String result = preActionBeforeSerializeEndTag(element,
param,
&needSkip);
if (needSkip)
return;
if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) {
result += "</";
result += element->nodeName().lower();
result += ">";
} else {
if (param->isHTMLDocument) {
result += ">";
const HTMLElement* htmlElement =
static_cast<const HTMLElement*>(element);
if (htmlElement->endTagRequirement() == TagStatusRequired) {
result += "</";
result += element->nodeName().lower();
result += ">";
}
} else {
result += " />";
}
}
result += postActionAfterSerializeEndTag(element, param);
saveHTMLContentToBuffer(result, param);
}
void WebPageSerializerImpl::buildContentForNode(const Node* node,
SerializeDomParam* param)
{
switch (node->nodeType()) {
case Node::ELEMENT_NODE:
openTagToString(static_cast<const Element*>(node), param);
for (const Node *child = node->firstChild(); child; child = child->nextSibling())
buildContentForNode(child, param);
endTagToString(static_cast<const Element*>(node), param);
break;
case Node::TEXT_NODE:
saveHTMLContentToBuffer(createMarkup(node), param);
break;
case Node::ATTRIBUTE_NODE:
case Node::DOCUMENT_NODE:
case Node::DOCUMENT_FRAGMENT_NODE:
ASSERT_NOT_REACHED();
break;
case Node::DOCUMENT_TYPE_NODE:
param->hasDoctype = true;
default:
saveHTMLContentToBuffer(createMarkup(node), param);
break;
}
}
WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
bool recursiveSerialization,
WebPageSerializerClient* client,
const WebVector<WebURL>& links,
const WebVector<WebString>& localPaths,
const WebString& localDirectoryName)
: m_client(client)
, m_recursiveSerialization(recursiveSerialization)
, m_framesCollected(false)
, m_localDirectoryName(localDirectoryName)
, m_htmlEntities(false)
, m_xmlEntities(true)
{
ASSERT(frame);
m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
ASSERT(client);
ASSERT(links.size() == localPaths.size());
for (size_t i = 0; i < links.size(); i++) {
KURL url = links[i];
ASSERT(!m_localLinks.contains(url.string()));
m_localLinks.set(url.string(), localPaths[i]);
}
ASSERT(!m_dataBuffer.length());
}
void WebPageSerializerImpl::collectTargetFrames()
{
ASSERT(!m_framesCollected);
m_framesCollected = true;
m_frames.append(m_specifiedWebFrameImpl);
if (!m_recursiveSerialization)
return;
for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
WebFrameImpl* currentFrame = m_frames[i];
Document* currentDoc = currentFrame->frame()->document();
RefPtr<HTMLAllCollection> all = currentDoc->all();
for (Node* node = all->firstItem(); node; node = all->nextItem()) {
if (!node->isHTMLElement())
continue;
Element* element = static_cast<Element*>(node);
WebFrameImpl* webFrame =
WebFrameImpl::fromFrameOwnerElement(element);
if (webFrame)
m_frames.append(webFrame);
}
}
}
bool WebPageSerializerImpl::serialize()
{
if (!m_framesCollected)
collectTargetFrames();
bool didSerialization = false;
KURL mainPageURL = m_specifiedWebFrameImpl->frame()->loader()->url();
for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
WebFrameImpl* currentFrame = m_frames[i];
Document* currentDoc = currentFrame->frame()->document();
const KURL& currentFrameURL = currentFrame->frame()->loader()->url();
if (m_localLinks.contains(currentFrameURL.string())) {
didSerialization = true;
String encoding = currentFrame->frame()->loader()->writer()->encoding();
TextEncoding textEncoding(encoding);
SerializeDomParam param(currentFrameURL,
encoding.length() ? textEncoding : UTF8Encoding(),
currentDoc,
currentFrameURL == mainPageURL ? m_localDirectoryName : "");
Element* rootElement = currentDoc->documentElement();
if (rootElement)
buildContentForNode(rootElement, ¶m);
encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished,
¶m,
1);
}
}
ASSERT(!m_dataBuffer.length());
m_client->didSerializeDataForFrame(KURL(),
WebCString("", 0),
WebPageSerializerClient::AllFramesAreFinished);
return didSerialization;
}
}