RSSFilterStream.cpp [plain text]
#include "config.h"
#include "RSSFilterStream.h"
#include "RSS10Parser.h"
#include "RSS20Parser.h"
#include "RSSAtomParser.h"
#include "RSSGenerator.h"
#include "TextCodecICU.h"
#include <BlackBerryPlatformAssert.h>
#include <BlackBerryPlatformLog.h>
#include <ScopePointer.h>
#include <network/NetworkRequest.h>
#include <wtf/ASCIICType.h>
#include <wtf/OwnPtr.h>
#include <wtf/PassOwnPtr.h>
#include <wtf/text/CString.h>
#include <wtf/text/WTFString.h>
using BlackBerry::Platform::LogLevelCritical;
using BlackBerry::Platform::LogLevelInfo;
using BlackBerry::Platform::LogLevelWarn;
using BlackBerry::Platform::NetworkRequest;
namespace WebCore {
static const char* const s_utf8EncodingName = "UTF-8";
static const char* const s_latin1EncodingName = "ISO-8859-1";
static const char* const s_gbkEncodingName = "GBK";
static const char* const s_contentEncodingHeaderKey = "Content-Encoding";
static const char* const s_contentLengthHeaderKey = "Content-Length";
static const char* const s_contentTypeHeaderKey = "Content-Type";
static int isASCIISpaceLowerByte(int ch)
{
return isASCIISpace<int>(ch & 0xff);
}
static std::string& stripWhiteSpace(std::string& str)
{
str.erase(str.begin(), std::find_if(str.begin(), str.end(), std::not1(std::ptr_fun<int, int>(isASCIISpaceLowerByte))));
str.erase(std::find_if(str.rbegin(), str.rend(), std::not1(std::ptr_fun<int, int>(isASCIISpaceLowerByte))).base(), str.end());
return str;
}
static inline bool equalIgnoringCase(const std::string& first, const char* second)
{
return !strcasecmp(first.c_str(), second);
}
static inline bool isAtomMIMEType(const std::string& mimeType)
{
return equalIgnoringCase(mimeType, "application/atom+xml");
}
static inline bool isRSSMIMEType(const std::string& mimeType)
{
return equalIgnoringCase(mimeType, "application/rss+xml");
}
static inline bool isPotentialRSSMIMEType(const std::string& mimeType)
{
return !strncmp(mimeType.data(), "text/xml", 8) || !strncmp(mimeType.data(), "application/xml", 15);
}
static inline bool isRSSContent(RSSFilterStream::ResourceType type)
{
return type == RSSFilterStream::TypeRSS10
|| type == RSSFilterStream::TypeRSS20
|| type == RSSFilterStream::TypeRSSAtom;
}
static RSSFilterStream::ResourceType RSSTypeFromContentType(const std::string& contentType)
{
if (contentType.empty())
return RSSFilterStream::TypeUnknown;
std::string mimeType;
std::string version;
bool isRSS = false;
for (size_t start = 0, delimiter = 0; delimiter != std::string::npos; start = delimiter + 1) {
delimiter = contentType.find(';', start);
std::string component = contentType.substr(start, delimiter == std::string::npos ? delimiter : delimiter - start);
if (mimeType.empty()) {
if (isAtomMIMEType(component))
return RSSFilterStream::TypeRSSAtom;
if (isRSSMIMEType(component)) {
mimeType = component;
isRSS = true;
continue;
}
}
size_t equalsign = component.find('=');
if (equalsign != std::string::npos && equalIgnoringCase(component.substr(0, equalsign), "version"))
version = component.substr(equalsign + 1);
if (!mimeType.empty() && !version.empty())
break;
}
if (isRSS) {
if (!version.compare("1.0"))
return RSSFilterStream::TypeRSS10;
return RSSFilterStream::TypeRSS20;
}
return RSSFilterStream::TypeUnknown;
}
static RSSFilterStream::ResourceType RSSTypeFromContent(const char* str, int strLen)
{
const char* pos = str;
while (true) {
pos = strchr(pos, '<');
if (!pos || (pos - str >= strLen)) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): can not find any start tag");
return RSSFilterStream::TypeUnknown;
}
++pos;
if (!strncasecmp(pos, "rss", 3)) {
if (isSpaceOrNewline(*(pos + 3))) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): found RSS 2.0 tag");
return RSSFilterStream::TypeRSS20;
}
BBLOG(LogLevelInfo, "RSSTypeFromContent(): wrong RSS 2.0 tag: %.*s", 32, pos);
return RSSFilterStream::TypeUnknown;
}
if (!strncasecmp(pos, "feed", 4)) {
if (isSpaceOrNewline(*(pos + 4))) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): found Atom 1.0 tag");
return RSSFilterStream::TypeRSSAtom;
}
BBLOG(LogLevelInfo, "RSSTypeFromContent(): wrong Atom 1.0 tag: %.*s", 32, pos);
return RSSFilterStream::TypeUnknown;
}
if (!strncasecmp(pos, "rdf:RDF", 7)) {
if (isSpaceOrNewline(*(pos + 7))) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): found RSS 1.0 tag");
return RSSFilterStream::TypeRSS10;
}
BBLOG(LogLevelInfo, "RSSTypeFromContent(): wrong RSS 1.0 tag: %.*s", 32, pos);
return RSSFilterStream::TypeUnknown;
}
if (!strncasecmp(pos, "?xml", 4)) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): found another xml tag");
pos += 4;
pos = strstr(pos, "?>");
if (!pos || (pos - str >= strLen)) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): extra xml tag too long");
return RSSFilterStream::TypeUnknown;
}
pos += 2;
continue;
}
if (!strncasecmp(pos, "!--", 3)) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): found comments");
pos += 3;
pos = strstr(pos, "-->");
if (!pos || (pos - str >= strLen)) {
BBLOG(LogLevelInfo, "RSSTypeFromContent(): comments too long");
return RSSFilterStream::TypeUnknown;
}
pos += 3;
continue;
}
BBLOG(LogLevelInfo, "RSSTypeFromContent(): non rss type (%.*s) found", 32, pos);
return RSSFilterStream::TypeUnknown;
}
return RSSFilterStream::TypeUnknown;
}
static PassOwnPtr<RSSParserBase> createParser(RSSFilterStream::ResourceType type)
{
BLACKBERRY_ASSERT(isRSSContent(type));
switch (type) {
case RSSFilterStream::TypeRSSAtom:
return adoptPtr(new RSSAtomParser());
case RSSFilterStream::TypeRSS10:
return adoptPtr(new RSS10Parser());
case RSSFilterStream::TypeRSS20:
return adoptPtr(new RSS20Parser());
default:
ASSERT_NOT_REACHED();
return adoptPtr(new RSS20Parser());
}
}
static bool findXMLEncodingPosition(const char* str, const char*& encodingStart, const char*& encodingEnd)
{
encodingStart = strstr(str, "<?xml ");
if (!encodingStart)
return false;
encodingStart += 6; char* endPos = strstr(const_cast<char*>(encodingStart), "?>");
if (!endPos)
return false;
encodingStart = strstr(const_cast<char*>(encodingStart), "encoding");
if (!encodingStart || encodingStart > endPos)
return false;
encodingStart += 8;
while (encodingStart < endPos && *encodingStart <= ' ')
++encodingStart;
if (encodingStart == endPos || *encodingStart != '=')
return false;
++encodingStart;
while (encodingStart < endPos && *encodingStart <= ' ')
++encodingStart;
if (encodingStart == endPos)
return false;
char quote = *encodingStart;
if (quote != '\"' && quote != '\'')
return false;
++encodingStart;
encodingEnd = strchr(const_cast<char*>(encodingStart), quote);
if (!encodingEnd || encodingEnd == encodingStart || encodingEnd > endPos)
return false;
return true;
}
static bool findXMLLanguagePosition(const char* str, const char*& langStart, const char*& langEnd)
{
langStart = strstr(const_cast<char*>(str), "<language>");
if (!langStart)
return false;
langStart += 10; char* endPos = strstr(const_cast<char*>(langStart), "</language>");
if (!endPos)
return false;
while (langStart < endPos && *langStart <= ' ')
++langStart;
if (langStart == endPos)
return false;
langEnd = endPos - 1;
while (langEnd > langStart && *langEnd <= ' ')
--langEnd;
++langEnd;
return true;
}
static const char* defaultEncodingForLanguage(const char* language)
{
if (!strcasecmp(language, "en") || !strcasecmp(language, "en-US"))
return s_latin1EncodingName;
if (!strcasecmp(language, "zh-cn"))
return s_gbkEncodingName;
return 0;
}
static bool isTranscodingNeeded(const std::string& encoding)
{
if (encoding.empty())
return true;
return TextEncoding(s_utf8EncodingName) != TextEncoding(encoding.c_str());
}
enum TranscodeResult {
Success,
SourceEncodingUnsupported,
SourceBroken,
TargetEncodingUnsupported,
TargetBufferInsufficient
};
static TranscodeResult transcode(const char* sourceEncoding,
const char* targetEncoding,
const char*& sourceStart,
int sourceLength,
char*& targetStart,
unsigned targetLength)
{
TextEncoding textEncodingSource(sourceEncoding);
if (!textEncodingSource.isValid())
return SourceEncodingUnsupported;
TextEncoding textEncodingTarget(targetEncoding);
if (!textEncodingTarget.isValid())
return TargetEncodingUnsupported;
bool sawError = false;
String ucs2 = TextCodecICU(textEncodingSource).decode(sourceStart, sourceLength, true, true, sawError);
if (sawError)
return SourceBroken;
CString encoded = TextCodecICU(textEncodingTarget).encode(ucs2.characters(), ucs2.length(), WebCore::EntitiesForUnencodables);
if (encoded.length() > targetLength)
return TargetBufferInsufficient;
strncpy(targetStart, encoded.data(), encoded.length());
targetStart += encoded.length();
return Success;
}
static bool transcodeContent(const std::string& content, const std::string& encoding, std::string& result)
{
unsigned utf8length = content.size() * 3 / 2; ScopeArray<char> buffer(new char[utf8length + 1]);
std::string xmlHeaderString;
const char* start = content.c_str();
const char* xmlStart = strstr(start, "<?xml ");
const char* xmlEnd;
if (xmlStart) {
xmlEnd = strstr(xmlStart, "?>");
if (!xmlEnd)
return false;
xmlEnd += 2;
const char* encodingStart;
const char* encodingEnd;
if (findXMLEncodingPosition(start, encodingStart, encodingEnd)) {
xmlHeaderString.assign(start, encodingStart - start);
xmlHeaderString.append(s_utf8EncodingName);
xmlHeaderString.append(encodingEnd, xmlEnd - encodingEnd);
} else {
xmlHeaderString.assign(start, (xmlEnd - 2) - start);
xmlHeaderString.append(" encoding=\"UTF-8\"?>");
}
} else {
xmlEnd = start;
xmlHeaderString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
}
const char* contentStart = xmlEnd;
char* bufferStart = &buffer[0];
TranscodeResult transcodeResult = transcode(encoding.c_str(), s_utf8EncodingName, contentStart, content.length() - (xmlEnd - xmlStart), bufferStart, utf8length);
if (Success != transcodeResult) {
int i = 0;
for (; contentStart < start + content.size() ; ++contentStart, ++i) {
if (isASCIIPrintable(*contentStart))
buffer[i] = *contentStart;
else
buffer[i] = '?';
}
buffer[i] = 0;
} else
*bufferStart = 0;
result.assign(xmlHeaderString);
result.append(&buffer[0]);
return true;
}
RSSFilterStream::RSSFilterStream()
: FilterStream()
, m_resourceType(TypeUnknown)
, m_contentTypeIndex(-1)
, m_charsetChecked(false)
, m_encodingChecked(false)
{
}
void RSSFilterStream::notifyStatusReceived(int status, const BlackBerry::Platform::String& message)
{
if (status < 0 || (300 <= status && status < 400 && status != 304)) {
m_resourceType = TypeNotRSS;
sendSavedHeaders();
}
FilterStream::notifyStatusReceived(status, message);
}
void RSSFilterStream::notifyHeadersReceived(const NetworkRequest::HeaderList& headers)
{
if (!isRSSContent(m_resourceType)) {
NetworkRequest::HeaderList::const_iterator end = headers.end();
NetworkRequest::HeaderList::const_iterator iter = headers.begin();
for (; iter != end; ++iter) {
if (equalIgnoringCase(iter->first.toStdString(), s_contentTypeHeaderKey)) {
if (iter->second.find("xml") != std::string::npos) {
m_contentTypeIndex = std::distance(NetworkRequest::HeaderList::const_iterator(headers.begin()), iter);
ResourceType type = RSSTypeFromContentType(iter->second.toStdString());
if (isRSSContent(type))
m_resourceType = type;
else if (!isPotentialRSSMIMEType(iter->second.toStdString()))
m_resourceType = TypeNotRSS;
}
break;
}
}
}
if (m_contentTypeIndex < 0)
m_resourceType = TypeNotRSS;
if (m_resourceType == TypeNotRSS) {
sendSavedHeaders();
FilterStream::notifyHeadersReceived(headers);
} else
saveHeaders(headers);
}
void RSSFilterStream::notifyDataReceived(BlackBerry::Platform::NetworkBuffer* buffer)
{
if (m_resourceType == TypeUnknown)
m_resourceType = RSSTypeFromContent(BlackBerry::Platform::networkBufferData(buffer), BlackBerry::Platform::networkBufferDataLength(buffer));
if (m_resourceType == TypeUnknown)
m_resourceType = TypeNotRSS;
if (isRSSContent(m_resourceType))
appendData(buffer);
else {
sendSavedHeaders();
FilterStream::notifyDataReceived(buffer);
}
}
void RSSFilterStream::notifyClose(int status)
{
if (m_resourceType == TypeUnknown)
m_resourceType = TypeNotRSS;
if (isRSSContent(m_resourceType))
handleRSSContent();
else {
sendSavedHeaders();
FilterStream::notifyClose(status);
}
}
bool RSSFilterStream::convertContentToHtml(std::string& result)
{
bool success = false;
const std::string& base = url().toStdString();
const std::string& enc = encoding();
OwnPtr<RSSParserBase> parser = createParser(m_resourceType);
if (isTranscodingNeeded(enc)) {
std::string transcodedContent;
if (transcodeContent(m_content, enc, transcodedContent))
success = parser->parseBuffer(transcodedContent.data(), transcodedContent.length(), base.c_str(), s_utf8EncodingName);
} else
success = parser->parseBuffer(m_content.data(), m_content.length(), base.c_str(), s_utf8EncodingName);
if (!success)
return false;
OwnPtr<RSSGenerator> generator = adoptPtr(new RSSGenerator());
String html = generator->generateHtml(parser->m_root);
ASSERT(html.is8Bit());
result.assign(reinterpret_cast<const char*>(html.characters8()), html.length());
return true;
}
void RSSFilterStream::handleRSSContent()
{
BLACKBERRY_ASSERT(isRSSContent(m_resourceType));
BlackBerry::Platform::NetworkBuffer* buffer;
std::string html;
if (!convertContentToHtml(html))
buffer = BlackBerry::Platform::createNetworkBufferByCopyingData(m_content.c_str(), m_content.size());
else {
updateRSSHeaders(html.size());
buffer = BlackBerry::Platform::createNetworkBufferByCopyingData(html.c_str(), html.size());
}
sendSavedHeaders();
FilterStream::notifyDataReceived(buffer);
BlackBerry::Platform::releaseNetworkBuffer(buffer);
FilterStream::notifyClose(FilterStream::StatusSuccess);
}
const std::string& RSSFilterStream::charset()
{
BLACKBERRY_ASSERT(m_contentTypeIndex >= 0);
if (m_charsetChecked)
return m_charset;
m_charsetChecked = true;
static const char* charsetPrefix = "charset=";
static const int charsetPrefixLen = strlen(charsetPrefix);
size_t pos = (m_headers.at(m_contentTypeIndex).second).find(charsetPrefix);
if (pos != std::string::npos) {
m_charset = m_headers.at(m_contentTypeIndex).second.substr(pos + charsetPrefixLen).toStdString();
stripWhiteSpace(m_charset);
}
return m_charset;
}
const std::string& RSSFilterStream::encoding()
{
if (m_encodingChecked)
return m_encoding;
m_encodingChecked = true;
const std::string& str = charset();
if (!str.empty()) {
m_encoding = str;
return m_encoding;
}
const char* start = m_content.c_str();
const char* encodingStart = 0;
const char* encodingEnd = 0;
if (findXMLEncodingPosition(start, encodingStart, encodingEnd)) {
m_encoding = std::string(encodingStart, encodingEnd - encodingStart);
return m_encoding;
}
if (findXMLLanguagePosition(start, encodingStart, encodingEnd)) {
std::string languageString(encodingStart, encodingEnd - encodingStart);
const char* defaultEncoding = defaultEncodingForLanguage(languageString.c_str());
if (defaultEncoding) {
m_encoding = defaultEncoding;
return m_encoding;
}
}
return m_encoding;
}
void RSSFilterStream::saveHeaders(const BlackBerry::Platform::NetworkRequest::HeaderList& headers)
{
m_headers = headers;
}
void RSSFilterStream::removeHeader(const char* key)
{
NetworkRequest::HeaderList::iterator end = m_headers.end();
for (NetworkRequest::HeaderList::iterator iter = m_headers.begin(); iter != end; ++iter) {
if (!strcasecmp(iter->first.c_str(), key)) {
m_headers.erase(iter);
return;
}
}
}
void RSSFilterStream::updateHeader(const char* key, const BlackBerry::Platform::String& value)
{
NetworkRequest::HeaderList::iterator iter = m_headers.begin();
NetworkRequest::HeaderList::iterator end = m_headers.end();
for (; iter != end; ++iter) {
if (!strcasecmp(iter->first.c_str(), key)) {
iter->second = value;
return;
}
}
m_headers.insert(iter, std::make_pair(BlackBerry::Platform::String::fromAscii(key), value));
}
void RSSFilterStream::updateRSSHeaders(size_t size)
{
STATIC_LOCAL_STRING(s_htmlContentType, "text/html; charset=utf-8");
removeHeader(s_contentEncodingHeaderKey);
updateHeader(s_contentTypeHeaderKey, s_htmlContentType);
updateHeader(s_contentLengthHeaderKey, BlackBerry::Platform::String::number(size));
}
void RSSFilterStream::sendSavedHeaders()
{
if (!m_headers.empty()) {
FilterStream::notifyHeadersReceived(m_headers);
m_contentTypeIndex = -1;
m_headers.clear();
}
}
void RSSFilterStream::appendData(BlackBerry::Platform::NetworkBuffer* buffer)
{
m_content.append(networkBufferData(buffer), networkBufferDataLength(buffer));
}
}