#include "regexp.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
using KJS::CString;
using KJS::RegExp;
using KJS::UString;
#ifdef HAVE_PCREPOSIX
static CString convertToUTF8(const UString &s)
{
const int length = s.size();
const unsigned bufferSize = length * 3 + 1;
char fixedSizeBuffer[1024];
char *buffer;
if (bufferSize > sizeof(fixedSizeBuffer)) {
buffer = new char [bufferSize];
} else {
buffer = fixedSizeBuffer;
}
char *p = buffer;
for (int i = 0; i != length; ++i) {
unsigned short c = s[i].unicode();
if (c < 0x80) {
*p++ = (char)c;
} else if (c < 0x800) {
*p++ = (char)((c >> 6) | 0xC0); *p++ = (char)((c | 0x80) & 0xBF); } else {
*p++ = (char)((c >> 12) | 0xE0); *p++ = (char)(((c >> 6) | 0x80) & 0xBF); *p++ = (char)((c | 0x80) & 0xBF); }
}
*p = 0;
CString result(buffer);
if (buffer != fixedSizeBuffer) {
delete [] buffer;
}
return result;
}
struct StringOffset {
int offset;
int locationInOffsetsArray;
};
static int compareStringOffsets(const void *a, const void *b)
{
const StringOffset *oa = static_cast<const StringOffset *>(a);
const StringOffset *ob = static_cast<const StringOffset *>(b);
if (oa->offset < ob->offset) {
return -1;
}
if (oa->offset > ob->offset) {
return +1;
}
return 0;
}
const int sortedOffsetsFixedBufferSize = 128;
static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
{
StringOffset *sortedOffsets;
if (numOffsets <= sortedOffsetsFixedBufferSize) {
sortedOffsets = sortedOffsetsFixedBuffer;
} else {
sortedOffsets = new StringOffset [numOffsets];
}
for (int i = 0; i != numOffsets; ++i) {
sortedOffsets[i].offset = offsets[i];
sortedOffsets[i].locationInOffsetsArray = i;
}
qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
return sortedOffsets;
}
static void convertCharacterOffsetsToUTF8ByteOffsets(const char *s, int *offsets, int numOffsets)
{
StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
int characterOffset = 0;
const char *p = s;
for (int oi = 0; oi != numOffsets; ++oi) {
const int nextOffset = sortedOffsets[oi].offset;
while (*p && characterOffset < nextOffset) {
++characterOffset;
do ++p; while ((*p & 0xC0) == 0x80); }
offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
}
if (sortedOffsets != fixedBuffer) {
delete [] sortedOffsets;
}
}
static void convertUTF8ByteOffsetsToCharacterOffsets(const char *s, int *offsets, int numOffsets)
{
StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
int characterOffset = 0;
const char *p = s;
for (int oi = 0; oi != numOffsets; ++oi) {
const int nextOffset = sortedOffsets[oi].offset;
while (*p && (p - s) < nextOffset) {
++characterOffset;
do ++p; while ((*p & 0xC0) == 0x80); }
offsets[sortedOffsets[oi].locationInOffsetsArray] = characterOffset;
}
if (sortedOffsets != fixedBuffer) {
delete [] sortedOffsets;
}
}
#endif // HAVE_PCREPOSIX
RegExp::RegExp(const UString &p, int flags)
: _flags(flags), _numSubPatterns(0)
{
#ifdef HAVE_PCREPOSIX
int options = PCRE_UTF8;
if (flags & IgnoreCase)
options |= PCRE_CASELESS;
if (flags & Multiline)
options |= PCRE_MULTILINE;
const char *errorMessage;
int errorOffset;
_regex = pcre_compile(convertToUTF8(p).c_str(), options, &errorMessage, &errorOffset, NULL);
if (!_regex) {
#ifndef NDEBUG
fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
#endif
return;
}
#ifdef PCRE_INFO_CAPTURECOUNT
pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
#endif
#else
int regflags = 0;
#ifdef REG_EXTENDED
regflags |= REG_EXTENDED;
#endif
#ifdef REG_ICASE
if ( f & IgnoreCase )
regflags |= REG_ICASE;
#endif
regcomp(&_regex, p.ascii(), regflags);
#endif
}
RegExp::~RegExp()
{
#ifdef HAVE_PCREPOSIX
pcre_free(_regex);
#else
regfree(&_regex);
#endif
}
UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
{
if (i < 0)
i = 0;
int dummyPos;
if (!pos)
pos = &dummyPos;
*pos = -1;
if (ovector)
*ovector = 0;
if (i > s.size() || s.isNull())
return UString::null();
#ifdef HAVE_PCREPOSIX
if (!_regex)
return UString::null();
int *offsetVector;
int offsetVectorSize;
int fixedSizeOffsetVector[3];
if (!ovector) {
offsetVectorSize = 3;
offsetVector = fixedSizeOffsetVector;
} else {
offsetVectorSize = (_numSubPatterns + 1) * 3;
offsetVector = new int [offsetVectorSize];
}
const CString buffer(convertToUTF8(s));
convertCharacterOffsetsToUTF8ByteOffsets(buffer.c_str(), &i, 1);
const int numMatches = pcre_exec(_regex, NULL, buffer.c_str(), buffer.size(), i, 0, offsetVector, offsetVectorSize);
if (numMatches < 0) {
#ifndef NDEBUG
if (numMatches != PCRE_ERROR_NOMATCH)
fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
#endif
if (offsetVector != fixedSizeOffsetVector)
delete [] offsetVector;
return UString::null();
}
convertUTF8ByteOffsetsToCharacterOffsets(buffer.c_str(), offsetVector, (numMatches == 0 ? 1 : numMatches) * 2);
*pos = offsetVector[0];
if (ovector)
*ovector = offsetVector;
return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
#else
const uint maxMatch = 10;
regmatch_t rmatch[maxMatch];
char *str = strdup(s.ascii()); if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
free(str);
return UString::null();
}
free(str);
if (!ovector) {
*pos = rmatch[0].rm_so + i;
return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
}
_numSubPatterns = 0;
for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
_numSubPatterns++;
int ovecsize = (_numSubPatterns+1)*3; *ovector = new int[ovecsize];
for (uint j = 0; j < _numSubPatterns + 1; j++) {
if (j>maxMatch)
break;
(*ovector)[2*j] = rmatch[j].rm_so + i;
(*ovector)[2*j+1] = rmatch[j].rm_eo + i;
}
*pos = (*ovector)[0];
return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
#endif
}