#include "EXTERN.h"
#define PERL_IN_UTF8_C
#include "perl.h"
static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
U8 *
Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
if (ckWARN(WARN_UTF8)) {
if (UNICODE_IS_SURROGATE(uv) &&
!(flags & UNICODE_ALLOW_SURROGATE))
Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
else if (
((uv >= 0xFDD0 && uv <= 0xFDEF &&
!(flags & UNICODE_ALLOW_FDD0))
||
((uv & 0xFFFE) == 0xFFFE &&
!(flags & UNICODE_ALLOW_FFFF))) &&
((uv <= PERL_UNICODE_MAX) ||
!(flags & UNICODE_ALLOW_SUPER))
)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
"Unicode character 0x%04"UVxf" is illegal", uv);
}
if (UNI_IS_INVARIANT(uv)) {
*d++ = (U8)UTF_TO_NATIVE(uv);
return d;
}
#if defined(EBCDIC)
else {
STRLEN len = UNISKIP(uv);
U8 *p = d+len-1;
while (p > d) {
*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
uv >>= UTF_ACCUMULATION_SHIFT;
}
*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
return d+len;
}
#else
if (uv < 0x800) {
*d++ = (U8)(( uv >> 6) | 0xc0);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x10000) {
*d++ = (U8)(( uv >> 12) | 0xe0);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x200000) {
*d++ = (U8)(( uv >> 18) | 0xf0);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x4000000) {
*d++ = (U8)(( uv >> 24) | 0xf8);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x80000000) {
*d++ = (U8)(( uv >> 30) | 0xfc);
*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
#ifdef HAS_QUAD
if (uv < UTF8_QUAD_MAX)
#endif
{
*d++ = 0xfe;
*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
#ifdef HAS_QUAD
{
*d++ = 0xff;
*d++ = 0x80;
*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);
*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
#endif
#endif
}
U8 *
Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
{
return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
}
STRLEN
Perl_is_utf8_char(pTHX_ U8 *s)
{
U8 u = *s;
STRLEN slen, len;
UV uv, ouv;
if (UTF8_IS_INVARIANT(u))
return 1;
if (!UTF8_IS_START(u))
return 0;
len = UTF8SKIP(s);
if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
return 0;
slen = len - 1;
s++;
u &= UTF_START_MASK(len);
uv = u;
ouv = uv;
while (slen--) {
if (!UTF8_IS_CONTINUATION(*s))
return 0;
uv = UTF8_ACCUMULATE(uv, *s);
if (uv < ouv)
return 0;
ouv = uv;
s++;
}
if ((STRLEN)UNISKIP(uv) < len)
return 0;
return len;
}
bool
Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
{
U8* x = s;
U8* send;
STRLEN c;
if (!len)
len = strlen((char *)s);
send = s + len;
while (x < send) {
if (UTF8_IS_INVARIANT(*x))
c = 1;
else if (!UTF8_IS_START(*x))
return FALSE;
else {
c = is_utf8_char(x);
if (!c)
return FALSE;
}
x += c;
}
if (x != send)
return FALSE;
return TRUE;
}
UV
Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
U8 *s0 = s;
UV uv = *s, ouv = 0;
STRLEN len = 1;
bool dowarn = ckWARN_d(WARN_UTF8);
UV startbyte = *s;
STRLEN expectlen = 0;
U32 warning = 0;
#define UTF8_WARN_EMPTY 1
#define UTF8_WARN_CONTINUATION 2
#define UTF8_WARN_NON_CONTINUATION 3
#define UTF8_WARN_FE_FF 4
#define UTF8_WARN_SHORT 5
#define UTF8_WARN_OVERFLOW 6
#define UTF8_WARN_SURROGATE 7
#define UTF8_WARN_LONG 8
#define UTF8_WARN_FFFF 9
if (curlen == 0 &&
!(flags & UTF8_ALLOW_EMPTY)) {
warning = UTF8_WARN_EMPTY;
goto malformed;
}
if (UTF8_IS_INVARIANT(uv)) {
if (retlen)
*retlen = 1;
return (UV) (NATIVE_TO_UTF(*s));
}
if (UTF8_IS_CONTINUATION(uv) &&
!(flags & UTF8_ALLOW_CONTINUATION)) {
warning = UTF8_WARN_CONTINUATION;
goto malformed;
}
if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
#ifdef EBCDIC
uv = NATIVE_TO_UTF(uv);
#else
if ((uv == 0xfe || uv == 0xff) &&
!(flags & UTF8_ALLOW_FE_FF)) {
warning = UTF8_WARN_FE_FF;
goto malformed;
}
#endif
if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
#ifdef EBCDIC
else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
else { len = 7; uv &= 0x01; }
#else
else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
else if (!(uv & 0x01)) { len = 7; uv = 0; }
else { len = 13; uv = 0; }
#endif
if (retlen)
*retlen = len;
expectlen = len;
if ((curlen < expectlen) &&
!(flags & UTF8_ALLOW_SHORT)) {
warning = UTF8_WARN_SHORT;
goto malformed;
}
len--;
s++;
ouv = uv;
while (len--) {
if (!UTF8_IS_CONTINUATION(*s) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
s--;
warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
else
uv = UTF8_ACCUMULATE(uv, *s);
if (!(uv > ouv)) {
if (uv == ouv) {
if (!(flags & UTF8_ALLOW_LONG)) {
warning = UTF8_WARN_LONG;
goto malformed;
}
}
else {
warning = UTF8_WARN_OVERFLOW;
goto malformed;
}
}
s++;
ouv = uv;
}
if (UNICODE_IS_SURROGATE(uv) &&
!(flags & UTF8_ALLOW_SURROGATE)) {
warning = UTF8_WARN_SURROGATE;
goto malformed;
} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
!(flags & UTF8_ALLOW_LONG)) {
warning = UTF8_WARN_LONG;
goto malformed;
} else if (UNICODE_IS_ILLEGAL(uv) &&
!(flags & UTF8_ALLOW_FFFF)) {
warning = UTF8_WARN_FFFF;
goto malformed;
}
return uv;
malformed:
if (flags & UTF8_CHECK_ONLY) {
if (retlen)
*retlen = -1;
return 0;
}
if (dowarn) {
SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
switch (warning) {
case 0: break;
case UTF8_WARN_EMPTY:
Perl_sv_catpvf(aTHX_ sv, "(empty string)");
break;
case UTF8_WARN_CONTINUATION:
Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
break;
case UTF8_WARN_NON_CONTINUATION:
if (s == s0)
Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
(UV)s[1], startbyte);
else
Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
break;
case UTF8_WARN_FE_FF:
Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
break;
case UTF8_WARN_SHORT:
Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
expectlen = curlen;
break;
case UTF8_WARN_OVERFLOW:
Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
ouv, *s, startbyte);
break;
case UTF8_WARN_SURROGATE:
Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
break;
case UTF8_WARN_LONG:
Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
break;
case UTF8_WARN_FFFF:
Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
break;
default:
Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
break;
}
if (warning) {
char *s = SvPVX(sv);
if (PL_op)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
"%s in %s", s, OP_DESC(PL_op));
else
Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
}
}
if (retlen)
*retlen = expectlen ? expectlen : len;
return 0;
}
UV
Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
{
return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
UV
Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
{
return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
STRLEN
Perl_utf8_length(pTHX_ U8 *s, U8 *e)
{
STRLEN len = 0;
if (e < s) {
if (ckWARN_d(WARN_UTF8)) {
if (PL_op)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
"%s in %s", unees, OP_DESC(PL_op));
else
Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
}
return 0;
}
while (s < e) {
U8 t = UTF8SKIP(s);
if (e - s < t) {
if (ckWARN_d(WARN_UTF8)) {
if (PL_op)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
unees, OP_DESC(PL_op));
else
Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
}
return len;
}
s += t;
len++;
}
return len;
}
IV
Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
{
IV off = 0;
if (a < b) {
while (a < b) {
U8 c = UTF8SKIP(a);
if (b - a < c) {
if (ckWARN_d(WARN_UTF8)) {
if (PL_op)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
"%s in %s", unees, OP_DESC(PL_op));
else
Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
}
return off;
}
a += c;
off--;
}
}
else {
while (b < a) {
U8 c = UTF8SKIP(b);
if (a - b < c) {
if (ckWARN_d(WARN_UTF8)) {
if (PL_op)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
"%s in %s", unees, OP_DESC(PL_op));
else
Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
}
return off;
}
b += c;
off++;
}
}
return off;
}
U8 *
Perl_utf8_hop(pTHX_ U8 *s, I32 off)
{
if (off >= 0) {
while (off--)
s += UTF8SKIP(s);
}
else {
while (off++) {
s--;
while (UTF8_IS_CONTINUATION(*s))
s--;
}
}
return s;
}
U8 *
Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
{
U8 *send;
U8 *d;
U8 *save = s;
for (send = s + *len; s < send; ) {
U8 c = *s++;
if (!UTF8_IS_INVARIANT(c) &&
(!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
|| !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
*len = -1;
return 0;
}
}
d = s = save;
while (s < send) {
STRLEN ulen;
*d++ = (U8)utf8_to_uvchr(s, &ulen);
s += ulen;
}
*d = '\0';
*len = d - save;
return save;
}
U8 *
Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
{
U8 *d;
U8 *start = s;
U8 *send;
I32 count = 0;
if (!*is_utf8)
return start;
for (send = s + *len; s < send;) {
U8 c = *s++;
if (!UTF8_IS_INVARIANT(c)) {
if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
(c = *s++) && UTF8_IS_CONTINUATION(c))
count++;
else
return start;
}
}
*is_utf8 = 0;
Newz(801, d, (*len) - count + 1, U8);
s = start; start = d;
while (s < send) {
U8 c = *s++;
if (!UTF8_IS_INVARIANT(c)) {
c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
c = ASCII_TO_NATIVE(c);
}
*d++ = c;
}
*d = '\0';
*len = d - start;
return start;
}
U8*
Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
{
U8 *send;
U8 *d;
U8 *dst;
send = s + (*len);
Newz(801, d, (*len) * 2 + 1, U8);
dst = d;
while (s < send) {
UV uv = NATIVE_TO_ASCII(*s++);
if (UNI_IS_INVARIANT(uv))
*d++ = (U8)UTF_TO_NATIVE(uv);
else {
*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
}
}
*d = '\0';
*len = d-dst;
return dst;
}
U8*
Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
U8* pend;
U8* dstart = d;
if (bytelen & 1)
Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
pend = p + bytelen;
while (p < pend) {
UV uv = (p[0] << 8) + p[1];
p += 2;
if (uv < 0x80) {
*d++ = (U8)uv;
continue;
}
if (uv < 0x800) {
*d++ = (U8)(( uv >> 6) | 0xc0);
*d++ = (U8)(( uv & 0x3f) | 0x80);
continue;
}
if (uv >= 0xd800 && uv < 0xdbff) {
UV low = *p++;
if (low < 0xdc00 || low >= 0xdfff)
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
}
if (uv < 0x10000) {
*d++ = (U8)(( uv >> 12) | 0xe0);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
continue;
}
else {
*d++ = (U8)(( uv >> 18) | 0xf0);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
continue;
}
}
*newlen = d - dstart;
return d;
}
U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
U8* s = (U8*)p;
U8* send = s + bytelen;
while (s < send) {
U8 tmp = s[0];
s[0] = s[1];
s[1] = tmp;
s += 2;
}
return utf16_to_utf8(p, d, bytelen, newlen);
}
bool
Perl_is_uni_alnum(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_alnum(tmpbuf);
}
bool
Perl_is_uni_alnumc(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_alnumc(tmpbuf);
}
bool
Perl_is_uni_idfirst(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_idfirst(tmpbuf);
}
bool
Perl_is_uni_alpha(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_alpha(tmpbuf);
}
bool
Perl_is_uni_ascii(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_ascii(tmpbuf);
}
bool
Perl_is_uni_space(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_space(tmpbuf);
}
bool
Perl_is_uni_digit(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_digit(tmpbuf);
}
bool
Perl_is_uni_upper(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_upper(tmpbuf);
}
bool
Perl_is_uni_lower(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_lower(tmpbuf);
}
bool
Perl_is_uni_cntrl(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_cntrl(tmpbuf);
}
bool
Perl_is_uni_graph(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_graph(tmpbuf);
}
bool
Perl_is_uni_print(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_print(tmpbuf);
}
bool
Perl_is_uni_punct(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_punct(tmpbuf);
}
bool
Perl_is_uni_xdigit(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_xdigit(tmpbuf);
}
UV
Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
{
uvchr_to_utf8(p, c);
return to_utf8_upper(p, p, lenp);
}
UV
Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
{
uvchr_to_utf8(p, c);
return to_utf8_title(p, p, lenp);
}
UV
Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
{
uvchr_to_utf8(p, c);
return to_utf8_lower(p, p, lenp);
}
UV
Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
{
uvchr_to_utf8(p, c);
return to_utf8_fold(p, p, lenp);
}
bool
Perl_is_uni_alnum_lc(pTHX_ UV c)
{
return is_uni_alnum(c);
}
bool
Perl_is_uni_alnumc_lc(pTHX_ UV c)
{
return is_uni_alnumc(c);
}
bool
Perl_is_uni_idfirst_lc(pTHX_ UV c)
{
return is_uni_idfirst(c);
}
bool
Perl_is_uni_alpha_lc(pTHX_ UV c)
{
return is_uni_alpha(c);
}
bool
Perl_is_uni_ascii_lc(pTHX_ UV c)
{
return is_uni_ascii(c);
}
bool
Perl_is_uni_space_lc(pTHX_ UV c)
{
return is_uni_space(c);
}
bool
Perl_is_uni_digit_lc(pTHX_ UV c)
{
return is_uni_digit(c);
}
bool
Perl_is_uni_upper_lc(pTHX_ UV c)
{
return is_uni_upper(c);
}
bool
Perl_is_uni_lower_lc(pTHX_ UV c)
{
return is_uni_lower(c);
}
bool
Perl_is_uni_cntrl_lc(pTHX_ UV c)
{
return is_uni_cntrl(c);
}
bool
Perl_is_uni_graph_lc(pTHX_ UV c)
{
return is_uni_graph(c);
}
bool
Perl_is_uni_print_lc(pTHX_ UV c)
{
return is_uni_print(c);
}
bool
Perl_is_uni_punct_lc(pTHX_ UV c)
{
return is_uni_punct(c);
}
bool
Perl_is_uni_xdigit_lc(pTHX_ UV c)
{
return is_uni_xdigit(c);
}
U32
Perl_to_uni_upper_lc(pTHX_ U32 c)
{
STRLEN len;
U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
return (U32)to_uni_upper(c, tmpbuf, &len);
}
U32
Perl_to_uni_title_lc(pTHX_ U32 c)
{
STRLEN len;
U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
return (U32)to_uni_title(c, tmpbuf, &len);
}
U32
Perl_to_uni_lower_lc(pTHX_ U32 c)
{
STRLEN len;
U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
return (U32)to_uni_lower(c, tmpbuf, &len);
}
bool
Perl_is_utf8_alnum(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
#ifdef SURPRISINGLY_SLOWER
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "",
sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
#endif
}
bool
Perl_is_utf8_alnumc(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
#ifdef SURPRISINGLY_SLOWER
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "",
sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
#endif
}
bool
Perl_is_utf8_idfirst(pTHX_ U8 *p)
{
if (*p == '_')
return TRUE;
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_idstart)
PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
}
bool
Perl_is_utf8_idcont(pTHX_ U8 *p)
{
if (*p == '_')
return TRUE;
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_idcont)
PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
}
bool
Perl_is_utf8_alpha(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_alpha)
PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
}
bool
Perl_is_utf8_ascii(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_ascii)
PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
}
bool
Perl_is_utf8_space(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_space)
PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_space, p, TRUE) != 0;
}
bool
Perl_is_utf8_digit(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_digit)
PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
}
bool
Perl_is_utf8_upper(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_upper)
PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
}
bool
Perl_is_utf8_lower(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_lower)
PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
}
bool
Perl_is_utf8_cntrl(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_cntrl)
PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
}
bool
Perl_is_utf8_graph(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_graph)
PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
}
bool
Perl_is_utf8_print(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_print)
PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_print, p, TRUE) != 0;
}
bool
Perl_is_utf8_punct(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_punct)
PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
}
bool
Perl_is_utf8_xdigit(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_xdigit)
PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
}
bool
Perl_is_utf8_mark(pTHX_ U8 *p)
{
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_mark)
PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
}
UV
Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
{
UV uv0, uv1;
U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
STRLEN len = 0;
uv0 = utf8_to_uvchr(p, 0);
uv1 = NATIVE_TO_UNI(uv0);
uvuni_to_utf8(tmpbuf, uv1);
if (!*swashp)
*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
if (special) {
HV *hv;
SV *keysv;
HE *he;
SV *val;
if ((hv = get_hv(special, FALSE)) &&
(keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
(he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
(val = HeVAL(he))) {
char *s;
s = SvPV(val, len);
if (len == 1)
len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
else {
#ifdef EBCDIC
U8 *t = (U8*)s, *tend = t + len, *d;
d = tmpbuf;
if (SvUTF8(val)) {
STRLEN tlen = 0;
while (t < tend) {
UV c = utf8_to_uvchr(t, &tlen);
if (tlen > 0) {
d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
t += tlen;
}
else
break;
}
}
else {
while (t < tend) {
d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
t++;
}
}
len = d - tmpbuf;
Copy(tmpbuf, ustrp, len, U8);
#else
Copy(s, ustrp, len, U8);
#endif
}
}
}
if (!len && *swashp) {
UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
if (uv2) {
UV uv3 = UNI_TO_NATIVE(uv2);
len = uvchr_to_utf8(ustrp, uv3) - ustrp;
}
}
if (!len)
len = uvchr_to_utf8(ustrp, uv0) - ustrp;
if (lenp)
*lenp = len;
return len ? utf8_to_uvchr(ustrp, 0) : 0;
}
UV
Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
{
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
}
UV
Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
{
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
}
UV
Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
{
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
}
UV
Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
{
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
}
SV*
Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
{
SV* retval;
SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
dSP;
HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
SV* errsv_save;
if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {
ENTER;
errsv_save = newSVsv(ERRSV);
Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
if (!SvTRUE(ERRSV))
sv_setsv(ERRSV, errsv_save);
SvREFCNT_dec(errsv_save);
LEAVE;
}
SPAGAIN;
PUSHSTACKi(PERLSI_MAGIC);
PUSHMARK(SP);
EXTEND(SP,5);
PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
PUSHs(listsv);
PUSHs(sv_2mortal(newSViv(minbits)));
PUSHs(sv_2mortal(newSViv(none)));
PUTBACK;
ENTER;
SAVEI32(PL_hints);
PL_hints = 0;
save_re_context();
if (PL_curcop == &PL_compiling) {
SAVEI32(PL_in_my);
PL_in_my = 0;
sv_setpv(tokenbufsv, PL_tokenbuf);
}
errsv_save = newSVsv(ERRSV);
if (call_method("SWASHNEW", G_SCALAR))
retval = newSVsv(*PL_stack_sp--);
else
retval = &PL_sv_undef;
if (!SvTRUE(ERRSV))
sv_setsv(ERRSV, errsv_save);
SvREFCNT_dec(errsv_save);
LEAVE;
POPSTACK;
if (PL_curcop == &PL_compiling) {
STRLEN len;
char* pv = SvPV(tokenbufsv, len);
Copy(pv, PL_tokenbuf, len+1, char);
PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
}
if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
if (SvPOK(retval))
Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
retval);
Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
}
return retval;
}
UV
Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
{
HV* hv = (HV*)SvRV(sv);
U32 klen;
U32 off;
STRLEN slen;
STRLEN needents;
U8 *tmps = NULL;
U32 bit;
SV *retval;
U8 tmputf8[2];
UV c = NATIVE_TO_ASCII(*ptr);
if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
ptr = tmputf8;
}
klen = UTF8SKIP(ptr) - 1;
off = ptr[klen];
if (klen == 0)
{
needents = UTF_CONTINUATION_MARK;
off = NATIVE_TO_UTF(ptr[klen]);
}
else
{
needents = (1 << UTF_ACCUMULATION_SHIFT);
off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
}
if (hv == PL_last_swash_hv &&
klen == PL_last_swash_klen &&
(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
{
tmps = PL_last_swash_tmps;
slen = PL_last_swash_slen;
}
else {
SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
dSP;
UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
SV *errsv_save;
ENTER;
SAVETMPS;
save_re_context();
PUSHSTACKi(PERLSI_MAGIC);
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
PUSHs(sv_2mortal(newSViv((klen) ?
(code_point & ~(needents - 1)) : 0)));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
errsv_save = newSVsv(ERRSV);
if (call_method("SWASHGET", G_SCALAR))
retval = newSVsv(*PL_stack_sp--);
else
retval = &PL_sv_undef;
if (!SvTRUE(ERRSV))
sv_setsv(ERRSV, errsv_save);
SvREFCNT_dec(errsv_save);
POPSTACK;
FREETMPS;
LEAVE;
if (PL_curcop == &PL_compiling)
PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
svp = hv_store(hv, (char*)ptr, klen, retval, 0);
if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
}
PL_last_swash_hv = hv;
PL_last_swash_klen = klen;
PL_last_swash_tmps = tmps;
PL_last_swash_slen = slen;
if (klen)
Copy(ptr, PL_last_swash_key, klen, U8);
}
switch ((int)((slen << 3) / needents)) {
case 1:
bit = 1 << (off & 7);
off >>= 3;
return (tmps[off] & bit) != 0;
case 8:
return tmps[off];
case 16:
off <<= 1;
return (tmps[off] << 8) + tmps[off + 1] ;
case 32:
off <<= 2;
return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
}
Perl_croak(aTHX_ "panic: swash_fetch");
return 0;
}
#undef Perl_uvchr_to_utf8
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
}
U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
}
#undef Perl_utf8n_to_uvchr
UV
Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
return UNI_TO_NATIVE(uv);
}
char *
Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
{
int truncated = 0;
char *s, *e;
sv_setpvn(dsv, "", 0);
for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
UV u;
bool ok = FALSE;
if (pvlim && SvCUR(dsv) >= pvlim) {
truncated++;
break;
}
u = utf8_to_uvchr((U8*)s, 0);
if (u < 256) {
if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
switch (u & 0xFF) {
case '\n':
Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
case '\r':
Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
case '\t':
Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
case '\f':
Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
case '\a':
Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
case '\\':
Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
default: break;
}
}
if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
ok = TRUE;
}
}
if (!ok)
Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
}
if (truncated)
sv_catpvn(dsv, "...", 3);
return SvPVX(dsv);
}
char *
Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
{
return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
pvlim, flags);
}
I32
Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
{
register U8 *p1 = (U8*)s1;
register U8 *p2 = (U8*)s2;
register U8 *e1 = 0, *f1 = 0, *q1 = 0;
register U8 *e2 = 0, *f2 = 0, *q2 = 0;
STRLEN n1 = 0, n2 = 0;
U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
U8 natbuf[1+1];
STRLEN foldlen1, foldlen2;
bool match;
if (pe1)
e1 = *(U8**)pe1;
if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
f1 = (U8*)s1 + l1;
if (pe2)
e2 = *(U8**)pe2;
if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
f2 = (U8*)s2 + l2;
if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
return 1;
if (!u1 || !u2)
natbuf[1] = 0;
while ((e1 == 0 || p1 < e1) &&
(f1 == 0 || p1 < f1) &&
(e2 == 0 || p2 < e2) &&
(f2 == 0 || p2 < f2)) {
if (n1 == 0) {
if (u1)
to_utf8_fold(p1, foldbuf1, &foldlen1);
else {
natbuf[0] = *p1;
to_utf8_fold(natbuf, foldbuf1, &foldlen1);
}
q1 = foldbuf1;
n1 = foldlen1;
}
if (n2 == 0) {
if (u2)
to_utf8_fold(p2, foldbuf2, &foldlen2);
else {
natbuf[0] = *p2;
to_utf8_fold(natbuf, foldbuf2, &foldlen2);
}
q2 = foldbuf2;
n2 = foldlen2;
}
while (n1 && n2) {
if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
(UTF8SKIP(q1) == 1 && *q1 != *q2) ||
memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
return 1;
n1 -= UTF8SKIP(q1);
q1 += UTF8SKIP(q1);
n2 -= UTF8SKIP(q2);
q2 += UTF8SKIP(q2);
}
if (n1 == 0)
p1 += u1 ? UTF8SKIP(p1) : 1;
if (n2 == 0)
p2 += u2 ? UTF8SKIP(p2) : 1;
}
match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
if (match) {
if (pe1)
*pe1 = (char*)p1;
if (pe2)
*pe2 = (char*)p2;
}
return match ? 0 : 1;
}