#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "po-lex.h"
#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#if HAVE_ICONV
# include <iconv.h>
#endif
#include "c-ctype.h"
#include "linebreak.h"
#include "vasprintf.h"
#include "gettext.h"
#include "po-charset.h"
#include "xalloc.h"
#include "exit.h"
#include "error.h"
#include "error-progname.h"
#include "pos.h"
#include "str-list.h"
#include "po-gram-gen2.h"
#define _(str) gettext(str)
#if HAVE_ICONV
# include "utf8-ucs4.h"
#endif
#if HAVE_DECL_GETC_UNLOCKED
# undef getc
# define getc getc_unlocked
#endif
lex_pos_ty gram_pos;
int gram_pos_column;
#if !(__STDC__ && \
((defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L && !defined __DECC) \
|| (defined __GNUC__ && __GNUC__ >= 2 && !defined __APPLE_CC__)))
void
po_gram_error (const char *fmt, ...)
{
va_list ap;
char *buffer;
va_start (ap, fmt);
if (vasprintf (&buffer, fmt, ap) < 0)
error (EXIT_FAILURE, 0, _("memory exhausted"));
va_end (ap);
error_with_progname = false;
error (0, 0, "%s:%lu:%d: %s", gram_pos.file_name,
(unsigned long) gram_pos.line_number, gram_pos_column + 1, buffer);
error_with_progname = true;
free (buffer);
if (*fmt == '.')
--error_message_count;
else if (error_message_count >= gram_max_allowed_errors)
error (EXIT_FAILURE, 0, _("too many errors, aborting"));
}
void
po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
{
va_list ap;
char *buffer;
va_start (ap, fmt);
if (vasprintf (&buffer, fmt, ap) < 0)
error (EXIT_FAILURE, 0, _("memory exhausted"));
va_end (ap);
error_with_progname = false;
error_at_line (0, 0, pp->file_name, pp->line_number, "%s", buffer);
error_with_progname = true;
free (buffer);
if (*fmt == '.')
--error_message_count;
else if (error_message_count >= gram_max_allowed_errors)
error (EXIT_FAILURE, 0, _("too many errors, aborting"));
}
#endif
#define MBCHAR_BUF_SIZE 24
struct mbchar
{
size_t bytes;
#if HAVE_ICONV
bool uc_valid;
unsigned int uc;
#endif
char buf[MBCHAR_BUF_SIZE];
};
typedef struct mbchar mbchar_t[1];
static inline void
memcpy_small (void *dst, const void *src, size_t n)
{
if (n > 0)
{
char *q = (char *) dst;
const char *p = (const char *) src;
*q = *p;
if (--n > 0)
do *++q = *++p; while (--n > 0);
}
}
static inline bool
mb_iseof (const mbchar_t mbc)
{
return (mbc->bytes == 0);
}
static inline const char *
mb_ptr (const mbchar_t mbc)
{
return mbc->buf;
}
static inline size_t
mb_len (const mbchar_t mbc)
{
return mbc->bytes;
}
static inline bool
mb_iseq (const mbchar_t mbc, char sc)
{
#if HAVE_ICONV && 0
if (mbc->uc_valid)
return (mbc->uc == sc);
else
#endif
return (mbc->bytes == 1 && mbc->buf[0] == sc);
}
static inline bool
mb_isnul (const mbchar_t mbc)
{
#if HAVE_ICONV
if (mbc->uc_valid)
return (mbc->uc == 0);
else
#endif
return (mbc->bytes == 1 && mbc->buf[0] == 0);
}
static inline int
mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
{
#if HAVE_ICONV
if (mbc1->uc_valid && mbc2->uc_valid)
return (int) mbc1->uc - (int) mbc2->uc;
else
#endif
return (mbc1->bytes == mbc2->bytes
? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
: mbc1->bytes < mbc2->bytes
? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
: (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
}
static inline bool
mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
{
#if HAVE_ICONV
if (mbc1->uc_valid && mbc2->uc_valid)
return mbc1->uc == mbc2->uc;
else
#endif
return (mbc1->bytes == mbc2->bytes
&& memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
}
static inline bool
mb_isascii (const mbchar_t mbc)
{
#if HAVE_ICONV
if (mbc->uc_valid)
return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
else
#endif
return (mbc->bytes == 1
#if CHAR_MIN < 0x00
&& mbc->buf[0] >= 0x00
#endif
#if CHAR_MAX > 0x7F
&& mbc->buf[0] <= 0x7F
#endif
);
}
#define MB_UNPRINTABLE_WIDTH 1
static int
mb_width (const mbchar_t mbc)
{
#if HAVE_ICONV
if (mbc->uc_valid)
{
unsigned int uc = mbc->uc;
const char *encoding =
(po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
int w = uc_width (uc, encoding);
if (w >= 0)
return w;
if (uc >= 0x0000 && uc <= 0x001F)
{
if (uc == 0x0009)
return 8 - (gram_pos_column & 7);
return 0;
}
if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
return 0;
return MB_UNPRINTABLE_WIDTH;
}
else
#endif
{
if (mbc->bytes == 1)
{
if (mbc->buf[0] >= 0x00 && mbc->buf[0] <= 0x1F)
{
if (mbc->buf[0] == 0x09)
return 8 - (gram_pos_column & 7);
return 0;
}
if (mbc->buf[0] == 0x7F)
return 0;
}
return MB_UNPRINTABLE_WIDTH;
}
}
static inline void
mb_putc (const mbchar_t mbc, FILE *stream)
{
fwrite (mbc->buf, 1, mbc->bytes, stream);
}
static inline void
mb_setascii (mbchar_t mbc, char sc)
{
mbc->bytes = 1;
#if HAVE_ICONV
mbc->uc_valid = 1;
mbc->uc = sc;
#endif
mbc->buf[0] = sc;
}
static inline void
mb_copy (mbchar_t new, const mbchar_t old)
{
memcpy_small (&new->buf[0], &old->buf[0], old->bytes);
new->bytes = old->bytes;
#if HAVE_ICONV
if ((new->uc_valid = old->uc_valid))
new->uc = old->uc;
#endif
}
#define NPUSHBACK 2
struct mbfile
{
FILE *fp;
bool eof_seen;
int have_pushback;
unsigned int bufcount;
char buf[MBCHAR_BUF_SIZE];
struct mbchar pushback[NPUSHBACK];
};
typedef struct mbfile mbfile_t[1];
static bool signal_eilseq;
static inline void
mbfile_init (mbfile_t mbf, FILE *stream)
{
mbf->fp = stream;
mbf->eof_seen = false;
mbf->have_pushback = 0;
mbf->bufcount = 0;
}
static void
mbfile_getc (mbchar_t mbc, mbfile_t mbf)
{
size_t bytes;
if (mbf->eof_seen)
goto eof;
if (mbf->have_pushback > 0)
{
mbf->have_pushback--;
mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
return;
}
if (mbf->bufcount == 0)
{
int c = getc (mbf->fp);
if (c == EOF)
{
mbf->eof_seen = true;
goto eof;
}
mbf->buf[0] = (unsigned char) c;
mbf->bufcount++;
}
#if HAVE_ICONV
if (po_lex_iconv != (iconv_t)(-1))
{
for (;;)
{
unsigned char scratchbuf[64];
const char *inptr = &mbf->buf[0];
size_t insize = mbf->bufcount;
char *outptr = (char *) &scratchbuf[0];
size_t outsize = sizeof (scratchbuf);
size_t res = iconv (po_lex_iconv,
(ICONV_CONST char **) &inptr, &insize,
&outptr, &outsize);
if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
abort ();
if (outsize == sizeof (scratchbuf))
{
if (res != (size_t)(-1))
abort ();
if (errno == EILSEQ)
{
if (signal_eilseq)
po_gram_error (_("invalid multibyte sequence"));
bytes = 1;
mbc->uc_valid = false;
break;
}
else if (errno == EINVAL)
{
int c;
if (mbf->bufcount == MBCHAR_BUF_SIZE)
{
bytes = 1;
mbc->uc_valid = false;
break;
}
c = getc (mbf->fp);
if (c == EOF)
{
mbf->eof_seen = true;
if (ferror (mbf->fp))
goto eof;
if (signal_eilseq)
po_gram_error (_("\
incomplete multibyte sequence at end of file"));
bytes = mbf->bufcount;
mbc->uc_valid = false;
break;
}
mbf->buf[mbf->bufcount++] = (unsigned char) c;
if (c == '\n')
{
if (signal_eilseq)
po_gram_error (_("\
incomplete multibyte sequence at end of line"));
bytes = mbf->bufcount - 1;
mbc->uc_valid = false;
break;
}
}
else
error (EXIT_FAILURE, errno, _("iconv failure"));
}
else
{
size_t outbytes = sizeof (scratchbuf) - outsize;
bytes = mbf->bufcount - insize;
if (bytes == 0)
abort ();
if (outbytes == 0)
abort ();
if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes)
{
if (signal_eilseq)
po_gram_error (_("invalid multibyte sequence"));
mbc->uc_valid = false;
break;
}
mbc->uc_valid = true;
break;
}
}
}
else
#endif
{
if (po_lex_weird_cjk
&& (unsigned char) mbf->buf[0] >= 0x80)
{
if (mbf->bufcount == 1)
{
int c = getc (mbf->fp);
if (c == EOF)
{
if (ferror (mbf->fp))
{
mbf->eof_seen = true;
goto eof;
}
}
else
{
mbf->buf[1] = (unsigned char) c;
mbf->bufcount++;
}
}
if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
bytes = 2;
else
bytes = 1;
}
else
{
bytes = 1;
}
#if HAVE_ICONV
mbc->uc_valid = false;
#endif
}
memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
mbc->bytes = bytes;
mbf->bufcount -= bytes;
if (mbf->bufcount > 0)
{
unsigned int count = mbf->bufcount;
char *p = &mbf->buf[0];
do
{
*p = *(p + bytes);
p++;
}
while (--count > 0);
}
return;
eof:
mbc->bytes = 0;
#if HAVE_ICONV
mbc->uc_valid = false;
#endif
return;
}
static void
mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
{
if (mbf->have_pushback >= NPUSHBACK)
abort ();
mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
mbf->have_pushback++;
}
static mbfile_t mbf;
unsigned int gram_max_allowed_errors = 20;
static bool po_lex_obsolete;
static bool pass_comments = false;
bool pass_obsolete_entries = false;
void
lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
{
gram_pos.file_name = xstrdup (real_filename);
mbfile_init (mbf, fp);
gram_pos.line_number = 1;
gram_pos_column = 0;
signal_eilseq = true;
po_lex_obsolete = false;
po_lex_charset_init ();
}
void
lex_end ()
{
mbf->fp = NULL;
gram_pos.file_name = NULL;
gram_pos.line_number = 0;
gram_pos_column = 0;
signal_eilseq = false;
po_lex_obsolete = false;
po_lex_charset_close ();
}
static void
lex_getc (mbchar_t mbc)
{
for (;;)
{
mbfile_getc (mbc, mbf);
if (mb_iseof (mbc))
{
if (ferror (mbf->fp))
{
bomb:
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
gram_pos.file_name);
}
break;
}
if (mb_iseq (mbc, '\n'))
{
gram_pos.line_number++;
gram_pos_column = 0;
break;
}
gram_pos_column += mb_width (mbc);
if (mb_iseq (mbc, '\\'))
{
mbchar_t mbc2;
mbfile_getc (mbc2, mbf);
if (mb_iseof (mbc2))
{
if (ferror (mbf->fp))
goto bomb;
break;
}
if (!mb_iseq (mbc2, '\n'))
{
mbfile_ungetc (mbc2, mbf);
break;
}
gram_pos.line_number++;
gram_pos_column = 0;
}
else
break;
}
}
static void
lex_ungetc (const mbchar_t mbc)
{
if (!mb_iseof (mbc))
{
if (mb_iseq (mbc, '\n'))
gram_pos.line_number--;
else
gram_pos_column -= mb_width (mbc);
mbfile_ungetc (mbc, mbf);
}
}
static int
keyword_p (const char *s)
{
if (!strcmp (s, "domain"))
return DOMAIN;
if (!strcmp (s, "msgid"))
return MSGID;
if (!strcmp (s, "msgid_plural"))
return MSGID_PLURAL;
if (!strcmp (s, "msgstr"))
return MSGSTR;
po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
return NAME;
}
static int
control_sequence ()
{
mbchar_t mbc;
int val;
int max;
lex_getc (mbc);
if (mb_len (mbc) == 1)
switch (mb_ptr (mbc) [0])
{
case 'n':
return '\n';
case 't':
return '\t';
case 'b':
return '\b';
case 'r':
return '\r';
case 'f':
return '\f';
case 'v':
return '\v';
case 'a':
return '\a';
case '\\':
case '"':
return mb_ptr (mbc) [0];
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
val = 0;
max = 0;
for (;;)
{
char c = mb_ptr (mbc) [0];
val = val * 8 + (c - '0');
if (++max == 3)
break;
lex_getc (mbc);
if (mb_len (mbc) == 1)
switch (mb_ptr (mbc) [0])
{
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
continue;
default:
break;
}
lex_ungetc (mbc);
break;
}
return val;
case 'x':
lex_getc (mbc);
if (mb_iseof (mbc) || mb_len (mbc) != 1
|| !c_isxdigit (mb_ptr (mbc) [0]))
break;
val = 0;
for (;;)
{
char c = mb_ptr (mbc) [0];
val *= 16;
if (c_isdigit (c))
val += c - '0';
else if (c_isupper (c))
val += c - 'A' + 10;
else
val += c - 'a' + 10;
lex_getc (mbc);
if (mb_len (mbc) == 1)
switch (mb_ptr (mbc) [0])
{
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
continue;
default:
break;
}
lex_ungetc (mbc);
break;
}
return val;
}
lex_ungetc (mbc);
po_gram_error (_("invalid control sequence"));
return ' ';
}
int
po_gram_lex ()
{
static char *buf;
static size_t bufmax;
mbchar_t mbc;
size_t bufpos;
for (;;)
{
lex_getc (mbc);
if (mb_iseof (mbc))
return 0;
if (mb_len (mbc) == 1)
switch (mb_ptr (mbc) [0])
{
case '\n':
po_lex_obsolete = false;
break;
case ' ':
case '\t':
case '\r':
case '\f':
case '\v':
break;
case '#':
lex_getc (mbc);
if (mb_iseq (mbc, '~'))
{
po_lex_obsolete = true;
break;
}
signal_eilseq = false;
if (pass_comments)
{
bufpos = 0;
for (;;)
{
while (bufpos + mb_len (mbc) >= bufmax)
{
bufmax += 100;
buf = xrealloc (buf, bufmax);
}
if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
break;
memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
bufpos += mb_len (mbc);
lex_getc (mbc);
}
buf[bufpos] = '\0';
po_gram_lval.string.string = buf;
po_gram_lval.string.pos = gram_pos;
po_gram_lval.string.obsolete = po_lex_obsolete;
po_lex_obsolete = false;
signal_eilseq = true;
return COMMENT;
}
else
{
while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
lex_getc (mbc);
po_lex_obsolete = false;
signal_eilseq = true;
}
break;
case '"':
bufpos = 0;
for (;;)
{
lex_getc (mbc);
while (bufpos + mb_len (mbc) >= bufmax)
{
bufmax += 100;
buf = xrealloc (buf, bufmax);
}
if (mb_iseof (mbc))
{
po_gram_error_at_line (&gram_pos,
_("end-of-file within string"));
break;
}
if (mb_iseq (mbc, '\n'))
{
po_gram_error_at_line (&gram_pos,
_("end-of-line within string"));
break;
}
if (mb_iseq (mbc, '"'))
break;
if (mb_iseq (mbc, '\\'))
{
buf[bufpos++] = control_sequence ();
continue;
}
memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
bufpos += mb_len (mbc);
}
buf[bufpos] = '\0';
po_gram_lval.string.string = xstrdup (buf);
po_gram_lval.string.pos = gram_pos;
po_gram_lval.string.obsolete = po_lex_obsolete;
return STRING;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
case '_': case '$':
bufpos = 0;
for (;;)
{
char c = mb_ptr (mbc) [0];
if (bufpos + 1 >= bufmax)
{
bufmax += 100;
buf = xrealloc (buf, bufmax);
}
buf[bufpos++] = c;
lex_getc (mbc);
if (mb_len (mbc) == 1)
switch (mb_ptr (mbc) [0])
{
default:
break;
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case '_': case '$':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
continue;
}
break;
}
lex_ungetc (mbc);
buf[bufpos] = '\0';
{
int k = keyword_p (buf);
if (k == NAME)
{
po_gram_lval.string.string = xstrdup (buf);
po_gram_lval.string.pos = gram_pos;
po_gram_lval.string.obsolete = po_lex_obsolete;
}
else
{
po_gram_lval.pos.pos = gram_pos;
po_gram_lval.pos.obsolete = po_lex_obsolete;
}
return k;
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
bufpos = 0;
for (;;)
{
char c = mb_ptr (mbc) [0];
if (bufpos + 1 >= bufmax)
{
bufmax += 100;
buf = xrealloc (buf, bufmax + 1);
}
buf[bufpos++] = c;
lex_getc (mbc);
if (mb_len (mbc) == 1)
switch (mb_ptr (mbc) [0])
{
default:
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
continue;
}
break;
}
lex_ungetc (mbc);
buf[bufpos] = '\0';
po_gram_lval.number.number = atol (buf);
po_gram_lval.number.pos = gram_pos;
po_gram_lval.number.obsolete = po_lex_obsolete;
return NUMBER;
case '[':
po_gram_lval.pos.pos = gram_pos;
po_gram_lval.pos.obsolete = po_lex_obsolete;
return '[';
case ']':
po_gram_lval.pos.pos = gram_pos;
po_gram_lval.pos.obsolete = po_lex_obsolete;
return ']';
default:
return JUNK;
}
else
return JUNK;
}
}
void
po_lex_pass_comments (bool flag)
{
pass_comments = flag;
}
void
po_lex_pass_obsolete_entries (bool flag)
{
pass_obsolete_entries = flag;
}