tld.c   [plain text]


/* tld.c --- Handle TLD restriction checking.
 * Copyright (C) 2004  Simon Josefsson.
 * Copyright (C) 2003, 2004  Free Software Foundation, Inc.
 *
 * Author: Thomas Jacob, Internet24.de
 *
 * This file is part of GNU Libidn.
 *
 * GNU Libidn is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * GNU Libidn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GNU Libidn; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 *
 */

/* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
#include <stringprep.h>

/* Get strcmp(). */
#include <string.h>

/* Get specifications. */
#include <tld.h>

/* Array of built-in domain restriction structures.  See tlds.c.  */
extern const Tld_table *_tld_tables[];

/**
 * tld_get_table - get table for a TLD name in table
 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
 * @tables: Zero terminated array of #Tld_table info-structures for
 *   TLDs.
 *
 * Get the TLD table for a named TLD by searching through the given
 * TLD table array.
 *
 * Return value: Return structure corresponding to TLD @tld by going
 *   thru @tables, or return %NULL if no such structure is found.
 */
const Tld_table *
tld_get_table (const char *tld, const Tld_table ** tables)
{
  const Tld_table **tldtable = NULL;

  if (!tld || !tables)
    return NULL;

  for (tldtable = tables; *tldtable; tldtable++)
    if (!strcmp ((*tldtable)->name, tld))
      return *tldtable;

  return NULL;
}

/**
 * tld_default_table - get table for a TLD name
 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
 * @overrides: Additional zero terminated array of #Tld_table
 *   info-structures for TLDs, or %NULL to only use library deault
 *   tables.
 *
 * Get the TLD table for a named TLD, using the internal defaults,
 * possibly overrided by the (optional) supplied tables.
 *
 * Return value: Return structure corresponding to TLD @tld_str, first
 *   looking through @overrides then thru built-in list, or %NULL if
 *   no such structure found.
 */
const Tld_table *
tld_default_table (const char *tld, const Tld_table ** overrides)
{
  const Tld_table *tldtable = NULL;

  if (!tld)
    return NULL;

  if (overrides)
    tldtable = tld_get_table (tld, overrides);

  if (!tldtable)
    tldtable = tld_get_table (tld, _tld_tables);

  return tldtable;
}

#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 ||	\
		 (c) == 0xFF0E || (c) == 0xFF61)

/**
 * tld_get_4 - extract top level domain part in input Unicode string
 * @in: Array of unicode code points to process. Does not need to be
 *   zero terminated.
 * @inlen: Number of unicode code points.
 * @out: Zero terminated ascii result string pointer.
 *
 * Isolate the top-level domain of @in and return it as an ASCII
 * string in @out.
 *
 * Return value: Return %TLD_SUCCESS on success, or the corresponding
 *   #Tld_rc error code otherwise.
 */
int
tld_get_4 (const uint32_t * in, size_t inlen, char **out)
{
  const uint32_t *ipos;
  size_t olen;

  *out = NULL;
  if (!in || inlen == 0)
    return TLD_NODATA;

  ipos = &in[inlen - 1];
  olen = 0;
  /* Scan backwards for non(latin)letters. */
  while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
			(*ipos >= 0x61 && *ipos <= 0x7A)))
    ipos--, olen++;

  if (olen > 0 && DOTP (*ipos))	/* Found something that appears a TLD. */
    {
      char *out_s = malloc (sizeof (char) * (olen + 1));
      char *opos = out_s;

      if (!opos)
	return TLD_MALLOC_ERROR;

      ipos++;
      /* Transcribe to lowercase ascii string. */
      for (; ipos < &in[inlen]; ipos++, opos++)
	*opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
      *opos = 0;
      *out = out_s;
      return TLD_SUCCESS;
    }

  return TLD_NO_TLD;
}

/**
 * tld_get_4z - extract top level domain part in input Unicode string
 * @in: Zero terminated array of unicode code points to process.
 * @out: Zero terminated ascii result string pointer.
 *
 * Isolate the top-level domain of @in and return it as an ASCII
 * string in @out.
 *
 * Return value: Return %TLD_SUCCESS on success, or the corresponding
 *   #Tld_rc error code otherwise.
 */
int
tld_get_4z (const uint32_t * in, char **out)
{
  const uint32_t *ipos = in;

  if (!in)
    return TLD_NODATA;

  while (*ipos)
    ipos++;

  return tld_get_4 (in, ipos - in, out);
}

/**
 * tld_get_z - extract top level domain part in input string
 * @in: Zero terminated character array to process.
 * @out: Zero terminated ascii result string pointer.
 *
 * Isolate the top-level domain of @in and return it as an ASCII
 * string in @out.  The input string @in may be UTF-8, ISO-8859-1 or
 * any ASCII compatible character encoding.
 *
 * Return value: Return %TLD_SUCCESS on success, or the corresponding
 *   #Tld_rc error code otherwise.
 */
int
tld_get_z (const char *in, char **out)
{
  uint32_t *iucs;
  size_t i, ilen;
  int rc;

  ilen = strlen (in);
  iucs = calloc (ilen, sizeof (*iucs));

  if (!iucs)
    return TLD_MALLOC_ERROR;

  for (i = 0; i < ilen; i++)
    iucs[i] = in[i];

  rc = tld_get_4 (iucs, ilen, out);

  free (iucs);

  return rc;
}

/*
 * tld_checkchar - verify that character is permitted
 * @ch: 32 bit unicode character to check.
 * @tld: A #Tld_table data structure to check @ch against.
 *
 * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
 * character in @tld.
 *
 * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
 *   valid character for the TLD @tld or if @tld is %NULL,
 *   %TLD_INVALID if @ch is invalid as defined by @tld.
 */
static int
_tld_checkchar (uint32_t ch, const Tld_table * tld)
{
  const Tld_table_element *s, *e, *m;

  if (!tld)
    return TLD_SUCCESS;

  /* Check for [-a-z0-9.]. */
  if ((ch >= 0x61 && ch <= 0x7A) ||
      (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
    return TLD_SUCCESS;

  s = tld->valid;
  e = s + tld->nvalid;
  while (s < e)
    {
      m = s + ((e - s) >> 1);
      if (ch < m->start)
	e = m;
      else if (ch > m->end)
	s = m + 1;
      else
	return TLD_SUCCESS;
    }

  return TLD_INVALID;
}

/**
 * tld_check_4t - verify that characters are permitted
 * @in: Array of unicode code points to process. Does not need to be
 *   zero terminated.
 * @inlen: Number of unicode code points.
 * @errpos: Position of offending character is returned here.
 * @tld: A #Tld_table data structure representing the restrictions for
 *   which the input should be tested.
 *
 * Test each of the code points in @in for whether or not
 * they are allowed by the data structure in @tld, return
 * the position of the first character for which this is not
 * the case in @errpos.
 *
 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
 *   points are valid or when @tld is null, %TLD_INVALID if a
 *   character is not allowed, or additional error codes on general
 *   failure conditions.
 */
int
tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
	      const Tld_table * tld)
{
  const uint32_t *ipos;
  int rc;

  if (!tld)			/* No data for TLD so everything is valid. */
    return TLD_SUCCESS;

  ipos = in;
  while (ipos < &in[inlen])
    {
      rc = _tld_checkchar (*ipos, tld);
      if (rc != TLD_SUCCESS)
	{
	  if (errpos)
	    *errpos = ipos - in;
	  return rc;
	}
      ipos++;
    }
  return TLD_SUCCESS;
}

/**
 * tld_check_4tz - verify that characters are permitted
 * @in: Zero terminated array of unicode code points to process.
 * @errpos: Position of offending character is returned here.
 * @tld: A #Tld_table data structure representing the restrictions for
 *   which the input should be tested.
 *
 * Test each of the code points in @in for whether or not
 * they are allowed by the data structure in @tld, return
 * the position of the first character for which this is not
 * the case in @errpos.
 *
 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
 *   points are valid or when @tld is null, %TLD_INVALID if a
 *   character is not allowed, or additional error codes on general
 *   failure conditions.
 */
int
tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
{
  const uint32_t *ipos = in;

  if (!ipos)
    return TLD_NODATA;

  while (*ipos)
    ipos++;

  return tld_check_4t (in, ipos - in, errpos, tld);
}

/**
 * tld_check_4 - verify that characters are permitted
 * @in: Array of unicode code points to process. Does not need to be
 *   zero terminated.
 * @inlen: Number of unicode code points.
 * @errpos: Position of offending character is returned here.
 * @overrides: A #Tld_table array of additional domain restriction
 *  structures that complement and supersede the built-in information.
 *
 * Test each of the code points in @in for whether or not they are
 * allowed by the information in @overrides or by the built-in TLD
 * restriction data. When data for the same TLD is available both
 * internally and in @overrides, the information in @overrides takes
 * precedence. If several entries for a specific TLD are found, the
 * first one is used.  If @overrides is %NULL, only the built-in
 * information is used.  The position of the first offending character
 * is returned in @errpos.
 *
 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
 *   points are valid or when @tld is null, %TLD_INVALID if a
 *   character is not allowed, or additional error codes on general
 *   failure conditions.
 */
int
tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
	     const Tld_table ** overrides)
{
  const Tld_table *tld;
  char *domain;
  int rc;

  if (errpos)
    *errpos = 0;

  /* Get TLD name. */
  rc = tld_get_4 (in, inlen, &domain);

  if (rc != TLD_SUCCESS)
    {
      if (rc == TLD_NO_TLD)	/* No TLD, say OK */
	return TLD_SUCCESS;
      else
	return rc;
    }

  /* Retrieve appropriate data structure. */
  tld = tld_default_table (domain, overrides);
  free (domain);

  return tld_check_4t (in, inlen, errpos, tld);
}

/**
 * tld_check_4z - verify that characters are permitted
 * @in: Zero-terminated array of unicode code points to process.
 * @errpos: Position of offending character is returned here.
 * @overrides: A #Tld_table array of additional domain restriction
 *   structures that complement and supersede the built-in information.
 *
 * Test each of the code points in @in for whether or not they are
 * allowed by the information in @overrides or by the built-in TLD
 * restriction data. When data for the same TLD is available both
 * internally and in @overrides, the information in @overrides takes
 * precedence. If several entries for a specific TLD are found, the
 * first one is used.  If @overrides is %NULL, only the built-in
 * information is used.  The position of the first offending character
 * is returned in @errpos.
 *
 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
 *   points are valid or when @tld is null, %TLD_INVALID if a
 *   character is not allowed, or additional error codes on general
 *   failure conditions.
 */
int
tld_check_4z (const uint32_t * in, size_t * errpos,
	      const Tld_table ** overrides)
{
  const uint32_t *ipos = in;

  if (!ipos)
    return TLD_NODATA;

  while (*ipos)
    ipos++;

  return tld_check_4 (in, ipos - in, errpos, overrides);
}

/**
 * tld_check_8z - verify that characters are permitted
 * @in: Zero-terminated UTF8 string to process.
 * @errpos: Position of offending character is returned here.
 * @overrides: A #Tld_table array of additional domain restriction
 *   structures that complement and supersede the built-in information.
 *
 * Test each of the characters in @in for whether or not they are
 * allowed by the information in @overrides or by the built-in TLD
 * restriction data. When data for the same TLD is available both
 * internally and in @overrides, the information in @overrides takes
 * precedence. If several entries for a specific TLD are found, the
 * first one is used.  If @overrides is %NULL, only the built-in
 * information is used.  The position of the first offending character
 * is returned in @errpos.  Note that the error position refers to the
 * decoded character offset rather than the byte position in the
 * string.
 *
 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
 *   characters are valid or when @tld is null, %TLD_INVALID if a
 *   character is not allowed, or additional error codes on general
 *   failure conditions.
 */
int
tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
{
  uint32_t *iucs;
  size_t ilen;
  int rc;

  if (!in)
    return TLD_NODATA;

  iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);

  if (!iucs)
    return TLD_MALLOC_ERROR;

  rc = tld_check_4 (iucs, ilen, errpos, overrides);

  free (iucs);

  return rc;
}

/**
 * tld_check_lz - verify that characters are permitted
 * @in: Zero-terminated string in the current locales encoding to process.
 * @errpos: Position of offending character is returned here.
 * @overrides: A #Tld_table array of additional domain restriction
 *   structures that complement and supersede the built-in information.
 *
 * Test each of the characters in @in for whether or not they are
 * allowed by the information in @overrides or by the built-in TLD
 * restriction data. When data for the same TLD is available both
 * internally and in @overrides, the information in @overrides takes
 * precedence. If several entries for a specific TLD are found, the
 * first one is used.  If @overrides is %NULL, only the built-in
 * information is used.  The position of the first offending character
 * is returned in @errpos.  Note that the error position refers to the
 * decoded character offset rather than the byte position in the
 * string.
 *
 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
 *   characters are valid or when @tld is null, %TLD_INVALID if a
 *   character is not allowed, or additional error codes on general
 *   failure conditions.
 */
int
tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
{
  char *utf8;
  int rc;

  if (!in)
    return TLD_NODATA;

  utf8 = stringprep_locale_to_utf8 (in);
  if (!utf8)
    return TLD_ICONV_ERROR;


  rc = tld_check_8z (utf8, errpos, overrides);

  free (utf8);

  return rc;
}

/**
 * Tld_rc:
 * @TLD_SUCCESS: Successful operation.  This value is guaranteed to
 *   always be zero, the remaining ones are only guaranteed to hold
 *   non-zero values, for logical comparison purposes.
 * @TLD_INVALID: Invalid character found.
 * @TLD_NODATA: No input data was provided.
 * @TLD_MALLOC_ERROR: Error during memory allocation.
 * @TLD_ICONV_ERROR: Error during iconv string conversion.
 * @TLD_NO_TLD: No top-level domain found in domain string.
 *
 * Enumerated return codes of the TLD checking functions.
 * The value 0 is guaranteed to always correspond to success.
 */