IDNA.java [plain text]

/**
 * Copyright (C) 2004, 2005, 2006, 2007  Free Software Foundation, Inc.
 *
 * Author: Oliver Hitz
 *
 * This file is part of GNU Libidn.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
 * USA
 */

package gnu.inet.encoding;

public class IDNA
{
  public final static String ACE_PREFIX = "xn--";

  /**
   * Converts a Unicode string to ASCII using the procedure in RFC3490
   * section 4.1. Unassigned characters are not allowed and STD3 ASCII
   * rules are enforced. The input string may be a domain name
   * containing dots.
   *
   * @param input Unicode string.
   * @return Encoded string.
   */
  public static String toASCII(String input)
    throws IDNAException
  {
    StringBuffer o = new StringBuffer();
    StringBuffer h = new StringBuffer();

    for (int i = 0; i < input.length(); i++) {
      char c = input.charAt(i);
      if (c == '.' || c == '\u3002' || c == '\uff0e' || c == '\uff61') {
	o.append(toASCII(h.toString(), false, true));
	o.append('.');
	h = new StringBuffer();
      } else {
	h.append(c);
      }
    }
    o.append(toASCII(h.toString(), false, true));
    return o.toString();
  }

  /**
   * Converts a Unicode string to ASCII using the procedure in RFC3490
   * section 4.1. Unassigned characters are not allowed and STD3 ASCII
   * rules are enforced.
   *
   * @param input Unicode string.
   * @param allowUnassigned Unassigned characters, allowed or not?
   * @param useSTD3ASCIIRules STD3 ASCII rules, enforced or not?
   * @return Encoded string.
   */
  public static String toASCII(String input, boolean allowUnassigned, boolean useSTD3ASCIIRules)
    throws IDNAException
  {
    // Step 1: Check if the string contains code points outside
    //         the ASCII range 0..0x7c.

    boolean nonASCII = false;

    for (int i = 0; i < input.length(); i++) {
      int c = input.charAt(i);
      if (c > 0x7f) {
	nonASCII = true;
	break;
      }
    }

    // Step 2: Perform the nameprep operation.

    if (nonASCII) {
      try {
	input = Stringprep.nameprep(input, allowUnassigned);
      } catch (StringprepException e) {
	throw new IDNAException(e);
      }
    }

    // Step 3: - Verify the absence of non-LDH ASCII code points
    //           0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60,
    //           0x7b..0x7f
    //         - Verify the absence of leading and trailing
    //           hyphen-minus

    if (useSTD3ASCIIRules) {
      for (int i = 0; i < input.length(); i++) {
	int c = input.charAt(i);
	if ((c <= 0x2c) || 
	    (c >= 0x2e && c <= 0x2f) || 
	    (c >= 0x3a && c <= 0x40) ||
	    (c >= 0x5b && c <= 0x60) ||
	    (c >= 0x7b && c <= 0x7f)) {
	  throw new IDNAException(IDNAException.CONTAINS_NON_LDH);
	}
      }

      if (input.startsWith("-") || input.endsWith("-")) {
	throw new IDNAException(IDNAException.CONTAINS_HYPHEN);
      }
    }

    // Step 4: If all code points are inside 0..0x7f, skip to step 8

    nonASCII = false;

    for (int i = 0; i < input.length(); i++) {
      int c = input.charAt(i);
      if (c > 0x7f) {
	nonASCII = true;
	break;
      }
    }

    String output = input;

    if (nonASCII) {

      // Step 5: Verify that the sequence does not begin with the ACE prefix.

      if (input.startsWith(ACE_PREFIX)) {
	throw new IDNAException(IDNAException.CONTAINS_ACE_PREFIX);
      }
      
      // Step 6: Punycode

      try {
	output = Punycode.encode(input);
      } catch (PunycodeException e) {
	throw new IDNAException(e);
      }

      // Step 7: Prepend the ACE prefix.

      output = ACE_PREFIX + output;
    }

    // Step 8: Check that the length is inside 1..63.

    if (output.length() < 1 || output.length() > 63) {
      throw new IDNAException(IDNAException.TOO_LONG);
    }

    return output;
  }

  /**
   * Converts an ASCII-encoded string to Unicode. Unassigned
   * characters are not allowed and STD3 hostnames are enforced. Input
   * may be domain name containing dots.
   *
   * @param input ASCII input string.
   * @return Unicode string.
   */
  public static String toUnicode(String input)
  {
    StringBuffer o = new StringBuffer();
    StringBuffer h = new StringBuffer();

    for (int i = 0; i < input.length(); i++) {
      char c = input.charAt(i);
      if (c == '.' || c == '\u3002' || c == '\uff0e' || c == '\uff61') {
	o.append(toUnicode(h.toString(), false, true));
	o.append(c);
	h = new StringBuffer();
      } else {
	h.append(c);
      }
    }
    o.append(toUnicode(h.toString(), false, true));
    return o.toString();
  }

  /**
   * Converts an ASCII-encoded string to Unicode.
   *
   * @param input ASCII input string.
   * @param allowUnassigned Allow unassigned Unicode characters.
   * @param useSTD3ASCIIRules Check that the output conforms to STD3.
   * @return Unicode string.
   */
  public static String toUnicode(String input, boolean allowUnassigned, boolean useSTD3ASCIIRules)
  {
    String original = input;
    boolean nonASCII = false;

    // Step 1: If all code points are inside 0..0x7f, skip to step 3.

    for (int i = 0; i < input.length(); i++) {
      int c = input.charAt(i);
      if (c > 0x7f) {
	nonASCII = true;
	break;
      }
    }

    // Step 2: Perform the Nameprep operation.

    if (nonASCII) {
      try {
	input = Stringprep.nameprep(input, allowUnassigned);
      } catch (StringprepException e) {
	// ToUnicode never fails!
	return original;
      }
    }

    // Step 3: Verify the sequence starts with the ACE prefix.

    if (!input.startsWith(ACE_PREFIX)) {
      // ToUnicode never fails!
      return original;
    }

    String stored = input;

    // Step 4: Remove the ACE prefix.

    input = input.substring(ACE_PREFIX.length());

    // Step 5: Decode using punycode

    String output;

    try {
      output = Punycode.decode(input);
    } catch (PunycodeException e) {
      // ToUnicode never fails!
      return original;
    }

    // Step 6: Apply toASCII

    String ascii;

    try {
      ascii = toASCII(output, allowUnassigned, useSTD3ASCIIRules);
    } catch (IDNAException e) {
      // ToUnicode never fails!
      return original;
    }

    // Step 7: Compare case-insensitively.

    if (!ascii.equalsIgnoreCase(stored)) {
      // ToUnicode never fails!
      return original;
    }

    // Step 8: Return the result.

    return output;
  }
}