gen-casefold-txt.pl   [plain text]


#! /usr/bin/perl -w

#    Copyright (C) 1998, 1999 Tom Tromey
#    Copyright (C) 2001 Red Hat Software

#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2, or (at your option)
#    any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#    02111-1307, USA.

# gen-casefold-test.pl - Generate test cases for casefolding from Unicode data.
# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
# Usage: 
# I consider the output of this program to be unrestricted.  Use it as
# you will.

require 5.006;

# Names of fields in the CaseFolding table
$FOLDING_CODE = 0;
$FOLDING_STATUS = 1;
$FOLDING_MAPPING = 2;

my $casefoldlen = 0;
my @casefold;

if (@ARGV != 2) {
    $0 =~ s@.*/@@;
    die "Usage: $0 UNICODE-VERSION  CaseFolding.txt\n";
}
 
print <<EOT;
# Test cases generated from Unicode $ARGV[0] data
# by gen-casefold-test.pl. Do not edit.
#
# Some special hand crafted tests
#
AaBbCc@@\taabbcc@@
#
# Now the automatic tests
#
EOT

binmode STDOUT, ":utf8";
open (INPUT, "< $ARGV[1]") || exit 1;

while (<INPUT>)
{
    chop;

    next if /^#/;
    next if /^\s*$/;

    s/\s*#.*//;

    my @fields = split ('\s*;\s*', $_, 30);

    my $raw_code = $fields[$FOLDING_CODE];
    my $code = hex ($raw_code);

    if ($#fields != 3)
    {
	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
	next;
    }

    # skip simple and Turkic mappings
    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);

    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
    printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));
}

close INPUT;