gen-casemap-txt.pl [plain text]
require 5.006;
use utf8;
if (@ARGV != 3) {
$0 =~ s@.*/@@;
die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
}
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
$CODE = 0;
$NAME = 1;
$CATEGORY = 2;
$COMBINING_CLASSES = 3;
$BIDI_CATEGORY = 4;
$DECOMPOSITION = 5;
$DECIMAL_VALUE = 6;
$DIGIT_VALUE = 7;
$NUMERIC_VALUE = 8;
$MIRRORED = 9;
$OLD_NAME = 10;
$COMMENT = 11;
$UPPER = 12;
$LOWER = 13;
$TITLE = 14;
$CASE_CODE = 0;
$CASE_LOWER = 1;
$CASE_TITLE = 2;
$CASE_UPPER = 3;
$CASE_CONDITION = 4;
my @upper;
my @title;
my @lower;
binmode STDOUT, ":utf8";
open (INPUT, "< $ARGV[1]") || exit 1;
$last_code = -1;
while (<INPUT>)
{
chop;
@fields = split (';', $_, 30);
if ($ {
printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $ }
$code = hex ($fields[$CODE]);
if ($code > $last_code + 1)
{
if ($fields[$NAME] =~ /Last>/)
{
@gfields = @fields;
}
else
{
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
'', '', '', '');
}
for (++$last_code; $last_code < $code; ++$last_code)
{
$gfields{$CODE} = sprintf ("%04x", $last_code);
&process_one ($last_code, @gfields);
}
}
&process_one ($code, @fields);
$last_code = $code;
}
close INPUT;
open (INPUT, "< $ARGV[2]") || exit 1;
while (<INPUT>)
{
my $code;
chop;
next if /^ next if /^\s*$/;
s/\s*
@fields = split ('\s*;\s*', $_, 30);
$raw_code = $fields[$CASE_CODE];
$code = hex ($raw_code);
if ($ {
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $ next;
}
if (defined $fields[5]) {
next;
}
$upper[$code] = &make_hex ($fields[$CASE_UPPER]);
$lower[$code] = &make_hex ($fields[$CASE_LOWER]);
$title[$code] = &make_hex ($fields[$CASE_TITLE]);
}
close INPUT;
print <<EOT;
tr_TR\ti\ti\t\x{0130}\t\x{0130}\ttr_TR\tI\t\x{0131}\tI\tI\ttr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\ttr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\ttr_TR.UTF-8\tI\t\x{0131}\tI\tI\ttr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
ΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\tEOT
&print_tests;
exit 0;
sub process_one
{
my ($code, @fields) = @_;
my $type = $fields[$CATEGORY];
if ($type eq 'Ll')
{
$upper[$code] = make_hex ($fields[$UPPER]);
$lower[$code] = pack ("U", $code);
$title[$code] = make_hex ($fields[$TITLE]);
}
elsif ($type eq 'Lu')
{
$lower[$code] = make_hex ($fields[$LOWER]);
$upper[$code] = pack ("U", $code);
$title[$code] = make_hex ($fields[$TITLE]);
}
if ($type eq 'Lt')
{
$upper[$code] = make_hex ($fields[$UPPER]);
$lower[$code] = pack ("U", hex ($fields[$LOWER]));
$title[$code] = make_hex ($fields[$LOWER]);
}
}
sub print_tests
{
for ($i = 0; $i < 0x10ffff; $i++) {
if ($i == 0x3A3) {
next;
}
my $lower = $lower[$i];
my $title = $title[$i];
my $upper = $upper[$i];
if (defined $upper || defined $lower || defined $title) {
printf "\t%s\t%s\t%s\t%s\t# %4X\n",
pack ("U", $i),
(defined $lower ? $lower : ""),
(defined $title ? $title : ""),
(defined $upper ? $upper : ""),
$i;
}
}
}
sub make_hex
{
my $codes = shift;
$codes =~ s/^\s+//;
$codes =~ s/\s+$//;
if ($codes eq "0" || $codes eq "") {
return "";
} else {
return pack ("U*", map { hex ($_) } split /\s+/, $codes);
}
}