mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-12 07:26:15 +01:00
tests: Port gen-casefold-txt.pl and gen-casemap-txt.pl to Python 3. See #1332
I've tried to keep the code structure roughly the same.
This commit is contained in:
parent
603d40467c
commit
a580185cdc
@ -146,8 +146,8 @@ endif
|
||||
|
||||
EXTRA_DIST += \
|
||||
$(test_scripts) \
|
||||
gen-casefold-txt.pl \
|
||||
gen-casemap-txt.pl \
|
||||
gen-casefold-txt.py \
|
||||
gen-casemap-txt.py \
|
||||
iochannel-test-infile \
|
||||
timeloop-basic.c \
|
||||
assert-msg-test.gdb
|
||||
|
@ -1,5 +1,5 @@
|
||||
# Test cases generated from Unicode 10.0.0 data
|
||||
# by gen-casefold-test.pl. Do not edit.
|
||||
# by gen-casefold-txt.py. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
|
@ -1,5 +1,5 @@
|
||||
# Test cases generated from Unicode 10.0.0 data
|
||||
# by gen-case-tests.pl. Do not edit.
|
||||
# by gen-casemap-txt.py. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
|
@ -1,82 +0,0 @@
|
||||
#! /usr/bin/perl -w
|
||||
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
# Copyright (C) 2001 Red Hat Software
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# gen-casefold-test.pl - Generate test cases for casefolding from Unicode data.
|
||||
# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
# Usage:
|
||||
# I consider the output of this program to be unrestricted. Use it as
|
||||
# you will.
|
||||
|
||||
require 5.006;
|
||||
|
||||
# Names of fields in the CaseFolding table
|
||||
$FOLDING_CODE = 0;
|
||||
$FOLDING_STATUS = 1;
|
||||
$FOLDING_MAPPING = 2;
|
||||
|
||||
my $casefoldlen = 0;
|
||||
my @casefold;
|
||||
|
||||
if (@ARGV != 2) {
|
||||
$0 =~ s@.*/@@;
|
||||
die "Usage: $0 UNICODE-VERSION CaseFolding.txt\n";
|
||||
}
|
||||
|
||||
print <<EOT;
|
||||
# Test cases generated from Unicode $ARGV[0] data
|
||||
# by gen-casefold-test.pl. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
AaBbCc@@\taabbcc@@
|
||||
#
|
||||
# Now the automatic tests
|
||||
#
|
||||
EOT
|
||||
|
||||
binmode STDOUT, ":utf8";
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
while (<INPUT>)
|
||||
{
|
||||
chop;
|
||||
|
||||
next if /^#/;
|
||||
next if /^\s*$/;
|
||||
|
||||
s/\s*#.*//;
|
||||
|
||||
my @fields = split ('\s*;\s*', $_, 30);
|
||||
|
||||
my $raw_code = $fields[$FOLDING_CODE];
|
||||
my $code = hex ($raw_code);
|
||||
|
||||
if ($#fields != 3)
|
||||
{
|
||||
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
|
||||
next;
|
||||
}
|
||||
|
||||
# skip simple and Turkic mappings
|
||||
next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
|
||||
|
||||
@values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
|
||||
printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));
|
||||
}
|
||||
|
||||
close INPUT;
|
78
tests/gen-casefold-txt.py
Executable file
78
tests/gen-casefold-txt.py
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
# Copyright (C) 2001 Red Hat Software
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
gen-casefold-txt.py - Generate test cases for casefolding from Unicode data.
|
||||
See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
Usage:
|
||||
I consider the output of this program to be unrestricted.
|
||||
Use it as you will.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
def main(argv):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate test cases for casefolding from Unicode data")
|
||||
parser.add_argument("UNICODE-VERSION")
|
||||
parser.add_argument("CaseFolding.txt")
|
||||
args = parser.parse_args(argv[1:])
|
||||
version = getattr(args, "UNICODE-VERSION")
|
||||
filename = getattr(args, "CaseFolding.txt")
|
||||
|
||||
print("""\
|
||||
# Test cases generated from Unicode {} data
|
||||
# by gen-casefold-txt.py. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
AaBbCc@@\taabbcc@@
|
||||
#
|
||||
# Now the automatic tests
|
||||
#""".format(version))
|
||||
|
||||
# Names of fields in the CaseFolding table
|
||||
CODE, STATUS, MAPPING = range(3)
|
||||
|
||||
with open(filename, encoding="utf-8") as fileobj:
|
||||
for line in fileobj:
|
||||
# strip comments and skip empty lines
|
||||
line = line.split("#", 1)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
fields = [f.strip() for f in line.split(";", 3)[:3]]
|
||||
if len(fields) != 3:
|
||||
raise SystemExit(
|
||||
"Entry for %s has wrong number of fields (%d)" % (
|
||||
fields[CODE], len(fields)))
|
||||
|
||||
status = fields[STATUS]
|
||||
# skip simple and Turkic mappings
|
||||
if status in "ST":
|
||||
continue
|
||||
|
||||
code = chr(int(fields[CODE], 16))
|
||||
values = "".join(
|
||||
[chr(int(v, 16)) for v in fields[MAPPING].split()])
|
||||
print("{}\t{}".format(code, values))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
@ -1,256 +0,0 @@
|
||||
#! /usr/bin/perl -w
|
||||
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
# Copyright (C) 2001 Red Hat Software
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
|
||||
# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
# I consider the output of this program to be unrestricted. Use it as
|
||||
# you will.
|
||||
|
||||
require 5.006;
|
||||
use utf8;
|
||||
|
||||
if (@ARGV != 3) {
|
||||
$0 =~ s@.*/@@;
|
||||
die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
|
||||
}
|
||||
|
||||
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
|
||||
|
||||
# Names of fields in Unicode data table.
|
||||
$CODE = 0;
|
||||
$NAME = 1;
|
||||
$CATEGORY = 2;
|
||||
$COMBINING_CLASSES = 3;
|
||||
$BIDI_CATEGORY = 4;
|
||||
$DECOMPOSITION = 5;
|
||||
$DECIMAL_VALUE = 6;
|
||||
$DIGIT_VALUE = 7;
|
||||
$NUMERIC_VALUE = 8;
|
||||
$MIRRORED = 9;
|
||||
$OLD_NAME = 10;
|
||||
$COMMENT = 11;
|
||||
$UPPER = 12;
|
||||
$LOWER = 13;
|
||||
$TITLE = 14;
|
||||
|
||||
# Names of fields in the SpecialCasing table
|
||||
$CASE_CODE = 0;
|
||||
$CASE_LOWER = 1;
|
||||
$CASE_TITLE = 2;
|
||||
$CASE_UPPER = 3;
|
||||
$CASE_CONDITION = 4;
|
||||
|
||||
my @upper;
|
||||
my @title;
|
||||
my @lower;
|
||||
|
||||
binmode STDOUT, ":utf8";
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
$last_code = -1;
|
||||
while (<INPUT>)
|
||||
{
|
||||
chop;
|
||||
@fields = split (';', $_, 30);
|
||||
if ($#fields != 14)
|
||||
{
|
||||
printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
|
||||
}
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
# Found a gap.
|
||||
if ($fields[$NAME] =~ /Last>/)
|
||||
{
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
@gfields = @fields;
|
||||
}
|
||||
else
|
||||
{
|
||||
# The gap represents undefined characters. Only the type
|
||||
# matters.
|
||||
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '');
|
||||
}
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
||||
&process_one ($last_code, @gfields);
|
||||
}
|
||||
}
|
||||
&process_one ($code, @fields);
|
||||
$last_code = $code;
|
||||
}
|
||||
|
||||
close INPUT;
|
||||
|
||||
open (INPUT, "< $ARGV[2]") || exit 1;
|
||||
|
||||
while (<INPUT>)
|
||||
{
|
||||
my $code;
|
||||
|
||||
chop;
|
||||
|
||||
next if /^#/;
|
||||
next if /^\s*$/;
|
||||
|
||||
s/\s*#.*//;
|
||||
|
||||
@fields = split ('\s*;\s*', $_, 30);
|
||||
|
||||
$raw_code = $fields[$CASE_CODE];
|
||||
$code = hex ($raw_code);
|
||||
|
||||
if ($#fields != 4 && $#fields != 5)
|
||||
{
|
||||
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
|
||||
next;
|
||||
}
|
||||
|
||||
if (defined $fields[5]) {
|
||||
# Ignore conditional special cases - we'll handle them manually
|
||||
next;
|
||||
}
|
||||
|
||||
$upper[$code] = &make_hex ($fields[$CASE_UPPER]);
|
||||
$lower[$code] = &make_hex ($fields[$CASE_LOWER]);
|
||||
$title[$code] = &make_hex ($fields[$CASE_TITLE]);
|
||||
}
|
||||
|
||||
close INPUT;
|
||||
|
||||
print <<EOT;
|
||||
# Test cases generated from Unicode $ARGV[0] data
|
||||
# by gen-case-tests.pl. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
# Test reordering of YPOGEGRAMMENI across other accents
|
||||
\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
|
||||
\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
|
||||
# Handling of final and nonfinal sigma
|
||||
ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
|
||||
ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
|
||||
ΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
|
||||
# Lithuanian rule of i followed by letter with dot. Not at all sure
|
||||
# about the titlecase part here
|
||||
lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
|
||||
lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
|
||||
lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
|
||||
lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
|
||||
lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
# Special case not at initial position
|
||||
\ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
|
||||
#
|
||||
# Now the automatic tests
|
||||
#
|
||||
EOT
|
||||
&print_tests;
|
||||
|
||||
exit 0;
|
||||
|
||||
# Process a single character.
|
||||
sub process_one
|
||||
{
|
||||
my ($code, @fields) = @_;
|
||||
|
||||
my $type = $fields[$CATEGORY];
|
||||
if ($type eq 'Ll')
|
||||
{
|
||||
$upper[$code] = make_hex ($fields[$UPPER]);
|
||||
$lower[$code] = pack ("U", $code);
|
||||
$title[$code] = make_hex ($fields[$TITLE]);
|
||||
}
|
||||
elsif ($type eq 'Lu')
|
||||
{
|
||||
$lower[$code] = make_hex ($fields[$LOWER]);
|
||||
$upper[$code] = pack ("U", $code);
|
||||
$title[$code] = make_hex ($fields[$TITLE]);
|
||||
}
|
||||
|
||||
if ($type eq 'Lt')
|
||||
{
|
||||
$upper[$code] = make_hex ($fields[$UPPER]);
|
||||
$lower[$code] = pack ("U", hex ($fields[$LOWER]));
|
||||
$title[$code] = make_hex ($fields[$LOWER]);
|
||||
}
|
||||
}
|
||||
|
||||
sub print_tests
|
||||
{
|
||||
for ($i = 0; $i < 0x10ffff; $i++) {
|
||||
if ($i == 0x3A3) {
|
||||
# Greek sigma needs special tests
|
||||
next;
|
||||
}
|
||||
|
||||
my $lower = $lower[$i];
|
||||
my $title = $title[$i];
|
||||
my $upper = $upper[$i];
|
||||
|
||||
if (defined $upper || defined $lower || defined $title) {
|
||||
printf "\t%s\t%s\t%s\t%s\t# %4X\n",
|
||||
pack ("U", $i),
|
||||
(defined $lower ? $lower : ""),
|
||||
(defined $title ? $title : ""),
|
||||
(defined $upper ? $upper : ""),
|
||||
$i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub make_hex
|
||||
{
|
||||
my $codes = shift;
|
||||
|
||||
$codes =~ s/^\s+//;
|
||||
$codes =~ s/\s+$//;
|
||||
|
||||
if ($codes eq "0" || $codes eq "") {
|
||||
return "";
|
||||
} else {
|
||||
return pack ("U*", map { hex ($_) } split /\s+/, $codes);
|
||||
}
|
||||
}
|
200
tests/gen-casemap-txt.py
Executable file
200
tests/gen-casemap-txt.py
Executable file
@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
# Copyright (C) 2001 Red Hat Software
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
|
||||
See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
Usage:
|
||||
I consider the output of this program to be unrestricted.
|
||||
Use it as you will.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
def main(argv):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate test cases for case mapping from Unicode data")
|
||||
parser.add_argument("UNICODE-VERSION")
|
||||
parser.add_argument("UnicodeData.txt")
|
||||
parser.add_argument("SpecialCasing.txt")
|
||||
args = parser.parse_args(argv[1:])
|
||||
version = getattr(args, "UNICODE-VERSION")
|
||||
filename_udata = getattr(args, "UnicodeData.txt")
|
||||
filename_casing = getattr(args, "SpecialCasing.txt")
|
||||
|
||||
# Names of fields in Unicode data table.
|
||||
CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \
|
||||
DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \
|
||||
COMMENT, UPPER, LOWER, TITLE = range(15)
|
||||
|
||||
# Names of fields in the SpecialCasing table
|
||||
CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
|
||||
|
||||
upper = {}
|
||||
title = {}
|
||||
lower = {}
|
||||
|
||||
def make_hex(codes):
|
||||
"""Converts a string of white space separated code points encoded as
|
||||
hex values to a Unicode string. Any extra white space is ignored.
|
||||
"""
|
||||
return "".join([chr(int(c, 16)) for c in codes.split()])
|
||||
|
||||
def process_one(code, fields):
|
||||
type_ = fields[CATEGORY]
|
||||
if type_ == "Ll":
|
||||
upper[code] = make_hex(fields[UPPER])
|
||||
lower[code] = chr(code)
|
||||
title[code] = make_hex(fields[TITLE])
|
||||
elif type_ == "Lu":
|
||||
lower[code] = make_hex(fields[LOWER])
|
||||
upper[code] = chr(code)
|
||||
title[code] = make_hex(fields[TITLE])
|
||||
elif type_ == "Lt":
|
||||
upper[code] = make_hex(fields[UPPER])
|
||||
lower[code] = make_hex(fields[LOWER])
|
||||
title[code] = make_hex(fields[LOWER])
|
||||
|
||||
with open(filename_udata, encoding="utf-8") as fileobj:
|
||||
last_code = -1
|
||||
for line in fileobj:
|
||||
line = line.strip()
|
||||
fields = [f.strip() for f in line.split(";")]
|
||||
if len(fields) != 15:
|
||||
raise SystemExit(
|
||||
"Entry for %s has wrong number of fields (%d)" % (
|
||||
fields[CODE], len(fields)))
|
||||
|
||||
code = int(fields[CODE], 16)
|
||||
|
||||
if code > last_code + 1:
|
||||
# Found a gap
|
||||
if fields[NAME].endswith("Last>"):
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
gfields = fields
|
||||
else:
|
||||
# The gap represents undefined characters. Only the type
|
||||
# matters.
|
||||
gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '']
|
||||
|
||||
last_code += 1
|
||||
while last_code < code:
|
||||
gfields[CODE] = "%04x" % last_code
|
||||
process_one(last_code, gfields)
|
||||
last_code += 1
|
||||
|
||||
process_one(code, fields)
|
||||
last_code = code
|
||||
|
||||
with open(filename_casing, encoding="utf-8") as fileobj:
|
||||
last_code = -1
|
||||
for line in fileobj:
|
||||
# strip comments and skip empty lines
|
||||
line = line.split("#", 1)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# all lines end with ";" so just remove it
|
||||
line = line.rstrip(";").rstrip()
|
||||
fields = [f.strip() for f in line.split(";")]
|
||||
if len(fields) not in (4, 5):
|
||||
raise SystemExit(
|
||||
"Entry for %s has wrong number of fields (%d)" % (
|
||||
fields[CASE_CODE], len(fields)))
|
||||
|
||||
if len(fields) == 5:
|
||||
# Ignore conditional special cases - we'll handle them manually
|
||||
continue
|
||||
|
||||
code = int(fields[CASE_CODE], 16)
|
||||
|
||||
upper[code] = make_hex(fields[CASE_UPPER])
|
||||
lower[code] = make_hex(fields[CASE_LOWER])
|
||||
title[code] = make_hex(fields[CASE_TITLE])
|
||||
|
||||
print_tests(version, upper, title, lower)
|
||||
|
||||
|
||||
def print_tests(version, upper, title, lower):
|
||||
print("""\
|
||||
# Test cases generated from Unicode {} data
|
||||
# by gen-casemap-txt.py. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
# Test reordering of YPOGEGRAMMENI across other accents
|
||||
\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
|
||||
\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
|
||||
# Handling of final and nonfinal sigma
|
||||
\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
|
||||
\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
|
||||
\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
|
||||
# Lithuanian rule of i followed by letter with dot. Not at all sure
|
||||
# about the titlecase part here
|
||||
lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
|
||||
lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
|
||||
lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
|
||||
lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
|
||||
lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
# Special case not at initial position
|
||||
\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
|
||||
#
|
||||
# Now the automatic tests
|
||||
#""".format(version))
|
||||
|
||||
for i in range(0x10ffff):
|
||||
if i == 0x3A3:
|
||||
# Greek sigma needs special tests
|
||||
continue
|
||||
|
||||
up = upper.get(i, "")
|
||||
lo = lower.get(i, "")
|
||||
ti = title.get(i, "")
|
||||
|
||||
if any([up, lo, ti]):
|
||||
print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
@ -57,7 +57,7 @@ int main (int argc, char **argv)
|
||||
|
||||
test = strings[1];
|
||||
|
||||
/* gen-casemap-txt.pl uses an empty string when a single character
|
||||
/* gen-casemap-txt.py uses an empty string when a single character
|
||||
* doesn't have an equivalent in a particular case; since that behavior
|
||||
* is nonsense for multicharacter strings, it would make more sense
|
||||
* to put the expected result .. the original character unchanged. But
|
||||
|
Loading…
Reference in New Issue
Block a user