mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-07-30 13:53:30 +02:00
Merge test/unicode-caseconc.c into glib/tests/unicode.c
Related to issue #1434
This commit is contained in:
1499
tests/casefold.txt
1499
tests/casefold.txt
File diff suppressed because it is too large
Load Diff
4023
tests/casemap.txt
4023
tests/casemap.txt
File diff suppressed because it is too large
Load Diff
@@ -1,83 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
# Copyright (C) 2001 Red Hat Software
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
gen-casefold-txt.py - Generate test cases for casefolding from Unicode data.
|
||||
See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
Usage:
|
||||
I consider the output of this program to be unrestricted.
|
||||
Use it as you will.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
def main(argv):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate test cases for casefolding from Unicode data"
|
||||
)
|
||||
parser.add_argument("UNICODE-VERSION")
|
||||
parser.add_argument("CaseFolding.txt")
|
||||
args = parser.parse_args(argv[1:])
|
||||
version = getattr(args, "UNICODE-VERSION")
|
||||
filename = getattr(args, "CaseFolding.txt")
|
||||
|
||||
print(
|
||||
"""\
|
||||
# Test cases generated from Unicode {} data
|
||||
# by gen-casefold-txt.py. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
AaBbCc@@\taabbcc@@
|
||||
#
|
||||
# Now the automatic tests
|
||||
#""".format(
|
||||
version
|
||||
)
|
||||
)
|
||||
|
||||
# Names of fields in the CaseFolding table
|
||||
CODE, STATUS, MAPPING = range(3)
|
||||
|
||||
with open(filename, encoding="utf-8") as fileobj:
|
||||
for line in fileobj:
|
||||
# strip comments and skip empty lines
|
||||
line = line.split("#", 1)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
fields = [f.strip() for f in line.split(";", 3)[:3]]
|
||||
if len(fields) != 3:
|
||||
raise SystemExit(
|
||||
"Entry for %s has wrong number of fields (%d)"
|
||||
% (fields[CODE], len(fields))
|
||||
)
|
||||
|
||||
status = fields[STATUS]
|
||||
# skip simple and Turkic mappings
|
||||
if status in "ST":
|
||||
continue
|
||||
|
||||
code = chr(int(fields[CODE], 16))
|
||||
values = "".join([chr(int(v, 16)) for v in fields[MAPPING].split()])
|
||||
print("{}\t{}".format(code, values))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
@@ -1,240 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
# Copyright (C) 2001 Red Hat Software
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
|
||||
See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
Usage:
|
||||
I consider the output of this program to be unrestricted.
|
||||
Use it as you will.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
# Disable line length warnings as wrapping the test templates would be hard
|
||||
# flake8: noqa: E501
|
||||
|
||||
|
||||
def main(argv):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate test cases for case mapping from Unicode data"
|
||||
)
|
||||
parser.add_argument("UNICODE-VERSION")
|
||||
parser.add_argument("UnicodeData.txt")
|
||||
parser.add_argument("SpecialCasing.txt")
|
||||
args = parser.parse_args(argv[1:])
|
||||
version = getattr(args, "UNICODE-VERSION")
|
||||
filename_udata = getattr(args, "UnicodeData.txt")
|
||||
filename_casing = getattr(args, "SpecialCasing.txt")
|
||||
|
||||
# Names of fields in Unicode data table.
|
||||
(
|
||||
CODE,
|
||||
NAME,
|
||||
CATEGORY,
|
||||
COMBINING_CLASSES,
|
||||
BIDI_CATEGORY,
|
||||
DECOMPOSITION,
|
||||
DECIMAL_VALUE,
|
||||
DIGIT_VALUE,
|
||||
NUMERIC_VALUE,
|
||||
MIRRORED,
|
||||
OLD_NAME,
|
||||
COMMENT,
|
||||
UPPER,
|
||||
LOWER,
|
||||
TITLE,
|
||||
) = range(15)
|
||||
|
||||
# Names of fields in the SpecialCasing table
|
||||
CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
|
||||
|
||||
upper = {}
|
||||
title = {}
|
||||
lower = {}
|
||||
|
||||
def make_hex(codes):
|
||||
"""Converts a string of white space separated code points encoded as
|
||||
hex values to a Unicode string. Any extra white space is ignored.
|
||||
"""
|
||||
return "".join([chr(int(c, 16)) for c in codes.split()])
|
||||
|
||||
def process_one(code, fields):
|
||||
type_ = fields[CATEGORY]
|
||||
if type_ == "Ll":
|
||||
upper[code] = make_hex(fields[UPPER])
|
||||
lower[code] = chr(code)
|
||||
title[code] = make_hex(fields[TITLE])
|
||||
elif type_ == "Lu":
|
||||
lower[code] = make_hex(fields[LOWER])
|
||||
upper[code] = chr(code)
|
||||
title[code] = make_hex(fields[TITLE])
|
||||
elif type_ == "Lt":
|
||||
upper[code] = make_hex(fields[UPPER])
|
||||
lower[code] = make_hex(fields[LOWER])
|
||||
title[code] = make_hex(fields[LOWER])
|
||||
|
||||
with open(filename_udata, encoding="utf-8") as fileobj:
|
||||
last_code = -1
|
||||
for line in fileobj:
|
||||
line = line.strip()
|
||||
fields = [f.strip() for f in line.split(";")]
|
||||
if len(fields) != 15:
|
||||
raise SystemExit(
|
||||
"Entry for %s has wrong number of fields (%d)"
|
||||
% (fields[CODE], len(fields))
|
||||
)
|
||||
|
||||
code = int(fields[CODE], 16)
|
||||
|
||||
if code > last_code + 1:
|
||||
# Found a gap
|
||||
if fields[NAME].endswith("Last>"):
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
gfields = fields
|
||||
else:
|
||||
# The gap represents undefined characters. Only the type
|
||||
# matters.
|
||||
gfields = [
|
||||
"",
|
||||
"",
|
||||
"Cn",
|
||||
"0",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
]
|
||||
|
||||
last_code += 1
|
||||
while last_code < code:
|
||||
gfields[CODE] = "%04x" % last_code
|
||||
process_one(last_code, gfields)
|
||||
last_code += 1
|
||||
|
||||
process_one(code, fields)
|
||||
last_code = code
|
||||
|
||||
with open(filename_casing, encoding="utf-8") as fileobj:
|
||||
last_code = -1
|
||||
for line in fileobj:
|
||||
# strip comments and skip empty lines
|
||||
line = line.split("#", 1)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# all lines end with ";" so just remove it
|
||||
line = line.rstrip(";").rstrip()
|
||||
fields = [f.strip() for f in line.split(";")]
|
||||
if len(fields) not in (4, 5):
|
||||
raise SystemExit(
|
||||
"Entry for %s has wrong number of fields (%d)"
|
||||
% (fields[CASE_CODE], len(fields))
|
||||
)
|
||||
|
||||
if len(fields) == 5:
|
||||
# Ignore conditional special cases - we'll handle them manually
|
||||
continue
|
||||
|
||||
code = int(fields[CASE_CODE], 16)
|
||||
|
||||
upper[code] = make_hex(fields[CASE_UPPER])
|
||||
lower[code] = make_hex(fields[CASE_LOWER])
|
||||
title[code] = make_hex(fields[CASE_TITLE])
|
||||
|
||||
print_tests(version, upper, title, lower)
|
||||
|
||||
|
||||
def print_tests(version, upper, title, lower):
|
||||
print(
|
||||
"""\
|
||||
# Test cases generated from Unicode {} data
|
||||
# by gen-casemap-txt.py. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
#
|
||||
tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
# Test reordering of YPOGEGRAMMENI across other accents
|
||||
\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
|
||||
\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
|
||||
# Handling of final and nonfinal sigma
|
||||
\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ \t
|
||||
\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ\t
|
||||
\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ\t
|
||||
# Lithuanian rule of i followed by letter with dot. Not at all sure
|
||||
# about the titlecase part here
|
||||
lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
|
||||
lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
|
||||
lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
|
||||
lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
|
||||
lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
# Special case not at initial position
|
||||
\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
|
||||
#
|
||||
# Now the automatic tests
|
||||
#""".format(
|
||||
version
|
||||
)
|
||||
)
|
||||
|
||||
for i in range(0x10FFFF):
|
||||
if i == 0x3A3:
|
||||
# Greek sigma needs special tests
|
||||
continue
|
||||
|
||||
up = upper.get(i, "")
|
||||
lo = lower.get(i, "")
|
||||
ti = title.get(i, "")
|
||||
|
||||
if any([up, lo, ti]):
|
||||
print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
@@ -32,7 +32,6 @@ tests = {
|
||||
'thread-test' : {},
|
||||
'threadpool-test' : {'suite' : ['slow']},
|
||||
'type-test' : {},
|
||||
'unicode-caseconv' : {},
|
||||
'unicode-encoding' : {},
|
||||
'module-test-library' : {
|
||||
'dependencies' : [libgmodule_dep],
|
||||
@@ -73,8 +72,6 @@ endif
|
||||
if installed_tests_enabled
|
||||
install_data(
|
||||
'iochannel-test-infile',
|
||||
'casemap.txt',
|
||||
'casefold.txt',
|
||||
'utf8.txt',
|
||||
install_dir : installed_tests_execdir,
|
||||
)
|
||||
|
@@ -1,131 +0,0 @@
|
||||
#undef G_DISABLE_ASSERT
|
||||
#undef G_LOG_DOMAIN
|
||||
|
||||
#include <locale.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <glib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main (int argc, char **argv)
|
||||
{
|
||||
FILE *infile;
|
||||
char buffer[1024];
|
||||
char **strings;
|
||||
char *filename;
|
||||
const char *locale;
|
||||
const char *test;
|
||||
const char *expected;
|
||||
char *convert;
|
||||
char *current_locale = setlocale (LC_CTYPE, NULL);
|
||||
gint result = 0;
|
||||
|
||||
g_test_init (&argc, &argv, NULL);
|
||||
|
||||
filename = g_test_build_filename (G_TEST_DIST, "casemap.txt", NULL);
|
||||
|
||||
infile = fopen (filename, "r");
|
||||
if (!infile)
|
||||
{
|
||||
fprintf (stderr, "Failed to open %s\n", filename );
|
||||
exit (1);
|
||||
}
|
||||
|
||||
while (fgets (buffer, sizeof(buffer), infile))
|
||||
{
|
||||
if (buffer[0] == '#')
|
||||
continue;
|
||||
|
||||
strings = g_strsplit (buffer, "\t", -1);
|
||||
|
||||
locale = strings[0];
|
||||
|
||||
if (!locale[0])
|
||||
locale = "C";
|
||||
|
||||
if (strcmp (locale, current_locale) != 0)
|
||||
{
|
||||
setlocale (LC_CTYPE, locale);
|
||||
current_locale = setlocale (LC_CTYPE, NULL);
|
||||
|
||||
if (strncmp (current_locale, locale, 2) != 0)
|
||||
{
|
||||
fprintf (stderr, "Cannot set locale to %s, skipping\n", locale);
|
||||
goto next;
|
||||
}
|
||||
}
|
||||
|
||||
test = strings[1];
|
||||
|
||||
/* gen-casemap-txt.py uses an empty string when a single character
|
||||
* doesn't have an equivalent in a particular case; since that behavior
|
||||
* is nonsense for multicharacter strings, it would make more sense
|
||||
* to put the expected result .. the original character unchanged. But
|
||||
* for now, we just work around it here and take the empty string to mean
|
||||
* "same as original"
|
||||
*/
|
||||
|
||||
convert = g_utf8_strup (test, -1);
|
||||
expected = strings[4][0] ? strings[4] : test;
|
||||
if (strcmp (convert, expected) != 0)
|
||||
{
|
||||
fprintf (stderr, "Failure: toupper(%s) == %s, should have been %s\n",
|
||||
test, convert, expected);
|
||||
result = 1;
|
||||
}
|
||||
g_free (convert);
|
||||
|
||||
convert = g_utf8_strdown (test, -1);
|
||||
expected = strings[2][0] ? strings[2] : test;
|
||||
if (strcmp (convert, expected) != 0)
|
||||
{
|
||||
fprintf (stderr, "Failure: tolower(%s) == %s, should have been %s\n",
|
||||
test, convert, expected);
|
||||
result = 1;
|
||||
}
|
||||
g_free (convert);
|
||||
|
||||
next:
|
||||
g_strfreev (strings);
|
||||
}
|
||||
|
||||
fclose (infile);
|
||||
|
||||
g_free (filename);
|
||||
filename = g_test_build_filename (G_TEST_DIST, "casefold.txt", NULL);
|
||||
|
||||
infile = fopen (filename, "r");
|
||||
if (!infile)
|
||||
{
|
||||
fprintf (stderr, "Failed to open %s\n", filename );
|
||||
g_free (filename);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
while (fgets (buffer, sizeof(buffer), infile))
|
||||
{
|
||||
if (buffer[0] == '#')
|
||||
continue;
|
||||
|
||||
buffer[strlen(buffer) - 1] = '\0';
|
||||
strings = g_strsplit (buffer, "\t", -1);
|
||||
|
||||
test = strings[0];
|
||||
|
||||
convert = g_utf8_casefold (test, -1);
|
||||
if (strcmp (convert, strings[1]) != 0)
|
||||
{
|
||||
fprintf (stderr, "Failure: casefold(%s) == '%s', should have been '%s'\n",
|
||||
test, convert, strings[1]);
|
||||
result = 1;
|
||||
}
|
||||
g_free (convert);
|
||||
|
||||
g_strfreev (strings);
|
||||
}
|
||||
|
||||
fclose (infile);
|
||||
g_free (filename);
|
||||
|
||||
return result;
|
||||
}
|
Reference in New Issue
Block a user