#!/usr/bin/env python3 # Copyright (C) 1998, 1999 Tom Tromey # Copyright (C) 2001 Red Hat Software # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . """ gen-casemap-txt.py - Generate test cases for case mapping from Unicode data. See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html Usage: I consider the output of this program to be unrestricted. Use it as you will. """ import sys import argparse def main(argv): parser = argparse.ArgumentParser( description="Generate test cases for case mapping from Unicode data" ) parser.add_argument("UNICODE-VERSION") parser.add_argument("UnicodeData.txt") parser.add_argument("SpecialCasing.txt") args = parser.parse_args(argv[1:]) version = getattr(args, "UNICODE-VERSION") filename_udata = getattr(args, "UnicodeData.txt") filename_casing = getattr(args, "SpecialCasing.txt") # Names of fields in Unicode data table. ( CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, COMMENT, UPPER, LOWER, TITLE, ) = range(15) # Names of fields in the SpecialCasing table CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5) upper = {} title = {} lower = {} def make_hex(codes): """Converts a string of white space separated code points encoded as hex values to a Unicode string. Any extra white space is ignored. """ return "".join([chr(int(c, 16)) for c in codes.split()]) def process_one(code, fields): type_ = fields[CATEGORY] if type_ == "Ll": upper[code] = make_hex(fields[UPPER]) lower[code] = chr(code) title[code] = make_hex(fields[TITLE]) elif type_ == "Lu": lower[code] = make_hex(fields[LOWER]) upper[code] = chr(code) title[code] = make_hex(fields[TITLE]) elif type_ == "Lt": upper[code] = make_hex(fields[UPPER]) lower[code] = make_hex(fields[LOWER]) title[code] = make_hex(fields[LOWER]) with open(filename_udata, encoding="utf-8") as fileobj: last_code = -1 for line in fileobj: line = line.strip() fields = [f.strip() for f in line.split(";")] if len(fields) != 15: raise SystemExit( "Entry for %s has wrong number of fields (%d)" % (fields[CODE], len(fields)) ) code = int(fields[CODE], 16) if code > last_code + 1: # Found a gap if fields[NAME].endswith("Last>"): # Fill the gap with the last character read, # since this was a range specified in the char database gfields = fields else: # The gap represents undefined characters. Only the type # matters. gfields = [ "", "", "Cn", "0", "", "", "", "", "", "", "", "", "", "", "", ] last_code += 1 while last_code < code: gfields[CODE] = "%04x" % last_code process_one(last_code, gfields) last_code += 1 process_one(code, fields) last_code = code with open(filename_casing, encoding="utf-8") as fileobj: last_code = -1 for line in fileobj: # strip comments and skip empty lines line = line.split("#", 1)[0].strip() if not line: continue # all lines end with ";" so just remove it line = line.rstrip(";").rstrip() fields = [f.strip() for f in line.split(";")] if len(fields) not in (4, 5): raise SystemExit( "Entry for %s has wrong number of fields (%d)" % (fields[CASE_CODE], len(fields)) ) if len(fields) == 5: # Ignore conditional special cases - we'll handle them manually continue code = int(fields[CASE_CODE], 16) upper[code] = make_hex(fields[CASE_UPPER]) lower[code] = make_hex(fields[CASE_LOWER]) title[code] = make_hex(fields[CASE_TITLE]) print_tests(version, upper, title, lower) def print_tests(version, upper, title, lower): print( """\ # Test cases generated from Unicode {} data # by gen-casemap-txt.py. Do not edit. # # Some special hand crafted tests # tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I # Test reordering of YPOGEGRAMMENI across other accents \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t # Handling of final and nonfinal sigma \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ \tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ # Lithuanian rule of i followed by letter with dot. Not at all sure # about the titlecase part here lt_LT\ti\u0117\ti\u0117\tIe\tIE\t lt_LT\tie\u0307\tie\u0307\tIe\tIE\t lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) # Special case not at initial position \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04 # # Now the automatic tests #""".format( version ) ) for i in range(0x10FFFF): if i == 0x3A3: # Greek sigma needs special tests continue up = upper.get(i, "") lo = lower.get(i, "") ti = title.get(i, "") if any([up, lo, ti]): print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i)) if __name__ == "__main__": sys.exit(main(sys.argv))