glib/glib/update-gtranslit.py

#!/usr/bin/env python3

# Run this script like so:
#
#  ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h

import sys, os

localedir = sys.argv[1]

# returns true if the name looks like a POSIX locale name
def looks_like_locale(name):
    name, _, variant = name.partition('@')

    if '_' not in name:
        return False

    lang, _, land = name.partition('_')

    return len(lang) == 2 or len(lang) == 3 and len(land) == 2

# handles <U1234> style escapes
def unescape(string):
    chunks = []

    n = len(string)
    i = 0

    while i < n:
        start_escape = string.find('<', i)

        if start_escape == -1:
            chunks.append(string[i:])
            break

        assert string[start_escape:start_escape + 2] == '<U'
        start_escape += 2

        end_escape = string.find('>', start_escape)
        assert end_escape != -1

        chunks.append(chr(int(string[start_escape:end_escape], 16)))
        i = end_escape + 1

    return ''.join(chunks)

# Checks if a string is ascii
def is_ascii(string):
    return all(ord(c) < 0x80 for c in string)

# A Mapping is a map from non-ascii strings to ascii strings.
#
# It corresponds to a sequence of one or more mapping lines:
#
#   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
#
# in a file.
class Mapping:
    def __init__(self):
        self.serialised = None
        self.mapping = {}

    # Scans a string like
    #
    #   <U00C4> "<U0041><U0308>";"<U0041><U0045>" % LATIN CAPITAL LETTER A WITH DIAERESIS.
    #
    # and adds the first all-ascii choice (or IGNORE) to the mapping
    # dictionary, with the origin string as the key.  In the case of
    # IGNORE, stores the empty string.
    def consider_mapping_line(self, line):
        key, value, rest = (line + ' % comment').split(maxsplit=2)

        key = unescape(key)

        for alternative in value.split(';'):
            if alternative[0] == '"' and alternative[-1] == '"':
                unescaped = unescape(alternative[1:-1])
                if is_ascii(unescaped):
                    self.mapping[key] = unescaped
                    break

            elif alternative[0] == '<' and alternative[-1] == '>':
                unescaped = unescape(alternative)
                if is_ascii(unescaped):
                    self.mapping[key] = unescaped
                    break

            elif alternative == 'IGNORE':
                self.mapping[key] = ''
                break

    # Performs a normal dictionary merge, but ensures that there are no
    # conflicting entries between the original dictionary and the requested
    # changes
    def merge_mapping(self, changes):
        for key in changes.mapping:
            if key in self.mapping:
                assert self.mapping[key] == changes.mapping[key]

        self.mapping.update(changes.mapping)

    # Can't get much flatter...
    def get_flattened(self):
        return [self]

    def serialise(self, serialiser):
        if self.serialised is None:
            self.serialised = serialiser.add_mapping(self.mapping)

        return self.serialised

# A Chain is a sequence of mappings and chains.
#
# A chain contains another chain whenever "copy" or "include" is
# encountered in a source file.
#
# A chain contains a mapping whenever a sequence of mapping lines:
#
#   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
#
# is encountered in a file.
#
# The order of lookup is reverse: later entries override earlier ones.
class Chain:
    def __init__(self, name):
        self.serialised = None
        self.name = name
        self.chain = []
        self.links = 0

        self.read_from_file(os.path.join(localedir, name))

    def read_from_file(self, filename):
        current_mapping = None
        in_lc_ctype = False
        in_translit = False

        fp = open(filename, encoding='ascii', errors='surrogateescape')

        for line in fp:
            line = line.strip()

            if in_lc_ctype:
                if line == 'END LC_CTYPE':
                    break

                if line.startswith('copy') or line.startswith('include'):
                    if current_mapping:
                        self.chain.append(current_mapping)

                    copyname = unescape(line.split('"', 3)[1])
                    copyfile = get_chain(copyname)
                    self.chain.append(copyfile)
                    copyfile.links += 1

                    current_mapping = None

                elif line == 'translit_start':
                    in_translit = True

                elif line == 'translit_end':
                    in_translit = False

                elif in_translit and line.startswith('<U'):
                    if not current_mapping:
                        current_mapping = Mapping()

                    current_mapping.consider_mapping_line(line)

                elif line == '' or line.startswith('%'):
                    pass

                elif 'default_missing <U003F>':
                    pass

                elif in_translit:
                    print('unknown line:', line)
                    assert False

            elif line == 'LC_CTYPE':
                in_lc_ctype = True

        if current_mapping:
            self.chain.append(current_mapping)

    # If there is only one link to this chain, we may as well just
    # return the contents of the chain so that they can be merged into
    # our sole parent directly.  Otherwise, return ourselves.
    def get_flattened(self):
        if self.links == 1:
            return sum((item.get_flattened() for item in self.chain), [])
        else:
            return [self]

    def serialise(self, serialiser):
        if self.serialised is None:
            # Before we serialise, see if we can optimise a bit
            self.chain = sum((item.get_flattened() for item in self.chain), [])

            i = 0
            while i < len(self.chain) - 1:
                if isinstance(self.chain[i], Mapping) and isinstance(self.chain[i + 1], Mapping):
                    # We have two mappings in a row.  Try to merge them.
                    self.chain[i].merge_mapping(self.chain[i + 1])
                    del self.chain[i + 1]
                else:
                    i += 1

            # If all that is left is one item, just serialise that directly
            if len(self.chain) == 1:
                self.serialised = self.chain[0].serialise(serialiser)
            else:
                ids = [item.serialise(serialiser) for item in self.chain]
                self.serialised = serialiser.add_chain(ids)

        return self.serialised

# Chain cache -- allows sharing of common chains
chains = {}
def get_chain(name):
    if not name in chains:
        chains[name] = Chain(name)

    return chains[name]


# Remove the country name from a locale, preserving variant
# eg: 'sr_RS@latin' -> 'sr@latin'
def remove_country(string):
    base, at, variant = string.partition('@')
    lang, _, land = base.partition('_')
    return lang + at + variant

def encode_range(start, end):
    assert start <= end
    length = end - start

    assert start < 0x1000
    assert length < 0x8

    result = 0x8000 + (length << 12) + start

    assert result < 0x10000

    return result

def c_pair_array(array):
    return '{ ' + ', '.join ('{ %u, %u }' % pair for pair in array) + ' };'

class Serialiser:
    def __init__(self):
        self.mappings = []
        self.chains = []
        self.locales = {}

    def add_mapping(self, mapping):
        if mapping in self.mappings:
            mapping_id = self.mappings.index(mapping)
        else:
            mapping_id = len(self.mappings)
            self.mappings.append(mapping)

        assert mapping_id < 128
        return mapping_id

    def add_chain(self, chain):
        if chain in self.chains:
            chain_id = self.chains.index(chain)
        else:
            chain_id = len(self.chains)
            self.chains.append(chain)

        assert chain_id < 128
        return 128 + chain_id

    def add_locale(self, name, item_id):
        self.locales[name] = item_id

    def add_default(self, item_id):
        self.default = item_id

    def optimise_locales(self):
        # Check if all regions of a language/variant agree
        languages = list(set(remove_country(locale) for locale in self.locales))

        for language in languages:
            locales = [locale for locale in self.locales if remove_country(locale) == language]

            item_id = self.locales[locales[0]]
            if all(self.locales[locale] == item_id for locale in locales):
                self.locales[language] = item_id
                for locale in locales:
                    del self.locales[locale]

        # Check if a variant is the same as the non-variant form
        # eg: 'de@euro' and 'de'
        for variant in list(locale for locale in self.locales if '@' in locale):
            base, _, _ = variant.partition('@')
            if base in self.locales and self.locales[base] == self.locales[variant]:
                del self.locales[variant]

        # Eliminate any entries that are just the same as the C locale
        for locale in list(self.locales):
            if self.locales[locale] == self.default:
                del self.locales[locale]

    def to_c(self):
        src_table = ''
        ascii_table = ''
        mappings_table = []
        mapping_ranges = []
        chains_table = []
        chain_starts = []
        locale_names = ''
        locale_index = []
        max_lookup = 0
        max_localename = 0

        for mapping in self.mappings:
            mapping_ranges.append ((len(mappings_table), len(mapping)))

            for key in sorted(mapping):
                if len(key) == 1 and ord(key[0]) < 0x8000:
                    src_range = ord(key[0])
                else:
                    existing = src_table.find(key)
                    if existing == -1:
                        start = len(src_table)
                        assert all(ord(c) <= 0x10ffff for c in key)
                        src_table += key
                        src_range = encode_range(start, len(src_table))
                        max_lookup = max(max_lookup, len(key))
                    else:
                        src_range = encode_range(existing, existing + len(key))

                value = mapping[key]
                if len(value) == 1 and ord(value[0]) < 0x80:
                    ascii_range = ord(value[0])
                else:
                    existing = ascii_table.find(value)
                    if existing == -1:
                        start = len(ascii_table)
                        assert all(ord(c) < 0x80 for c in value)
                        ascii_table += value
                        ascii_range = encode_range(start, len(ascii_table))
                    else:
                        ascii_range = encode_range(existing, existing + len(value))

                mappings_table.append ((src_range, ascii_range))

            mapping_end = len(mappings_table)

        for chain in self.chains:
            chain_starts.append(len(chains_table))

            for item_id in reversed(chain):
                assert item_id < 0xff
                chains_table.append(item_id)
            chains_table.append(0xff)

        for locale in sorted(self.locales):
            max_localename = max(max_localename, len(locale))
            name_offset = len(locale_names)
            assert all(ord(c) <= 0x7f for c in locale)
            locale_names += (locale + '\0')

            item_id = self.locales[locale]

            assert name_offset < 256
            assert item_id < 256
            locale_index.append((name_offset, item_id))

        print('/* Generated by update-gtranslit.py */')
        print('#define MAX_KEY_SIZE', max_lookup)
        print('#define MAX_LOCALE_NAME', max_localename)
        print('static const gunichar src_table[] = {', ', '.join(str(ord(c)) for c in src_table), '};')
        # cannot do this in plain ascii because of trigraphs... :(
        print('static const gchar ascii_table[] = {', ', '.join(str(ord(c)) for c in ascii_table), '};')
        print('static const struct mapping_entry mappings_table[] =', c_pair_array (mappings_table))
        print('static const struct mapping_range mapping_ranges[] =', c_pair_array (mapping_ranges))
        print('static const guint8 chains_table[] = {', ', '.join(str(i) for i in chains_table), '};')
        print('static const guint8 chain_starts[] = {', ', '.join(str(i) for i in chain_starts), '};')
        print('static const gchar locale_names[] = "' + locale_names.replace('\0', '\\0') + '";')
        print('static const struct locale_entry locale_index[] = ', c_pair_array (locale_index))
        print('static const guint8 default_item_id = %u;' % (self.default,))

    def dump(self):
        print(self.mappings)
        print(self.chains)
        print(self.locales)

locales = []
for name in os.listdir(localedir):
    if looks_like_locale(name):
        chain = get_chain(name)
        locales.append (chain)
        chain.links += 1

serialiser = Serialiser()

for locale in locales:
    serialiser.add_locale(locale.name, locale.serialise(serialiser))

i18n = get_chain('i18n').serialise(serialiser)
combining = get_chain('translit_combining').serialise(serialiser)
serialiser.add_default(serialiser.add_chain([i18n, combining]))

serialiser.optimise_locales()

serialiser.to_c()