Add locale-sensitive ASCII transliteration API

Add a new function, g_str_to_ascii() that does locale-dependent ASCII transliteration of UTF-8 strings. This function works off of an internal database. We get the data out of the localedata shipped with glibc, which seems to be just about the best source of locale-sensitive transliteration information available anywhere. We include a update script with this commit that's not used by anything at all -- it will just sit in git. It is intended to be run manually from time to time. https://bugzilla.gnome.org/show_bug.cgi?id=710142
2024-12-25 15:06:14 +01:00 · 2014-02-17 11:54:18 -05:00 · 2014-02-17 11:54:18 -05:00 · 941b8979d0
commit 941b8979d0
parent 436d77f70a
6 changed files with 837 additions and 0 deletions
--- a/docs/reference/glib/glib-sections.txt
+++ b/docs/reference/glib/glib-sections.txt
@ -1282,6 +1282,7 @@ g_strrstr_len
 g_str_has_prefix
 g_str_has_suffix
 g_strcmp0
+g_str_to_ascii
 g_str_tokenize_and_fold
 g_str_match_string

--- a/glib/Makefile.am
+++ b/glib/Makefile.am
@ -172,6 +172,7 @@ libglib_2_0_la_SOURCES = 	\
 	gthreadpool.c		\
 	gtimer.c		\
 	gtimezone.c	 	\
+	gtranslit.c		\
 	gtrashstack.c		\
 	gtree.c			\
 	guniprop.c		\
--- a/glib/gstrfuncs.h
+++ b/glib/gstrfuncs.h
@ -287,6 +287,10 @@ GLIB_AVAILABLE_IN_ALL
 gchar*                g_stpcpy         (gchar        *dest,
                                        const char   *src);

+GLIB_AVAILABLE_IN_2_40
+gchar *                 g_str_to_ascii                                  (const gchar   *str,
+                                                                         const gchar   *from_locale);
+
 GLIB_AVAILABLE_IN_2_40
 gchar **                g_str_tokenize_and_fold                         (const gchar   *string,
                                                                         const gchar   *translit_locale,
--- a/glib/gtranslit-data.h
+++ b/glib/gtranslit-data.h
--- a/glib/gtranslit.c
+++ b/glib/gtranslit.c
@ -0,0 +1,409 @@
+/*
+ * Copyright © 2014 Canonical Limited
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the licence, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ryan Lortie <desrt@desrt.ca>
+ */
+
+#include <config.h>
+
+#include "gstrfuncs.h"
+
+#include <glib.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct mapping_entry
+{
+  guint16 src;
+  guint16 ascii;
+};
+
+struct mapping_range
+{
+  guint16 start;
+  guint16 length;
+};
+
+struct locale_entry
+{
+  guint8 name_offset;
+  guint8 item_id;
+};
+
+#include "gtranslit-data.h"
+
+#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
+#define get_length(encoded)                 ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
+
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+#define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
+#else
+#define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
+#endif
+
+static const gchar * lookup_in_item (guint           item_id,
+                                     const gunichar *key,
+                                     gint           *result_len,
+                                     gint           *key_consumed);
+
+static gint
+compare_mapping_entry (gconstpointer user_data,
+                       gconstpointer data)
+{
+  const struct mapping_entry *entry = data;
+  const gunichar *key = user_data;
+  gunichar src_0;
+
+  G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
+
+  src_0 = get_src_char (src_table, entry->src, 0);
+
+  if (key[0] > src_0)
+    return 1;
+  else if (key[0] < src_0)
+    return -1;
+
+  if (get_length (entry->src) > 1)
+    {
+      gunichar src_1;
+
+      src_1 = get_src_char (src_table, entry->src, 1);
+
+      if (key[1] > src_1)
+        return 1;
+      else if (key[1] < src_1)
+        return -1;
+    }
+  else if (key[1])
+    return 1;
+
+  return 0;
+}
+
+static const gchar *
+lookup_in_mapping (const struct mapping_entry *mapping,
+                   gint                        mapping_size,
+                   const gunichar             *key,
+                   gint                       *result_len,
+                   gint                       *key_consumed)
+{
+  const struct mapping_entry *hit;
+
+  hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
+
+  if (hit == NULL)
+    return NULL;
+
+  *key_consumed = get_length (hit->src);
+  *result_len = get_length (hit->ascii);
+
+  return get_ascii_item(ascii_table, hit->ascii);
+}
+
+static const gchar *
+lookup_in_chain (const guint8   *chain,
+                 const gunichar *key,
+                 gint           *result_len,
+                 gint           *key_consumed)
+{
+  const gchar *result;
+
+  while (*chain != 0xff)
+    {
+      result = lookup_in_item (*chain, key, result_len, key_consumed);
+
+      if (result)
+        return result;
+
+      chain++;
+    }
+
+  return NULL;
+}
+
+static const gchar *
+lookup_in_item (guint           item_id,
+                const gunichar *key,
+                gint           *result_len,
+                gint           *key_consumed)
+{
+  if (item_id & 0x80)
+    {
+      const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
+
+      return lookup_in_chain (chain, key, result_len, key_consumed);
+    }
+  else
+    {
+      const struct mapping_range *range = &mapping_ranges[item_id];
+
+      return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
+    }
+}
+
+static gint
+compare_locale_entry (gconstpointer user_data,
+                      gconstpointer data)
+{
+  const struct locale_entry *entry = data;
+  const gchar *key = user_data;
+
+  return strcmp (key, &locale_names[entry->name_offset]);
+}
+
+static gboolean
+lookup_item_id_for_one_locale (const gchar *key,
+                               guint       *item_id)
+{
+  const struct locale_entry *hit;
+
+  hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
+
+  if (hit == NULL)
+    return FALSE;
+
+  *item_id = hit->item_id;
+  return TRUE;
+}
+
+static guint
+lookup_item_id_for_locale (const gchar *locale)
+{
+  gchar key[MAX_LOCALE_NAME + 1];
+  const gchar *language;
+  guint language_len;
+  const gchar *territory = NULL;
+  guint territory_len = 0;
+  const gchar *modifier = NULL;
+  guint modifier_len = 0;
+  const gchar *next_char;
+  guint id;
+
+  /* As per POSIX, a valid locale looks like:
+   *
+   *   language[_territory][.codeset][@modifier]
+   */
+  language = locale;
+  language_len = strcspn (language, "_.@");
+  next_char = language + language_len;
+
+  if (*next_char == '_')
+    {
+      territory = next_char;
+      territory_len = strcspn (territory + 1, "_.@") + 1;
+      next_char = territory + territory_len;
+    }
+
+  if (*next_char == '.')
+    {
+      const gchar *codeset;
+      guint codeset_len;
+
+      codeset = next_char;
+      codeset_len = strcspn (codeset + 1, "_.@") + 1;
+      next_char = codeset + codeset_len;
+    }
+
+  if (*next_char == '@')
+    {
+      modifier = next_char;
+      modifier_len = strcspn (modifier + 1, "_.@") + 1;
+      next_char = modifier + modifier_len;
+    }
+
+  /* What madness is this? */
+  if (language_len == 0 || *next_char)
+    return default_item_id;
+
+  /* We are not interested in codeset.
+   *
+   * For this locale:
+   *
+   *  aa_BB@cc
+   *
+   * try in this order:
+   *
+   * Note: we have no locales of the form aa_BB@cc in the database.
+   *
+   *  1. aa@cc
+   *  2. aa_BB
+   *  3. aa
+   */
+
+  /* 1. */
+  if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
+    {
+      memcpy (key, language, language_len);
+      memcpy (key + language_len, modifier, modifier_len);
+      key[language_len + modifier_len] = '\0';
+
+      if (lookup_item_id_for_one_locale (key, &id))
+        return id;
+    }
+
+  /* 2. */
+  if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
+    {
+      memcpy (key, language, language_len);
+      memcpy (key + language_len, territory, territory_len);
+      key[language_len + territory_len] = '\0';
+
+      if (lookup_item_id_for_one_locale (key, &id))
+        return id;
+    }
+
+  /* 3. */
+  if (language_len <= MAX_LOCALE_NAME)
+    {
+      memcpy (key, language, language_len);
+      key[language_len] = '\0';
+
+      if (lookup_item_id_for_one_locale (key, &id))
+        return id;
+    }
+
+  return default_item_id;
+}
+
+static guint
+get_default_item_id (void)
+{
+  static guint item_id;
+  static gboolean done;
+
+  /* Doesn't need to be locked -- no harm in doing it twice. */
+  if (!done)
+    {
+      const gchar *locale;
+
+      locale = setlocale (LC_CTYPE, NULL);
+      item_id = lookup_item_id_for_locale (locale);
+      done = TRUE;
+    }
+
+  return item_id;
+}
+
+/**
+ * g_str_to_ascii:
+ * @str: a string, in UTF-8
+ * @from_locale: (allow-none): the source locale, if known
+ *
+ * Transliterate @str to plain ASCII.
+ *
+ * For best results, @str should be in composed normalised form.
+ *
+ * This function performs a reasonably good set of character
+ * replacements.  The particular set of replacements that is done may
+ * change by version or even by runtime environment.
+ *
+ * If the source language of @str is known, it can used to improve the
+ * accuracy of the translation by passing it as @from_locale.  It should
+ * be a valid POSIX locale string (of the form
+ * "language[_territory][.codeset][@modifier]").
+ *
+ * If @from_locale is %NULL then the current locale is used.
+ *
+ * If you want to do translation for no specific locale, and you want it
+ * to be done independently of the currently locale, specify "C" for
+ * @from_locale.
+ *
+ * Returns: a string in plain ASCII
+ **/
+gchar *
+g_str_to_ascii (const gchar *str,
+                const gchar *from_locale)
+{
+  GString *result;
+  guint item_id;
+
+  if (g_str_is_ascii (str))
+    return g_strdup (str);
+
+  if (from_locale)
+    item_id = lookup_item_id_for_locale (from_locale);
+  else
+    item_id = get_default_item_id ();
+
+  result = g_string_new (NULL);
+
+  while (*str)
+    {
+      /* We only need to transliterate non-ASCII values... */
+      if (*str & 0x80)
+        {
+          gunichar key[MAX_KEY_SIZE];
+          const gchar *r;
+          gint consumed;
+          gint r_len;
+          gunichar c;
+
+          G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
+
+          c = g_utf8_get_char (str);
+
+          /* This is where it gets evil...
+           *
+           * We know that MAX_KEY_SIZE is 2.  We also know that we
+           * only want to try another character if it's non-ascii.
+           */
+          str = g_utf8_next_char (str);
+
+          key[0] = c;
+          if (*str & 0x80)
+            key[1] = g_utf8_get_char (str);
+          else
+            key[1] = 0;
+
+          r = lookup_in_item (item_id, key, &r_len, &consumed);
+
+          /* If we failed to map two characters, try again with one.
+           *
+           * gconv behaviour is a bit weird here -- it seems to
+           * depend in the randomness of the binary search and the
+           * size of the input buffer as to what result we get here.
+           *
+           * Doing it this way is more work, but should be
+           * more-correct.
+           */
+          if (r == NULL && key[1])
+            {
+              key[1] = 0;
+              r = lookup_in_item (item_id, key, &r_len, &consumed);
+            }
+
+          if (r != NULL)
+            {
+              g_string_append_len (result, r, r_len);
+              if (consumed == 2)
+                /* If it took both then skip again */
+                str = g_utf8_next_char (str);
+            }
+          else /* no match found */
+            g_string_append_c (result, '?');
+        }
+      else if (*str & 0x80) /* Out-of-range non-ASCII case */
+        {
+          g_string_append_c (result, '?');
+          str = g_utf8_next_char (str);
+        }
+      else /* ASCII case */
+        g_string_append_c (result, *str++);
+    }
+
+  return g_string_free (result, FALSE);
+}
--- a/glib/update-gtranslit.py
+++ b/glib/update-gtranslit.py
@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+
+# Run this script like so:
+#
+#  ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
+
+import sys, os
+
+localedir = sys.argv[1]
+
+# returns true if the name looks like a POSIX locale name
+def looks_like_locale(name):
+    name, _, variant = name.partition('@')
+
+    if '_' not in name:
+        return False
+
+    lang, _, land = name.partition('_')
+
+    return len(lang) == 2 or len(lang) == 3 and len(land) == 2
+
+# handles <U1234> style escapes
+def unescape(string):
+    chunks = []
+
+    n = len(string)
+    i = 0
+
+    while i < n:
+        start_escape = string.find('<', i)
+
+        if start_escape == -1:
+            chunks.append(string[i:])
+            break
+
+        assert string[start_escape:start_escape + 2] == '<U'
+        start_escape += 2
+
+        end_escape = string.find('>', start_escape)
+        assert end_escape != -1
+
+        chunks.append(chr(int(string[start_escape:end_escape], 16)))
+        i = end_escape + 1
+
+    return ''.join(chunks)
+
+# Checks if a string is ascii
+def is_ascii(string):
+    return all(ord(c) < 0x80 for c in string)
+
+# A Mapping is a map from non-ascii strings to ascii strings.
+#
+# It corresponds to a sequence of one or more mapping lines:
+#
+#   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
+#
+# in a file.
+class Mapping:
+    def __init__(self):
+        self.serialised = None
+        self.mapping = {}
+
+    # Scans a string like
+    #
+    #   <U00C4> "<U0041><U0308>";"<U0041><U0045>" % LATIN CAPITAL LETTER A WITH DIAERESIS.
+    #
+    # and adds the first all-ascii choice (or IGNORE) to the mapping
+    # dictionary, with the origin string as the key.  In the case of
+    # IGNORE, stores the empty string.
+    def consider_mapping_line(self, line):
+        key, value, rest = (line + ' % comment').split(maxsplit=2)
+
+        key = unescape(key)
+
+        for alternative in value.split(';'):
+            if alternative[0] == '"' and alternative[-1] == '"':
+                unescaped = unescape(alternative[1:-1])
+                if is_ascii(unescaped):
+                    self.mapping[key] = unescaped
+                    break
+
+            elif alternative[0] == '<' and alternative[-1] == '>':
+                unescaped = unescape(alternative)
+                if is_ascii(unescaped):
+                    self.mapping[key] = unescaped
+                    break
+
+            elif alternative == 'IGNORE':
+                self.mapping[key] = ''
+                break
+
+    # Performs a normal dictionary merge, but ensures that there are no
+    # conflicting entries between the original dictionary and the requested
+    # changes
+    def merge_mapping(self, changes):
+        for key in changes.mapping:
+            if key in self.mapping:
+                assert self.mapping[key] == changes.mapping[key]
+
+        self.mapping.update(changes.mapping)
+
+    # Can't get much flatter...
+    def get_flattened(self):
+        return [self]
+
+    def serialise(self, serialiser):
+        if self.serialised == None:
+            self.serialised = serialiser.add_mapping(self.mapping)
+
+        return self.serialised
+
+# A Chain is a sequence of mappings and chains.
+#
+# A chain contains another chain whenever "copy" or "include" is
+# encountered in a source file.
+#
+# A chain contains a mapping whenever a sequence of mapping lines:
+#
+#   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
+#
+# is encountered in a file.
+#
+# The order of lookup is reverse: later entries override earlier ones.
+class Chain:
+    def __init__(self, name):
+        self.serialised = None
+        self.name = name
+        self.chain = []
+        self.links = 0
+
+        self.read_from_file(os.path.join(localedir, name))
+
+    def read_from_file(self, filename):
+        current_mapping = None
+        in_lc_ctype = False
+        in_translit = False
+
+        fp = open(filename, encoding='ascii', errors='surrogateescape')
+
+        for line in fp:
+            line = line.strip()
+
+            if in_lc_ctype:
+                if line == 'END LC_CTYPE':
+                    break
+
+                if line.startswith('copy') or line.startswith('include'):
+                    if current_mapping:
+                        self.chain.append(current_mapping)
+
+                    copyname = unescape(line.split('"', 3)[1])
+                    copyfile = get_chain(copyname)
+                    self.chain.append(copyfile)
+                    copyfile.links += 1
+
+                    current_mapping = None
+
+                elif line == 'translit_start':
+                    in_translit = True
+
+                elif line == 'translit_end':
+                    in_translit = False
+
+                elif in_translit and line.startswith('<U'):
+                    if not current_mapping:
+                        current_mapping = Mapping()
+
+                    current_mapping.consider_mapping_line(line)
+
+                elif line == '' or line.startswith('%'):
+                    pass
+
+                elif 'default_missing <U003F>':
+                    pass
+
+                elif in_translit:
+                    print('unknown line:', line)
+                    assert False
+
+            elif line == 'LC_CTYPE':
+                in_lc_ctype = True
+
+        if current_mapping:
+            self.chain.append(current_mapping)
+
+    # If there is only one link to this chain, we may as well just
+    # return the contents of the chain so that they can be merged into
+    # our sole parent directly.  Otherwise, return ourselves.
+    def get_flattened(self):
+        if self.links == 1:
+            return sum((item.get_flattened() for item in self.chain), [])
+        else:
+            return [self]
+
+    def serialise(self, serialiser):
+        if self.serialised == None:
+            # Before we serialise, see if we can optimise a bit
+            self.chain = sum((item.get_flattened() for item in self.chain), [])
+
+            i = 0
+            while i < len(self.chain) - 1:
+                if isinstance(self.chain[i], Mapping) and isinstance(self.chain[i + 1], Mapping):
+                    # We have two mappings in a row.  Try to merge them.
+                    self.chain[i].merge_mapping(self.chain[i + 1])
+                    del self.chain[i + 1]
+                else:
+                    i += 1
+
+            # If all that is left is one item, just serialise that directly
+            if len(self.chain) == 1:
+                self.serialised = self.chain[0].serialise(serialiser)
+            else:
+                ids = [item.serialise(serialiser) for item in self.chain]
+                self.serialised = serialiser.add_chain(ids)
+
+        return self.serialised
+
+# Chain cache -- allows sharing of common chains
+chains = {}
+def get_chain(name):
+    if not name in chains:
+        chains[name] = Chain(name)
+
+    return chains[name]
+
+
+# Remove the country name from a locale, preserving variant
+# eg: 'sr_RS@latin' -> 'sr@latin'
+def remove_country(string):
+    base, at, variant = string.partition('@')
+    lang, _, land = base.partition('_')
+    return lang + at + variant
+
+def encode_range(start, end):
+    assert start <= end
+    length = end - start
+
+    assert start < 0x1000
+    assert length < 0x8
+
+    result = 0x8000 + (length << 12) + start
+
+    assert result < 0x10000
+
+    return result
+
+def c_pair_array(array):
+    return '{ ' + ', '.join ('{ %u, %u }' % pair for pair in array) + ' };'
+
+class Serialiser:
+    def __init__(self):
+        self.mappings = []
+        self.chains = []
+        self.locales = {}
+
+    def add_mapping(self, mapping):
+        if mapping in self.mappings:
+            mapping_id = self.mappings.index(mapping)
+        else:
+            mapping_id = len(self.mappings)
+            self.mappings.append(mapping)
+
+        assert mapping_id < 128
+        return mapping_id
+
+    def add_chain(self, chain):
+        if chain in self.chains:
+            chain_id = self.chains.index(chain)
+        else:
+            chain_id = len(self.chains)
+            self.chains.append(chain)
+
+        assert chain_id < 128
+        return 128 + chain_id
+
+    def add_locale(self, name, item_id):
+        self.locales[name] = item_id
+
+    def add_default(self, item_id):
+        self.default = item_id
+
+    def optimise_locales(self):
+        # Check if all regions of a language/variant agree
+        languages = list(set(remove_country(locale) for locale in self.locales))
+
+        for language in languages:
+            locales = [locale for locale in self.locales if remove_country(locale) == language]
+
+            item_id = self.locales[locales[0]]
+            if all(self.locales[locale] == item_id for locale in locales):
+                self.locales[language] = item_id
+                for locale in locales:
+                    del self.locales[locale]
+
+        # Check if a variant is the same as the non-variant form
+        # eg: 'de@euro' and 'de'
+        for variant in list(locale for locale in self.locales if '@' in locale):
+            base, _, _ = variant.partition('@')
+            if base in self.locales and self.locales[base] == self.locales[variant]:
+                del self.locales[variant]
+
+        # Eliminate any entries that are just the same as the C locale
+        for locale in list(self.locales):
+            if self.locales[locale] == self.default:
+                del self.locales[locale]
+
+    def to_c(self):
+        src_table = ''
+        ascii_table = ''
+        mappings_table = []
+        mapping_ranges = []
+        chains_table = []
+        chain_starts = []
+        locale_names = ''
+        locale_index = []
+        max_lookup = 0
+        max_localename = 0
+
+        for mapping in self.mappings:
+            mapping_ranges.append ((len(mappings_table), len(mapping)))
+
+            for key in sorted(mapping):
+                if len(key) == 1 and ord(key[0]) < 0x8000:
+                    src_range = ord(key[0])
+                else:
+                    existing = src_table.find(key)
+                    if existing == -1:
+                        start = len(src_table)
+                        assert all(ord(c) <= 0x10ffff for c in key)
+                        src_table += key
+                        src_range = encode_range(start, len(src_table))
+                        max_lookup = max(max_lookup, len(key))
+                    else:
+                        src_range = encode_range(existing, existing + len(key))
+
+                value = mapping[key]
+                if len(value) == 1 and ord(value[0]) < 0x80:
+                    ascii_range = ord(value[0])
+                else:
+                    existing = ascii_table.find(value)
+                    if existing == -1:
+                        start = len(ascii_table)
+                        assert all(ord(c) < 0x80 for c in value)
+                        ascii_table += value
+                        ascii_range = encode_range(start, len(ascii_table))
+                    else:
+                        ascii_range = encode_range(existing, existing + len(value))
+
+                mappings_table.append ((src_range, ascii_range))
+
+            mapping_end = len(mappings_table)
+
+        for chain in self.chains:
+            chain_starts.append(len(chains_table))
+
+            for item_id in reversed(chain):
+                assert item_id < 0xff
+                chains_table.append(item_id)
+            chains_table.append(0xff)
+
+        for locale in sorted(self.locales):
+            max_localename = max(max_localename, len(locale))
+            name_offset = len(locale_names)
+            assert all(ord(c) <= 0x7f for c in locale)
+            locale_names += (locale + '\0')
+
+            item_id = self.locales[locale]
+
+            assert name_offset < 256
+            assert item_id < 256
+            locale_index.append((name_offset, item_id))
+
+        print('/* Generated by update-gtranslit.py */')
+        print('#define MAX_KEY_SIZE', max_lookup)
+        print('#define MAX_LOCALE_NAME', max_localename)
+        print('static const gunichar src_table[] = {', ', '.join(str(ord(c)) for c in src_table), '};')
+        # cannot do this in plain ascii because of trigraphs... :(
+        print('static const gchar ascii_table[] = {', ', '.join(str(ord(c)) for c in ascii_table), '};')
+        print('static const struct mapping_entry mappings_table[] =', c_pair_array (mappings_table))
+        print('static const struct mapping_range mapping_ranges[] =', c_pair_array (mapping_ranges))
+        print('static const guint8 chains_table[] = {', ', '.join(str(i) for i in chains_table), '};')
+        print('static const guint8 chain_starts[] = {', ', '.join(str(i) for i in chain_starts), '};')
+        print('static const gchar locale_names[] = "' + locale_names.replace('\0', '\\0') + '";')
+        print('static const struct locale_entry locale_index[] = ', c_pair_array (locale_index))
+        print('static const guint8 default_item_id = %u;' % (self.default,))
+
+    def dump(self):
+        print(self.mappings)
+        print(self.chains)
+        print(self.locales)
+
+locales = []
+for name in os.listdir(localedir):
+    if looks_like_locale(name):
+        chain = get_chain(name)
+        locales.append (chain)
+        chain.links += 1
+
+serialiser = Serialiser()
+
+for locale in locales:
+    serialiser.add_locale(locale.name, locale.serialise(serialiser))
+
+i18n = get_chain('i18n').serialise(serialiser)
+combining = get_chain('translit_combining').serialise(serialiser)
+serialiser.add_default(serialiser.add_chain([i18n, combining]))
+
+serialiser.optimise_locales()
+
+serialiser.to_c()