mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-27 22:46:15 +01:00
Add locale-sensitive ASCII transliteration API
Add a new function, g_str_to_ascii() that does locale-dependent ASCII transliteration of UTF-8 strings. This function works off of an internal database. We get the data out of the localedata shipped with glibc, which seems to be just about the best source of locale-sensitive transliteration information available anywhere. We include a update script with this commit that's not used by anything at all -- it will just sit in git. It is intended to be run manually from time to time. https://bugzilla.gnome.org/show_bug.cgi?id=710142
This commit is contained in:
parent
436d77f70a
commit
941b8979d0
@ -1282,6 +1282,7 @@ g_strrstr_len
|
|||||||
g_str_has_prefix
|
g_str_has_prefix
|
||||||
g_str_has_suffix
|
g_str_has_suffix
|
||||||
g_strcmp0
|
g_strcmp0
|
||||||
|
g_str_to_ascii
|
||||||
g_str_tokenize_and_fold
|
g_str_tokenize_and_fold
|
||||||
g_str_match_string
|
g_str_match_string
|
||||||
|
|
||||||
|
@ -172,6 +172,7 @@ libglib_2_0_la_SOURCES = \
|
|||||||
gthreadpool.c \
|
gthreadpool.c \
|
||||||
gtimer.c \
|
gtimer.c \
|
||||||
gtimezone.c \
|
gtimezone.c \
|
||||||
|
gtranslit.c \
|
||||||
gtrashstack.c \
|
gtrashstack.c \
|
||||||
gtree.c \
|
gtree.c \
|
||||||
guniprop.c \
|
guniprop.c \
|
||||||
|
@ -287,6 +287,10 @@ GLIB_AVAILABLE_IN_ALL
|
|||||||
gchar* g_stpcpy (gchar *dest,
|
gchar* g_stpcpy (gchar *dest,
|
||||||
const char *src);
|
const char *src);
|
||||||
|
|
||||||
|
GLIB_AVAILABLE_IN_2_40
|
||||||
|
gchar * g_str_to_ascii (const gchar *str,
|
||||||
|
const gchar *from_locale);
|
||||||
|
|
||||||
GLIB_AVAILABLE_IN_2_40
|
GLIB_AVAILABLE_IN_2_40
|
||||||
gchar ** g_str_tokenize_and_fold (const gchar *string,
|
gchar ** g_str_tokenize_and_fold (const gchar *string,
|
||||||
const gchar *translit_locale,
|
const gchar *translit_locale,
|
||||||
|
12
glib/gtranslit-data.h
Normal file
12
glib/gtranslit-data.h
Normal file
File diff suppressed because one or more lines are too long
409
glib/gtranslit.c
Normal file
409
glib/gtranslit.c
Normal file
@ -0,0 +1,409 @@
|
|||||||
|
/*
|
||||||
|
* Copyright © 2014 Canonical Limited
|
||||||
|
*
|
||||||
|
* This library is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2 of the licence, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This library is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* Author: Ryan Lortie <desrt@desrt.ca>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#include "gstrfuncs.h"
|
||||||
|
|
||||||
|
#include <glib.h>
|
||||||
|
#include <locale.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
struct mapping_entry
|
||||||
|
{
|
||||||
|
guint16 src;
|
||||||
|
guint16 ascii;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mapping_range
|
||||||
|
{
|
||||||
|
guint16 start;
|
||||||
|
guint16 length;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct locale_entry
|
||||||
|
{
|
||||||
|
guint8 name_offset;
|
||||||
|
guint8 item_id;
|
||||||
|
};
|
||||||
|
|
||||||
|
#include "gtranslit-data.h"
|
||||||
|
|
||||||
|
#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
|
||||||
|
#define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
|
||||||
|
|
||||||
|
#if G_BYTE_ORDER == G_BIG_ENDIAN
|
||||||
|
#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
|
||||||
|
#else
|
||||||
|
#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static const gchar * lookup_in_item (guint item_id,
|
||||||
|
const gunichar *key,
|
||||||
|
gint *result_len,
|
||||||
|
gint *key_consumed);
|
||||||
|
|
||||||
|
static gint
|
||||||
|
compare_mapping_entry (gconstpointer user_data,
|
||||||
|
gconstpointer data)
|
||||||
|
{
|
||||||
|
const struct mapping_entry *entry = data;
|
||||||
|
const gunichar *key = user_data;
|
||||||
|
gunichar src_0;
|
||||||
|
|
||||||
|
G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
|
||||||
|
|
||||||
|
src_0 = get_src_char (src_table, entry->src, 0);
|
||||||
|
|
||||||
|
if (key[0] > src_0)
|
||||||
|
return 1;
|
||||||
|
else if (key[0] < src_0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (get_length (entry->src) > 1)
|
||||||
|
{
|
||||||
|
gunichar src_1;
|
||||||
|
|
||||||
|
src_1 = get_src_char (src_table, entry->src, 1);
|
||||||
|
|
||||||
|
if (key[1] > src_1)
|
||||||
|
return 1;
|
||||||
|
else if (key[1] < src_1)
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
else if (key[1])
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const gchar *
|
||||||
|
lookup_in_mapping (const struct mapping_entry *mapping,
|
||||||
|
gint mapping_size,
|
||||||
|
const gunichar *key,
|
||||||
|
gint *result_len,
|
||||||
|
gint *key_consumed)
|
||||||
|
{
|
||||||
|
const struct mapping_entry *hit;
|
||||||
|
|
||||||
|
hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
|
||||||
|
|
||||||
|
if (hit == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
*key_consumed = get_length (hit->src);
|
||||||
|
*result_len = get_length (hit->ascii);
|
||||||
|
|
||||||
|
return get_ascii_item(ascii_table, hit->ascii);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const gchar *
|
||||||
|
lookup_in_chain (const guint8 *chain,
|
||||||
|
const gunichar *key,
|
||||||
|
gint *result_len,
|
||||||
|
gint *key_consumed)
|
||||||
|
{
|
||||||
|
const gchar *result;
|
||||||
|
|
||||||
|
while (*chain != 0xff)
|
||||||
|
{
|
||||||
|
result = lookup_in_item (*chain, key, result_len, key_consumed);
|
||||||
|
|
||||||
|
if (result)
|
||||||
|
return result;
|
||||||
|
|
||||||
|
chain++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const gchar *
|
||||||
|
lookup_in_item (guint item_id,
|
||||||
|
const gunichar *key,
|
||||||
|
gint *result_len,
|
||||||
|
gint *key_consumed)
|
||||||
|
{
|
||||||
|
if (item_id & 0x80)
|
||||||
|
{
|
||||||
|
const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
|
||||||
|
|
||||||
|
return lookup_in_chain (chain, key, result_len, key_consumed);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const struct mapping_range *range = &mapping_ranges[item_id];
|
||||||
|
|
||||||
|
return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static gint
|
||||||
|
compare_locale_entry (gconstpointer user_data,
|
||||||
|
gconstpointer data)
|
||||||
|
{
|
||||||
|
const struct locale_entry *entry = data;
|
||||||
|
const gchar *key = user_data;
|
||||||
|
|
||||||
|
return strcmp (key, &locale_names[entry->name_offset]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static gboolean
|
||||||
|
lookup_item_id_for_one_locale (const gchar *key,
|
||||||
|
guint *item_id)
|
||||||
|
{
|
||||||
|
const struct locale_entry *hit;
|
||||||
|
|
||||||
|
hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
|
||||||
|
|
||||||
|
if (hit == NULL)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
*item_id = hit->item_id;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static guint
|
||||||
|
lookup_item_id_for_locale (const gchar *locale)
|
||||||
|
{
|
||||||
|
gchar key[MAX_LOCALE_NAME + 1];
|
||||||
|
const gchar *language;
|
||||||
|
guint language_len;
|
||||||
|
const gchar *territory = NULL;
|
||||||
|
guint territory_len = 0;
|
||||||
|
const gchar *modifier = NULL;
|
||||||
|
guint modifier_len = 0;
|
||||||
|
const gchar *next_char;
|
||||||
|
guint id;
|
||||||
|
|
||||||
|
/* As per POSIX, a valid locale looks like:
|
||||||
|
*
|
||||||
|
* language[_territory][.codeset][@modifier]
|
||||||
|
*/
|
||||||
|
language = locale;
|
||||||
|
language_len = strcspn (language, "_.@");
|
||||||
|
next_char = language + language_len;
|
||||||
|
|
||||||
|
if (*next_char == '_')
|
||||||
|
{
|
||||||
|
territory = next_char;
|
||||||
|
territory_len = strcspn (territory + 1, "_.@") + 1;
|
||||||
|
next_char = territory + territory_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*next_char == '.')
|
||||||
|
{
|
||||||
|
const gchar *codeset;
|
||||||
|
guint codeset_len;
|
||||||
|
|
||||||
|
codeset = next_char;
|
||||||
|
codeset_len = strcspn (codeset + 1, "_.@") + 1;
|
||||||
|
next_char = codeset + codeset_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*next_char == '@')
|
||||||
|
{
|
||||||
|
modifier = next_char;
|
||||||
|
modifier_len = strcspn (modifier + 1, "_.@") + 1;
|
||||||
|
next_char = modifier + modifier_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* What madness is this? */
|
||||||
|
if (language_len == 0 || *next_char)
|
||||||
|
return default_item_id;
|
||||||
|
|
||||||
|
/* We are not interested in codeset.
|
||||||
|
*
|
||||||
|
* For this locale:
|
||||||
|
*
|
||||||
|
* aa_BB@cc
|
||||||
|
*
|
||||||
|
* try in this order:
|
||||||
|
*
|
||||||
|
* Note: we have no locales of the form aa_BB@cc in the database.
|
||||||
|
*
|
||||||
|
* 1. aa@cc
|
||||||
|
* 2. aa_BB
|
||||||
|
* 3. aa
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* 1. */
|
||||||
|
if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
|
||||||
|
{
|
||||||
|
memcpy (key, language, language_len);
|
||||||
|
memcpy (key + language_len, modifier, modifier_len);
|
||||||
|
key[language_len + modifier_len] = '\0';
|
||||||
|
|
||||||
|
if (lookup_item_id_for_one_locale (key, &id))
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 2. */
|
||||||
|
if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
|
||||||
|
{
|
||||||
|
memcpy (key, language, language_len);
|
||||||
|
memcpy (key + language_len, territory, territory_len);
|
||||||
|
key[language_len + territory_len] = '\0';
|
||||||
|
|
||||||
|
if (lookup_item_id_for_one_locale (key, &id))
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 3. */
|
||||||
|
if (language_len <= MAX_LOCALE_NAME)
|
||||||
|
{
|
||||||
|
memcpy (key, language, language_len);
|
||||||
|
key[language_len] = '\0';
|
||||||
|
|
||||||
|
if (lookup_item_id_for_one_locale (key, &id))
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return default_item_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
static guint
|
||||||
|
get_default_item_id (void)
|
||||||
|
{
|
||||||
|
static guint item_id;
|
||||||
|
static gboolean done;
|
||||||
|
|
||||||
|
/* Doesn't need to be locked -- no harm in doing it twice. */
|
||||||
|
if (!done)
|
||||||
|
{
|
||||||
|
const gchar *locale;
|
||||||
|
|
||||||
|
locale = setlocale (LC_CTYPE, NULL);
|
||||||
|
item_id = lookup_item_id_for_locale (locale);
|
||||||
|
done = TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return item_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* g_str_to_ascii:
|
||||||
|
* @str: a string, in UTF-8
|
||||||
|
* @from_locale: (allow-none): the source locale, if known
|
||||||
|
*
|
||||||
|
* Transliterate @str to plain ASCII.
|
||||||
|
*
|
||||||
|
* For best results, @str should be in composed normalised form.
|
||||||
|
*
|
||||||
|
* This function performs a reasonably good set of character
|
||||||
|
* replacements. The particular set of replacements that is done may
|
||||||
|
* change by version or even by runtime environment.
|
||||||
|
*
|
||||||
|
* If the source language of @str is known, it can used to improve the
|
||||||
|
* accuracy of the translation by passing it as @from_locale. It should
|
||||||
|
* be a valid POSIX locale string (of the form
|
||||||
|
* "language[_territory][.codeset][@modifier]").
|
||||||
|
*
|
||||||
|
* If @from_locale is %NULL then the current locale is used.
|
||||||
|
*
|
||||||
|
* If you want to do translation for no specific locale, and you want it
|
||||||
|
* to be done independently of the currently locale, specify "C" for
|
||||||
|
* @from_locale.
|
||||||
|
*
|
||||||
|
* Returns: a string in plain ASCII
|
||||||
|
**/
|
||||||
|
gchar *
|
||||||
|
g_str_to_ascii (const gchar *str,
|
||||||
|
const gchar *from_locale)
|
||||||
|
{
|
||||||
|
GString *result;
|
||||||
|
guint item_id;
|
||||||
|
|
||||||
|
if (g_str_is_ascii (str))
|
||||||
|
return g_strdup (str);
|
||||||
|
|
||||||
|
if (from_locale)
|
||||||
|
item_id = lookup_item_id_for_locale (from_locale);
|
||||||
|
else
|
||||||
|
item_id = get_default_item_id ();
|
||||||
|
|
||||||
|
result = g_string_new (NULL);
|
||||||
|
|
||||||
|
while (*str)
|
||||||
|
{
|
||||||
|
/* We only need to transliterate non-ASCII values... */
|
||||||
|
if (*str & 0x80)
|
||||||
|
{
|
||||||
|
gunichar key[MAX_KEY_SIZE];
|
||||||
|
const gchar *r;
|
||||||
|
gint consumed;
|
||||||
|
gint r_len;
|
||||||
|
gunichar c;
|
||||||
|
|
||||||
|
G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
|
||||||
|
|
||||||
|
c = g_utf8_get_char (str);
|
||||||
|
|
||||||
|
/* This is where it gets evil...
|
||||||
|
*
|
||||||
|
* We know that MAX_KEY_SIZE is 2. We also know that we
|
||||||
|
* only want to try another character if it's non-ascii.
|
||||||
|
*/
|
||||||
|
str = g_utf8_next_char (str);
|
||||||
|
|
||||||
|
key[0] = c;
|
||||||
|
if (*str & 0x80)
|
||||||
|
key[1] = g_utf8_get_char (str);
|
||||||
|
else
|
||||||
|
key[1] = 0;
|
||||||
|
|
||||||
|
r = lookup_in_item (item_id, key, &r_len, &consumed);
|
||||||
|
|
||||||
|
/* If we failed to map two characters, try again with one.
|
||||||
|
*
|
||||||
|
* gconv behaviour is a bit weird here -- it seems to
|
||||||
|
* depend in the randomness of the binary search and the
|
||||||
|
* size of the input buffer as to what result we get here.
|
||||||
|
*
|
||||||
|
* Doing it this way is more work, but should be
|
||||||
|
* more-correct.
|
||||||
|
*/
|
||||||
|
if (r == NULL && key[1])
|
||||||
|
{
|
||||||
|
key[1] = 0;
|
||||||
|
r = lookup_in_item (item_id, key, &r_len, &consumed);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (r != NULL)
|
||||||
|
{
|
||||||
|
g_string_append_len (result, r, r_len);
|
||||||
|
if (consumed == 2)
|
||||||
|
/* If it took both then skip again */
|
||||||
|
str = g_utf8_next_char (str);
|
||||||
|
}
|
||||||
|
else /* no match found */
|
||||||
|
g_string_append_c (result, '?');
|
||||||
|
}
|
||||||
|
else if (*str & 0x80) /* Out-of-range non-ASCII case */
|
||||||
|
{
|
||||||
|
g_string_append_c (result, '?');
|
||||||
|
str = g_utf8_next_char (str);
|
||||||
|
}
|
||||||
|
else /* ASCII case */
|
||||||
|
g_string_append_c (result, *str++);
|
||||||
|
}
|
||||||
|
|
||||||
|
return g_string_free (result, FALSE);
|
||||||
|
}
|
410
glib/update-gtranslit.py
Executable file
410
glib/update-gtranslit.py
Executable file
@ -0,0 +1,410 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Run this script like so:
|
||||||
|
#
|
||||||
|
# ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
|
||||||
|
|
||||||
|
import sys, os
|
||||||
|
|
||||||
|
localedir = sys.argv[1]
|
||||||
|
|
||||||
|
# returns true if the name looks like a POSIX locale name
|
||||||
|
def looks_like_locale(name):
|
||||||
|
name, _, variant = name.partition('@')
|
||||||
|
|
||||||
|
if '_' not in name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
lang, _, land = name.partition('_')
|
||||||
|
|
||||||
|
return len(lang) == 2 or len(lang) == 3 and len(land) == 2
|
||||||
|
|
||||||
|
# handles <U1234> style escapes
|
||||||
|
def unescape(string):
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
n = len(string)
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while i < n:
|
||||||
|
start_escape = string.find('<', i)
|
||||||
|
|
||||||
|
if start_escape == -1:
|
||||||
|
chunks.append(string[i:])
|
||||||
|
break
|
||||||
|
|
||||||
|
assert string[start_escape:start_escape + 2] == '<U'
|
||||||
|
start_escape += 2
|
||||||
|
|
||||||
|
end_escape = string.find('>', start_escape)
|
||||||
|
assert end_escape != -1
|
||||||
|
|
||||||
|
chunks.append(chr(int(string[start_escape:end_escape], 16)))
|
||||||
|
i = end_escape + 1
|
||||||
|
|
||||||
|
return ''.join(chunks)
|
||||||
|
|
||||||
|
# Checks if a string is ascii
|
||||||
|
def is_ascii(string):
|
||||||
|
return all(ord(c) < 0x80 for c in string)
|
||||||
|
|
||||||
|
# A Mapping is a map from non-ascii strings to ascii strings.
|
||||||
|
#
|
||||||
|
# It corresponds to a sequence of one or more mapping lines:
|
||||||
|
#
|
||||||
|
# <U00C4> "<U0041><U0308>";"<U0041><U0045>"
|
||||||
|
#
|
||||||
|
# in a file.
|
||||||
|
class Mapping:
|
||||||
|
def __init__(self):
|
||||||
|
self.serialised = None
|
||||||
|
self.mapping = {}
|
||||||
|
|
||||||
|
# Scans a string like
|
||||||
|
#
|
||||||
|
# <U00C4> "<U0041><U0308>";"<U0041><U0045>" % LATIN CAPITAL LETTER A WITH DIAERESIS.
|
||||||
|
#
|
||||||
|
# and adds the first all-ascii choice (or IGNORE) to the mapping
|
||||||
|
# dictionary, with the origin string as the key. In the case of
|
||||||
|
# IGNORE, stores the empty string.
|
||||||
|
def consider_mapping_line(self, line):
|
||||||
|
key, value, rest = (line + ' % comment').split(maxsplit=2)
|
||||||
|
|
||||||
|
key = unescape(key)
|
||||||
|
|
||||||
|
for alternative in value.split(';'):
|
||||||
|
if alternative[0] == '"' and alternative[-1] == '"':
|
||||||
|
unescaped = unescape(alternative[1:-1])
|
||||||
|
if is_ascii(unescaped):
|
||||||
|
self.mapping[key] = unescaped
|
||||||
|
break
|
||||||
|
|
||||||
|
elif alternative[0] == '<' and alternative[-1] == '>':
|
||||||
|
unescaped = unescape(alternative)
|
||||||
|
if is_ascii(unescaped):
|
||||||
|
self.mapping[key] = unescaped
|
||||||
|
break
|
||||||
|
|
||||||
|
elif alternative == 'IGNORE':
|
||||||
|
self.mapping[key] = ''
|
||||||
|
break
|
||||||
|
|
||||||
|
# Performs a normal dictionary merge, but ensures that there are no
|
||||||
|
# conflicting entries between the original dictionary and the requested
|
||||||
|
# changes
|
||||||
|
def merge_mapping(self, changes):
|
||||||
|
for key in changes.mapping:
|
||||||
|
if key in self.mapping:
|
||||||
|
assert self.mapping[key] == changes.mapping[key]
|
||||||
|
|
||||||
|
self.mapping.update(changes.mapping)
|
||||||
|
|
||||||
|
# Can't get much flatter...
|
||||||
|
def get_flattened(self):
|
||||||
|
return [self]
|
||||||
|
|
||||||
|
def serialise(self, serialiser):
|
||||||
|
if self.serialised == None:
|
||||||
|
self.serialised = serialiser.add_mapping(self.mapping)
|
||||||
|
|
||||||
|
return self.serialised
|
||||||
|
|
||||||
|
# A Chain is a sequence of mappings and chains.
|
||||||
|
#
|
||||||
|
# A chain contains another chain whenever "copy" or "include" is
|
||||||
|
# encountered in a source file.
|
||||||
|
#
|
||||||
|
# A chain contains a mapping whenever a sequence of mapping lines:
|
||||||
|
#
|
||||||
|
# <U00C4> "<U0041><U0308>";"<U0041><U0045>"
|
||||||
|
#
|
||||||
|
# is encountered in a file.
|
||||||
|
#
|
||||||
|
# The order of lookup is reverse: later entries override earlier ones.
|
||||||
|
class Chain:
|
||||||
|
def __init__(self, name):
|
||||||
|
self.serialised = None
|
||||||
|
self.name = name
|
||||||
|
self.chain = []
|
||||||
|
self.links = 0
|
||||||
|
|
||||||
|
self.read_from_file(os.path.join(localedir, name))
|
||||||
|
|
||||||
|
def read_from_file(self, filename):
|
||||||
|
current_mapping = None
|
||||||
|
in_lc_ctype = False
|
||||||
|
in_translit = False
|
||||||
|
|
||||||
|
fp = open(filename, encoding='ascii', errors='surrogateescape')
|
||||||
|
|
||||||
|
for line in fp:
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
if in_lc_ctype:
|
||||||
|
if line == 'END LC_CTYPE':
|
||||||
|
break
|
||||||
|
|
||||||
|
if line.startswith('copy') or line.startswith('include'):
|
||||||
|
if current_mapping:
|
||||||
|
self.chain.append(current_mapping)
|
||||||
|
|
||||||
|
copyname = unescape(line.split('"', 3)[1])
|
||||||
|
copyfile = get_chain(copyname)
|
||||||
|
self.chain.append(copyfile)
|
||||||
|
copyfile.links += 1
|
||||||
|
|
||||||
|
current_mapping = None
|
||||||
|
|
||||||
|
elif line == 'translit_start':
|
||||||
|
in_translit = True
|
||||||
|
|
||||||
|
elif line == 'translit_end':
|
||||||
|
in_translit = False
|
||||||
|
|
||||||
|
elif in_translit and line.startswith('<U'):
|
||||||
|
if not current_mapping:
|
||||||
|
current_mapping = Mapping()
|
||||||
|
|
||||||
|
current_mapping.consider_mapping_line(line)
|
||||||
|
|
||||||
|
elif line == '' or line.startswith('%'):
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif 'default_missing <U003F>':
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif in_translit:
|
||||||
|
print('unknown line:', line)
|
||||||
|
assert False
|
||||||
|
|
||||||
|
elif line == 'LC_CTYPE':
|
||||||
|
in_lc_ctype = True
|
||||||
|
|
||||||
|
if current_mapping:
|
||||||
|
self.chain.append(current_mapping)
|
||||||
|
|
||||||
|
# If there is only one link to this chain, we may as well just
|
||||||
|
# return the contents of the chain so that they can be merged into
|
||||||
|
# our sole parent directly. Otherwise, return ourselves.
|
||||||
|
def get_flattened(self):
|
||||||
|
if self.links == 1:
|
||||||
|
return sum((item.get_flattened() for item in self.chain), [])
|
||||||
|
else:
|
||||||
|
return [self]
|
||||||
|
|
||||||
|
def serialise(self, serialiser):
|
||||||
|
if self.serialised == None:
|
||||||
|
# Before we serialise, see if we can optimise a bit
|
||||||
|
self.chain = sum((item.get_flattened() for item in self.chain), [])
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(self.chain) - 1:
|
||||||
|
if isinstance(self.chain[i], Mapping) and isinstance(self.chain[i + 1], Mapping):
|
||||||
|
# We have two mappings in a row. Try to merge them.
|
||||||
|
self.chain[i].merge_mapping(self.chain[i + 1])
|
||||||
|
del self.chain[i + 1]
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# If all that is left is one item, just serialise that directly
|
||||||
|
if len(self.chain) == 1:
|
||||||
|
self.serialised = self.chain[0].serialise(serialiser)
|
||||||
|
else:
|
||||||
|
ids = [item.serialise(serialiser) for item in self.chain]
|
||||||
|
self.serialised = serialiser.add_chain(ids)
|
||||||
|
|
||||||
|
return self.serialised
|
||||||
|
|
||||||
|
# Chain cache -- allows sharing of common chains
|
||||||
|
chains = {}
|
||||||
|
def get_chain(name):
|
||||||
|
if not name in chains:
|
||||||
|
chains[name] = Chain(name)
|
||||||
|
|
||||||
|
return chains[name]
|
||||||
|
|
||||||
|
|
||||||
|
# Remove the country name from a locale, preserving variant
|
||||||
|
# eg: 'sr_RS@latin' -> 'sr@latin'
|
||||||
|
def remove_country(string):
|
||||||
|
base, at, variant = string.partition('@')
|
||||||
|
lang, _, land = base.partition('_')
|
||||||
|
return lang + at + variant
|
||||||
|
|
||||||
|
def encode_range(start, end):
|
||||||
|
assert start <= end
|
||||||
|
length = end - start
|
||||||
|
|
||||||
|
assert start < 0x1000
|
||||||
|
assert length < 0x8
|
||||||
|
|
||||||
|
result = 0x8000 + (length << 12) + start
|
||||||
|
|
||||||
|
assert result < 0x10000
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def c_pair_array(array):
|
||||||
|
return '{ ' + ', '.join ('{ %u, %u }' % pair for pair in array) + ' };'
|
||||||
|
|
||||||
|
class Serialiser:
|
||||||
|
def __init__(self):
|
||||||
|
self.mappings = []
|
||||||
|
self.chains = []
|
||||||
|
self.locales = {}
|
||||||
|
|
||||||
|
def add_mapping(self, mapping):
|
||||||
|
if mapping in self.mappings:
|
||||||
|
mapping_id = self.mappings.index(mapping)
|
||||||
|
else:
|
||||||
|
mapping_id = len(self.mappings)
|
||||||
|
self.mappings.append(mapping)
|
||||||
|
|
||||||
|
assert mapping_id < 128
|
||||||
|
return mapping_id
|
||||||
|
|
||||||
|
def add_chain(self, chain):
|
||||||
|
if chain in self.chains:
|
||||||
|
chain_id = self.chains.index(chain)
|
||||||
|
else:
|
||||||
|
chain_id = len(self.chains)
|
||||||
|
self.chains.append(chain)
|
||||||
|
|
||||||
|
assert chain_id < 128
|
||||||
|
return 128 + chain_id
|
||||||
|
|
||||||
|
def add_locale(self, name, item_id):
|
||||||
|
self.locales[name] = item_id
|
||||||
|
|
||||||
|
def add_default(self, item_id):
|
||||||
|
self.default = item_id
|
||||||
|
|
||||||
|
def optimise_locales(self):
|
||||||
|
# Check if all regions of a language/variant agree
|
||||||
|
languages = list(set(remove_country(locale) for locale in self.locales))
|
||||||
|
|
||||||
|
for language in languages:
|
||||||
|
locales = [locale for locale in self.locales if remove_country(locale) == language]
|
||||||
|
|
||||||
|
item_id = self.locales[locales[0]]
|
||||||
|
if all(self.locales[locale] == item_id for locale in locales):
|
||||||
|
self.locales[language] = item_id
|
||||||
|
for locale in locales:
|
||||||
|
del self.locales[locale]
|
||||||
|
|
||||||
|
# Check if a variant is the same as the non-variant form
|
||||||
|
# eg: 'de@euro' and 'de'
|
||||||
|
for variant in list(locale for locale in self.locales if '@' in locale):
|
||||||
|
base, _, _ = variant.partition('@')
|
||||||
|
if base in self.locales and self.locales[base] == self.locales[variant]:
|
||||||
|
del self.locales[variant]
|
||||||
|
|
||||||
|
# Eliminate any entries that are just the same as the C locale
|
||||||
|
for locale in list(self.locales):
|
||||||
|
if self.locales[locale] == self.default:
|
||||||
|
del self.locales[locale]
|
||||||
|
|
||||||
|
def to_c(self):
|
||||||
|
src_table = ''
|
||||||
|
ascii_table = ''
|
||||||
|
mappings_table = []
|
||||||
|
mapping_ranges = []
|
||||||
|
chains_table = []
|
||||||
|
chain_starts = []
|
||||||
|
locale_names = ''
|
||||||
|
locale_index = []
|
||||||
|
max_lookup = 0
|
||||||
|
max_localename = 0
|
||||||
|
|
||||||
|
for mapping in self.mappings:
|
||||||
|
mapping_ranges.append ((len(mappings_table), len(mapping)))
|
||||||
|
|
||||||
|
for key in sorted(mapping):
|
||||||
|
if len(key) == 1 and ord(key[0]) < 0x8000:
|
||||||
|
src_range = ord(key[0])
|
||||||
|
else:
|
||||||
|
existing = src_table.find(key)
|
||||||
|
if existing == -1:
|
||||||
|
start = len(src_table)
|
||||||
|
assert all(ord(c) <= 0x10ffff for c in key)
|
||||||
|
src_table += key
|
||||||
|
src_range = encode_range(start, len(src_table))
|
||||||
|
max_lookup = max(max_lookup, len(key))
|
||||||
|
else:
|
||||||
|
src_range = encode_range(existing, existing + len(key))
|
||||||
|
|
||||||
|
value = mapping[key]
|
||||||
|
if len(value) == 1 and ord(value[0]) < 0x80:
|
||||||
|
ascii_range = ord(value[0])
|
||||||
|
else:
|
||||||
|
existing = ascii_table.find(value)
|
||||||
|
if existing == -1:
|
||||||
|
start = len(ascii_table)
|
||||||
|
assert all(ord(c) < 0x80 for c in value)
|
||||||
|
ascii_table += value
|
||||||
|
ascii_range = encode_range(start, len(ascii_table))
|
||||||
|
else:
|
||||||
|
ascii_range = encode_range(existing, existing + len(value))
|
||||||
|
|
||||||
|
mappings_table.append ((src_range, ascii_range))
|
||||||
|
|
||||||
|
mapping_end = len(mappings_table)
|
||||||
|
|
||||||
|
for chain in self.chains:
|
||||||
|
chain_starts.append(len(chains_table))
|
||||||
|
|
||||||
|
for item_id in reversed(chain):
|
||||||
|
assert item_id < 0xff
|
||||||
|
chains_table.append(item_id)
|
||||||
|
chains_table.append(0xff)
|
||||||
|
|
||||||
|
for locale in sorted(self.locales):
|
||||||
|
max_localename = max(max_localename, len(locale))
|
||||||
|
name_offset = len(locale_names)
|
||||||
|
assert all(ord(c) <= 0x7f for c in locale)
|
||||||
|
locale_names += (locale + '\0')
|
||||||
|
|
||||||
|
item_id = self.locales[locale]
|
||||||
|
|
||||||
|
assert name_offset < 256
|
||||||
|
assert item_id < 256
|
||||||
|
locale_index.append((name_offset, item_id))
|
||||||
|
|
||||||
|
print('/* Generated by update-gtranslit.py */')
|
||||||
|
print('#define MAX_KEY_SIZE', max_lookup)
|
||||||
|
print('#define MAX_LOCALE_NAME', max_localename)
|
||||||
|
print('static const gunichar src_table[] = {', ', '.join(str(ord(c)) for c in src_table), '};')
|
||||||
|
# cannot do this in plain ascii because of trigraphs... :(
|
||||||
|
print('static const gchar ascii_table[] = {', ', '.join(str(ord(c)) for c in ascii_table), '};')
|
||||||
|
print('static const struct mapping_entry mappings_table[] =', c_pair_array (mappings_table))
|
||||||
|
print('static const struct mapping_range mapping_ranges[] =', c_pair_array (mapping_ranges))
|
||||||
|
print('static const guint8 chains_table[] = {', ', '.join(str(i) for i in chains_table), '};')
|
||||||
|
print('static const guint8 chain_starts[] = {', ', '.join(str(i) for i in chain_starts), '};')
|
||||||
|
print('static const gchar locale_names[] = "' + locale_names.replace('\0', '\\0') + '";')
|
||||||
|
print('static const struct locale_entry locale_index[] = ', c_pair_array (locale_index))
|
||||||
|
print('static const guint8 default_item_id = %u;' % (self.default,))
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
print(self.mappings)
|
||||||
|
print(self.chains)
|
||||||
|
print(self.locales)
|
||||||
|
|
||||||
|
locales = []
|
||||||
|
for name in os.listdir(localedir):
|
||||||
|
if looks_like_locale(name):
|
||||||
|
chain = get_chain(name)
|
||||||
|
locales.append (chain)
|
||||||
|
chain.links += 1
|
||||||
|
|
||||||
|
serialiser = Serialiser()
|
||||||
|
|
||||||
|
for locale in locales:
|
||||||
|
serialiser.add_locale(locale.name, locale.serialise(serialiser))
|
||||||
|
|
||||||
|
i18n = get_chain('i18n').serialise(serialiser)
|
||||||
|
combining = get_chain('translit_combining').serialise(serialiser)
|
||||||
|
serialiser.add_default(serialiser.add_chain([i18n, combining]))
|
||||||
|
|
||||||
|
serialiser.optimise_locales()
|
||||||
|
|
||||||
|
serialiser.to_c()
|
Loading…
Reference in New Issue
Block a user