mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2024-11-14 05:16:18 +01:00
b4734d3b5b
That should format these character classes correctly in the documentation, and prevent them being interpreted by gtk-doc as (broken) Markdown hyperlinks. Signed-off-by: Philip Withnall <withnall@endlessm.com>
414 lines
10 KiB
C
414 lines
10 KiB
C
/*
|
|
* Copyright © 2014 Canonical Limited
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* Author: Ryan Lortie <desrt@desrt.ca>
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include "gstrfuncs.h"
|
|
|
|
#include <glib.h>
|
|
#include <locale.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
struct mapping_entry
|
|
{
|
|
guint16 src;
|
|
guint16 ascii;
|
|
};
|
|
|
|
struct mapping_range
|
|
{
|
|
guint16 start;
|
|
guint16 length;
|
|
};
|
|
|
|
struct locale_entry
|
|
{
|
|
guint8 name_offset;
|
|
guint8 item_id;
|
|
};
|
|
|
|
#include "gtranslit-data.h"
|
|
|
|
#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
|
|
#define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
|
|
|
|
#if G_BYTE_ORDER == G_BIG_ENDIAN
|
|
#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
|
|
#else
|
|
#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
|
|
#endif
|
|
|
|
static const gchar * lookup_in_item (guint item_id,
|
|
const gunichar *key,
|
|
gint *result_len,
|
|
gint *key_consumed);
|
|
|
|
static gint
|
|
compare_mapping_entry (gconstpointer user_data,
|
|
gconstpointer data)
|
|
{
|
|
const struct mapping_entry *entry = data;
|
|
const gunichar *key = user_data;
|
|
gunichar src_0;
|
|
|
|
G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
|
|
|
|
src_0 = get_src_char (src_table, entry->src, 0);
|
|
|
|
if (key[0] > src_0)
|
|
return 1;
|
|
else if (key[0] < src_0)
|
|
return -1;
|
|
|
|
if (get_length (entry->src) > 1)
|
|
{
|
|
gunichar src_1;
|
|
|
|
src_1 = get_src_char (src_table, entry->src, 1);
|
|
|
|
if (key[1] > src_1)
|
|
return 1;
|
|
else if (key[1] < src_1)
|
|
return -1;
|
|
}
|
|
else if (key[1])
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const gchar *
|
|
lookup_in_mapping (const struct mapping_entry *mapping,
|
|
gint mapping_size,
|
|
const gunichar *key,
|
|
gint *result_len,
|
|
gint *key_consumed)
|
|
{
|
|
const struct mapping_entry *hit;
|
|
|
|
hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
|
|
|
|
if (hit == NULL)
|
|
return NULL;
|
|
|
|
*key_consumed = get_length (hit->src);
|
|
*result_len = get_length (hit->ascii);
|
|
|
|
return get_ascii_item(ascii_table, hit->ascii);
|
|
}
|
|
|
|
static const gchar *
|
|
lookup_in_chain (const guint8 *chain,
|
|
const gunichar *key,
|
|
gint *result_len,
|
|
gint *key_consumed)
|
|
{
|
|
const gchar *result;
|
|
|
|
while (*chain != 0xff)
|
|
{
|
|
result = lookup_in_item (*chain, key, result_len, key_consumed);
|
|
|
|
if (result)
|
|
return result;
|
|
|
|
chain++;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static const gchar *
|
|
lookup_in_item (guint item_id,
|
|
const gunichar *key,
|
|
gint *result_len,
|
|
gint *key_consumed)
|
|
{
|
|
if (item_id & 0x80)
|
|
{
|
|
const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
|
|
|
|
return lookup_in_chain (chain, key, result_len, key_consumed);
|
|
}
|
|
else
|
|
{
|
|
const struct mapping_range *range = &mapping_ranges[item_id];
|
|
|
|
return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
|
|
}
|
|
}
|
|
|
|
static gint
|
|
compare_locale_entry (gconstpointer user_data,
|
|
gconstpointer data)
|
|
{
|
|
const struct locale_entry *entry = data;
|
|
const gchar *key = user_data;
|
|
|
|
return strcmp (key, &locale_names[entry->name_offset]);
|
|
}
|
|
|
|
static gboolean
|
|
lookup_item_id_for_one_locale (const gchar *key,
|
|
guint *item_id)
|
|
{
|
|
const struct locale_entry *hit;
|
|
|
|
hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
|
|
|
|
if (hit == NULL)
|
|
return FALSE;
|
|
|
|
*item_id = hit->item_id;
|
|
return TRUE;
|
|
}
|
|
|
|
static guint
|
|
lookup_item_id_for_locale (const gchar *locale)
|
|
{
|
|
gchar key[MAX_LOCALE_NAME + 1];
|
|
const gchar *language;
|
|
guint language_len;
|
|
const gchar *territory = NULL;
|
|
guint territory_len = 0;
|
|
const gchar *modifier = NULL;
|
|
guint modifier_len = 0;
|
|
const gchar *next_char;
|
|
guint id;
|
|
|
|
/* As per POSIX, a valid locale looks like:
|
|
*
|
|
* language[_territory][.codeset][@modifier]
|
|
*/
|
|
language = locale;
|
|
language_len = strcspn (language, "_.@");
|
|
next_char = language + language_len;
|
|
|
|
if (*next_char == '_')
|
|
{
|
|
territory = next_char;
|
|
territory_len = strcspn (territory + 1, "_.@") + 1;
|
|
next_char = territory + territory_len;
|
|
}
|
|
|
|
if (*next_char == '.')
|
|
{
|
|
const gchar *codeset;
|
|
guint codeset_len;
|
|
|
|
codeset = next_char;
|
|
codeset_len = strcspn (codeset + 1, "_.@") + 1;
|
|
next_char = codeset + codeset_len;
|
|
}
|
|
|
|
if (*next_char == '@')
|
|
{
|
|
modifier = next_char;
|
|
modifier_len = strcspn (modifier + 1, "_.@") + 1;
|
|
next_char = modifier + modifier_len;
|
|
}
|
|
|
|
/* What madness is this? */
|
|
if (language_len == 0 || *next_char)
|
|
return default_item_id;
|
|
|
|
/* We are not interested in codeset.
|
|
*
|
|
* For this locale:
|
|
*
|
|
* aa_BB@cc
|
|
*
|
|
* try in this order:
|
|
*
|
|
* Note: we have no locales of the form aa_BB@cc in the database.
|
|
*
|
|
* 1. aa@cc
|
|
* 2. aa_BB
|
|
* 3. aa
|
|
*/
|
|
|
|
/* 1. */
|
|
if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
|
|
{
|
|
memcpy (key, language, language_len);
|
|
memcpy (key + language_len, modifier, modifier_len);
|
|
key[language_len + modifier_len] = '\0';
|
|
|
|
if (lookup_item_id_for_one_locale (key, &id))
|
|
return id;
|
|
}
|
|
|
|
/* 2. */
|
|
if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
|
|
{
|
|
memcpy (key, language, language_len);
|
|
memcpy (key + language_len, territory, territory_len);
|
|
key[language_len + territory_len] = '\0';
|
|
|
|
if (lookup_item_id_for_one_locale (key, &id))
|
|
return id;
|
|
}
|
|
|
|
/* 3. */
|
|
if (language_len <= MAX_LOCALE_NAME)
|
|
{
|
|
memcpy (key, language, language_len);
|
|
key[language_len] = '\0';
|
|
|
|
if (lookup_item_id_for_one_locale (key, &id))
|
|
return id;
|
|
}
|
|
|
|
return default_item_id;
|
|
}
|
|
|
|
static guint
|
|
get_default_item_id (void)
|
|
{
|
|
static guint item_id;
|
|
static gboolean done;
|
|
|
|
/* Doesn't need to be locked -- no harm in doing it twice. */
|
|
if (!done)
|
|
{
|
|
const gchar *locale;
|
|
|
|
locale = setlocale (LC_CTYPE, NULL);
|
|
item_id = lookup_item_id_for_locale (locale);
|
|
done = TRUE;
|
|
}
|
|
|
|
return item_id;
|
|
}
|
|
|
|
/**
|
|
* g_str_to_ascii:
|
|
* @str: a string, in UTF-8
|
|
* @from_locale: (nullable): the source locale, if known
|
|
*
|
|
* Transliterate @str to plain ASCII.
|
|
*
|
|
* For best results, @str should be in composed normalised form.
|
|
*
|
|
* This function performs a reasonably good set of character
|
|
* replacements. The particular set of replacements that is done may
|
|
* change by version or even by runtime environment.
|
|
*
|
|
* If the source language of @str is known, it can used to improve the
|
|
* accuracy of the translation by passing it as @from_locale. It should
|
|
* be a valid POSIX locale string (of the form
|
|
* `language[_territory][.codeset][@modifier]`).
|
|
*
|
|
* If @from_locale is %NULL then the current locale is used.
|
|
*
|
|
* If you want to do translation for no specific locale, and you want it
|
|
* to be done independently of the currently locale, specify `"C"` for
|
|
* @from_locale.
|
|
*
|
|
* Returns: a string in plain ASCII
|
|
*
|
|
* Since: 2.40
|
|
**/
|
|
gchar *
|
|
g_str_to_ascii (const gchar *str,
|
|
const gchar *from_locale)
|
|
{
|
|
GString *result;
|
|
guint item_id;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (g_str_is_ascii (str))
|
|
return g_strdup (str);
|
|
|
|
if (from_locale)
|
|
item_id = lookup_item_id_for_locale (from_locale);
|
|
else
|
|
item_id = get_default_item_id ();
|
|
|
|
result = g_string_sized_new (strlen (str));
|
|
|
|
while (*str)
|
|
{
|
|
/* We only need to transliterate non-ASCII values... */
|
|
if (*str & 0x80)
|
|
{
|
|
gunichar key[MAX_KEY_SIZE];
|
|
const gchar *r;
|
|
gint consumed;
|
|
gint r_len;
|
|
gunichar c;
|
|
|
|
G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
|
|
|
|
c = g_utf8_get_char (str);
|
|
|
|
/* This is where it gets evil...
|
|
*
|
|
* We know that MAX_KEY_SIZE is 2. We also know that we
|
|
* only want to try another character if it's non-ascii.
|
|
*/
|
|
str = g_utf8_next_char (str);
|
|
|
|
key[0] = c;
|
|
if (*str & 0x80)
|
|
key[1] = g_utf8_get_char (str);
|
|
else
|
|
key[1] = 0;
|
|
|
|
r = lookup_in_item (item_id, key, &r_len, &consumed);
|
|
|
|
/* If we failed to map two characters, try again with one.
|
|
*
|
|
* gconv behaviour is a bit weird here -- it seems to
|
|
* depend in the randomness of the binary search and the
|
|
* size of the input buffer as to what result we get here.
|
|
*
|
|
* Doing it this way is more work, but should be
|
|
* more-correct.
|
|
*/
|
|
if (r == NULL && key[1])
|
|
{
|
|
key[1] = 0;
|
|
r = lookup_in_item (item_id, key, &r_len, &consumed);
|
|
}
|
|
|
|
if (r != NULL)
|
|
{
|
|
g_string_append_len (result, r, r_len);
|
|
if (consumed == 2)
|
|
/* If it took both then skip again */
|
|
str = g_utf8_next_char (str);
|
|
}
|
|
else /* no match found */
|
|
g_string_append_c (result, '?');
|
|
}
|
|
else if (*str & 0x80) /* Out-of-range non-ASCII case */
|
|
{
|
|
g_string_append_c (result, '?');
|
|
str = g_utf8_next_char (str);
|
|
}
|
|
else /* ASCII case */
|
|
g_string_append_c (result, *str++);
|
|
}
|
|
|
|
return g_string_free (result, FALSE);
|
|
}
|