glib/glib/gcharset.c
Frederic Martinsons 782eb1f7af Add private functions to correctly convert datetime when LC_TIME is not UTF8
Functions (_g_get_time_charset and _g_get_ctype_charset) to get LC_TIME and LC_CTYPE charset
by using nl_langinfo with _NL_TIME_CODESET and CODESET).
Another functions (_g_locale_time_to_utf8 and _g_locale_ctype_to_utf8) which uses thel and format
the input string accordingly.
Add new test cases with mixing UTF8 and non UTF8 LC_TIME along with UTF8
and non UTF8 LC_MESSAGES.

Closed #2055

Signed-off-by: Frederic Martinsons <frederic.martinsons@sigfox.com>
2021-03-27 09:28:10 +01:00

833 lines
22 KiB
C

/* gcharset.c - Charset information
*
* Copyright (C) 2011 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "config.h"
#include "gcharset.h"
#include "gcharsetprivate.h"
#include "garray.h"
#include "genviron.h"
#include "ghash.h"
#include "gmessages.h"
#include "gstrfuncs.h"
#include "gthread.h"
#include "gthreadprivate.h"
#ifdef G_OS_WIN32
#include "gwin32.h"
#endif
#include "libcharset/libcharset.h"
#include <string.h>
#include <stdio.h>
#if (HAVE_LANGINFO_TIME_CODESET || HAVE_LANGINFO_CODESET)
#include <langinfo.h>
#endif
#include <locale.h>
#ifdef G_OS_WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
G_LOCK_DEFINE_STATIC (aliases);
static GHashTable *
get_alias_hash (void)
{
static GHashTable *alias_hash = NULL;
const char *aliases;
G_LOCK (aliases);
if (!alias_hash)
{
alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
aliases = _g_locale_get_charset_aliases ();
while (*aliases != '\0')
{
const char *canonical;
const char *alias;
const char **alias_array;
int count = 0;
alias = aliases;
aliases += strlen (aliases) + 1;
canonical = aliases;
aliases += strlen (aliases) + 1;
alias_array = g_hash_table_lookup (alias_hash, canonical);
if (alias_array)
{
while (alias_array[count])
count++;
}
alias_array = g_renew (const char *, alias_array, count + 2);
alias_array[count] = alias;
alias_array[count + 1] = NULL;
g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
}
}
G_UNLOCK (aliases);
return alias_hash;
}
/* As an abuse of the alias table, the following routines gets
* the charsets that are aliases for the canonical name.
*/
const char **
_g_charset_get_aliases (const char *canonical_name)
{
GHashTable *alias_hash = get_alias_hash ();
return g_hash_table_lookup (alias_hash, canonical_name);
}
static gboolean
g_utf8_get_charset_internal (const char *raw_data,
const char **a)
{
const char *charset = g_getenv ("CHARSET");
if (charset && *charset)
{
*a = charset;
if (charset && strstr (charset, "UTF-8"))
return TRUE;
else
return FALSE;
}
/* The libcharset code tries to be thread-safe without
* a lock, but has a memory leak and a missing memory
* barrier, so we lock for it
*/
G_LOCK (aliases);
charset = _g_locale_charset_unalias (raw_data);
G_UNLOCK (aliases);
if (charset && *charset)
{
*a = charset;
if (charset && strstr (charset, "UTF-8"))
return TRUE;
else
return FALSE;
}
/* Assume this for compatibility at present. */
*a = "US-ASCII";
return FALSE;
}
typedef struct _GCharsetCache GCharsetCache;
struct _GCharsetCache {
gboolean is_utf8;
gchar *raw;
gchar *charset;
};
static void
charset_cache_free (gpointer data)
{
GCharsetCache *cache = data;
g_free (cache->raw);
g_free (cache->charset);
g_free (cache);
}
/**
* g_get_charset:
* @charset: (out) (optional) (transfer none): return location for character set
* name, or %NULL.
*
* Obtains the character set for the [current locale][setlocale]; you
* might use this character set as an argument to g_convert(), to convert
* from the current locale's encoding to some other encoding. (Frequently
* g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
*
* On Windows the character set returned by this function is the
* so-called system default ANSI code-page. That is the character set
* used by the "narrow" versions of C library and Win32 functions that
* handle file names. It might be different from the character set
* used by the C library's current locale.
*
* On Linux, the character set is found by consulting nl_langinfo() if
* available. If not, the environment variables `LC_ALL`, `LC_CTYPE`, `LANG`
* and `CHARSET` are queried in order.
*
* The return value is %TRUE if the locale's encoding is UTF-8, in that
* case you can perhaps avoid calling g_convert().
*
* The string returned in @charset is not allocated, and should not be
* freed.
*
* Returns: %TRUE if the returned charset is UTF-8
*/
gboolean
g_get_charset (const char **charset)
{
static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
GCharsetCache *cache = g_private_get (&cache_private);
const gchar *raw;
if (!cache)
cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
G_LOCK (aliases);
raw = _g_locale_charset_raw ();
G_UNLOCK (aliases);
if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
{
const gchar *new_charset;
g_free (cache->raw);
g_free (cache->charset);
cache->raw = g_strdup (raw);
cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
cache->charset = g_strdup (new_charset);
}
if (charset)
*charset = cache->charset;
return cache->is_utf8;
}
/*
* Do the same as g_get_charset() but it temporarily set locale (LC_ALL to
* LC_TIME) to correctly check for charset about time conversion relatives.
*
* Returns: %TRUE if the returned charset is UTF-8
*/
gboolean
_g_get_time_charset (const char **charset)
{
static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
GCharsetCache *cache = g_private_get (&cache_private);
const gchar *raw;
if (!cache)
cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
#ifdef HAVE_LANGINFO_TIME_CODESET
raw = nl_langinfo (_NL_TIME_CODESET);
#else
G_LOCK (aliases);
raw = _g_locale_charset_raw ();
G_UNLOCK (aliases);
#endif
if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
{
const gchar *new_charset;
g_free (cache->raw);
g_free (cache->charset);
cache->raw = g_strdup (raw);
cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
cache->charset = g_strdup (new_charset);
}
if (charset)
*charset = cache->charset;
return cache->is_utf8;
}
/*
* Do the same as g_get_charset() but it temporarily set locale (LC_ALL to
* LC_CTYPE) to correctly check for charset about CTYPE conversion relatives.
*
* Returns: %TRUE if the returned charset is UTF-8
*/
gboolean
_g_get_ctype_charset (const char **charset)
{
static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
GCharsetCache *cache = g_private_get (&cache_private);
const gchar *raw;
if (!cache)
cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
#ifdef HAVE_LANGINFO_CODESET
raw = nl_langinfo (CODESET);
#else
G_LOCK (aliases);
raw = _g_locale_charset_raw ();
G_UNLOCK (aliases);
#endif
if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
{
const gchar *new_charset;
g_free (cache->raw);
g_free (cache->charset);
cache->raw = g_strdup (raw);
cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
cache->charset = g_strdup (new_charset);
}
if (charset)
*charset = cache->charset;
return cache->is_utf8;
}
/**
* g_get_codeset:
*
* Gets the character set for the current locale.
*
* Returns: a newly allocated string containing the name
* of the character set. This string must be freed with g_free().
*/
gchar *
g_get_codeset (void)
{
const gchar *charset;
g_get_charset (&charset);
return g_strdup (charset);
}
/**
* g_get_console_charset:
* @charset: (out) (optional) (transfer none): return location for character set
* name, or %NULL.
*
* Obtains the character set used by the console attached to the process,
* which is suitable for printing output to the terminal.
*
* Usually this matches the result returned by g_get_charset(), but in
* environments where the locale's character set does not match the encoding
* of the console this function tries to guess a more suitable value instead.
*
* On Windows the character set returned by this function is the
* output code page used by the console associated with the calling process.
* If the codepage can't be determined (for example because there is no
* console attached) UTF-8 is assumed.
*
* The return value is %TRUE if the locale's encoding is UTF-8, in that
* case you can perhaps avoid calling g_convert().
*
* The string returned in @charset is not allocated, and should not be
* freed.
*
* Returns: %TRUE if the returned charset is UTF-8
*
* Since: 2.62
*/
gboolean
g_get_console_charset (const char **charset)
{
#ifdef G_OS_WIN32
static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
GCharsetCache *cache = g_private_get (&cache_private);
const gchar *locale;
unsigned int cp;
char buf[2 + 20 + 1]; /* "CP" + G_MAXUINT64 (to be safe) in decimal form (20 bytes) + "\0" */
const gchar *raw = NULL;
if (!cache)
cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
/* first try to query $LANG (works for Cygwin/MSYS/MSYS2 and others using mintty) */
locale = g_getenv ("LANG");
if (locale != NULL && locale[0] != '\0')
{
/* If the locale name contains an encoding after the dot, return it. */
const char *dot = strchr (locale, '.');
if (dot != NULL)
{
const char *modifier;
dot++;
/* Look for the possible @... trailer and remove it, if any. */
modifier = strchr (dot, '@');
if (modifier == NULL)
raw = dot;
else if (modifier - dot < sizeof (buf))
{
memcpy (buf, dot, modifier - dot);
buf[modifier - dot] = '\0';
raw = buf;
}
}
}
/* next try querying console codepage using native win32 API */
if (raw == NULL)
{
cp = GetConsoleOutputCP ();
if (cp)
{
sprintf (buf, "CP%u", cp);
raw = buf;
}
else if (GetLastError () != ERROR_INVALID_HANDLE)
{
gchar *emsg = g_win32_error_message (GetLastError ());
g_warning ("Failed to determine console output code page: %s. "
"Falling back to UTF-8", emsg);
g_free (emsg);
}
}
/* fall-back to UTF-8 if the rest failed (it's a universal default) */
if (raw == NULL)
raw = "UTF-8";
if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
{
const gchar *new_charset;
g_free (cache->raw);
g_free (cache->charset);
cache->raw = g_strdup (raw);
cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
cache->charset = g_strdup (new_charset);
}
if (charset)
*charset = cache->charset;
return cache->is_utf8;
#else
/* assume the locale settings match the console encoding on non-Windows OSs */
return g_get_charset (charset);
#endif
}
#ifndef G_OS_WIN32
/* read an alias file for the locales */
static void
read_aliases (const gchar *file,
GHashTable *alias_table)
{
FILE *fp;
char buf[256];
fp = fopen (file,"r");
if (!fp)
return;
while (fgets (buf, 256, fp))
{
char *p, *q;
g_strstrip (buf);
/* Line is a comment */
if ((buf[0] == '#') || (buf[0] == '\0'))
continue;
/* Reads first column */
for (p = buf, q = NULL; *p; p++) {
if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
*p = '\0';
q = p+1;
while ((*q == '\t') || (*q == ' ')) {
q++;
}
break;
}
}
/* The line only had one column */
if (!q || *q == '\0')
continue;
/* Read second column */
for (p = q; *p; p++) {
if ((*p == '\t') || (*p == ' ')) {
*p = '\0';
break;
}
}
/* Add to alias table if necessary */
if (!g_hash_table_lookup (alias_table, buf)) {
g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
}
}
fclose (fp);
}
#endif
static char *
unalias_lang (char *lang)
{
#ifndef G_OS_WIN32
static GHashTable *alias_table = NULL;
char *p;
int i;
if (g_once_init_enter (&alias_table))
{
GHashTable *table = g_hash_table_new (g_str_hash, g_str_equal);
read_aliases ("/usr/share/locale/locale.alias", table);
g_once_init_leave (&alias_table, table);
}
i = 0;
while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
{
lang = p;
if (i++ == 30)
{
static gboolean said_before = FALSE;
if (!said_before)
g_warning ("Too many alias levels for a locale, "
"may indicate a loop");
said_before = TRUE;
return lang;
}
}
#endif
return lang;
}
/* Mask for components of locale spec. The ordering here is from
* least significant to most significant
*/
enum
{
COMPONENT_CODESET = 1 << 0,
COMPONENT_TERRITORY = 1 << 1,
COMPONENT_MODIFIER = 1 << 2
};
/* Break an X/Open style locale specification into components
*/
static guint
explode_locale (const gchar *locale,
gchar **language,
gchar **territory,
gchar **codeset,
gchar **modifier)
{
const gchar *uscore_pos;
const gchar *at_pos;
const gchar *dot_pos;
guint mask = 0;
uscore_pos = strchr (locale, '_');
dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
if (at_pos)
{
mask |= COMPONENT_MODIFIER;
*modifier = g_strdup (at_pos);
}
else
at_pos = locale + strlen (locale);
if (dot_pos)
{
mask |= COMPONENT_CODESET;
*codeset = g_strndup (dot_pos, at_pos - dot_pos);
}
else
dot_pos = at_pos;
if (uscore_pos)
{
mask |= COMPONENT_TERRITORY;
*territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
}
else
uscore_pos = dot_pos;
*language = g_strndup (locale, uscore_pos - locale);
return mask;
}
/*
* Compute all interesting variants for a given locale name -
* by stripping off different components of the value.
*
* For simplicity, we assume that the locale is in
* X/Open format: language[_territory][.codeset][@modifier]
*
* TODO: Extend this to handle the CEN format (see the GNUlibc docs)
* as well. We could just copy the code from glibc wholesale
* but it is big, ugly, and complicated, so I'm reluctant
* to do so when this should handle 99% of the time...
*/
static void
append_locale_variants (GPtrArray *array,
const gchar *locale)
{
gchar *language = NULL;
gchar *territory = NULL;
gchar *codeset = NULL;
gchar *modifier = NULL;
guint mask;
guint i, j;
g_return_if_fail (locale != NULL);
mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
/* Iterate through all possible combinations, from least attractive
* to most attractive.
*/
for (j = 0; j <= mask; ++j)
{
i = mask - j;
if ((i & ~mask) == 0)
{
gchar *val = g_strconcat (language,
(i & COMPONENT_TERRITORY) ? territory : "",
(i & COMPONENT_CODESET) ? codeset : "",
(i & COMPONENT_MODIFIER) ? modifier : "",
NULL);
g_ptr_array_add (array, val);
}
}
g_free (language);
if (mask & COMPONENT_CODESET)
g_free (codeset);
if (mask & COMPONENT_TERRITORY)
g_free (territory);
if (mask & COMPONENT_MODIFIER)
g_free (modifier);
}
/**
* g_get_locale_variants:
* @locale: a locale identifier
*
* Returns a list of derived variants of @locale, which can be used to
* e.g. construct locale-dependent filenames or search paths. The returned
* list is sorted from most desirable to least desirable.
* This function handles territory, charset and extra locale modifiers. See
* [`setlocale(3)`](man:setlocale) for information about locales and their format.
*
* @locale itself is guaranteed to be returned in the output.
*
* For example, if @locale is `fr_BE`, then the returned list
* is `fr_BE`, `fr`. If @locale is `en_GB.UTF-8@euro`, then the returned list
* is `en_GB.UTF-8@euro`, `en_GB.UTF-8`, `en_GB@euro`, `en_GB`, `en.UTF-8@euro`,
* `en.UTF-8`, `en@euro`, `en`.
*
* If you need the list of variants for the current locale,
* use g_get_language_names().
*
* Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
* allocated array of newly allocated strings with the locale variants. Free with
* g_strfreev().
*
* Since: 2.28
*/
gchar **
g_get_locale_variants (const gchar *locale)
{
GPtrArray *array;
g_return_val_if_fail (locale != NULL, NULL);
array = g_ptr_array_sized_new (8);
append_locale_variants (array, locale);
g_ptr_array_add (array, NULL);
return (gchar **) g_ptr_array_free (array, FALSE);
}
/* The following is (partly) taken from the gettext package.
Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
static const gchar *
guess_category_value (const gchar *category_name)
{
const gchar *retval;
/* The highest priority value is the 'LANGUAGE' environment
variable. This is a GNU extension. */
retval = g_getenv ("LANGUAGE");
if ((retval != NULL) && (retval[0] != '\0'))
return retval;
/* 'LANGUAGE' is not set. So we have to proceed with the POSIX
methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
systems this can be done by the 'setlocale' function itself. */
/* Setting of LC_ALL overwrites all other. */
retval = g_getenv ("LC_ALL");
if ((retval != NULL) && (retval[0] != '\0'))
return retval;
/* Next comes the name of the desired category. */
retval = g_getenv (category_name);
if ((retval != NULL) && (retval[0] != '\0'))
return retval;
/* Last possibility is the LANG environment variable. */
retval = g_getenv ("LANG");
if ((retval != NULL) && (retval[0] != '\0'))
return retval;
#ifdef G_PLATFORM_WIN32
/* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
* LANG, which we already did above. Oh well. The main point of
* calling g_win32_getlocale() is to get the thread's locale as used
* by Windows and the Microsoft C runtime (in the "English_United
* States" format) translated into the Unixish format.
*/
{
char *locale = g_win32_getlocale ();
retval = g_intern_string (locale);
g_free (locale);
return retval;
}
#endif
return NULL;
}
typedef struct _GLanguageNamesCache GLanguageNamesCache;
struct _GLanguageNamesCache {
gchar *languages;
gchar **language_names;
};
static void
language_names_cache_free (gpointer data)
{
GLanguageNamesCache *cache = data;
g_free (cache->languages);
g_strfreev (cache->language_names);
g_free (cache);
}
/**
* g_get_language_names:
*
* Computes a list of applicable locale names, which can be used to
* e.g. construct locale-dependent filenames or search paths. The returned
* list is sorted from most desirable to least desirable and always contains
* the default locale "C".
*
* For example, if LANGUAGE=de:en_US, then the returned list is
* "de", "en_US", "en", "C".
*
* This function consults the environment variables `LANGUAGE`, `LC_ALL`,
* `LC_MESSAGES` and `LANG` to find the list of locales specified by the
* user.
*
* Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
* that must not be modified or freed.
*
* Since: 2.6
*/
const gchar * const *
g_get_language_names (void)
{
return g_get_language_names_with_category ("LC_MESSAGES");
}
/**
* g_get_language_names_with_category:
* @category_name: a locale category name
*
* Computes a list of applicable locale names with a locale category name,
* which can be used to construct the fallback locale-dependent filenames
* or search paths. The returned list is sorted from most desirable to
* least desirable and always contains the default locale "C".
*
* This function consults the environment variables `LANGUAGE`, `LC_ALL`,
* @category_name, and `LANG` to find the list of locales specified by the
* user.
*
* g_get_language_names() returns g_get_language_names_with_category("LC_MESSAGES").
*
* Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by
* the thread g_get_language_names_with_category was called from.
* It must not be modified or freed. It must be copied if planned to be used in another thread.
*
* Since: 2.58
*/
const gchar * const *
g_get_language_names_with_category (const gchar *category_name)
{
static GPrivate cache_private = G_PRIVATE_INIT ((void (*)(gpointer)) g_hash_table_unref);
GHashTable *cache = g_private_get (&cache_private);
const gchar *languages;
GLanguageNamesCache *name_cache;
g_return_val_if_fail (category_name != NULL, NULL);
if (!cache)
{
cache = g_hash_table_new_full (g_str_hash, g_str_equal,
g_free, language_names_cache_free);
g_private_set (&cache_private, cache);
}
languages = guess_category_value (category_name);
if (!languages)
languages = "C";
name_cache = (GLanguageNamesCache *) g_hash_table_lookup (cache, category_name);
if (!(name_cache && name_cache->languages &&
strcmp (name_cache->languages, languages) == 0))
{
GPtrArray *array;
gchar **alist, **a;
g_hash_table_remove (cache, category_name);
array = g_ptr_array_sized_new (8);
alist = g_strsplit (languages, ":", 0);
for (a = alist; *a; a++)
append_locale_variants (array, unalias_lang (*a));
g_strfreev (alist);
g_ptr_array_add (array, g_strdup ("C"));
g_ptr_array_add (array, NULL);
name_cache = g_new0 (GLanguageNamesCache, 1);
name_cache->languages = g_strdup (languages);
name_cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
g_hash_table_insert (cache, g_strdup (category_name), name_cache);
}
return (const gchar * const *) name_cache->language_names;
}