gutf8: Drop ifunc code and always call strlen() when validating UTF-8

This fixes a heap buffer overflow read in `g_utf8_validate()` and
`g_str_is_ascii()`, at the cost of always calling `strlen()` on the
input string if its length isn’t known already.

The overflow read was not a security vulnerability, but getting valgrind
and asan to understand that, across all platforms and build
configurations, doesn’t seem to be possible with the resources available
to us. In particular, the `ifunc` approach doesn’t work on muslc, and
doesn’t work when statically linked.

The UTF-8 validation code should still be faster than the old approach
(GLib 2.82 and older), as `strlen()` is SIMD-accelerated in glibc, and
UTF-8 validation is SIMD accelerated in GLib. The combination of the two
should still be faster than the bytewise read loop we used to have.

Unfortunately, correctness and testability have to be prioritised over
absolute performance.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Fixes: #3493
Fixes: #3511
Fixes: #3526
This commit is contained in:
Philip Withnall 2024-11-19 13:49:34 +00:00
parent dfe252429c
commit 96205fc7fe
No known key found for this signature in database
GPG Key ID: C5C42CFB268637CA

View File

@ -42,10 +42,6 @@
#include "glibintl.h"
#include "gvalgrind.h"
#if g_macro__has_attribute(ifunc) && !defined(G_OS_WIN32)
#define HAVE_WORKING_IFUNC_ATTRIBUTE 1
#endif
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
@ -1607,12 +1603,6 @@ load_u8 (gconstpointer memory,
# define _attribute_aligned(n)
#endif
/* See the HAVE_WORKING_IFUNC_ATTRIBUTE definition for an explanation of the
* no_sanitize_address annotation.
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
static inline gsize
load_word (gconstpointer memory,
gsize offset)
@ -1648,7 +1638,7 @@ utf8_verify_ascii (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
gsize len = lenp ? *lenp : strlen (str);
while (len > 0 && load_u8 (str, 0) < 128)
{
@ -1697,7 +1687,7 @@ utf8_verify (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
gsize len = lenp ? *lenp : strlen (str);
/* See Unicode 10.0.0, Chapter 3, Section D92 */
@ -1835,77 +1825,6 @@ out:
*lenp = len;
}
static gboolean
g_utf8_validate_native (const char *str,
gssize max_len,
const char **end)
{
if (max_len >= 0)
return g_utf8_validate_len (str, max_len, end);
utf8_verify (&str, NULL);
if (end != NULL)
*end = str;
return *str == 0;
}
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
/* The fast implementation of UTF-8 validation in `utf8_verify()` technically
* uses undefined behaviour when the string length is not provided (i.e. when
* its looking for a trailing nul terminator): when doing word-sized reads of
* the string, it can read up to the word size (minus one byte) beyond the end
* of the string in order to find the nul terminator.
*
* While this is guaranteed to not cause a page fault (at worst, the nul
* terminator could be in the final word of the page, and the code wont read
* any further than that), it is still technically undefined behaviour in C,
* because were reading off the end of an array.
*
* We dont *think* this can cause any bugs due to compiler optimisations,
* because glibc does exactly the same thing in its string handling code, and
* that code has been extensively tested. For example:
* https://github.com/bminor/glibc/blob/2c1903cbbac0022153a67776f474c221250ad6ed/string/strchrnul.c
*
* However, both valgrind and asan warn about the read beyond the end of the
* array (a heap buffer overflow read). Theyre right to do this (they cant
* know the read is bounded to the word size minus one, and guaranteed to not
* cross a page boundary), but its annoying for any application which calls
* `g_utf8_validate()`.
*
* Use an [indirect function (`ifunc`)](https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-ifunc-function-attribute)
* to use a fallback implementation of `g_utf8_validate()` when running under
* valgrind. This is resolved at load time using `resolve_g_utf8_validate()`.
*
* Similarly, mark the real implementation so that its not instrumented by asan
* using `no_sanitize_address`.
*/
static gboolean
g_utf8_validate_valgrind (const char *str,
gssize max_len,
const char **end)
{
if (max_len < 0)
max_len = strlen (str);
return g_utf8_validate_len (str, max_len, end);
}
typedef gboolean (*GUtf8ValidateFunc) (const char *str,
gssize max_len,
const char **end);
static GUtf8ValidateFunc
resolve_g_utf8_validate (void)
{
if (RUNNING_ON_VALGRIND)
return g_utf8_validate_valgrind;
else
return g_utf8_validate_native;
}
#endif /* HAVE_WORKING_IFUNC_ATTRIBUTE */
/**
* g_utf8_validate:
* @str: (array length=max_len) (element-type guint8): a pointer to character data
@ -1932,20 +1851,15 @@ resolve_g_utf8_validate (void)
*
* Returns: `TRUE` if the text was valid UTF-8
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
gboolean
g_utf8_validate (const char *str,
gssize max_len,
const gchar **end)
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
__attribute__((ifunc ("resolve_g_utf8_validate")));
#else
{
return g_utf8_validate_native (str, max_len, end);
size_t max_len_unsigned = (max_len >= 0) ? (size_t) max_len : strlen (str);
return g_utf8_validate_len (str, max_len_unsigned, end);
}
#endif
/**
* g_utf8_validate_len:
@ -1975,38 +1889,6 @@ g_utf8_validate_len (const char *str,
return max_len == 0;
}
static gboolean
g_str_is_ascii_native (const char *str)
{
utf8_verify_ascii (&str, NULL);
return *str == 0;
}
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
/* See above comment about `ifunc` use for g_utf8_validate(). */
static gboolean
g_str_is_ascii_valgrind (const char *str)
{
size_t len = strlen (str);
utf8_verify_ascii (&str, &len);
return *str == 0;
}
typedef gboolean (*GStrIsAsciiFunc) (const char *str);
static GStrIsAsciiFunc
resolve_g_str_is_ascii (void)
{
if (RUNNING_ON_VALGRIND)
return g_str_is_ascii_valgrind;
else
return g_str_is_ascii_native;
}
#endif /* HAVE_WORKING_IFUNC_ATTRIBUTE */
/**
* g_str_is_ascii:
* @str: a string
@ -2018,18 +1900,13 @@ resolve_g_str_is_ascii (void)
*
* Since: 2.40
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
gboolean
g_str_is_ascii (const gchar *str)
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
__attribute__((ifunc ("resolve_g_str_is_ascii")));
#else
{
return g_str_is_ascii_native (str);
utf8_verify_ascii (&str, NULL);
return *str == 0;
}
#endif
/**
* g_unichar_validate: