Merge branch 'wip/chergert/valgrind-utf8-check' into 'main'

glib/gutf8: use ifunc to check for valgrind See merge request GNOME/glib!4344
2025-08-22 17:08:53 +02:00 · 2024-10-17 17:57:48 +00:00
parent 85b53d6317 ec7cf334db
commit 6cabc7bbf8
2 changed files with 79 additions and 12 deletions
--- a/glib/gmacros.h
+++ b/glib/gmacros.h
@@ -190,6 +190,8 @@
 #define g_macro__has_attribute_fallthrough G_GNUC_CHECK_VERSION (6, 0)
 #define g_macro__has_attribute_may_alias G_GNUC_CHECK_VERSION (3, 3)
 #define g_macro__has_attribute_warn_unused_result G_GNUC_CHECK_VERSION (3, 4)
+#define g_macro__has_attribute_no_sanitize_address 0
+#define g_macro__has_attribute_ifunc 0

 #endif

--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -40,6 +40,7 @@
 #include "gtypes.h"
 #include "gthread.h"
 #include "glibintl.h"
+#include "gvalgrind.h"

 #define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
@@ -1824,6 +1825,72 @@ out:
    *lenp = len;
 }

+static gboolean
+g_utf8_validate_native (const char  *str,
+                        gssize       max_len,
+                        const char **end)
+{
+  if (max_len >= 0)
+    return g_utf8_validate_len (str, max_len, end);
+
+  utf8_verify (&str, NULL);
+
+  if (end != NULL)
+    *end = str;
+
+  return *str == 0;
+}
+
+#if g_macro__has_attribute(ifunc) && !defined(G_OS_WIN32)
+/* The fast implementation of UTF-8 validation in `utf8_verify()` technically
+ * uses undefined behaviour when the string length is not provided (i.e. when
+ * it’s looking for a trailing nul terminator): when doing word-sized reads of
+ * the string, it can read up to the word size (minus one byte) beyond the end
+ * of the string in order to find the nul terminator.
+ *
+ * While this is guaranteed to not cause a page fault (at worst, the nul
+ * terminator could be in the final word of the page, and the code won’t read
+ * any further than that), it is still technically undefined behaviour in C,
+ * because we’re reading off the end of an array.
+ *
+ * We don’t *think* this can cause any bugs due to compiler optimisations,
+ * because glibc does exactly the same thing in its string handling code, and
+ * that code has been extensively tested. For example:
+ * https://github.com/bminor/glibc/blob/2c1903cbbac0022153a67776f474c221250ad6ed/string/strchrnul.c
+ *
+ * However, both valgrind and asan warn about the read beyond the end of the
+ * array (a ‘heap buffer overflow read’). They’re right to do this (they can’t
+ * know the read is bounded to the word size minus one, and guaranteed to not
+ * cross a page boundary), but it’s annoying for any application which calls
+ * `g_utf8_validate()`.
+ *
+ * Use an [indirect function (`ifunc`)](https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-ifunc-function-attribute)
+ * to use a fallback implementation of `g_utf8_validate()` when running under
+ * valgrind. This is resolved at load time using `resolve_g_utf8_validate()`.
+ *
+ * Similarly, mark the real implementation so that it’s not instrumented by asan
+ * using `no_sanitize_address`.
+ */
+static gboolean
+g_utf8_validate_valgrind (const char  *str,
+                          gssize       max_len,
+                          const char **end)
+{
+  if (max_len < 0)
+    max_len = strlen (str);
+
+  return g_utf8_validate_len (str, max_len, end);
+}
+
+static gboolean (*resolve_g_utf8_validate (void)) (const char *, gssize, const char **)
+{
+  if (RUNNING_ON_VALGRIND)
+    return g_utf8_validate_valgrind;
+  else
+    return g_utf8_validate_native;
+}
+#endif
+
 /**
 * g_utf8_validate:
 * @str: (array length=max_len) (element-type guint8): a pointer to character data
@@ -1850,22 +1917,20 @@ out:
 * 
 * Returns: `TRUE` if the text was valid UTF-8
 */
+#if g_macro__has_attribute(no_sanitize_address)
+  __attribute__((no_sanitize_address))
+#endif
 gboolean
 g_utf8_validate (const char   *str,
-		 gssize        max_len,    
-		 const gchar **end)
-
+                 gssize        max_len,
+                 const gchar **end)
+#if g_macro__has_attribute(ifunc) && !defined(G_OS_WIN32)
+  __attribute__((ifunc ("resolve_g_utf8_validate")));
+#else
 {
-  if (max_len >= 0)
-    return g_utf8_validate_len (str, max_len, end);
-
-  utf8_verify (&str, NULL);
-
-  if (end != NULL)
-    *end = str;
-
-  return *str == 0;
+  return g_utf8_validate_native (str, max_len, end);
 }
+#endif

 /**
 * g_utf8_validate_len: