From ec7cf334db8d4ea722413e6050cc92ce553dc4f7 Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Thu, 17 Oct 2024 18:26:19 +0100
Subject: [PATCH] gutf8: Add a comment explaining the ifunc and asan annotation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Why they’re necessary, why we _think_ the optimised implementation of
`g_utf8_validate()` is OK despite what valgrind and asan are telling us,
and how they work.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Helps: #3493
---
 glib/gutf8.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/glib/gutf8.c b/glib/gutf8.c
index 0f75c6b28..ba0a0e731 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -1842,6 +1842,35 @@ g_utf8_validate_native (const char  *str,
 }
 
 #if g_macro__has_attribute(ifunc) && !defined(G_OS_WIN32)
+/* The fast implementation of UTF-8 validation in `utf8_verify()` technically
+ * uses undefined behaviour when the string length is not provided (i.e. when
+ * it’s looking for a trailing nul terminator): when doing word-sized reads of
+ * the string, it can read up to the word size (minus one byte) beyond the end
+ * of the string in order to find the nul terminator.
+ *
+ * While this is guaranteed to not cause a page fault (at worst, the nul
+ * terminator could be in the final word of the page, and the code won’t read
+ * any further than that), it is still technically undefined behaviour in C,
+ * because we’re reading off the end of an array.
+ *
+ * We don’t *think* this can cause any bugs due to compiler optimisations,
+ * because glibc does exactly the same thing in its string handling code, and
+ * that code has been extensively tested. For example:
+ * https://github.com/bminor/glibc/blob/2c1903cbbac0022153a67776f474c221250ad6ed/string/strchrnul.c
+ *
+ * However, both valgrind and asan warn about the read beyond the end of the
+ * array (a ‘heap buffer overflow read’). They’re right to do this (they can’t
+ * know the read is bounded to the word size minus one, and guaranteed to not
+ * cross a page boundary), but it’s annoying for any application which calls
+ * `g_utf8_validate()`.
+ *
+ * Use an [indirect function (`ifunc`)](https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-ifunc-function-attribute)
+ * to use a fallback implementation of `g_utf8_validate()` when running under
+ * valgrind. This is resolved at load time using `resolve_g_utf8_validate()`.
+ *
+ * Similarly, mark the real implementation so that it’s not instrumented by asan
+ * using `no_sanitize_address`.
+ */
 static gboolean
 g_utf8_validate_valgrind (const char  *str,
                           gssize       max_len,