From b555119ca3cf8f4e9dd49e2c6585c96ef74e649d Mon Sep 17 00:00:00 2001
From: nightuser <22158-nightuser@users.noreply.gitlab.gnome.org>
Date: Thu, 14 Nov 2019 18:38:03 +0000
Subject: [PATCH] gunicode: Fix UB in gutf8.c and utf8-pointer test

In glib/gutf8.c there was an UB in function g_utf8_find_prev_char when
p == str. In this case we substract one from p and now p points to a
location outside of the boundary of str. It's a UB by the standard.
Since this function are meant to be fast, we don't check the boundary
conditions.

Fix glib/tests/utf8-pointer test. It failed due to the UB described
above and aggressive optimisation when -O2 and LTO are enabled. Some
compilers (e.g. GCC with major version >= 8) create an optimised version
of g_utf8_find_prev_char with the first argument fixed and stored
somewhere else (with a different pointer). It can be solved with either
marking str as volatile or creating a copy of str in memory. We choose
the second approach since it's more explicit solution.

Add additional checks to glib/tests/utf8-pointer test.

Closes #1917
---
 glib/gutf8.c              |   7 +--
 glib/tests/utf8-pointer.c | 107 +++++++++++++++++++++++++-------------
 2 files changed, 76 insertions(+), 38 deletions(-)

diff --git a/glib/gutf8.c b/glib/gutf8.c
index b67d09362..d1e8879ae 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -139,11 +139,12 @@ const gchar * const g_utf8_skip = utf8_skip_data;
  * Returns: (transfer none) (nullable): a pointer to the found character or %NULL.
  */
 gchar *
-g_utf8_find_prev_char (const char *str,
-		       const char *p)
+g_utf8_find_prev_char (const gchar *str,
+		       const gchar *p)
 {
-  for (--p; p >= str; --p)
+  while (p > str)
     {
+      --p;
       if ((*p & 0xc0) != 0x80)
 	return (gchar *)p;
     }
diff --git a/glib/tests/utf8-pointer.c b/glib/tests/utf8-pointer.c
index c29ea3e95..58350960b 100644
--- a/glib/tests/utf8-pointer.c
+++ b/glib/tests/utf8-pointer.c
@@ -97,46 +97,83 @@ test_find (void)
    * U+0041 Latin Capital Letter A (\101)
    * U+1EB6 Latin Capital Letter A With Breve And Dot Below (\341\272\266)
    */
-  const gchar *str = "\340\254\213\360\220\244\200\101\341\272\266\0\101";
-  const gchar *p = str + strlen (str);
+#define TEST_STR "\340\254\213\360\220\244\200\101\341\272\266\0\101"
+  const gsize str_size = sizeof TEST_STR;
+  const gchar *str = TEST_STR;
+  const gchar str_array[] = TEST_STR;
+  const gchar * volatile str_volatile = TEST_STR;
+#undef TEST_STR
+  gchar *str_copy = g_malloc (str_size);
+  const gchar *p;
   const gchar *q;
+  memcpy (str_copy, str, str_size);
 
-  q = g_utf8_find_prev_char (str, p);
-  g_assert (q == str + 8);
-  q = g_utf8_find_prev_char (str, q);
-  g_assert (q == str + 7);
-  q = g_utf8_find_prev_char (str, q);
-  g_assert (q == str + 3);
-  q = g_utf8_find_prev_char (str, q);
-  g_assert (q == str);
-  q = g_utf8_find_prev_char (str, q);
-  g_assert (q == NULL);
+#define TEST_SET(STR)  \
+  G_STMT_START { \
+    p = STR + (str_size - 1); \
+    \
+    q = g_utf8_find_prev_char (STR, p); \
+    g_assert (q == STR + 12); \
+    q = g_utf8_find_prev_char (STR, q); \
+    g_assert (q == STR + 11); \
+    q = g_utf8_find_prev_char (STR, q); \
+    g_assert (q == STR + 8); \
+    q = g_utf8_find_prev_char (STR, q); \
+    g_assert (q == STR + 7); \
+    q = g_utf8_find_prev_char (STR, q); \
+    g_assert (q == STR + 3); \
+    q = g_utf8_find_prev_char (STR, q); \
+    g_assert (q == STR); \
+    q = g_utf8_find_prev_char (STR, q); \
+    g_assert_null (q); \
+    \
+    p = STR + 4; \
+    q = g_utf8_find_prev_char (STR, p); \
+    g_assert (q == STR + 3); \
+    \
+    p = STR + 2; \
+    q = g_utf8_find_prev_char (STR, p); \
+    g_assert (q == STR); \
+    \
+    p = STR + 2; \
+    q = g_utf8_find_next_char (p, NULL); \
+    g_assert (q == STR + 3); \
+    q = g_utf8_find_next_char (q, NULL); \
+    g_assert (q == STR + 7); \
+    \
+    q = g_utf8_find_next_char (p, STR + 6); \
+    g_assert (q == STR + 3); \
+    q = g_utf8_find_next_char (q, STR + 6); \
+    g_assert_null (q); \
+    \
+    q = g_utf8_find_next_char (STR, STR); \
+    g_assert_null (q); \
+    \
+    q = g_utf8_find_next_char (STR + strlen (STR), NULL); \
+    g_assert (q == STR + strlen (STR) + 1); \
+    \
+    /* Check return values when reaching the end of the string, \
+     * with @end set and unset. */ \
+    q = g_utf8_find_next_char (STR + 10, NULL); \
+    g_assert_nonnull (q); \
+    g_assert (*q == '\0'); \
+    \
+    q = g_utf8_find_next_char (STR + 10, STR + 11); \
+    g_assert_null (q); \
+  } G_STMT_END
 
-  p = str + 2;
-  q = g_utf8_find_next_char (p, NULL);
-  g_assert (q == str + 3);
-  q = g_utf8_find_next_char (q, NULL);
-  g_assert (q == str + 7);
+  TEST_SET(str_array);
+  TEST_SET(str_copy);
+  TEST_SET(str_volatile);
+  /* Starting with GCC 8 tests on @str with "-O2 -flto" in CFLAGS fail due to
+   * (incorrect?) constant propagation of @str into @g_utf8_find_prev_char. It
+   * doesn't happen if @TEST_STR doesn't contain \0 in the middle but the tests
+   * should cover all corner cases.
+   * For instance, see https://gitlab.gnome.org/GNOME/glib/issues/1917 */
 
-  q = g_utf8_find_next_char (p, str + 6);
-  g_assert (q == str + 3);
-  q = g_utf8_find_next_char (q, str + 6);
-  g_assert (q == NULL);
+#undef TEST_SET
 
-  q = g_utf8_find_next_char (str, str);
-  g_assert (q == NULL);
-
-  q = g_utf8_find_next_char (str + strlen (str), NULL);
-  g_assert (q == str + strlen (str) + 1);
-
-  /* Check return values when reaching the end of the string,
-   * with @end set and unset. */
-  q = g_utf8_find_next_char (str + 10, NULL);
-  g_assert_nonnull (q);
-  g_assert (*q == '\0');
-
-  q = g_utf8_find_next_char (str + 10, str + 11);
-  g_assert_null (q);
+  g_free (str_copy);
 }
 
 int main (int argc, char *argv[])