From b555119ca3cf8f4e9dd49e2c6585c96ef74e649d Mon Sep 17 00:00:00 2001 From: nightuser <22158-nightuser@users.noreply.gitlab.gnome.org> Date: Thu, 14 Nov 2019 18:38:03 +0000 Subject: [PATCH] gunicode: Fix UB in gutf8.c and utf8-pointer test In glib/gutf8.c there was an UB in function g_utf8_find_prev_char when p == str. In this case we substract one from p and now p points to a location outside of the boundary of str. It's a UB by the standard. Since this function are meant to be fast, we don't check the boundary conditions. Fix glib/tests/utf8-pointer test. It failed due to the UB described above and aggressive optimisation when -O2 and LTO are enabled. Some compilers (e.g. GCC with major version >= 8) create an optimised version of g_utf8_find_prev_char with the first argument fixed and stored somewhere else (with a different pointer). It can be solved with either marking str as volatile or creating a copy of str in memory. We choose the second approach since it's more explicit solution. Add additional checks to glib/tests/utf8-pointer test. Closes #1917 --- glib/gutf8.c | 7 +-- glib/tests/utf8-pointer.c | 107 +++++++++++++++++++++++++------------- 2 files changed, 76 insertions(+), 38 deletions(-) diff --git a/glib/gutf8.c b/glib/gutf8.c index b67d09362..d1e8879ae 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -139,11 +139,12 @@ const gchar * const g_utf8_skip = utf8_skip_data; * Returns: (transfer none) (nullable): a pointer to the found character or %NULL. */ gchar * -g_utf8_find_prev_char (const char *str, - const char *p) +g_utf8_find_prev_char (const gchar *str, + const gchar *p) { - for (--p; p >= str; --p) + while (p > str) { + --p; if ((*p & 0xc0) != 0x80) return (gchar *)p; } diff --git a/glib/tests/utf8-pointer.c b/glib/tests/utf8-pointer.c index c29ea3e95..58350960b 100644 --- a/glib/tests/utf8-pointer.c +++ b/glib/tests/utf8-pointer.c @@ -97,46 +97,83 @@ test_find (void) * U+0041 Latin Capital Letter A (\101) * U+1EB6 Latin Capital Letter A With Breve And Dot Below (\341\272\266) */ - const gchar *str = "\340\254\213\360\220\244\200\101\341\272\266\0\101"; - const gchar *p = str + strlen (str); +#define TEST_STR "\340\254\213\360\220\244\200\101\341\272\266\0\101" + const gsize str_size = sizeof TEST_STR; + const gchar *str = TEST_STR; + const gchar str_array[] = TEST_STR; + const gchar * volatile str_volatile = TEST_STR; +#undef TEST_STR + gchar *str_copy = g_malloc (str_size); + const gchar *p; const gchar *q; + memcpy (str_copy, str, str_size); - q = g_utf8_find_prev_char (str, p); - g_assert (q == str + 8); - q = g_utf8_find_prev_char (str, q); - g_assert (q == str + 7); - q = g_utf8_find_prev_char (str, q); - g_assert (q == str + 3); - q = g_utf8_find_prev_char (str, q); - g_assert (q == str); - q = g_utf8_find_prev_char (str, q); - g_assert (q == NULL); +#define TEST_SET(STR) \ + G_STMT_START { \ + p = STR + (str_size - 1); \ + \ + q = g_utf8_find_prev_char (STR, p); \ + g_assert (q == STR + 12); \ + q = g_utf8_find_prev_char (STR, q); \ + g_assert (q == STR + 11); \ + q = g_utf8_find_prev_char (STR, q); \ + g_assert (q == STR + 8); \ + q = g_utf8_find_prev_char (STR, q); \ + g_assert (q == STR + 7); \ + q = g_utf8_find_prev_char (STR, q); \ + g_assert (q == STR + 3); \ + q = g_utf8_find_prev_char (STR, q); \ + g_assert (q == STR); \ + q = g_utf8_find_prev_char (STR, q); \ + g_assert_null (q); \ + \ + p = STR + 4; \ + q = g_utf8_find_prev_char (STR, p); \ + g_assert (q == STR + 3); \ + \ + p = STR + 2; \ + q = g_utf8_find_prev_char (STR, p); \ + g_assert (q == STR); \ + \ + p = STR + 2; \ + q = g_utf8_find_next_char (p, NULL); \ + g_assert (q == STR + 3); \ + q = g_utf8_find_next_char (q, NULL); \ + g_assert (q == STR + 7); \ + \ + q = g_utf8_find_next_char (p, STR + 6); \ + g_assert (q == STR + 3); \ + q = g_utf8_find_next_char (q, STR + 6); \ + g_assert_null (q); \ + \ + q = g_utf8_find_next_char (STR, STR); \ + g_assert_null (q); \ + \ + q = g_utf8_find_next_char (STR + strlen (STR), NULL); \ + g_assert (q == STR + strlen (STR) + 1); \ + \ + /* Check return values when reaching the end of the string, \ + * with @end set and unset. */ \ + q = g_utf8_find_next_char (STR + 10, NULL); \ + g_assert_nonnull (q); \ + g_assert (*q == '\0'); \ + \ + q = g_utf8_find_next_char (STR + 10, STR + 11); \ + g_assert_null (q); \ + } G_STMT_END - p = str + 2; - q = g_utf8_find_next_char (p, NULL); - g_assert (q == str + 3); - q = g_utf8_find_next_char (q, NULL); - g_assert (q == str + 7); + TEST_SET(str_array); + TEST_SET(str_copy); + TEST_SET(str_volatile); + /* Starting with GCC 8 tests on @str with "-O2 -flto" in CFLAGS fail due to + * (incorrect?) constant propagation of @str into @g_utf8_find_prev_char. It + * doesn't happen if @TEST_STR doesn't contain \0 in the middle but the tests + * should cover all corner cases. + * For instance, see https://gitlab.gnome.org/GNOME/glib/issues/1917 */ - q = g_utf8_find_next_char (p, str + 6); - g_assert (q == str + 3); - q = g_utf8_find_next_char (q, str + 6); - g_assert (q == NULL); +#undef TEST_SET - q = g_utf8_find_next_char (str, str); - g_assert (q == NULL); - - q = g_utf8_find_next_char (str + strlen (str), NULL); - g_assert (q == str + strlen (str) + 1); - - /* Check return values when reaching the end of the string, - * with @end set and unset. */ - q = g_utf8_find_next_char (str + 10, NULL); - g_assert_nonnull (q); - g_assert (*q == '\0'); - - q = g_utf8_find_next_char (str + 10, str + 11); - g_assert_null (q); + g_free (str_copy); } int main (int argc, char *argv[])