Merge branch 'normalize-utf8-bounds-checking' into 'main'

g_utf8_normalize: don't read past the end of the buffer See merge request GNOME/glib!3341
2025-01-27 22:46:15 +01:00 · 2023-04-14 16:14:54 +00:00 · 2023-04-14 16:14:54 +00:00 · 353f2e4b3c
commit 353f2e4b3c
parent 8b45b81150 c82f9adb19
2 changed files with 71 additions and 6 deletions
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@ -388,9 +388,33 @@ _g_utf8_normalize_wc (const gchar    *str,
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      const gchar *decomp;
-      gunichar wc = g_utf8_get_char (p);
+      const char *next, *between;
      gunichar wc;
-      if (wc >= SBase && wc < SBase + SCount)
+      next = g_utf8_next_char (p);
      /* Avoid reading truncated multibyte characters
         which run past the end of the buffer */
      if (max_len < 0)
        {
          /* Does the character contain a NUL terminator? */
          for (between = &p[1]; between < next; between++)
            {
              if (G_UNLIKELY (!*between))
                return NULL;
            }
        }
      else
        {
          if (G_UNLIKELY (next > str + max_len))
            return NULL;
        }
      wc = g_utf8_get_char (p);
      if (G_UNLIKELY (wc == (gunichar) -1))
        {
          return NULL;
        }
      else if (wc >= SBase && wc < SBase + SCount)
        {
          gsize result_len;
          decompose_hangul (wc, NULL, &result_len);
@ -406,7 +430,7 @@ _g_utf8_normalize_wc (const gchar    *str,
            n_wc++;
        }
-      p = g_utf8_next_char (p);
+      p = next;
    }
  wc_buffer = g_new (gunichar, n_wc + 1);
@ -548,10 +572,13 @@ g_utf8_normalize (const gchar    *str,
 		  GNormalizeMode  mode)
 {
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
-  gchar *result;
+  gchar *result = NULL;
-  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
+  if (G_LIKELY (result_wc != NULL))
-  g_free (result_wc);
+    {
      result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
      g_free (result_wc);
    }
  return result;
 }
--- a/glib/tests/unicode-normalize.c
+++ b/glib/tests/unicode-normalize.c
@ -146,12 +146,50 @@ test_unicode_normalize (void)
  g_string_free (buffer, TRUE);
 }
 static void
 test_unicode_normalize_invalid (void)
 {
  /* g_utf8_normalize() should return NULL for all of these invalid inputs */
  const struct
  {
    gssize max_len;
    const gchar *str;
  } test_vectors[] = {
    /* input ending with truncated multibyte encoding */
    { -1, "\xC0" },
    { 1, "\xC0\x80" },
    { -1, "\xE0\x80" },
    { 2, "\xE0\x80\x80" },
    { -1, "\xF0\x80\x80" },
    { 3, "\xF0\x80\x80\x80" },
    { -1, "\xF8\x80\x80\x80" },
    { 4, "\xF8\x80\x80\x80\x80" },
    { 3, "\x20\xE2\x84\xAA" },
    { -1, "\x20\xE2\x00\xAA" },
    { -1, "\xC0\x80\xE0\x80" },
    { 4, "\xC0\x80\xE0\x80\x80" },
    /* input containing invalid multibyte encoding */
    { -1, "\xED\x85\x9C\xED\x15\x9C\xED\x85\x9C" },
  };
  gsize i;
  for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
    {
      g_test_message ("Invalid UTF-8 vector %" G_GSIZE_FORMAT, i);
      g_assert_null (g_utf8_normalize (test_vectors[i].str,
                                       test_vectors[i].max_len,
                                       G_NORMALIZE_ALL));
    }
 }
 int
 main (int argc, char **argv)
 {
  g_test_init (&argc, &argv, NULL);
  g_test_add_func ("/unicode/normalize", test_unicode_normalize);
  g_test_add_func ("/unicode/normalize-invalid",
                   test_unicode_normalize_invalid);
  return g_test_run ();
 }