Merge branch 'normalize-utf8-bounds-checking' into 'main'

g_utf8_normalize: don't read past the end of the buffer See merge request GNOME/glib!3341
2025-01-13 07:56:17 +01:00 · 2023-04-14 16:14:54 +00:00 · 2023-04-14 16:14:54 +00:00 · 353f2e4b3c
commit 353f2e4b3c
parent 8b45b81150 c82f9adb19
2 changed files with 71 additions and 6 deletions
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@ -388,9 +388,33 @@ _g_utf8_normalize_wc (const gchar    *str,
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      const gchar *decomp;
-      gunichar wc = g_utf8_get_char (p);
+      const char *next, *between;
+      gunichar wc;

-      if (wc >= SBase && wc < SBase + SCount)
+      next = g_utf8_next_char (p);
+      /* Avoid reading truncated multibyte characters
+         which run past the end of the buffer */
+      if (max_len < 0)
+        {
+          /* Does the character contain a NUL terminator? */
+          for (between = &p[1]; between < next; between++)
+            {
+              if (G_UNLIKELY (!*between))
+                return NULL;
+            }
+        }
+      else
+        {
+          if (G_UNLIKELY (next > str + max_len))
+            return NULL;
+        }
+      wc = g_utf8_get_char (p);
+
+      if (G_UNLIKELY (wc == (gunichar) -1))
+        {
+          return NULL;
+        }
+      else if (wc >= SBase && wc < SBase + SCount)
        {
          gsize result_len;
          decompose_hangul (wc, NULL, &result_len);
@ -406,7 +430,7 @@ _g_utf8_normalize_wc (const gchar    *str,
            n_wc++;
        }

-      p = g_utf8_next_char (p);
+      p = next;
    }

  wc_buffer = g_new (gunichar, n_wc + 1);
@ -548,10 +572,13 @@ g_utf8_normalize (const gchar    *str,
 		  GNormalizeMode  mode)
 {
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
-  gchar *result;
+  gchar *result = NULL;

-  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
-  g_free (result_wc);
+  if (G_LIKELY (result_wc != NULL))
+    {
+      result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
+      g_free (result_wc);
+    }

  return result;
 }
--- a/glib/tests/unicode-normalize.c
+++ b/glib/tests/unicode-normalize.c
@ -146,12 +146,50 @@ test_unicode_normalize (void)
  g_string_free (buffer, TRUE);
 }

+static void
+test_unicode_normalize_invalid (void)
+{
+  /* g_utf8_normalize() should return NULL for all of these invalid inputs */
+  const struct
+  {
+    gssize max_len;
+    const gchar *str;
+  } test_vectors[] = {
+    /* input ending with truncated multibyte encoding */
+    { -1, "\xC0" },
+    { 1, "\xC0\x80" },
+    { -1, "\xE0\x80" },
+    { 2, "\xE0\x80\x80" },
+    { -1, "\xF0\x80\x80" },
+    { 3, "\xF0\x80\x80\x80" },
+    { -1, "\xF8\x80\x80\x80" },
+    { 4, "\xF8\x80\x80\x80\x80" },
+    { 3, "\x20\xE2\x84\xAA" },
+    { -1, "\x20\xE2\x00\xAA" },
+    { -1, "\xC0\x80\xE0\x80" },
+    { 4, "\xC0\x80\xE0\x80\x80" },
+    /* input containing invalid multibyte encoding */
+    { -1, "\xED\x85\x9C\xED\x15\x9C\xED\x85\x9C" },
+  };
+  gsize i;
+
+  for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
+    {
+      g_test_message ("Invalid UTF-8 vector %" G_GSIZE_FORMAT, i);
+      g_assert_null (g_utf8_normalize (test_vectors[i].str,
+                                       test_vectors[i].max_len,
+                                       G_NORMALIZE_ALL));
+    }
+}
+
 int
 main (int argc, char **argv)
 {
  g_test_init (&argc, &argv, NULL);

  g_test_add_func ("/unicode/normalize", test_unicode_normalize);
+  g_test_add_func ("/unicode/normalize-invalid",
+                   test_unicode_normalize_invalid);

  return g_test_run ();
 }