Merge branch 'normalize-utf8-bounds-checking' into 'main'

g_utf8_normalize: don't read past the end of the buffer

See merge request GNOME/glib!3341
This commit is contained in:
Philip Withnall 2023-04-14 16:14:54 +00:00
commit 353f2e4b3c
2 changed files with 71 additions and 6 deletions

View File

@ -388,9 +388,33 @@ _g_utf8_normalize_wc (const gchar *str,
while ((max_len < 0 || p < str + max_len) && *p)
{
const gchar *decomp;
gunichar wc = g_utf8_get_char (p);
const char *next, *between;
gunichar wc;
if (wc >= SBase && wc < SBase + SCount)
next = g_utf8_next_char (p);
/* Avoid reading truncated multibyte characters
which run past the end of the buffer */
if (max_len < 0)
{
/* Does the character contain a NUL terminator? */
for (between = &p[1]; between < next; between++)
{
if (G_UNLIKELY (!*between))
return NULL;
}
}
else
{
if (G_UNLIKELY (next > str + max_len))
return NULL;
}
wc = g_utf8_get_char (p);
if (G_UNLIKELY (wc == (gunichar) -1))
{
return NULL;
}
else if (wc >= SBase && wc < SBase + SCount)
{
gsize result_len;
decompose_hangul (wc, NULL, &result_len);
@ -406,7 +430,7 @@ _g_utf8_normalize_wc (const gchar *str,
n_wc++;
}
p = g_utf8_next_char (p);
p = next;
}
wc_buffer = g_new (gunichar, n_wc + 1);
@ -548,10 +572,13 @@ g_utf8_normalize (const gchar *str,
GNormalizeMode mode)
{
gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
gchar *result;
gchar *result = NULL;
result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
g_free (result_wc);
if (G_LIKELY (result_wc != NULL))
{
result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
g_free (result_wc);
}
return result;
}

View File

@ -146,12 +146,50 @@ test_unicode_normalize (void)
g_string_free (buffer, TRUE);
}
static void
test_unicode_normalize_invalid (void)
{
/* g_utf8_normalize() should return NULL for all of these invalid inputs */
const struct
{
gssize max_len;
const gchar *str;
} test_vectors[] = {
/* input ending with truncated multibyte encoding */
{ -1, "\xC0" },
{ 1, "\xC0\x80" },
{ -1, "\xE0\x80" },
{ 2, "\xE0\x80\x80" },
{ -1, "\xF0\x80\x80" },
{ 3, "\xF0\x80\x80\x80" },
{ -1, "\xF8\x80\x80\x80" },
{ 4, "\xF8\x80\x80\x80\x80" },
{ 3, "\x20\xE2\x84\xAA" },
{ -1, "\x20\xE2\x00\xAA" },
{ -1, "\xC0\x80\xE0\x80" },
{ 4, "\xC0\x80\xE0\x80\x80" },
/* input containing invalid multibyte encoding */
{ -1, "\xED\x85\x9C\xED\x15\x9C\xED\x85\x9C" },
};
gsize i;
for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
{
g_test_message ("Invalid UTF-8 vector %" G_GSIZE_FORMAT, i);
g_assert_null (g_utf8_normalize (test_vectors[i].str,
test_vectors[i].max_len,
G_NORMALIZE_ALL));
}
}
int
main (int argc, char **argv)
{
g_test_init (&argc, &argv, NULL);
g_test_add_func ("/unicode/normalize", test_unicode_normalize);
g_test_add_func ("/unicode/normalize-invalid",
test_unicode_normalize_invalid);
return g_test_run ();
}