Merge branch 'wip/chergert/c-utf8' into 'main'

glib/utf8: Use SIMD for UTF-8 validation

Closes #3481

See merge request GNOME/glib!4319
This commit is contained in:
Philip Withnall 2024-10-03 15:07:17 +00:00
commit fb4f2e5578
4 changed files with 308 additions and 186 deletions

View File

@ -1604,29 +1604,6 @@ g_ascii_strup (const gchar *str,
return result;
}
/**
* g_str_is_ascii:
* @str: a string
*
* Determines if a string is pure ASCII. A string is pure ASCII if it
* contains no bytes with the high bit set.
*
* Returns: true if @str is ASCII
*
* Since: 2.40
*/
gboolean
g_str_is_ascii (const gchar *str)
{
gsize i;
for (i = 0; str[i]; i++)
if (str[i] & 0x80)
return FALSE;
return TRUE;
}
/**
* g_strdown:
* @string: the string to convert

View File

@ -1,7 +1,8 @@
/* gutf8.c - Operations on UTF-8 strings.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
* Copyright (C) 2000, 2015-2022 Red Hat, Inc.
* Copyright (C) 2022-2023 David Rheinsberg
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*
@ -1574,166 +1575,255 @@ g_ucs4_to_utf16 (const gunichar *str,
return result;
}
#define VALIDATE_BYTE(mask, expect) \
G_STMT_START { \
if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
goto error; \
} G_STMT_END
/* SIMD-based UTF-8 validation originates in the c-utf8 project from
* https://github.com/c-util/c-utf8/ from the following authors:
*
* David Rheinsberg <david@readahead.eu>
* Evgeny Vereshchagin <evvers@ya.ru>
* Jan Engelhardt <jengelh@inai.de>
* Tom Gundersen <teg@jklm.no>
*
* It has been adapted for portability and integration.
* The original code is dual-licensed Apache-2.0 or LGPLv2.1+
*/
/* see IETF RFC 3629 Section 4 */
static const gchar *
fast_validate (const char *str)
#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1))
static inline guint8
load_u8 (gconstpointer memory,
gsize offset)
{
const gchar *p;
for (p = str; *p; p++)
{
if (*(guchar *)p < 128)
/* done */;
else
{
const gchar *last;
last = p;
if (*(guchar *)p < 0xe0) /* 110xxxxx */
{
if (G_UNLIKELY (*(guchar *)p < 0xc2))
goto error;
}
else
{
if (*(guchar *)p < 0xf0) /* 1110xxxx */
{
switch (*(guchar *)p++ & 0x0f)
{
case 0:
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
break;
case 0x0d:
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
}
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
{
switch (*(guchar *)p++ & 0x07)
{
case 0:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
goto error;
break;
case 4:
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
else
goto error;
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
continue;
error:
return last;
}
}
return p;
return ((const guint8 *)memory)[offset];
}
static const gchar *
fast_validate_len (const char *str,
gssize max_len)
#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__)
# define _attribute_aligned(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER)
# define _attribute_aligned(n) __declspec(align(n))
#else
# define _attribute_aligned(n)
#endif
static inline gsize
load_word (gconstpointer memory,
gsize offset)
{
const gchar *p;
#if GLIB_SIZEOF_VOID_P == 8
_attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset;
g_assert (max_len >= 0);
return ((guint64)m[0] << 0) | ((guint64)m[1] << 8) |
((guint64)m[2] << 16) | ((guint64)m[3] << 24) |
((guint64)m[4] << 32) | ((guint64)m[5] << 40) |
((guint64)m[6] << 48) | ((guint64)m[7] << 56);
#else
_attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset;
for (p = str; ((p - str) < max_len) && *p; p++)
return ((guint)m[0] << 0) | ((guint)m[1] << 8) |
((guint)m[2] << 16) | ((guint)m[3] << 24);
#endif
}
/* The following constants are truncated on 32-bit machines */
#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L)
#define UTF8_ASCII_SUB ((gsize)0x0101010101010101L)
static inline int
utf8_word_is_ascii (gsize word)
{
/* True unless any byte is NULL or has the MSB set. */
return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0);
}
static void
utf8_verify_ascii (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
while (len > 0 && load_u8 (str, 0) < 128)
{
if (*(guchar *)p < 128)
/* done */;
else
if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str)
{
const gchar *last;
last = p;
if (*(guchar *)p < 0xe0) /* 110xxxxx */
while (len >= 2 * sizeof (gsize))
{
if (G_UNLIKELY (max_len - (p - str) < 2))
goto error;
if (!utf8_word_is_ascii (load_word (str, 0)) ||
!utf8_word_is_ascii (load_word (str, sizeof (gsize))))
break;
if (G_UNLIKELY (*(guchar *)p < 0xc2))
goto error;
str += 2 * sizeof(gsize);
len -= 2 * sizeof(gsize);
}
while (len > 0 && load_u8 (str, 0) < 128)
{
if G_UNLIKELY (load_u8 (str, 0) == 0x00)
goto out;
++str;
--len;
}
}
else
{
if (*(guchar *)p < 0xf0) /* 1110xxxx */
if G_UNLIKELY (load_u8 (str, 0) == 0x00)
goto out;
++str;
--len;
}
}
out:
*strp = str;
if (lenp)
*lenp = len;
}
#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80)
static void
utf8_verify (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
/* See Unicode 10.0.0, Chapter 3, Section D92 */
while (len > 0)
{
if (G_UNLIKELY (max_len - (p - str) < 3))
goto error;
guint8 b = load_u8 (str, 0);
switch (*(guchar *)p++ & 0x0f)
if (b == 0x00)
goto out;
else if (b <= 0x7F)
{
case 0:
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
break;
case 0x0d:
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
/*
* Special-case and optimize the ASCII case.
*/
utf8_verify_ascii ((const char **)&str, &len);
}
}
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
else if (b >= 0xC2 && b <= 0xDF)
{
if (G_UNLIKELY (max_len - (p - str) < 4))
goto error;
if G_UNLIKELY (len < 2)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
switch (*(guchar *)p++ & 0x07)
str += 2;
len -= 2;
}
else if (b == 0xE0)
{
case 0:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
goto error;
break;
case 4:
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
else
goto error;
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
str += 3;
len -= 3;
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
else if (b >= 0xE1 && b <= 0xEC)
{
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
continue;
error:
return last;
}
str += 3;
len -= 3;
}
return p;
else if (b == 0xED)
{
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
str += 3;
len -= 3;
}
else if (b >= 0xEE && b <= 0xEF)
{
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
str += 3;
len -= 3;
}
else if (b == 0xF0)
{
if G_UNLIKELY (len < 4)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
goto out;
str += 4;
len -= 4;
}
else if (b >= 0xF1 && b <= 0xF3)
{
if G_UNLIKELY (len < 4)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
goto out;
str += 4;
len -= 4;
}
else if (b == 0xF4)
{
if G_UNLIKELY (len < 4)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
goto out;
str += 4;
len -= 4;
}
else goto out;
}
out:
*strp = str;
if (lenp)
*lenp = len;
}
/**
@ -1768,20 +1858,15 @@ g_utf8_validate (const char *str,
const gchar **end)
{
const gchar *p;
if (max_len >= 0)
return g_utf8_validate_len (str, max_len, end);
p = fast_validate (str);
utf8_verify (&str, NULL);
if (end)
*end = p;
if (end != NULL)
*end = str;
if (*p != '\0')
return FALSE;
else
return TRUE;
return *str == 0;
}
/**
@ -1804,17 +1889,31 @@ g_utf8_validate_len (const char *str,
const gchar **end)
{
const gchar *p;
utf8_verify (&str, &max_len);
p = fast_validate_len (str, max_len);
if (end != NULL)
*end = str;
if (end)
*end = p;
return max_len == 0;
}
if (p != str + max_len)
return FALSE;
else
return TRUE;
/**
* g_str_is_ascii:
* @str: a string
*
* Determines if a string is pure ASCII. A string is pure ASCII if it
* contains no bytes with the high bit set.
*
* Returns: true if @str is ASCII
*
* Since: 2.40
*/
gboolean
g_str_is_ascii (const gchar *str)
{
utf8_verify_ascii (&str, NULL);
return *str == 0;
}
/**

View File

@ -2719,6 +2719,27 @@ test_set_str (void)
g_free (str);
}
static void
test_str_is_ascii (void)
{
const char *ascii_strings[] = {
"",
"hello",
"is it me you're looking for",
};
const char *non_ascii_strings[] = {
"is it me youre looking for",
"áccents",
"☺️",
};
for (size_t i = 0; i < G_N_ELEMENTS (ascii_strings); i++)
g_assert_true (g_str_is_ascii (ascii_strings[i]));
for (size_t i = 0; i < G_N_ELEMENTS (non_ascii_strings); i++)
g_assert_false (g_str_is_ascii (non_ascii_strings[i]));
}
int
main (int argc,
char *argv[])
@ -2775,6 +2796,7 @@ main (int argc,
g_test_add_func ("/strfuncs/test-is-to-digit", test_is_to_digit);
g_test_add_func ("/strfuncs/transliteration", test_transliteration);
g_test_add_func ("/strfuncs/str-equal", test_str_equal);
g_test_add_func ("/strfuncs/str-is-ascii", test_str_is_ascii);
return g_test_run();
}

View File

@ -81,8 +81,9 @@ static Test global_test[] = {
{ "\xed\x9f\xbf", -1, 3, TRUE },
{ "\xee\x80\x80", -1, 3, TRUE },
{ "\xef\xbf\xbd", -1, 3, TRUE },
{ "\xf1\x80\x80\x80", -1, 4, TRUE },
{ "\xf4\x8f\xbf\xbf", -1, 4, TRUE },
{ "\xf4\x90\x80\x80", -1, 0, FALSE },
{ "\xf4\x90\x80\x80", -1, 0, FALSE }, /* bigger than U+10FFFF */
/* malformed sequences */
/* continuation bytes */
{ "\x80", -1, 0, FALSE },
@ -94,6 +95,18 @@ static Test global_test[] = {
{ "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
{ "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
{ "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
{ "\xe0\xa0\x20", -1, 0, FALSE },
{ "\xe1\x80\x20", -1, 0, FALSE },
{ "\xed\x80\x20", -1, 0, FALSE },
{ "\xf0\xc0\x80\x80", -1, 0, FALSE },
{ "\xf0\x90\x20\x80", -1, 0, FALSE },
{ "\xf0\x90\x80\x20", -1, 0, FALSE },
{ "\xf1\x20\x80\x80", -1, 0, FALSE },
{ "\xf1\x80\x20\x80", -1, 0, FALSE },
{ "\xf1\x80\x80\x20", -1, 0, FALSE },
{ "\xf4\x7f\x80\x80", -1, 0, FALSE },
{ "\xf4\x80\x20\x80", -1, 0, FALSE },
{ "\xf4\x80\x80\x20", -1, 0, FALSE },
/* all possible continuation byte */
{ "\x80", -1, 0, FALSE },
@ -253,6 +266,9 @@ static Test global_test[] = {
{ "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
{ "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
{ "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
{ "\xe0\x9f\x80", -1, 0, FALSE },
{ "\xe0\xc0\x80", -1, 0, FALSE },
{ "\xf0\x8f\x80\x80", -1, 0, FALSE },
/* illegal code positions */
{ "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
{ "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
@ -270,6 +286,14 @@ static Test global_test[] = {
{ "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
{ "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
/* ASCII boundaries */
{ "\x00", 1, 0, FALSE },
{ "\x01", -1, 1, TRUE },
{ "\x02", -1, 1, TRUE },
{ "\x7d", -1, 1, TRUE },
{ "\x7e", -1, 1, TRUE },
{ "\x7f", -1, 1, TRUE },
{ NULL, 0, 0, 0 }
};