glib/utf8: Use SIMD for UTF-8 validation

This is based on the https://github.com/c-util/c-utf8 project and has
been adapted for portability and integration into GLib. c-utf8 is dual
licensed Apache-2.0 and LGPLv2.1+, the latter matching GLib.

Notably, `case 0x01 ... 0x7F:` style switch/case labels have been
converted to if/else which is more portable to non-GCC/Clang platforms
while generating the same assembly, at least on x86_64 with GCC.

Additionally, `__attribute__((aligned(n)))` is used in favor of
`__builtin_assume_aligned(n)` because it is more portable to MSVC's
`__declspec(align(n))` and also generates the same assembly as GCC's
`__builtin_assume_aligned(n)`.

For GCC x86_64 Linux on a Xeon 4214 this improved the throughput of
g_utf8_validate() for ASCII from 750MB/s to around 10,000MB/s (13x).

On GCC aarch64 Linux with an Apple Silicon M2 Pro we go from about
2,200 MB/s to 26,700 MB/s (12x).

Closes: #3481
This commit is contained in:
Christian Hergert 2024-09-30 11:28:21 -07:00
parent c96cd22cf9
commit 1d3d7336ed

View File

@ -1,7 +1,8 @@
/* gutf8.c - Operations on UTF-8 strings.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
* Copyright (C) 2000, 2015-2022 Red Hat, Inc.
* Copyright (C) 2022-2023 David Rheinsberg
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*
@ -1565,166 +1566,255 @@ g_ucs4_to_utf16 (const gunichar *str,
return result;
}
#define VALIDATE_BYTE(mask, expect) \
G_STMT_START { \
if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
goto error; \
} G_STMT_END
/* SIMD-based UTF-8 validation originates in the c-utf8 project from
* https://github.com/c-util/c-utf8/ from the following authors:
*
* David Rheinsberg <david@readahead.eu>
* Evgeny Vereshchagin <evvers@ya.ru>
* Jan Engelhardt <jengelh@inai.de>
* Tom Gundersen <teg@jklm.no>
*
* It has been adapted for portability and integration.
* The original code is dual-licensed Apache-2.0 or LGPLv2.1+
*/
/* see IETF RFC 3629 Section 4 */
static const gchar *
fast_validate (const char *str)
#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1))
static inline guint8
load_u8 (gconstpointer memory,
gsize offset)
{
const gchar *p;
for (p = str; *p; p++)
{
if (*(guchar *)p < 128)
/* done */;
else
{
const gchar *last;
last = p;
if (*(guchar *)p < 0xe0) /* 110xxxxx */
{
if (G_UNLIKELY (*(guchar *)p < 0xc2))
goto error;
}
else
{
if (*(guchar *)p < 0xf0) /* 1110xxxx */
{
switch (*(guchar *)p++ & 0x0f)
{
case 0:
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
break;
case 0x0d:
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
}
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
{
switch (*(guchar *)p++ & 0x07)
{
case 0:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
goto error;
break;
case 4:
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
else
goto error;
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
continue;
error:
return last;
}
}
return p;
return ((const guint8 *)memory)[offset];
}
static const gchar *
fast_validate_len (const char *str,
gssize max_len)
#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__)
# define _attribute_aligned(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER)
# define _attribute_aligned(n) __declspec(align(n))
#else
# define _attribute_aligned(n)
#endif
static inline gsize
load_word (gconstpointer memory,
gsize offset)
{
const gchar *p;
#if GLIB_SIZEOF_VOID_P == 8
_attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset;
g_assert (max_len >= 0);
return ((guint64)m[0] << 0) | ((guint64)m[1] << 8) |
((guint64)m[2] << 16) | ((guint64)m[3] << 24) |
((guint64)m[4] << 32) | ((guint64)m[5] << 40) |
((guint64)m[6] << 48) | ((guint64)m[7] << 56);
#else
_attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset;
for (p = str; ((p - str) < max_len) && *p; p++)
return ((guint)m[0] << 0) | ((guint)m[1] << 8) |
((guint)m[2] << 16) | ((guint)m[3] << 24);
#endif
}
/* The following constants are truncated on 32-bit machines */
#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L)
#define UTF8_ASCII_SUB ((gsize)0x0101010101010101L)
static inline int
utf8_word_is_ascii (gsize word)
{
/* True unless any byte is NULL or has the MSB set. */
return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0);
}
static void
utf8_verify_ascii (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
while (len > 0 && load_u8 (str, 0) < 128)
{
if (*(guchar *)p < 128)
/* done */;
else
if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str)
{
const gchar *last;
last = p;
if (*(guchar *)p < 0xe0) /* 110xxxxx */
while (len >= 2 * sizeof (gsize))
{
if (G_UNLIKELY (max_len - (p - str) < 2))
goto error;
if (!utf8_word_is_ascii (load_word (str, 0)) ||
!utf8_word_is_ascii (load_word (str, sizeof (gsize))))
break;
if (G_UNLIKELY (*(guchar *)p < 0xc2))
goto error;
str += 2 * sizeof(gsize);
len -= 2 * sizeof(gsize);
}
while (len > 0 && load_u8 (str, 0) < 128)
{
if G_UNLIKELY (load_u8 (str, 0) == 0x00)
goto out;
++str;
--len;
}
}
else
{
if (*(guchar *)p < 0xf0) /* 1110xxxx */
if G_UNLIKELY (load_u8 (str, 0) == 0x00)
goto out;
++str;
--len;
}
}
out:
*strp = str;
if (lenp)
*lenp = len;
}
#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80)
static void
utf8_verify (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
/* See Unicode 10.0.0, Chapter 3, Section D92 */
while (len > 0)
{
if (G_UNLIKELY (max_len - (p - str) < 3))
goto error;
guint8 b = load_u8 (str, 0);
switch (*(guchar *)p++ & 0x0f)
if (b == 0x00)
goto out;
else if (b >= 0x01 && b <= 0x7F)
{
case 0:
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
break;
case 0x0d:
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
/*
* Special-case and optimize the ASCII case.
*/
utf8_verify_ascii ((const char **)&str, &len);
}
}
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
else if (b >= 0xC2 && b <= 0xDF)
{
if (G_UNLIKELY (max_len - (p - str) < 4))
goto error;
if G_UNLIKELY (len < 2)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
switch (*(guchar *)p++ & 0x07)
str += 2;
len -= 2;
}
else if (b == 0xE0)
{
case 0:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
goto error;
break;
case 4:
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
break;
default:
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
else
goto error;
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
str += 3;
len -= 3;
}
p++;
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
else if (b >= 0xE1 && b <= 0xEC)
{
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
continue;
error:
return last;
}
str += 3;
len -= 3;
}
return p;
else if (b == 0xED)
{
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
str += 3;
len -= 3;
}
else if (b >= 0xEE && b <= 0xEF)
{
if G_UNLIKELY (len < 3)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
str += 3;
len -= 3;
}
else if (b == 0xF0)
{
if G_UNLIKELY (len < 4)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
goto out;
str += 4;
len -= 4;
}
else if (b >= 0xF1 && b <= 0xF3)
{
if G_UNLIKELY (len < 4)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
goto out;
str += 4;
len -= 4;
}
else if (b == 0xF4)
{
if G_UNLIKELY (len < 4)
goto out;
if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F)
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
goto out;
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
goto out;
str += 4;
len -= 4;
}
else goto out;
}
out:
*strp = str;
if (lenp)
*lenp = len;
}
/**
@ -1757,20 +1847,15 @@ g_utf8_validate (const char *str,
const gchar **end)
{
const gchar *p;
if (max_len >= 0)
return g_utf8_validate_len (str, max_len, end);
p = fast_validate (str);
utf8_verify (&str, NULL);
if (end)
*end = p;
if (end != NULL)
*end = str;
if (*p != '\0')
return FALSE;
else
return TRUE;
return *str == 0;
}
/**
@ -1793,17 +1878,12 @@ g_utf8_validate_len (const char *str,
const gchar **end)
{
const gchar *p;
utf8_verify (&str, &max_len);
p = fast_validate_len (str, max_len);
if (end != NULL)
*end = str;
if (end)
*end = p;
if (p != str + max_len)
return FALSE;
else
return TRUE;
return max_len == 0;
}
/**