mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-02-02 17:26:17 +01:00
glib/utf8: Use SIMD for UTF-8 validation
This is based on the https://github.com/c-util/c-utf8 project and has been adapted for portability and integration into GLib. c-utf8 is dual licensed Apache-2.0 and LGPLv2.1+, the latter matching GLib. Notably, `case 0x01 ... 0x7F:` style switch/case labels have been converted to if/else which is more portable to non-GCC/Clang platforms while generating the same assembly, at least on x86_64 with GCC. Additionally, `__attribute__((aligned(n)))` is used in favor of `__builtin_assume_aligned(n)` because it is more portable to MSVC's `__declspec(align(n))` and also generates the same assembly as GCC's `__builtin_assume_aligned(n)`. For GCC x86_64 Linux on a Xeon 4214 this improved the throughput of g_utf8_validate() for ASCII from 750MB/s to around 10,000MB/s (13x). On GCC aarch64 Linux with an Apple Silicon M2 Pro we go from about 2,200 MB/s to 26,700 MB/s (12x). Closes: #3481
This commit is contained in:
parent
c96cd22cf9
commit
1d3d7336ed
382
glib/gutf8.c
382
glib/gutf8.c
@ -1,7 +1,8 @@
|
||||
/* gutf8.c - Operations on UTF-8 strings.
|
||||
*
|
||||
* Copyright (C) 1999 Tom Tromey
|
||||
* Copyright (C) 2000 Red Hat, Inc.
|
||||
* Copyright (C) 2000, 2015-2022 Red Hat, Inc.
|
||||
* Copyright (C) 2022-2023 David Rheinsberg
|
||||
*
|
||||
* SPDX-License-Identifier: LGPL-2.1-or-later
|
||||
*
|
||||
@ -1565,166 +1566,255 @@ g_ucs4_to_utf16 (const gunichar *str,
|
||||
return result;
|
||||
}
|
||||
|
||||
#define VALIDATE_BYTE(mask, expect) \
|
||||
G_STMT_START { \
|
||||
if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
|
||||
goto error; \
|
||||
} G_STMT_END
|
||||
/* SIMD-based UTF-8 validation originates in the c-utf8 project from
|
||||
* https://github.com/c-util/c-utf8/ from the following authors:
|
||||
*
|
||||
* David Rheinsberg <david@readahead.eu>
|
||||
* Evgeny Vereshchagin <evvers@ya.ru>
|
||||
* Jan Engelhardt <jengelh@inai.de>
|
||||
* Tom Gundersen <teg@jklm.no>
|
||||
*
|
||||
* It has been adapted for portability and integration.
|
||||
* The original code is dual-licensed Apache-2.0 or LGPLv2.1+
|
||||
*/
|
||||
|
||||
/* see IETF RFC 3629 Section 4 */
|
||||
|
||||
static const gchar *
|
||||
fast_validate (const char *str)
|
||||
#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1))
|
||||
|
||||
static inline guint8
|
||||
load_u8 (gconstpointer memory,
|
||||
gsize offset)
|
||||
{
|
||||
const gchar *p;
|
||||
|
||||
for (p = str; *p; p++)
|
||||
{
|
||||
if (*(guchar *)p < 128)
|
||||
/* done */;
|
||||
else
|
||||
{
|
||||
const gchar *last;
|
||||
|
||||
last = p;
|
||||
if (*(guchar *)p < 0xe0) /* 110xxxxx */
|
||||
{
|
||||
if (G_UNLIKELY (*(guchar *)p < 0xc2))
|
||||
goto error;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*(guchar *)p < 0xf0) /* 1110xxxx */
|
||||
{
|
||||
switch (*(guchar *)p++ & 0x0f)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
|
||||
break;
|
||||
case 0x0d:
|
||||
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
}
|
||||
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
|
||||
{
|
||||
switch (*(guchar *)p++ & 0x07)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
|
||||
goto error;
|
||||
break;
|
||||
case 4:
|
||||
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
}
|
||||
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
|
||||
continue;
|
||||
|
||||
error:
|
||||
return last;
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
return ((const guint8 *)memory)[offset];
|
||||
}
|
||||
|
||||
static const gchar *
|
||||
fast_validate_len (const char *str,
|
||||
gssize max_len)
|
||||
#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__)
|
||||
# define _attribute_aligned(n) __attribute__((aligned(n)))
|
||||
#elif defined(_MSC_VER)
|
||||
# define _attribute_aligned(n) __declspec(align(n))
|
||||
#else
|
||||
# define _attribute_aligned(n)
|
||||
#endif
|
||||
|
||||
static inline gsize
|
||||
load_word (gconstpointer memory,
|
||||
gsize offset)
|
||||
{
|
||||
const gchar *p;
|
||||
#if GLIB_SIZEOF_VOID_P == 8
|
||||
_attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset;
|
||||
|
||||
g_assert (max_len >= 0);
|
||||
return ((guint64)m[0] << 0) | ((guint64)m[1] << 8) |
|
||||
((guint64)m[2] << 16) | ((guint64)m[3] << 24) |
|
||||
((guint64)m[4] << 32) | ((guint64)m[5] << 40) |
|
||||
((guint64)m[6] << 48) | ((guint64)m[7] << 56);
|
||||
#else
|
||||
_attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset;
|
||||
|
||||
for (p = str; ((p - str) < max_len) && *p; p++)
|
||||
return ((guint)m[0] << 0) | ((guint)m[1] << 8) |
|
||||
((guint)m[2] << 16) | ((guint)m[3] << 24);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* The following constants are truncated on 32-bit machines */
|
||||
#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L)
|
||||
#define UTF8_ASCII_SUB ((gsize)0x0101010101010101L)
|
||||
|
||||
static inline int
|
||||
utf8_word_is_ascii (gsize word)
|
||||
{
|
||||
/* True unless any byte is NULL or has the MSB set. */
|
||||
return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0);
|
||||
}
|
||||
|
||||
static void
|
||||
utf8_verify_ascii (const char **strp,
|
||||
gsize *lenp)
|
||||
{
|
||||
const char *str = *strp;
|
||||
gsize len = lenp ? *lenp : (gsize)-1;
|
||||
|
||||
while (len > 0 && load_u8 (str, 0) < 128)
|
||||
{
|
||||
if (*(guchar *)p < 128)
|
||||
/* done */;
|
||||
else
|
||||
if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str)
|
||||
{
|
||||
const gchar *last;
|
||||
|
||||
last = p;
|
||||
if (*(guchar *)p < 0xe0) /* 110xxxxx */
|
||||
while (len >= 2 * sizeof (gsize))
|
||||
{
|
||||
if (G_UNLIKELY (max_len - (p - str) < 2))
|
||||
goto error;
|
||||
if (!utf8_word_is_ascii (load_word (str, 0)) ||
|
||||
!utf8_word_is_ascii (load_word (str, sizeof (gsize))))
|
||||
break;
|
||||
|
||||
if (G_UNLIKELY (*(guchar *)p < 0xc2))
|
||||
goto error;
|
||||
str += 2 * sizeof(gsize);
|
||||
len -= 2 * sizeof(gsize);
|
||||
}
|
||||
|
||||
while (len > 0 && load_u8 (str, 0) < 128)
|
||||
{
|
||||
if G_UNLIKELY (load_u8 (str, 0) == 0x00)
|
||||
goto out;
|
||||
|
||||
++str;
|
||||
--len;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*(guchar *)p < 0xf0) /* 1110xxxx */
|
||||
if G_UNLIKELY (load_u8 (str, 0) == 0x00)
|
||||
goto out;
|
||||
|
||||
++str;
|
||||
--len;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
*strp = str;
|
||||
|
||||
if (lenp)
|
||||
*lenp = len;
|
||||
}
|
||||
|
||||
#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80)
|
||||
|
||||
static void
|
||||
utf8_verify (const char **strp,
|
||||
gsize *lenp)
|
||||
{
|
||||
const char *str = *strp;
|
||||
gsize len = lenp ? *lenp : (gsize)-1;
|
||||
|
||||
/* See Unicode 10.0.0, Chapter 3, Section D92 */
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
if (G_UNLIKELY (max_len - (p - str) < 3))
|
||||
goto error;
|
||||
guint8 b = load_u8 (str, 0);
|
||||
|
||||
switch (*(guchar *)p++ & 0x0f)
|
||||
if (b == 0x00)
|
||||
goto out;
|
||||
|
||||
else if (b >= 0x01 && b <= 0x7F)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
|
||||
break;
|
||||
case 0x0d:
|
||||
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
/*
|
||||
* Special-case and optimize the ASCII case.
|
||||
*/
|
||||
utf8_verify_ascii ((const char **)&str, &len);
|
||||
}
|
||||
}
|
||||
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
|
||||
|
||||
else if (b >= 0xC2 && b <= 0xDF)
|
||||
{
|
||||
if (G_UNLIKELY (max_len - (p - str) < 4))
|
||||
goto error;
|
||||
if G_UNLIKELY (len < 2)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
|
||||
goto out;
|
||||
|
||||
switch (*(guchar *)p++ & 0x07)
|
||||
str += 2;
|
||||
len -= 2;
|
||||
|
||||
}
|
||||
|
||||
else if (b == 0xE0)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
|
||||
goto error;
|
||||
break;
|
||||
case 4:
|
||||
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
if G_UNLIKELY (len < 3)
|
||||
goto out;
|
||||
if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
|
||||
str += 3;
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
else if (b >= 0xE1 && b <= 0xEC)
|
||||
{
|
||||
if G_UNLIKELY (len < 3)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
|
||||
continue;
|
||||
|
||||
error:
|
||||
return last;
|
||||
}
|
||||
str += 3;
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
return p;
|
||||
else if (b == 0xED)
|
||||
{
|
||||
if G_UNLIKELY (len < 3)
|
||||
goto out;
|
||||
if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
|
||||
str += 3;
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
else if (b >= 0xEE && b <= 0xEF)
|
||||
{
|
||||
if G_UNLIKELY (len < 3)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
|
||||
str += 3;
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
else if (b == 0xF0)
|
||||
{
|
||||
if G_UNLIKELY (len < 4)
|
||||
goto out;
|
||||
if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
|
||||
goto out;
|
||||
|
||||
str += 4;
|
||||
len -= 4;
|
||||
}
|
||||
|
||||
else if (b >= 0xF1 && b <= 0xF3)
|
||||
{
|
||||
if G_UNLIKELY (len < 4)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
|
||||
goto out;
|
||||
|
||||
str += 4;
|
||||
len -= 4;
|
||||
}
|
||||
|
||||
else if (b == 0xF4)
|
||||
{
|
||||
if G_UNLIKELY (len < 4)
|
||||
goto out;
|
||||
if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F)
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
|
||||
goto out;
|
||||
if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
|
||||
goto out;
|
||||
|
||||
str += 4;
|
||||
len -= 4;
|
||||
}
|
||||
|
||||
else goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
*strp = str;
|
||||
|
||||
if (lenp)
|
||||
*lenp = len;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1757,20 +1847,15 @@ g_utf8_validate (const char *str,
|
||||
const gchar **end)
|
||||
|
||||
{
|
||||
const gchar *p;
|
||||
|
||||
if (max_len >= 0)
|
||||
return g_utf8_validate_len (str, max_len, end);
|
||||
|
||||
p = fast_validate (str);
|
||||
utf8_verify (&str, NULL);
|
||||
|
||||
if (end)
|
||||
*end = p;
|
||||
if (end != NULL)
|
||||
*end = str;
|
||||
|
||||
if (*p != '\0')
|
||||
return FALSE;
|
||||
else
|
||||
return TRUE;
|
||||
return *str == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1793,17 +1878,12 @@ g_utf8_validate_len (const char *str,
|
||||
const gchar **end)
|
||||
|
||||
{
|
||||
const gchar *p;
|
||||
utf8_verify (&str, &max_len);
|
||||
|
||||
p = fast_validate_len (str, max_len);
|
||||
if (end != NULL)
|
||||
*end = str;
|
||||
|
||||
if (end)
|
||||
*end = p;
|
||||
|
||||
if (p != str + max_len)
|
||||
return FALSE;
|
||||
else
|
||||
return TRUE;
|
||||
return max_len == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user