diff --git a/glib/gstrfuncs.c b/glib/gstrfuncs.c index 33faa80c2..f2c80e4e9 100644 --- a/glib/gstrfuncs.c +++ b/glib/gstrfuncs.c @@ -1604,29 +1604,6 @@ g_ascii_strup (const gchar *str, return result; } -/** - * g_str_is_ascii: - * @str: a string - * - * Determines if a string is pure ASCII. A string is pure ASCII if it - * contains no bytes with the high bit set. - * - * Returns: true if @str is ASCII - * - * Since: 2.40 - */ -gboolean -g_str_is_ascii (const gchar *str) -{ - gsize i; - - for (i = 0; str[i]; i++) - if (str[i] & 0x80) - return FALSE; - - return TRUE; -} - /** * g_strdown: * @string: the string to convert diff --git a/glib/gutf8.c b/glib/gutf8.c index d26a63094..3aeca3d2f 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -1,7 +1,8 @@ /* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey - * Copyright (C) 2000 Red Hat, Inc. + * Copyright (C) 2000, 2015-2022 Red Hat, Inc. + * Copyright (C) 2022-2023 David Rheinsberg * * SPDX-License-Identifier: LGPL-2.1-or-later * @@ -1574,166 +1575,255 @@ g_ucs4_to_utf16 (const gunichar *str, return result; } -#define VALIDATE_BYTE(mask, expect) \ - G_STMT_START { \ - if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \ - goto error; \ - } G_STMT_END +/* SIMD-based UTF-8 validation originates in the c-utf8 project from + * https://github.com/c-util/c-utf8/ from the following authors: + * + * David Rheinsberg + * Evgeny Vereshchagin + * Jan Engelhardt + * Tom Gundersen + * + * It has been adapted for portability and integration. + * The original code is dual-licensed Apache-2.0 or LGPLv2.1+ + */ -/* see IETF RFC 3629 Section 4 */ - -static const gchar * -fast_validate (const char *str) +#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1)) +static inline guint8 +load_u8 (gconstpointer memory, + gsize offset) { - const gchar *p; - - for (p = str; *p; p++) - { - if (*(guchar *)p < 128) - /* done */; - else - { - const gchar *last; - - last = p; - if (*(guchar *)p < 0xe0) /* 110xxxxx */ - { - if (G_UNLIKELY (*(guchar *)p < 0xc2)) - goto error; - } - else - { - if (*(guchar *)p < 0xf0) /* 1110xxxx */ - { - switch (*(guchar *)p++ & 0x0f) - { - case 0: - VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ - break; - case 0x0d: - VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - } - else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ - { - switch (*(guchar *)p++ & 0x07) - { - case 0: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) - goto error; - break; - case 4: - VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - else - goto error; - } - - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - - continue; - - error: - return last; - } - } - - return p; + return ((const guint8 *)memory)[offset]; } -static const gchar * -fast_validate_len (const char *str, - gssize max_len) +#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__) +# define _attribute_aligned(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +# define _attribute_aligned(n) __declspec(align(n)) +#else +# define _attribute_aligned(n) +#endif +static inline gsize +load_word (gconstpointer memory, + gsize offset) { - const gchar *p; +#if GLIB_SIZEOF_VOID_P == 8 + _attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset; - g_assert (max_len >= 0); + return ((guint64)m[0] << 0) | ((guint64)m[1] << 8) | + ((guint64)m[2] << 16) | ((guint64)m[3] << 24) | + ((guint64)m[4] << 32) | ((guint64)m[5] << 40) | + ((guint64)m[6] << 48) | ((guint64)m[7] << 56); +#else + _attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset; - for (p = str; ((p - str) < max_len) && *p; p++) + return ((guint)m[0] << 0) | ((guint)m[1] << 8) | + ((guint)m[2] << 16) | ((guint)m[3] << 24); +#endif +} + +/* The following constants are truncated on 32-bit machines */ +#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L) +#define UTF8_ASCII_SUB ((gsize)0x0101010101010101L) + +static inline int +utf8_word_is_ascii (gsize word) +{ + /* True unless any byte is NULL or has the MSB set. */ + return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0); +} + +static void +utf8_verify_ascii (const char **strp, + gsize *lenp) +{ + const char *str = *strp; + gsize len = lenp ? *lenp : (gsize)-1; + + while (len > 0 && load_u8 (str, 0) < 128) { - if (*(guchar *)p < 128) - /* done */; - else - { - const gchar *last; + if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str) + { + while (len >= 2 * sizeof (gsize)) + { + if (!utf8_word_is_ascii (load_word (str, 0)) || + !utf8_word_is_ascii (load_word (str, sizeof (gsize)))) + break; - last = p; - if (*(guchar *)p < 0xe0) /* 110xxxxx */ - { - if (G_UNLIKELY (max_len - (p - str) < 2)) - goto error; - - if (G_UNLIKELY (*(guchar *)p < 0xc2)) - goto error; - } - else - { - if (*(guchar *)p < 0xf0) /* 1110xxxx */ - { - if (G_UNLIKELY (max_len - (p - str) < 3)) - goto error; + str += 2 * sizeof(gsize); + len -= 2 * sizeof(gsize); + } - switch (*(guchar *)p++ & 0x0f) - { - case 0: - VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ - break; - case 0x0d: - VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - } - else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ - { - if (G_UNLIKELY (max_len - (p - str) < 4)) - goto error; + while (len > 0 && load_u8 (str, 0) < 128) + { + if G_UNLIKELY (load_u8 (str, 0) == 0x00) + goto out; - switch (*(guchar *)p++ & 0x07) - { - case 0: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) - goto error; - break; - case 4: - VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - else - goto error; - } + ++str; + --len; + } + } + else + { + if G_UNLIKELY (load_u8 (str, 0) == 0x00) + goto out; - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - - continue; - - error: - return last; - } + ++str; + --len; + } } - return p; +out: + *strp = str; + + if (lenp) + *lenp = len; +} + +#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80) + +static void +utf8_verify (const char **strp, + gsize *lenp) +{ + const char *str = *strp; + gsize len = lenp ? *lenp : (gsize)-1; + + /* See Unicode 10.0.0, Chapter 3, Section D92 */ + + while (len > 0) + { + guint8 b = load_u8 (str, 0); + + if (b == 0x00) + goto out; + + else if (b <= 0x7F) + { + /* + * Special-case and optimize the ASCII case. + */ + utf8_verify_ascii ((const char **)&str, &len); + } + + else if (b >= 0xC2 && b <= 0xDF) + { + if G_UNLIKELY (len < 2) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + + str += 2; + len -= 2; + + } + + else if (b == 0xE0) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b >= 0xE1 && b <= 0xEC) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b == 0xED) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b >= 0xEE && b <= 0xEF) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b == 0xF0) + { + if G_UNLIKELY (len < 4) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3))) + goto out; + + str += 4; + len -= 4; + } + + else if (b >= 0xF1 && b <= 0xF3) + { + if G_UNLIKELY (len < 4) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3))) + goto out; + + str += 4; + len -= 4; + } + + else if (b == 0xF4) + { + if G_UNLIKELY (len < 4) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3))) + goto out; + + str += 4; + len -= 4; + } + + else goto out; + } + +out: + *strp = str; + + if (lenp) + *lenp = len; } /** @@ -1768,20 +1858,15 @@ g_utf8_validate (const char *str, const gchar **end) { - const gchar *p; - if (max_len >= 0) return g_utf8_validate_len (str, max_len, end); - p = fast_validate (str); + utf8_verify (&str, NULL); - if (end) - *end = p; + if (end != NULL) + *end = str; - if (*p != '\0') - return FALSE; - else - return TRUE; + return *str == 0; } /** @@ -1804,17 +1889,31 @@ g_utf8_validate_len (const char *str, const gchar **end) { - const gchar *p; + utf8_verify (&str, &max_len); - p = fast_validate_len (str, max_len); + if (end != NULL) + *end = str; - if (end) - *end = p; + return max_len == 0; +} - if (p != str + max_len) - return FALSE; - else - return TRUE; +/** + * g_str_is_ascii: + * @str: a string + * + * Determines if a string is pure ASCII. A string is pure ASCII if it + * contains no bytes with the high bit set. + * + * Returns: true if @str is ASCII + * + * Since: 2.40 + */ +gboolean +g_str_is_ascii (const gchar *str) +{ + utf8_verify_ascii (&str, NULL); + + return *str == 0; } /** diff --git a/glib/tests/strfuncs.c b/glib/tests/strfuncs.c index 5d4dfa3aa..05f08c0ac 100644 --- a/glib/tests/strfuncs.c +++ b/glib/tests/strfuncs.c @@ -2719,6 +2719,27 @@ test_set_str (void) g_free (str); } +static void +test_str_is_ascii (void) +{ + const char *ascii_strings[] = { + "", + "hello", + "is it me you're looking for", + }; + const char *non_ascii_strings[] = { + "is it me you’re looking for", + "áccents", + "☺️", + }; + + for (size_t i = 0; i < G_N_ELEMENTS (ascii_strings); i++) + g_assert_true (g_str_is_ascii (ascii_strings[i])); + + for (size_t i = 0; i < G_N_ELEMENTS (non_ascii_strings); i++) + g_assert_false (g_str_is_ascii (non_ascii_strings[i])); +} + int main (int argc, char *argv[]) @@ -2775,6 +2796,7 @@ main (int argc, g_test_add_func ("/strfuncs/test-is-to-digit", test_is_to_digit); g_test_add_func ("/strfuncs/transliteration", test_transliteration); g_test_add_func ("/strfuncs/str-equal", test_str_equal); + g_test_add_func ("/strfuncs/str-is-ascii", test_str_is_ascii); return g_test_run(); } diff --git a/glib/tests/utf8-validate.c b/glib/tests/utf8-validate.c index 6c230452c..b77f0e18d 100644 --- a/glib/tests/utf8-validate.c +++ b/glib/tests/utf8-validate.c @@ -81,8 +81,9 @@ static Test global_test[] = { { "\xed\x9f\xbf", -1, 3, TRUE }, { "\xee\x80\x80", -1, 3, TRUE }, { "\xef\xbf\xbd", -1, 3, TRUE }, + { "\xf1\x80\x80\x80", -1, 4, TRUE }, { "\xf4\x8f\xbf\xbf", -1, 4, TRUE }, - { "\xf4\x90\x80\x80", -1, 0, FALSE }, + { "\xf4\x90\x80\x80", -1, 0, FALSE }, /* bigger than U+10FFFF */ /* malformed sequences */ /* continuation bytes */ { "\x80", -1, 0, FALSE }, @@ -94,6 +95,18 @@ static Test global_test[] = { { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE }, { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE }, { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE }, + { "\xe0\xa0\x20", -1, 0, FALSE }, + { "\xe1\x80\x20", -1, 0, FALSE }, + { "\xed\x80\x20", -1, 0, FALSE }, + { "\xf0\xc0\x80\x80", -1, 0, FALSE }, + { "\xf0\x90\x20\x80", -1, 0, FALSE }, + { "\xf0\x90\x80\x20", -1, 0, FALSE }, + { "\xf1\x20\x80\x80", -1, 0, FALSE }, + { "\xf1\x80\x20\x80", -1, 0, FALSE }, + { "\xf1\x80\x80\x20", -1, 0, FALSE }, + { "\xf4\x7f\x80\x80", -1, 0, FALSE }, + { "\xf4\x80\x20\x80", -1, 0, FALSE }, + { "\xf4\x80\x80\x20", -1, 0, FALSE }, /* all possible continuation byte */ { "\x80", -1, 0, FALSE }, @@ -253,6 +266,9 @@ static Test global_test[] = { { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE }, { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE }, { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE }, + { "\xe0\x9f\x80", -1, 0, FALSE }, + { "\xe0\xc0\x80", -1, 0, FALSE }, + { "\xf0\x8f\x80\x80", -1, 0, FALSE }, /* illegal code positions */ { "\x20\xed\xa0\x80\x20", -1, 1, FALSE }, { "\x20\xed\xad\xbf\x20", -1, 1, FALSE }, @@ -270,6 +286,14 @@ static Test global_test[] = { { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE }, { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE }, + /* ASCII boundaries */ + { "\x00", 1, 0, FALSE }, + { "\x01", -1, 1, TRUE }, + { "\x02", -1, 1, TRUE }, + { "\x7d", -1, 1, TRUE }, + { "\x7e", -1, 1, TRUE }, + { "\x7f", -1, 1, TRUE }, + { NULL, 0, 0, 0 } };