Merge branch 'wip/chergert/c-utf8' into 'main'

glib/utf8: Use SIMD for UTF-8 validation Closes #3481 See merge request GNOME/glib!4319
2024-11-06 09:26:17 +01:00 · 2024-10-03 15:07:17 +00:00 · 2024-10-03 15:07:17 +00:00 · fb4f2e5578
commit fb4f2e5578
parent 7f8f0842d0 72384894b8
4 changed files with 308 additions and 186 deletions
--- a/glib/gstrfuncs.c
+++ b/glib/gstrfuncs.c
@ -1604,29 +1604,6 @@ g_ascii_strup (const gchar *str,
  return result;
 }

-/**
- * g_str_is_ascii:
- * @str: a string
- *
- * Determines if a string is pure ASCII. A string is pure ASCII if it
- * contains no bytes with the high bit set.
- *
- * Returns: true if @str is ASCII
- *
- * Since: 2.40
- */
-gboolean
-g_str_is_ascii (const gchar *str)
-{
-  gsize i;
-
-  for (i = 0; str[i]; i++)
-    if (str[i] & 0x80)
-      return FALSE;
-
-  return TRUE;
-}
-
 /**
 * g_strdown:
 * @string: the string to convert
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@ -1,7 +1,8 @@
 /* gutf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 1999 Tom Tromey
- * Copyright (C) 2000 Red Hat, Inc.
+ * Copyright (C) 2000, 2015-2022 Red Hat, Inc.
+ * Copyright (C) 2022-2023 David Rheinsberg
 *
 * SPDX-License-Identifier: LGPL-2.1-or-later
 *
@ -1574,166 +1575,255 @@ g_ucs4_to_utf16 (const gunichar  *str,
  return result;
 }

-#define VALIDATE_BYTE(mask, expect)                      \
-  G_STMT_START {                                         \
-    if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
-      goto error;                                        \
-  } G_STMT_END
+/* SIMD-based UTF-8 validation originates in the c-utf8 project from
+ * https://github.com/c-util/c-utf8/ from the following authors:
+ *
+ *   David Rheinsberg <david@readahead.eu>
+ *   Evgeny Vereshchagin <evvers@ya.ru>
+ *   Jan Engelhardt <jengelh@inai.de>
+ *   Tom Gundersen <teg@jklm.no>
+ *
+ * It has been adapted for portability and integration.
+ * The original code is dual-licensed Apache-2.0 or LGPLv2.1+
+ */

-/* see IETF RFC 3629 Section 4 */
-
-static const gchar *
-fast_validate (const char *str)
+#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1))

+static inline guint8
+load_u8 (gconstpointer memory,
+         gsize         offset)
 {
-  const gchar *p;
+  return ((const guint8 *)memory)[offset];
+}

-  for (p = str; *p; p++)
-    {
-      if (*(guchar *)p < 128)
-	/* done */;
-      else 
-	{
-	  const gchar *last;
+#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__)
+# define _attribute_aligned(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+# define _attribute_aligned(n) __declspec(align(n))
+#else
+# define _attribute_aligned(n)
+#endif

-	  last = p;
-	  if (*(guchar *)p < 0xe0) /* 110xxxxx */
+static inline gsize
+load_word (gconstpointer memory,
+           gsize         offset)
 {
-	      if (G_UNLIKELY (*(guchar *)p < 0xc2))
-		goto error;
+#if GLIB_SIZEOF_VOID_P == 8
+  _attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset;
+
+  return ((guint64)m[0] <<  0) | ((guint64)m[1] <<  8) |
+         ((guint64)m[2] << 16) | ((guint64)m[3] << 24) |
+         ((guint64)m[4] << 32) | ((guint64)m[5] << 40) |
+         ((guint64)m[6] << 48) | ((guint64)m[7] << 56);
+#else
+  _attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset;
+
+  return ((guint)m[0] <<  0) | ((guint)m[1] <<  8) |
+         ((guint)m[2] << 16) | ((guint)m[3] << 24);
+#endif
+}
+
+/* The following constants are truncated on 32-bit machines */
+#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L)
+#define UTF8_ASCII_SUB  ((gsize)0x0101010101010101L)
+
+static inline int
+utf8_word_is_ascii (gsize word)
+{
+  /* True unless any byte is NULL or has the MSB set. */
+  return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0);
+}
+
+static void
+utf8_verify_ascii (const char **strp,
+                   gsize       *lenp)
+{
+  const char *str = *strp;
+  gsize len = lenp ? *lenp : (gsize)-1;
+
+  while (len > 0 && load_u8 (str, 0) < 128)
+    {
+      if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str)
+        {
+          while (len >= 2 * sizeof (gsize))
+            {
+              if (!utf8_word_is_ascii (load_word (str, 0)) ||
+                  !utf8_word_is_ascii (load_word (str, sizeof (gsize))))
+                break;
+
+              str += 2 * sizeof(gsize);
+              len -= 2 * sizeof(gsize);
+            }
+
+          while (len > 0 && load_u8 (str, 0) < 128)
+            {
+              if G_UNLIKELY (load_u8 (str, 0) == 0x00)
+                goto out;
+
+              ++str;
+              --len;
+            }
        }
      else
        {
-	      if (*(guchar *)p < 0xf0) /* 1110xxxx */
+          if G_UNLIKELY (load_u8 (str, 0) == 0x00)
+            goto out;
+
+          ++str;
+          --len;
+        }
+    }
+
+out:
+  *strp = str;
+
+  if (lenp)
+    *lenp = len;
+}
+
+#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80)
+
+static void
+utf8_verify (const char **strp,
+             gsize       *lenp)
 {
-		  switch (*(guchar *)p++ & 0x0f)
+  const char *str = *strp;
+  gsize len = lenp ? *lenp : (gsize)-1;
+
+  /* See Unicode 10.0.0, Chapter 3, Section D92 */
+
+  while (len > 0)
    {
-		    case 0:
-		      VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
-		      break;
-		    case 0x0d:
-		      VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		}
-	      else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
+      guint8 b = load_u8 (str, 0);
+
+      if (b == 0x00)
+        goto out;
+
+      else if (b <= 0x7F)
        {
-		  switch (*(guchar *)p++ & 0x07)
+          /*
+           * Special-case and optimize the ASCII case.
+           */
+          utf8_verify_ascii ((const char **)&str, &len);
+        }
+
+      else if (b >= 0xC2 && b <= 0xDF)
        {
-		    case 0:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		      if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
-			goto error;
-		      break;
-		    case 4:
-		      VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		  p++;
-		  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		}
-	      else
-		goto error;
+          if G_UNLIKELY (len < 2)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+
+          str += 2;
+          len -= 2;
+
        }

-	  p++;
-	  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-
-	  continue;
-
-	error:
-	  return last;
-	}
-    }
-
-  return p;
-}
-
-static const gchar *
-fast_validate_len (const char *str,
-		   gssize      max_len)
-
+      else if (b == 0xE0)
        {
-  const gchar *p;
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;

-  g_assert (max_len >= 0);
-
-  for (p = str; ((p - str) < max_len) && *p; p++)
-    {
-      if (*(guchar *)p < 128)
-	/* done */;
-      else 
-	{
-	  const gchar *last;
-
-	  last = p;
-	  if (*(guchar *)p < 0xe0) /* 110xxxxx */
-	    {
-	      if (G_UNLIKELY (max_len - (p - str) < 2))
-		goto error;
-	      
-	      if (G_UNLIKELY (*(guchar *)p < 0xc2))
-		goto error;
-	    }
-	  else
-	    {
-	      if (*(guchar *)p < 0xf0) /* 1110xxxx */
-		{
-		  if (G_UNLIKELY (max_len - (p - str) < 3))
-		    goto error;
-
-		  switch (*(guchar *)p++ & 0x0f)
-		    {
-		    case 0:
-		      VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
-		      break;
-		    case 0x0d:
-		      VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		}
-	      else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
-		{
-		  if (G_UNLIKELY (max_len - (p - str) < 4))
-		    goto error;
-
-		  switch (*(guchar *)p++ & 0x07)
-		    {
-		    case 0:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		      if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
-			goto error;
-		      break;
-		    case 4:
-		      VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		  p++;
-		  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		}
-	      else
-		goto error;
+          str += 3;
+          len -= 3;
        }

-	  p++;
-	  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+      else if (b >= 0xE1 && b <= 0xEC)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;

-	  continue;
-
-	error:
-	  return last;
-	}
+          str += 3;
+          len -= 3;
        }

-  return p;
+      else if (b == 0xED)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+
+          str += 3;
+          len -= 3;
+        }
+
+      else if (b >= 0xEE && b <= 0xEF)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+
+          str += 3;
+          len -= 3;
+        }
+
+      else if (b == 0xF0)
+        {
+          if G_UNLIKELY (len < 4)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
+            goto out;
+
+          str += 4;
+          len -= 4;
+        }
+
+      else if (b >= 0xF1 && b <= 0xF3)
+        {
+          if G_UNLIKELY (len < 4)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
+            goto out;
+
+          str += 4;
+          len -= 4;
+        }
+
+      else if (b == 0xF4)
+        {
+          if G_UNLIKELY (len < 4)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
+            goto out;
+
+          str += 4;
+          len -= 4;
+        }
+
+      else goto out;
+    }
+
+out:
+  *strp = str;
+
+  if (lenp)
+    *lenp = len;
 }

 /**
@ -1768,20 +1858,15 @@ g_utf8_validate (const char   *str,
 		 const gchar **end)

 {
-  const gchar *p;
-
  if (max_len >= 0)
    return g_utf8_validate_len (str, max_len, end);

-  p = fast_validate (str);
+  utf8_verify (&str, NULL);

-  if (end)
-    *end = p;
+  if (end != NULL)
+    *end = str;

-  if (*p != '\0')
-    return FALSE;
-  else
-    return TRUE;
+  return *str == 0;
 }

 /**
@ -1804,17 +1889,31 @@ g_utf8_validate_len (const char   *str,
                     const gchar **end)

 {
-  const gchar *p;
+  utf8_verify (&str, &max_len);

-  p = fast_validate_len (str, max_len);
+  if (end != NULL)
+    *end = str;

-  if (end)
-    *end = p;
+  return max_len == 0;
+}

-  if (p != str + max_len)
-    return FALSE;
-  else
-    return TRUE;
+/**
+ * g_str_is_ascii:
+ * @str: a string
+ *
+ * Determines if a string is pure ASCII. A string is pure ASCII if it
+ * contains no bytes with the high bit set.
+ *
+ * Returns: true if @str is ASCII
+ *
+ * Since: 2.40
+ */
+gboolean
+g_str_is_ascii (const gchar *str)
+{
+  utf8_verify_ascii (&str, NULL);
+
+  return *str == 0;
 }

 /**
--- a/glib/tests/strfuncs.c
+++ b/glib/tests/strfuncs.c
@ -2719,6 +2719,27 @@ test_set_str (void)
  g_free (str);
 }

+static void
+test_str_is_ascii (void)
+{
+  const char *ascii_strings[] = {
+    "",
+    "hello",
+    "is it me you're looking for",
+  };
+  const char *non_ascii_strings[] = {
+    "is it me you’re looking for",
+    "áccents",
+    "☺️",
+  };
+
+  for (size_t i = 0; i < G_N_ELEMENTS (ascii_strings); i++)
+    g_assert_true (g_str_is_ascii (ascii_strings[i]));
+
+  for (size_t i = 0; i < G_N_ELEMENTS (non_ascii_strings); i++)
+    g_assert_false (g_str_is_ascii (non_ascii_strings[i]));
+}
+
 int
 main (int   argc,
      char *argv[])
@ -2775,6 +2796,7 @@ main (int   argc,
  g_test_add_func ("/strfuncs/test-is-to-digit", test_is_to_digit);
  g_test_add_func ("/strfuncs/transliteration", test_transliteration);
  g_test_add_func ("/strfuncs/str-equal", test_str_equal);
+  g_test_add_func ("/strfuncs/str-is-ascii", test_str_is_ascii);

  return g_test_run();
 }
--- a/glib/tests/utf8-validate.c
+++ b/glib/tests/utf8-validate.c
@ -81,8 +81,9 @@ static Test global_test[] = {
  { "\xed\x9f\xbf", -1, 3, TRUE },
  { "\xee\x80\x80", -1, 3, TRUE },
  { "\xef\xbf\xbd", -1, 3, TRUE },
+  { "\xf1\x80\x80\x80", -1, 4, TRUE },
  { "\xf4\x8f\xbf\xbf", -1, 4, TRUE },
-  { "\xf4\x90\x80\x80", -1, 0, FALSE },
+  { "\xf4\x90\x80\x80", -1, 0, FALSE }, /* bigger than U+10FFFF */
  /* malformed sequences */
  /* continuation bytes */
  { "\x80", -1, 0, FALSE },
@ -94,6 +95,18 @@ static Test global_test[] = {
  { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
  { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
  { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
+  { "\xe0\xa0\x20", -1, 0, FALSE },
+  { "\xe1\x80\x20", -1, 0, FALSE },
+  { "\xed\x80\x20", -1, 0, FALSE },
+  { "\xf0\xc0\x80\x80", -1, 0, FALSE },
+  { "\xf0\x90\x20\x80", -1, 0, FALSE },
+  { "\xf0\x90\x80\x20", -1, 0, FALSE },
+  { "\xf1\x20\x80\x80", -1, 0, FALSE },
+  { "\xf1\x80\x20\x80", -1, 0, FALSE },
+  { "\xf1\x80\x80\x20", -1, 0, FALSE },
+  { "\xf4\x7f\x80\x80", -1, 0, FALSE },
+  { "\xf4\x80\x20\x80", -1, 0, FALSE },
+  { "\xf4\x80\x80\x20", -1, 0, FALSE },

  /* all possible continuation byte */
  { "\x80", -1, 0, FALSE },
@ -253,6 +266,9 @@ static Test global_test[] = {
  { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
  { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
  { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
+  { "\xe0\x9f\x80", -1, 0, FALSE },
+  { "\xe0\xc0\x80", -1, 0, FALSE },
+  { "\xf0\x8f\x80\x80", -1, 0, FALSE },
  /* illegal code positions */
  { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
  { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
@ -270,6 +286,14 @@ static Test global_test[] = {
  { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
  { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },

+  /* ASCII boundaries */
+  { "\x00", 1, 0, FALSE },
+  { "\x01", -1, 1, TRUE },
+  { "\x02", -1, 1, TRUE },
+  { "\x7d", -1, 1, TRUE },
+  { "\x7e", -1, 1, TRUE },
+  { "\x7f", -1, 1, TRUE },
+
  { NULL, 0, 0, 0 }
 };