Unrolled implementation of g_utf8_to_ucs4_fast()

Unrolling the branches and expressions for all expected cases of UTF-8 sequences facilitates the work of both an optimizing compiler and the branch prediction logic in the CPU. This speeds up decoding noticeably on text composed primarily of longer sequences. https://bugzilla.gnome.org/show_bug.cgi?id=738504
2025-08-19 07:08:54 +02:00 · 2014-10-13 21:31:02 +03:00
parent 3188b8ee79
commit b963565125
1 changed files with 43 additions and 28 deletions
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -677,6 +677,8 @@ g_utf8_get_char_validated (const gchar *p,
    return result;
 }
 #define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
 /**
 * g_utf8_to_ucs4_fast:
 * @str: a UTF-8 encoded string
@@ -729,39 +731,52 @@ g_utf8_to_ucs4_fast (const gchar *str,
  p = str;
  for (i=0; i < n_chars; i++)
    {
-      gunichar wc = (guchar)*p++;
+      guchar first = (guchar)*p++;
      gunichar wc;
-      if (wc < 0x80)
+      if (first < 0xc0)
 	{
-	  result[i] = wc;
+          /* We really hope first < 0x80, but we don't want to test an
           * extra branch for invalid input, which this function
           * does not care about. Handling unexpected continuation bytes
           * here will do the least damage. */
 	  wc = first;
 	}
      else
-	{ 
+	{
-	  gunichar mask = 0x40;
+          gunichar c1 = CONT_BYTE_FAST(p);
-
+          if (first < 0xe0)
-	  if (G_UNLIKELY ((wc & mask) == 0))
+            {
-	    {
+              wc = ((first & 0x1f) << 6) | c1;
-	      /* It's an out-of-sequence 10xxxxxxx byte.
+            }
-	       * Rather than making an ugly hash of this and the next byte
+          else
-	       * and overrunning the buffer, it's more useful to treat it
+            {
-	       * with a replacement character
+              gunichar c2 = CONT_BYTE_FAST(p);
-	       */
+              if (first < 0xf0)
-	      result[i] = 0xfffd;
+                {
-	      continue;
+                  wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;
-	    }
+                }
-
+              else
-	  do
+                {
-	    {
+                  gunichar c3 = CONT_BYTE_FAST(p);
-	      wc <<= 6;
+                  wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
-	      wc |= (guchar)(*p++) & 0x3f;
+                  if (G_UNLIKELY (first >= 0xf8))
-	      mask <<= 5;
+                    {
-	    }
+                      /* This can't be valid UTF-8, but g_utf8_next_char()
-	  while((wc & mask) != 0);
+                       * and company allow out-of-range sequences */
-
+                      gunichar mask = 1 << 20;
-	  wc &= mask - 1;
+                      while ((wc & mask) != 0)
-
+                        {
-	  result[i] = wc;
+                          wc <<= 6;
                          wc |= CONT_BYTE_FAST(p);
                          mask <<= 5;
                        }
                      wc &= mask - 1;
                    }
                }
            }
 	}
      result[i] = wc;
    }
  result[i] = 0;