Made g_utf8_to_ucs4_fast() even faster

https://bugzilla.gnome.org/show_bug.cgi?id=619435
2025-07-14 22:26:46 +02:00 · 2010-03-17 02:34:51 +02:00
parent 1b101a3873
commit 69efeee287
1 changed files with 16 additions and 29 deletions
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@ -851,7 +851,6 @@ g_utf8_to_ucs4_fast (const gchar *str,
 		     glong        len,              
 		     glong       *items_written)    
 {
  gint j, charlen;
  gunichar *result;
  gint n_chars, i;
  const gchar *p;
@ -882,49 +881,37 @@ g_utf8_to_ucs4_fast (const gchar *str,
  p = str;
  for (i=0; i < n_chars; i++)
    {
-      gunichar wc = ((unsigned char *)p)[0];
+      gunichar wc = (guchar)*p++;
      if (wc < 0x80)
 	{
 	  result[i] = wc;
 	  p++;
 	}
      else
 	{ 
-	  if (wc < 0xe0)
+	  gunichar mask = 0x40;
 	  if (G_UNLIKELY ((wc & mask) == 0))
 	    {
-	      charlen = 2;
+	      /* It's an out-of-sequence 10xxxxxxx byte.
-	      wc &= 0x1f;
+	       * Rather than making an ugly hash of this and the next byte
-	    }
+	       * and overrunning the buffer, it's more useful to treat it
-	  else if (wc < 0xf0)
+	       * with a replacement character */
-	    {
+	      result[i] = 0xfffd;
-	      charlen = 3;
+	      continue;
 	      wc &= 0x0f;
 	    }
 	  else if (wc < 0xf8)
 	    {
 	      charlen = 4;
 	      wc &= 0x07;
 	    }
 	  else if (wc < 0xfc)
 	    {
 	      charlen = 5;
 	      wc &= 0x03;
 	    }
 	  else
 	    {
 	      charlen = 6;
 	      wc &= 0x01;
 	    }
-	  for (j = 1; j < charlen; j++)
+	  do
 	    {
 	      wc <<= 6;
-	      wc |= ((unsigned char *)p)[j] & 0x3f;
+	      wc |= (guchar)(*p++) & 0x3f;
 	      mask <<= 5;
 	    }
 	  while((wc & mask) != 0);
 	  wc &= mask - 1;
 	  result[i] = wc;
 	  p += charlen;
 	}
    }
  result[i] = 0;