Replace g_utf8_validate() with an optimized version, and clarify the docs

2004-11-24 Matthias Clasen <mclasen@redhat.com> * glib/gutf8.c: Replace g_utf8_validate() with an optimized version, and clarify the docs a bit. (#159131, Owen Taylor)
2025-11-04 01:58:54 +01:00 · 2004-11-24 17:58:21 +00:00
parent b8d9e050a4
commit 40fb4cff10
6 changed files with 187 additions and 52 deletions
--- a/4
+++ b/4
@@ -1,5 +1,9 @@
 2004-11-24  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gutf8.c: Replace g_utf8_validate() with an
 	optimized version, and clarify the docs a bit.  (#159131,
 	Owen Taylor)
 	* tests/Makefile.am (test_programs): Add utf8-validate.
 	* tests/utf8-validate.c: Unit tests for g_utf8_validate().
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,5 +1,9 @@
 2004-11-24  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gutf8.c: Replace g_utf8_validate() with an
 	optimized version, and clarify the docs a bit.  (#159131,
 	Owen Taylor)
 	* tests/Makefile.am (test_programs): Add utf8-validate.
 	* tests/utf8-validate.c: Unit tests for g_utf8_validate().
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,5 +1,9 @@
 2004-11-24  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gutf8.c: Replace g_utf8_validate() with an
 	optimized version, and clarify the docs a bit.  (#159131,
 	Owen Taylor)
 	* tests/Makefile.am (test_programs): Add utf8-validate.
 	* tests/utf8-validate.c: Unit tests for g_utf8_validate().
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,5 +1,9 @@
 2004-11-24  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gutf8.c: Replace g_utf8_validate() with an
 	optimized version, and clarify the docs a bit.  (#159131,
 	Owen Taylor)
 	* tests/Makefile.am (test_programs): Add utf8-validate.
 	* tests/utf8-validate.c: Unit tests for g_utf8_validate().
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,5 +1,9 @@
 2004-11-24  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gutf8.c: Replace g_utf8_validate() with an
 	optimized version, and clarify the docs a bit.  (#159131,
 	Owen Taylor)
 	* tests/Makefile.am (test_programs): Add utf8-validate.
 	* tests/utf8-validate.c: Unit tests for g_utf8_validate().
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -1511,19 +1511,171 @@ g_ucs4_to_utf16 (const gunichar  *str,
  return result;
 }
 #define CONTINUATION_CHAR                           \
 G_STMT_START {                                     \
  if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
    goto error;                                     \
  val <<= 6;                                        \
  val |= (*(guchar *)p) & 0x3f;                     \
 } G_STMT_END
 static const gchar *
 fast_validate (const char *str)
 {
  gunichar val = 0;
  gunichar min = 0;
  const gchar *p;
  for (p = str; *p; p++)
    {
      if (*(guchar *)p < 128)
 	/* done */;
      else 
 	{
 	  const gchar *last;
 	  last = p;
 	  if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
 	    {
 	      if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
 		goto error;
 	      p++;
 	      if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
 		goto error;
 	    }
 	  else
 	    {
 	      if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
 		{
 		  min = (1 << 11);
 		  val = *(guchar *)p & 0x0f;
 		  goto TWO_REMAINING;
 		}
 	      else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
 		{
 		  min = (1 << 16);
 		  val = *(guchar *)p & 0x07;
 		}
 	      else
 		goto error;
 	      p++;
 	      CONTINUATION_CHAR;
 	    TWO_REMAINING:
 	      p++;
 	      CONTINUATION_CHAR;
 	      p++;
 	      CONTINUATION_CHAR;
 	      if (G_UNLIKELY (val < min))
 		goto error;
 	      if (G_UNLIKELY (!UNICODE_VALID(val)))
 		goto error;
 	    } 
 	  continue;
 	error:
 	  return last;
 	}
    }
  return p;
 }
 static const gchar *
 fast_validate_len (const char *str,
 		   gssize      max_len)
 {
  gunichar val = 0;
  gunichar min = 0;
  const gchar *p;
  for (p = str; (max_len < 0 || (p - str) < max_len) && *p; p++)
    {
      if (*(guchar *)p < 128)
 	/* done */;
      else 
 	{
 	  const gchar *last;
 	  last = p;
 	  if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
 	    {
 	      if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 2))
 		goto error;
 	      if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
 		goto error;
 	      p++;
 	      if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
 		goto error;
 	    }
 	  else
 	    {
 	      if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
 		{
 		  if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 3))
 		    goto error;
 		  min = (1 << 11);
 		  val = *(guchar *)p & 0x0f;
 		  goto TWO_REMAINING;
 		}
 	      else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
 		{
 		  if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 4))
 		    goto error;
 		  min = (1 << 16);
 		  val = *(guchar *)p & 0x07;
 		}
 	      else
 		goto error;
 	      p++;
 	      CONTINUATION_CHAR;
 	    TWO_REMAINING:
 	      p++;
 	      CONTINUATION_CHAR;
 	      p++;
 	      CONTINUATION_CHAR;
 	      if (G_UNLIKELY (val < min))
 		goto error;
 	      if (G_UNLIKELY (!UNICODE_VALID(val)))
 		goto error;
 	    } 
 	  continue;
 	error:
 	  return last;
 	}
    }
  return p;
 }
 /**
 * g_utf8_validate:
 * @str: a pointer to character data
- * @max_len: max bytes to validate, or -1 to go until nul
+ * @max_len: max bytes to validate, or -1 to go until NUL
 * @end: return location for end of valid data
 * 
 * Validates UTF-8 encoded text. @str is the text to validate;
 * if @str is nul-terminated, then @max_len can be -1, otherwise
 * @max_len should be the number of bytes to validate.
 * If @end is non-%NULL, then the end of the valid range
- * will be stored there (i.e. the address of the first invalid byte
+ * will be stored there (i.e. the start of the first invalid 
- * if some bytes were invalid, or the end of the text being validated
+ * character if some bytes were invalid, or the end of the text 
- * otherwise).
+ * being validated otherwise).
 *
 * Note that g_utf8_validate() returns %FALSE if @max_len is 
 * positive and NUL is met before @max_len bytes have been read.
 *
 * Returns %TRUE if all of @str was valid. Many GLib and GTK+
 * routines <emphasis>require</emphasis> valid UTF-8 as input;
@@ -1533,66 +1685,29 @@ g_ucs4_to_utf16 (const gunichar  *str,
 * Return value: %TRUE if the text was valid UTF-8
 **/
 gboolean
-g_utf8_validate (const gchar  *str,
+g_utf8_validate (const char   *str,
-                 gssize        max_len,    
+		 gssize        max_len,    
-                 const gchar **end)
+		 const gchar **end)
 {
 {
  const gchar *p;
-  g_return_val_if_fail (str != NULL, FALSE);
+  if (max_len < 0)
-  
+    p = fast_validate (str);
-  if (end)
+  else
-    *end = str;
+    p = fast_validate_len (str, max_len);
  p = str;
  while ((max_len < 0 || (p - str) < max_len) && *p)
    {
      int i, mask = 0, len;
      gunichar result;
      unsigned char c = (unsigned char) *p;
      UTF8_COMPUTE (c, mask, len);
      if (len == -1)
        break;
      /* check that the expected number of bytes exists in str */
      if (max_len >= 0 &&
          ((max_len - (p - str)) < len))
        break;
      UTF8_GET (result, p, i, mask, len);
      if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
 	break;
      if (result == (gunichar)-1)
        break;
      if (!UNICODE_VALID (result))
 	break;
      p += len;
    }
  if (end)
    *end = p;
-  /* See that we covered the entire length if a length was
+  if ((max_len >= 0 && p != str + max_len) ||
-   * passed in, or that we ended on a nul if not
+      (max_len < 0 && *p != '\0'))
   */
  if (max_len >= 0 &&
      p != (str + max_len))
    return FALSE;
  else if (max_len < 0 &&
           *p != '\0')
    return FALSE;
  else
    return TRUE;
 }
 /**
 * g_unichar_validate:
 * @ch: a Unicode character