diff --git a/ChangeLog b/ChangeLog index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-0 b/ChangeLog.pre-2-0 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-0 +++ b/ChangeLog.pre-2-0 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-10 b/ChangeLog.pre-2-10 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-10 +++ b/ChangeLog.pre-2-10 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-12 b/ChangeLog.pre-2-12 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-12 +++ b/ChangeLog.pre-2-12 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-2 b/ChangeLog.pre-2-2 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-2 +++ b/ChangeLog.pre-2-2 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-4 b/ChangeLog.pre-2-4 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-4 +++ b/ChangeLog.pre-2-4 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-6 b/ChangeLog.pre-2-6 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-6 +++ b/ChangeLog.pre-2-6 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/ChangeLog.pre-2-8 b/ChangeLog.pre-2-8 index 5a19170f7..1abdfe7ad 100644 --- a/ChangeLog.pre-2-8 +++ b/ChangeLog.pre-2-8 @@ -1,3 +1,7 @@ +2000-09-10 Havoc Pennington + + * gutf8.c (g_utf8_validate): Add this function. + Sat Sep 9 18:50:42 2000 Owen Taylor * gstrfuncs.c (g_strescape): Add a missing g_return_if_fail(). diff --git a/glib/gunicode.h b/glib/gunicode.h index d58e31667..396c6bcd1 100644 --- a/glib/gunicode.h +++ b/glib/gunicode.h @@ -185,6 +185,14 @@ gchar * g_ucs4_to_utf8 (const gunichar *str, gint g_unichar_to_utf8 (gunichar c, char *outbuf); +/* Validate a UTF8 string, return TRUE if valid, put pointer to + * first invalid char in **end + */ + +gboolean g_utf8_validate (const gchar *str, + gint len, + const gchar **end); + #ifdef __cplusplus } #endif diff --git a/glib/gutf8.c b/glib/gutf8.c index 8bf95ebc3..f98f1372a 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -487,3 +487,78 @@ g_utf8_to_ucs4 (const char *str, int len) return result; } +/** + * g_utf8_validate: + * @str: a pointer to character data + * @max_len: max bytes to validate, or -1 to go until nul + * @end: return location for end of valid data + * + * Validates UTF-8 encoded text. @str is the text to validate; + * if @str is nul-terminated, then @max_len can be -1, otherwise + * @max_len should be the number of bytes to validate. + * If @end is non-NULL, then the end of the valid range + * will be stored there (i.e. the address of the first invalid byte + * if some bytes were invalid, or the end of the text being validated + * otherwise). + * + * Returns TRUE if all of @str was valid. Many GLib and GTK+ + * routines require valid UTF8 as input; + * so data read from a file or the network should be checked + * with g_utf8_validate() before doing anything else with it. + * + * Return value: TRUE if the text was valid UTF-8. + **/ +gboolean +g_utf8_validate (const gchar *str, + gint max_len, + const gchar **end) +{ + + const gchar *p; + gboolean retval = TRUE; + + if (end) + *end = str; + + p = str; + + while ((max_len < 0 || (p - str) < max_len) && *p) + { + int i, mask = 0, len; + gunichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + + if (len == -1) + { + retval = FALSE; + break; + } + + /* check that the expected number of bytes exists in str */ + if (max_len >= 0 && + ((max_len - (p - str)) < len)) + { + retval = FALSE; + break; + } + + UTF8_GET (result, p, i, mask, len); + + if (result == (gunichar)-1) + { + retval = FALSE; + break; + } + + p += len; + } + + if (end) + *end = p; + + return retval; +} + + diff --git a/gunicode.h b/gunicode.h index d58e31667..396c6bcd1 100644 --- a/gunicode.h +++ b/gunicode.h @@ -185,6 +185,14 @@ gchar * g_ucs4_to_utf8 (const gunichar *str, gint g_unichar_to_utf8 (gunichar c, char *outbuf); +/* Validate a UTF8 string, return TRUE if valid, put pointer to + * first invalid char in **end + */ + +gboolean g_utf8_validate (const gchar *str, + gint len, + const gchar **end); + #ifdef __cplusplus } #endif diff --git a/gutf8.c b/gutf8.c index 8bf95ebc3..f98f1372a 100644 --- a/gutf8.c +++ b/gutf8.c @@ -487,3 +487,78 @@ g_utf8_to_ucs4 (const char *str, int len) return result; } +/** + * g_utf8_validate: + * @str: a pointer to character data + * @max_len: max bytes to validate, or -1 to go until nul + * @end: return location for end of valid data + * + * Validates UTF-8 encoded text. @str is the text to validate; + * if @str is nul-terminated, then @max_len can be -1, otherwise + * @max_len should be the number of bytes to validate. + * If @end is non-NULL, then the end of the valid range + * will be stored there (i.e. the address of the first invalid byte + * if some bytes were invalid, or the end of the text being validated + * otherwise). + * + * Returns TRUE if all of @str was valid. Many GLib and GTK+ + * routines require valid UTF8 as input; + * so data read from a file or the network should be checked + * with g_utf8_validate() before doing anything else with it. + * + * Return value: TRUE if the text was valid UTF-8. + **/ +gboolean +g_utf8_validate (const gchar *str, + gint max_len, + const gchar **end) +{ + + const gchar *p; + gboolean retval = TRUE; + + if (end) + *end = str; + + p = str; + + while ((max_len < 0 || (p - str) < max_len) && *p) + { + int i, mask = 0, len; + gunichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + + if (len == -1) + { + retval = FALSE; + break; + } + + /* check that the expected number of bytes exists in str */ + if (max_len >= 0 && + ((max_len - (p - str)) < len)) + { + retval = FALSE; + break; + } + + UTF8_GET (result, p, i, mask, len); + + if (result == (gunichar)-1) + { + retval = FALSE; + break; + } + + p += len; + } + + if (end) + *end = p; + + return retval; +} + +