From f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9 Mon Sep 17 00:00:00 2001 From: Owen Taylor Date: Thu, 19 Jul 2001 14:35:48 +0000 Subject: [PATCH] Add functions to insert a unichar as UTF-8, since this is reasonably Fri Jul 13 19:20:06 2001 Owen Taylor * glib/gstring.c (g_string_insert/append/prepend_unichar): Add functions to insert a unichar as UTF-8, since this is reasonably common. * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): New function exposing iterating through possibly invalid/incomplete UTF-8 to unicode to the outside world. * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument to be gssize, not gsize. --- ChangeLog | 13 ++++++++ ChangeLog.pre-2-0 | 13 ++++++++ ChangeLog.pre-2-10 | 13 ++++++++ ChangeLog.pre-2-12 | 13 ++++++++ ChangeLog.pre-2-2 | 13 ++++++++ ChangeLog.pre-2-4 | 13 ++++++++ ChangeLog.pre-2-6 | 13 ++++++++ ChangeLog.pre-2-8 | 13 ++++++++ glib/gstring.c | 68 +++++++++++++++++++++++++++++++++++++++ glib/gstring.h | 8 +++++ glib/gunicode.h | 5 ++- glib/guniprop.c | 6 +--- glib/gutf8.c | 36 +++++++++++++++++++-- tests/unicode-normalize.c | 5 +-- 14 files changed, 220 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9387011ec..6339de26a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-0 b/ChangeLog.pre-2-0 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-0 +++ b/ChangeLog.pre-2-0 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-10 b/ChangeLog.pre-2-10 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-10 +++ b/ChangeLog.pre-2-10 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-12 b/ChangeLog.pre-2-12 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-12 +++ b/ChangeLog.pre-2-12 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-2 b/ChangeLog.pre-2-2 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-2 +++ b/ChangeLog.pre-2-2 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-4 b/ChangeLog.pre-2-4 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-4 +++ b/ChangeLog.pre-2-4 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-6 b/ChangeLog.pre-2-6 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-6 +++ b/ChangeLog.pre-2-6 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/ChangeLog.pre-2-8 b/ChangeLog.pre-2-8 index 9387011ec..6339de26a 100644 --- a/ChangeLog.pre-2-8 +++ b/ChangeLog.pre-2-8 @@ -1,3 +1,16 @@ +Fri Jul 13 19:20:06 2001 Owen Taylor + + * glib/gstring.c (g_string_insert/append/prepend_unichar): + Add functions to insert a unichar as UTF-8, since this + is reasonably common. + + * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): + New function exposing iterating through possibly invalid/incomplete + UTF-8 to unicode to the outside world. + + * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument + to be gssize, not gsize. + 2001-07-17 Kjartan Maraas * configure.in: Added "nn" to ALL_LINGUAS. diff --git a/glib/gstring.c b/glib/gstring.c index 58909cbdc..532275bf9 100644 --- a/glib/gstring.c +++ b/glib/gstring.c @@ -465,6 +465,25 @@ g_string_append_c (GString *fstring, return g_string_insert_c (fstring, -1, c); } +/** + * g_string_append_unichar: + * @string: a #GString + * @wc: a Unicode character + * + * Converts a Unicode character into UTF-8, and appends it + * to the string. + * + * Return value: @string + **/ +GString* +g_string_append_unichar (GString *string, + gunichar wc) +{ + g_return_val_if_fail (string != NULL, NULL); + + return g_string_insert_unichar (string, -1, wc); +} + GString* g_string_prepend (GString *fstring, const gchar *val) @@ -495,6 +514,25 @@ g_string_prepend_c (GString *fstring, return g_string_insert_c (fstring, 0, c); } +/** + * g_string_append_unichar: + * @string: a #GString + * @wc: a Unicode character + * + * Converts a Unicode character into UTF-8, and prepends it + * to the string. + * + * Return value: @string + **/ +GString* +g_string_prepend_unichar (GString *string, + gunichar wc) +{ + g_return_val_if_fail (string != NULL, NULL); + + return g_string_insert_unichar (string, 0, wc); +} + GString* g_string_insert (GString *fstring, gssize pos, @@ -537,6 +575,36 @@ g_string_insert_c (GString *fstring, return fstring; } +/** + * g_string_insert_unichar: + * @string: a #Gstring + * @pos: the position at which to insert character, or -1 to + * append at the end of the string. + * @wc: a Unicode character + * + * Converts a Unicode character into UTF-8, and insert it + * into the string at the given position. + * + * Return value: @string + **/ +GString* +g_string_insert_unichar (GString *string, + gssize pos, + gunichar wc) +{ + gchar buf[6]; + gint charlen; + + /* We could be somewhat more efficient here by computing + * the length, adding the space, then converting into that + * space, by cut-and-pasting the internals of g_unichar_to_utf8. + */ + g_return_val_if_fail (string != NULL, NULL); + + charlen = g_unichar_to_utf8 (wc, buf); + return g_string_insert_len (string, pos, buf, charlen); +} + GString* g_string_erase (GString *fstring, gsize pos, diff --git a/glib/gstring.h b/glib/gstring.h index fbdee1d69..7241273b6 100644 --- a/glib/gstring.h +++ b/glib/gstring.h @@ -28,6 +28,7 @@ #define __G_STRING_H__ #include +#include G_BEGIN_DECLS @@ -79,10 +80,14 @@ GString* g_string_append_len (GString *string, gssize len); GString* g_string_append_c (GString *string, gchar c); +GString* g_string_append_unichar (GString *string, + gunichar wc); GString* g_string_prepend (GString *string, const gchar *val); GString* g_string_prepend_c (GString *string, gchar c); +GString* g_string_prepend_unichar (GString *string, + gunichar wc); GString* g_string_prepend_len (GString *string, const gchar *val, gssize len); @@ -92,6 +97,9 @@ GString* g_string_insert (GString *string, GString* g_string_insert_c (GString *string, gssize pos, gchar c); +GString* g_string_insert_unichar (GString *string, + gssize pos, + gunichar wc); GString* g_string_erase (GString *string, gsize pos, gsize len); diff --git a/glib/gunicode.h b/glib/gunicode.h index b7f5e344a..c1309559f 100644 --- a/glib/gunicode.h +++ b/glib/gunicode.h @@ -167,7 +167,10 @@ GLIB_VAR char g_utf8_skip[256]; #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)]) -gunichar g_utf8_get_char (const gchar *p); +gunichar g_utf8_get_char (const gchar *p); +gunichar g_utf8_get_char_validated (const gchar *p, + gssize max_len); + gchar* g_utf8_offset_to_pointer (const gchar *str, glong offset); glong g_utf8_pointer_to_offset (const gchar *str, diff --git a/glib/guniprop.c b/glib/guniprop.c index 4195aea8a..de1b37660 100644 --- a/glib/guniprop.c +++ b/glib/guniprop.c @@ -872,8 +872,6 @@ g_utf8_casefold (const gchar *str, { GString *result = g_string_new (NULL); const char *p; - gchar buf[6]; - int charlen; p = str; while ((len < 0 || p < str + len) && *p) @@ -903,9 +901,7 @@ g_utf8_casefold (const gchar *str, } } - ch = g_unichar_tolower (ch); - charlen = g_unichar_to_utf8 (ch, buf); - g_string_append_len (result, buf, charlen); + g_string_append_unichar (result, g_unichar_tolower (ch)); next: p = g_utf8_next_char (p); diff --git a/glib/gutf8.c b/glib/gutf8.c index d1863e927..912d5b599 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -245,7 +245,9 @@ g_utf8_strlen (const gchar *p, * * Convert a sequence of bytes encoded as UTF-8 to a unicode character. * If @p does not point to a valid UTF-8 encoded character, results are - * undefined. + * undefined. If you are not sure that the bytes are complete + * valid unicode characters, you should use g_utf8_get_char_validated() + * instead. * * Return value: the resulting character **/ @@ -550,7 +552,8 @@ g_utf8_strrchr (const char *p, * and return (gunichar)-2 on incomplete trailing character */ static inline gunichar -g_utf8_get_char_extended (const gchar *p, gsize max_len) +g_utf8_get_char_extended (const gchar *p, + gssize max_len) { guint i, len; gunichar wc = (guchar) *p; @@ -625,6 +628,35 @@ g_utf8_get_char_extended (const gchar *p, gsize max_len) return wc; } +/** + * g_utf8_get_char_validated: + * @p: a pointer to unicode character encoded as UTF-8 + * @max_len: the maximum number of bytes to read, or -1, for no maximum. + * + * Convert a sequence of bytes encoded as UTF-8 to a unicode character. + * This function checks for incomplete characters, for invalid characters + * such as characters that are out of the range of Unicode, and for + * overlong encodings of valid characters. + * + * Return value: the resulting character. If @p points to a partial + * sequence at the end of a string that could begin a valid character, + * returns (gunichar)-2; otherwise, if @p does not point to a valid + * UTF-8 encoded unicode character, returns (gunichar)-1. + **/ +gunichar +g_utf8_get_char_validated (const gchar *p, + gssize max_len) +{ + gunichar result = g_utf8_get_char_extended (p, max_len); + + if (result & 0x80000000) + return result; + else if (!UNICODE_VALID (result)) + return (gunichar)-1; + else + return result; +} + /** * g_utf8_to_ucs4_fast: * @str: a UTF-8 encoded string diff --git a/tests/unicode-normalize.c b/tests/unicode-normalize.c index 2107932f9..7ea46edec 100644 --- a/tests/unicode-normalize.c +++ b/tests/unicode-normalize.c @@ -11,8 +11,6 @@ decode (const gchar *input) unsigned ch; int offset = 0; GString *result = g_string_new (NULL); - int len; - char buf[6]; do { @@ -30,8 +28,7 @@ decode (const gchar *input) return NULL; } - len = g_unichar_to_utf8 (ch, buf); - g_string_append_len (result, buf, len); + g_string_append_unichar (result, ch); while (input[offset] && input[offset] != ' ') offset++;