Initial pass at adding unicode support functions. A few things still need

Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode.
2025-08-07 17:54:05 +02:00 · 2000-06-21 16:11:21 +00:00
parent 876a6767eb
commit 0891c64816
24 changed files with 16676 additions and 4 deletions
--- a/9
+++ b/9
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-0
+++ b/ChangeLog.pre-2-0
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-2
+++ b/ChangeLog.pre-2-2
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-4
+++ b/ChangeLog.pre-2-4
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,3 +1,12 @@
+Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>
+
+	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
+	Makefile.am glib.h: Initial pass at adding unicode support
+	functions. A few things still need to be implemented, a bit
+	of cleanup needs to be done, tests need to be added, and 
+	the docs need to be finished, but this should allow replacing
+	most or all use of libunicode.
+
 2000-06-06  Tor Lillqvist  <tml@iki.fi>

 	* giowin32.c (g_io_channel_win32_pipe_readable): If we are
--- a/Makefile.am
+++ b/Makefile.am
@@ -62,9 +62,14 @@ libglib_la_SOURCES = \
 	gthreadpool.c   \
 	gtimer.c	\
 	gtree.c		\
+	guniprop.c	\
+	gutf8.c		\
+	gunichartable.h	\
+	gunidecomp.h	\
+	gunidecomp.c	\
 	gutils.c

-include_HEADERS = glib.h glib-object.h
+include_HEADERS = glib.h glib-object.h gunicode.h

 configexecincludedir = $(pkglibdir)/include
 #configexecinclude_DATA = glibconfig.h
--- a/glib.h
+++ b/glib.h
@@ -3321,9 +3321,10 @@ guint           g_thread_pool_get_num_unused_threads (void);
 /* Stop all currently unused threads, but leave the limit untouched */
 void            g_thread_pool_stop_unused_threads    (void);

+#include <gunicode.h>
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

-
 #endif /* __G_LIB_H__ */
--- a/glib/Makefile.am
+++ b/glib/Makefile.am
@@ -62,9 +62,14 @@ libglib_la_SOURCES = \
 	gthreadpool.c   \
 	gtimer.c	\
 	gtree.c		\
+	guniprop.c	\
+	gutf8.c		\
+	gunichartable.h	\
+	gunidecomp.h	\
+	gunidecomp.c	\
 	gutils.c

-include_HEADERS = glib.h glib-object.h
+include_HEADERS = glib.h glib-object.h gunicode.h

 configexecincludedir = $(pkglibdir)/include
 #configexecinclude_DATA = glibconfig.h
--- a/glib/glib.h
+++ b/glib/glib.h
@@ -3321,9 +3321,10 @@ guint           g_thread_pool_get_num_unused_threads (void);
 /* Stop all currently unused threads, but leave the limit untouched */
 void            g_thread_pool_stop_unused_threads    (void);

+#include <gunicode.h>
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

-
 #endif /* __G_LIB_H__ */
--- a/glib/gunichartables.h
+++ b/glib/gunichartables.h
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@@ -0,0 +1,178 @@
+/* gunicode.h - Unicode manipulation functions
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000 Red Hat, Inc.
+ *
+ * The Gnome Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The Gnome Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ *   Boston, MA 02111-1307, USA.
+ */
+
+#ifndef __GUNICODE_H__
+#define __GUNICODE_H__
+
+#include <stdlib.h>      /* For size_t */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef guint32 gunichar;
+typedef guint16 gunichar2;
+
+/* These are the possible character classifications.  */
+typedef enum {
+  G_UNICODE_CONTROL,
+  G_UNICODE_FORMAT,
+  G_UNICODE_UNASSIGNED,
+  G_UNICODE_PRIVATE_USE,
+  G_UNICODE_SURROGATE,
+  G_UNICODE_LOWERCASE_LETTER,
+  G_UNICODE_MODIFIER_LETTER,
+  G_UNICODE_OTHER_LETTER,
+  G_UNICODE_TITLECASE_LETTER,
+  G_UNICODE_UPPERCASE_LETTER,
+  G_UNICODE_COMBINING_MARK,
+  G_UNICODE_ENCLOSING_MARK,
+  G_UNICODE_NON_SPACING_MARK,
+  G_UNICODE_DECIMAL_NUMBER,
+  G_UNICODE_LETTER_NUMBER,
+  G_UNICODE_OTHER_NUMBER,
+  G_UNICODE_CONNECT_PUNCTUATION,
+  G_UNICODE_DASH_PUNCTUATION,
+  G_UNICODE_CLOSE_PUNCTUATION,
+  G_UNICODE_FINAL_PUNCTUATION,
+  G_UNICODE_INITIAL_PUNCTUATION,
+  G_UNICODE_OTHER_PUNCTUATION,
+  G_UNICODE_OPEN_PUNCTUATION,
+  G_UNICODE_CURRENCY_SYMBOL,
+  G_UNICODE_MODIFIER_SYMBOL,
+  G_UNICODE_MATH_SYMBOL,
+  G_UNICODE_OTHER_SYMBOL,
+  G_UNICODE_LINE_SEPARATOR,
+  G_UNICODE_PARAGRAPH_SEPARATOR,
+  G_UNICODE_SPACE_SEPARATOR
+} GUnicodeType;
+
+/* Returns TRUE if current locale uses UTF-8 charset.  If CHARSET is
+ * not null, sets *CHARSET to the name of the current locale's
+ * charset.  This value is statically allocated.
+ */
+gboolean g_get_charset (char **charset);
+
+/* These are all analogs of the <ctype.h> functions.
+ */
+gboolean g_unichar_isalnum   (gunichar c);
+gboolean g_unichar_isalpha   (gunichar c);
+gboolean g_unichar_iscntrl   (gunichar c);
+gboolean g_unicphar_isdigit   (gunichar c);
+gboolean g_unichar_isgraph   (gunichar c);
+gboolean g_unichar_islower   (gunichar c);
+gboolean g_unichar_isprint   (gunichar c);
+gboolean g_unichar_ispunct   (gunichar c);
+gboolean g_unichar_isspace   (gunichar c);
+gboolean g_unichar_isupper   (gunichar c);
+gboolean g_unichar_isxdigit  (gunichar c);
+gboolean g_unichar_istitle   (gunichar c);
+gboolean g_unichar_isdefined (gunichar c);
+gboolean g_unichar_iswide    (gunichar c);
+
+/* More <ctype.h> functions.  These convert between the three cases.
+ * See the Unicode book to understand title case.  */
+gunichar g_unichar_toupper (gunichar c);
+gunichar g_unichar_tolower (gunichar c);
+gunichar g_unichar_totitle (gunichar c);
+
+/* If C is a digit (according to `g_unichar_isdigit'), then return its
+   numeric value.  Otherwise return -1.  */
+gint g_unichar_digit_value (gunichar c);
+
+gint g_unichar_xdigit_value (gunichar c);
+
+/* Return the Unicode character type of a given character.  */
+GUnicodeType g_unichar_type (gunichar c);
+
+
+
+/* Compute canonical ordering of a string in-place.  This rearranges
+   decomposed characters in the string according to their combining
+   classes.  See the Unicode manual for more information.  */
+void g_unicode_canonical_ordering (gunichar *string,
+				   size_t   len);
+
+/* Compute canonical decomposition of a character.  Returns g_malloc()d
+   string of Unicode characters.  RESULT_LEN is set to the resulting
+   length of the string.  */
+gunichar *g_unicode_canonical_decomposition (gunichar  ch,
+					     size_t   *result_len);
+
+/* Array of skip-bytes-per-initial character
+ */
+extern char g_utf8_skip[256];
+
+#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
+
+gunichar g_utf8_get_char          (const gchar *p);
+gchar *  g_utf8_offset_to_pointer  (const gchar *str,
+				    gint         offset);
+gint     g_utf8_pointer_to_offset (const gchar *str,
+				   const gchar *pos);
+gchar *  g_utf8_prev_char         (const gchar *p);
+gchar *  g_utf8_find_next_char    (const gchar *p,
+				   const gchar *bound);
+gchar *  g_utf8_find_prev_char    (const gchar *str,
+				   const gchar *p);
+
+gint g_utf8_strlen (const gchar *p,
+		    gint         max);
+
+/* Copies n characters from src to dest */
+gchar *g_utf8_strncpy (gchar       *dest,
+		       const gchar *src,
+		       size_t       n);
+
+/* Find the UTF-8 character corresponding to ch, in string p. These
+   functions are equivalants to strchr and strrchr */
+
+gchar *g_utf8_strchr  (const gchar *p,
+		       gunichar     ch);
+gchar *g_utf8_strrchr (const gchar *p,
+		       gunichar     ch);
+
+gunichar2 *g_utf8_to_utf16 (const gchar     *str,
+			    gint             len);
+gunichar * g_utf8_to_ucs4  (const gchar     *str,
+			    gint             len);
+gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
+			    gint             len);
+gchar *    g_utf16_to_utf8 (const gunichar2 *str,
+			    gint             len);
+gunichar * g_ucs4_to_utf16 (const gunichar  *str,
+			    gint             len);
+gchar *    g_ucs4_to_utf8  (const gunichar  *str,
+			    gint             len);
+
+/* Convert a single character into UTF-8. outbuf must have at
+ * least 6 bytes of space. Returns the number of bytes in the
+ * result.
+ */
+gint      g_unichar_to_utf8 (gunichar    c,
+			     char       *outbuf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GUNICODE_H */
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -0,0 +1,133 @@
+/* decomp.c - Character decomposition.
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000 Red Hat, Inc.
+ *
+ * The Gnome Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The Gnome Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ *   Boston, MA 02111-1307, USA.
+ */
+
+#include "glib.h"
+#include "gunidecomp.h"
+
+#include <config.h>
+
+#include <stdlib.h>
+
+/* We cheat a bit and cast type values to (char *).  We detect these
+   using the &0xff trick.  */
+#define CC(Page, Char) \
+  (((((int) (combining_class_table[Page])) & 0xff) \
+    == ((int) combining_class_table[Page])) \
+   ? ((int) combining_class_table[Page]) \
+   : (combining_class_table[Page][Char]))
+
+#define COMBINING_CLASS(Char) \
+     (((Char) > (UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
+
+/* Compute the canonical ordering of a string in-place.  */
+void
+g_unicode_canonical_ordering (gunichar *string,
+			      size_t len)
+{
+  size_t i;
+  int swap = 1;
+
+  while (swap)
+    {
+      int last;
+      swap = 0;
+      last = COMBINING_CLASS (string[0]);
+      for (i = 0; i < len - 1; ++i)
+	{
+	  int next = COMBINING_CLASS (string[i + 1]);
+	  if (next != 0 && last > next)
+	    {
+	      size_t j;
+	      /* Percolate item leftward through string.  */
+	      for (j = i; j > 0; --j)
+		{
+		  gunichar t;
+		  if (COMBINING_CLASS (string[j]) <= next)
+		    break;
+		  t = string[j + 1];
+		  string[j + 1] = string[j];
+		  string[j] = t;
+		  swap = 1;
+		}
+	      /* We're re-entering the loop looking at the old
+		 character again.  */
+	      next = last;
+	    }
+	  last = next;
+	}
+    }
+}
+
+gunichar *
+g_unicode_canonical_decomposition (gunichar ch,
+				   size_t *result_len)
+{
+  gunichar *r = NULL;
+
+  if (ch <= 0xffff)
+    {
+      int start = 0;
+      int end = G_N_ELEMENTS (decomp_table);
+      while (start != end)
+	{
+	  int half = (start + end) / 2;
+	  if (ch == decomp_table[half].ch)
+	    {
+	      /* Found it.  */
+	      int i, len;
+	      /* We store as a double-nul terminated string.  */
+	      for (len = 0; (decomp_table[half].expansion[len]
+			     || decomp_table[half].expansion[len + 1]);
+		   len += 2)
+		;
+
+	      /* We've counted twice as many bytes as there are
+		 characters.  */
+	      *result_len = len / 2;
+	      r = malloc (len / 2 * sizeof (gunichar));
+
+	      for (i = 0; i < len; i += 2)
+		{
+		  r[i / 2] = (decomp_table[half].expansion[i] << 8
+			      | decomp_table[half].expansion[i + 1]);
+		}
+	      break;
+	    }
+	  else if (ch > decomp_table[half].ch)
+	    start = half;
+	  else
+	    end = half;
+	}
+    }
+
+  if (r == NULL)
+    {
+      /* Not in our table.  */
+      r = malloc (sizeof (gunichar));
+      *r = ch;
+      *result_len = 1;
+    }
+
+  /* Supposedly following the Unicode 2.1.9 table means that the
+     decompositions come out in canonical order.  I haven't tested
+     this, but we rely on it here.  */
+  return r;
+}
--- a/glib/gunidecomp.h
+++ b/glib/gunidecomp.h
--- a/glib/guniprop.c
+++ b/glib/guniprop.c
@@ -0,0 +1,355 @@
+/* guniprop.c - Unicode character properties.
+ *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "glib.h"
+#include "gunichartables.h"
+
+#include <config.h>
+
+#include <stddef.h>
+
+#define asize(x)  ((sizeof (x)) / sizeof (x[0]))
+
+#define ATTTABLE(Page, Char) \
+  ((attr_table[Page] == 0) ? 0 : (attr_table[Page][Char]))
+
+/* We cheat a bit and cast type values to (char *).  We detect these
+   using the &0xff trick.  */
+#define TTYPE(Page, Char) \
+  (((((int) type_table[Page]) & 0xff) == ((int) type_table[Page])) \
+   ? ((int) (type_table[Page])) \
+   : (type_table[Page][Char]))
+
+#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
+
+#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
+		       || (Type) == G_UNICODE_LETTER_NUMBER \
+		       || (Type) == G_UNICODE_OTHER_NUMBER)
+
+#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
+		       || (Type) == G_UNICODE_UPPERCASE_LETTER \
+		       || (Type) == G_UNICODE_TITLECASE_LETTER \
+		       || (Type) == G_UNICODE_MODIFIER_LETTER \
+		       || (Type) == G_UNICODE_OTHER_LETTER)
+
+gboolean
+g_unichar_isalnum (gunichar c)
+{
+  int t = TYPE (c);
+  return ISDIGIT (t) || ISALPHA (t);
+}
+
+gboolean
+g_unichar_isalpha (gunichar c)
+{
+  int t = TYPE (c);
+  return ISALPHA (t);
+}
+
+gboolean
+g_unichar_iscntrl (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_CONTROL;
+}
+
+gboolean
+g_unichar_isdigit (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
+}
+
+gboolean
+g_unichar_isgraph (gunichar c)
+{
+  int t = TYPE (c);
+  return (t != G_UNICODE_CONTROL
+	  && t != G_UNICODE_FORMAT
+	  && t != G_UNICODE_UNASSIGNED
+	  && t != G_UNICODE_PRIVATE_USE
+	  && t != G_UNICODE_SURROGATE
+	  && t != G_UNICODE_SPACE_SEPARATOR);
+}
+
+gboolean
+g_unichar_islower (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
+}
+
+gboolean
+g_unichar_isprint (gunichar c)
+{
+  int t = TYPE (c);
+  return (t != G_UNICODE_CONTROL
+	  && t != G_UNICODE_FORMAT
+	  && t != G_UNICODE_UNASSIGNED
+	  && t != G_UNICODE_PRIVATE_USE
+	  && t != G_UNICODE_SURROGATE);
+}
+
+gboolean
+g_unichar_ispunct (gunichar c)
+{
+  int t = TYPE (c);
+  return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
+	  || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
+	  || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
+	  || t == G_UNICODE_OPEN_PUNCTUATION);
+}
+
+gboolean
+g_unichar_isspace (gunichar c)
+{
+  int t = TYPE (c);
+  return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
+	  || t == G_UNICODE_PARAGRAPH_SEPARATOR);
+}
+
+/**
+ * g_unichar_isupper:
+ * @c: a unicode character
+ * 
+ * Determines if a character is uppercase.
+ * 
+ * Return value: 
+ **/
+gboolean
+g_unichar_isupper (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
+}
+
+/**
+ * g_unichar_istitle:
+ * @c: a unicode character
+ * 
+ * Determines if a character is titlecase. Some characters in
+ * Unicode which are composites, such as the DZ digraph
+ * have three case variants instead of just two. The titlecase
+ * form is used at the beginning of a word where only the
+ * first letter is capitalized. The titlecase form of the DZ
+ * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z
+ * 
+ * Return value: %TRUE if the character is titlecase.
+ **/
+gboolean
+g_unichar_istitle (gunichar c)
+{
+  unsigned int i;
+  for (i = 0; i < asize (title_table); ++i)
+    if (title_table[i][0] == c)
+      return 1;
+  return 0;
+}
+
+/**
+ * g_unichar_isxdigit:
+ * @c: a unicode character.
+ * 
+ * Determines if a characters is a hexidecimal digit
+ * 
+ * Return value: %TRUE if the character is a hexidecimal digit.
+ **/
+gboolean
+g_unichar_isxdigit (gunichar c)
+{
+  int t = TYPE (c);
+  return ((c >= 'a' && c <= 'f')
+	  || (c >= 'A' && c <= 'F')
+	  || ISDIGIT (t));
+}
+
+/**
+ * g_unichar_isdefined:
+ * @c: a unicode character
+ * 
+ * Determines if a given character is assigned in the Unicode
+ * standard
+ *
+ * Return value: %TRUE if the character has an assigned value.
+ **/
+gboolean
+g_unichar_isdefined (gunichar c)
+{
+  int t = TYPE (c);
+  return t != G_UNICODE_UNASSIGNED;
+}
+
+/**
+ * g_unichar_iswide:
+ * @c: a unicode character
+ * 
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ * 
+ * Return value: %TRUE if the character is wide.
+ **/
+/* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
+gboolean
+g_unichar_iswide (gunichar c)
+{
+  if (c < 0x1100)
+    return 0;
+
+  return ((c >= 0x1100 && c <= 0x115f)	   /* Hangul Jamo */
+	  || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
+	      c != 0x303f)		   /* CJK ... Yi */
+	  || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
+	  || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
+	  || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
+	  || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
+	  || (c >= 0xffe0 && c <= 0xffe6));
+}
+
+/**
+ * g_unichar_toupper:
+ * @c: a unicode character
+ * 
+ * Convert a character to uppercase.
+ * 
+ * Return value: the result of converting @c to uppercase.
+ *               If @c is not an lowercase or titlecase character,
+ *               @c is returned unchanged.
+ **/
+gunichar
+g_unichar_toupper (gunichar c)
+{
+  int t = TYPE (c);
+  if (t == G_UNICODE_LOWERCASE_LETTER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  else if (t == G_UNICODE_TITLECASE_LETTER)
+    {
+      unsigned int i;
+      for (i = 0; i < asize (title_table); ++i)
+	{
+	  if (title_table[i][0] == c)
+	    return title_table[i][1];
+	}
+    }
+  return c;
+}
+
+/**
+ * g_unichar_tolower:
+ * @c: a unicode character.
+ * 
+ * Convert a character to lower case
+ * 
+e * Return value: the result of converting @c to lower case.
+ *               If @c is not an upperlower or titlecase character,
+ *               @c is returned unchanged.
+ **/
+gunichar
+g_unichar_tolower (gunichar c)
+{
+  int t = TYPE (c);
+  if (t == G_UNICODE_UPPERCASE_LETTER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  else if (t == G_UNICODE_TITLECASE_LETTER)
+    {
+      unsigned int i;
+      for (i = 0; i < asize (title_table); ++i)
+	{
+	  if (title_table[i][0] == c)
+	    return title_table[i][2];
+	}
+    }
+  return c;
+}
+
+/**
+ * g_unichar_totitle:
+ * @c: a unicode character
+ * 
+ * Convert a character to the titlecase
+ * 
+ * Return value: the result of converting @c to titlecase.
+ *               If @c is not an uppercase or lowercase character,
+ *               @c is returned unchanged.
+ **/
+gunichar
+g_unichar_totitle (gunichar c)
+{
+  unsigned int i;
+  for (i = 0; i < asize (title_table); ++i)
+    {
+      if (title_table[i][0] == c || title_table[i][1] == c
+	  || title_table[i][2] == c)
+	return title_table[i][0];
+    }
+  return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
+	  ? ATTTABLE (c >> 8, c & 0xff)
+	  : c);
+}
+
+/**
+ * g_unichar_xdigit_value:
+ * @c: a unicode character
+ *
+ * Determines the numeric value of a character as a decimal
+ * degital.
+ *
+ * Return value: If @c is a decimal digit (according to
+ * `g_unichar_isdigit'), its numeric value. Otherwise, -1.
+ **/
+int
+g_unichar_digit_value (gunichar c)
+{
+  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  return -1;
+}
+
+/**
+ * g_unichar_xdigit_value:
+ * @c: a unicode character
+ *
+ * Determines the numeric value of a character as a hexidecimal
+ * degital.
+ *
+ * Return value: If @c is a hex digit (according to
+ * `g_unichar_isxdigit'), its numeric value. Otherwise, -1.
+ **/
+int
+g_unichar_xdigit_value (gunichar c)
+{
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 1;
+  if (c >= 'a' && c <= 'f')
+    return c - 'a' + 1;
+  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  return -1;
+}
+
+/**
+ * g_unichar_type:
+ * @c: a unicode character
+ * 
+ * Classifies a unicode character by type.
+ * 
+ * Return value: the typ of the character.
+ **/
+GUnicodeType
+g_unichar_type (gunichar c)
+{
+  return TYPE (c);
+}
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -0,0 +1,483 @@
+/* gutf8.c - Operations on UTF-8 strings.
+ *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#ifdef HAVE_LANGINFO_H
+#include <langinfo.h>
+#endif
+#include <string.h>
+
+#include "glib.h"
+
+#define UTF8_COMPUTE(Char, Mask, Len)					      \
+  if (Char < 128)							      \
+    {									      \
+      Len = 1;								      \
+      Mask = 0x7f;							      \
+    }									      \
+  else if ((Char & 0xe0) == 0xc0)					      \
+    {									      \
+      Len = 2;								      \
+      Mask = 0x1f;							      \
+    }									      \
+  else if ((Char & 0xf0) == 0xe0)					      \
+    {									      \
+      Len = 3;								      \
+      Mask = 0x0f;							      \
+    }									      \
+  else if ((Char & 0xf8) == 0xf0)					      \
+    {									      \
+      Len = 4;								      \
+      Mask = 0x07;							      \
+    }									      \
+  else if ((Char & 0xfc) == 0xf8)					      \
+    {									      \
+      Len = 5;								      \
+      Mask = 0x03;							      \
+    }									      \
+  else if ((Char & 0xfe) == 0xfc)					      \
+    {									      \
+      Len = 6;								      \
+      Mask = 0x01;							      \
+    }									      \
+  else									      \
+    Len = -1;
+
+#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
+  (Result) = (Chars)[0] & (Mask);					      \
+  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
+    {									      \
+      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
+	{								      \
+	  (Result) = -1;						      \
+	  break;							      \
+	}								      \
+      (Result) <<= 6;							      \
+      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
+    }
+
+gchar g_utf8_skip[256] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
+};
+
+/**
+ * g_utf8_find_prev_char:
+ * @str: pointer to the beginning of a UTF-8 string
+ * @p: pointer to some position within @str
+ * 
+ * Given a position @p with a UTF-8 encoded string @str, find the start
+ * of the previous UTF-8 character starting before @p. Returns %NULL if no
+ * UTF-8 characters are present in @p before @str.
+ *
+ * @p does not have to be at the beginning of a UTF-8 chracter. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte.
+ *
+ * Return value: a pointer to the found character or %NULL.
+ **/
+gchar *
+g_utf8_find_prev_char (const char *str,
+		       const char *p)
+{
+  for (--p; p > str; --p)
+    {
+      if ((*p & 0xc0) != 0x80)
+	return (gchar *)p;
+    }
+  return NULL;
+}
+
+/**
+ * g_utf8_find_next_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ * @end: a pointer to the end of the string, or %NULL to indicate
+ *        that the string is NULL terminated, in which case
+ *        the returned value will be 
+ *
+ * Find the start of the next utf-8 character in the string after @p
+ *
+ * @p does not have to be at the beginning of a UTF-8 chracter. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte.
+ * 
+ * Return value: a pointer to the found character or %NULL
+ **/
+gchar *
+g_utf8_find_next_char (const gchar *p,
+		       const gchar *end)
+{
+  if (*p)
+    {
+      if (end)
+	for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
+	  ;
+      else
+	for (++p; (*p & 0xc0) == 0x80; ++p)
+	  ;
+    }
+  return (p == end) ? NULL : (gchar *)p;
+}
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Find the previous UTF-8 character in the string before @p
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char instead.
+ * 
+ * Return value: a pointer to the found character.
+ **/
+gchar *
+g_utf8_prev_char (const gchar *p)
+{
+  while (TRUE)
+    {
+      p--;
+      if ((*p & 0xc0) != 0x80)
+	return (gchar *)p;
+    }
+}
+
+/**
+ * g_utf8_strlen:
+ * @p: pointer to the start of a UTF-8 string.
+ * @max: the maximum number of bytes to examine. If @max
+ *       is less than 0, then the string is assumed to be
+ *       nul-terminated.
+ * 
+ * Return value: the length of the string in characters
+ */
+gint
+g_utf8_strlen (const gchar *p, gint max)
+{
+  int len = 0;
+  const gchar *start = p;
+  /* special case for the empty string */
+  if (!*p) 
+    return 0;
+  /* Note that the test here and the test in the loop differ subtly.
+     In the loop we want to see if we've passed the maximum limit --
+     for instance if the buffer ends mid-character.  Here at the top
+     of the loop we want to see if we've just reached the last byte.  */
+  while (max < 0 || p - start < max)
+    {
+      p = g_utf8_next_char (p);
+      ++len;
+      if (! *p || (max > 0 && p - start > max))
+	break;
+    }
+  return len;
+}
+
+/**
+ * g_utf8_get_char:
+ * @p: a pointer to unicode character encoded as UTF-8
+ * 
+ * Convert a sequence of bytes encoded as UTF-8 to a unicode character.
+ * 
+ * Return value: the resulting character or (gunichar)-1 if @p does
+ *               not point to a valid UTF-8 encoded unicode character
+ **/
+gunichar
+g_utf8_get_char (const gchar *p)
+{
+  int i, mask = 0, len;
+  gunichar result;
+  unsigned char c = (unsigned char) *p;
+
+  UTF8_COMPUTE (c, mask, len);
+  if (len == -1)
+    return (gunichar)-1;
+  UTF8_GET (result, p, i, mask, len);
+
+  return result;
+}
+
+/**
+ * g_utf8_offset_to_pointer:
+ * @str: a UTF-8 encoded string
+ * @offset: a character offset within the string.
+ * 
+ * Converts from an integer character offset to a pointer to a position
+ * within the string.
+ * 
+ * Return value: the resulting pointer
+ **/
+gchar *
+g_utf8_offset_to_pointer  (const gchar *str,
+			   gint         offset)
+{
+  const gchar *s = str;
+  while (offset--)
+    s = g_utf8_next_char (s);
+  
+  return (gchar *)s;
+}
+
+/**
+ * g_utf8_pointer_to_offset:
+ * @str: a UTF-8 encoded string
+ * @pos: a pointer to a position within @str
+ * 
+ * Converts from a pointer to position within a string to a integer
+ * character offset
+ * 
+ * Return value: the resulting character offset
+ **/
+gint
+g_utf8_pointer_to_offset (const gchar *str,
+			  const gchar *pos)
+{
+  const gchar *s = str;
+  gint offset = 0;
+  
+  while (s < pos)
+    {
+      s = g_utf8_next_char (s);
+      offset++;
+    }
+
+  return offset;
+}
+
+
+gchar *
+g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
+{
+  const gchar *s = src;
+  while (n && *s)
+    {
+      s = g_utf8_next_char(s);
+      n--;
+    }
+  strncpy(dest, src, s - src);
+  dest[s - src] = 0;
+  return dest;
+}
+
+static gboolean
+g_utf8_get_charset_internal (char **a)
+{
+  char *charset = getenv("CHARSET");
+
+  if (charset && a && ! *a)
+    *a = charset;
+
+  if (charset && strstr (charset, "UTF-8"))
+      return TRUE;
+
+#ifdef _NL_CTYPE_CODESET_NAME
+  charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
+  if (charset)
+    {
+      if (a && ! *a)
+	*a = charset;
+      if (strcmp (charset, "UTF-8") == 0)
+	return TRUE;
+    }
+#elif CODESET
+  charset = nl_langinfo(CODESET);
+  if (charset)
+    {
+      if (a && ! *a)
+	*a = charset;
+      if (strcmp (charset, "UTF-8") == 0)
+	return TRUE;
+    }
+#endif  
+
+  if (a && ! *a) 
+    *a = "US-ASCII";
+  /* Assume this for compatibility at present.  */
+  return FALSE;
+}
+
+static int utf8_locale_cache = -1;
+static char *utf8_charset_cache = NULL;
+
+gboolean
+g_get_charset (char **charset) 
+{
+  if (utf8_locale_cache != -1)
+    {
+      if (charset)
+	*charset = utf8_charset_cache;
+      return utf8_locale_cache;
+    }
+  utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
+  if (charset) 
+    *charset = utf8_charset_cache;
+  return utf8_locale_cache;
+}
+
+/* unicode_strchr */
+
+/**
+ * g_unichar_to_utf8:
+ * @ch: a ISO10646 character code
+ * @out: output buffer, must have at least 6 bytes of space.
+ * 
+ * Convert a single character to utf8
+ * 
+ * Return value: number of bytes written
+ **/
+int
+g_unichar_to_utf8 (gunichar c, gchar *outbuf)
+{
+  size_t len = 0;
+  int first;
+  int i;
+
+  if (c < 0x80)
+    {
+      first = 0;
+      len = 1;
+    }
+  else if (c < 0x800)
+    {
+      first = 0xc0;
+      len = 2;
+    }
+  else if (c < 0x10000)
+    {
+      first = 0xe0;
+      len = 3;
+    }
+   else if (c < 0x200000)
+    {
+      first = 0xf0;
+      len = 4;
+    }
+  else if (c < 0x4000000)
+    {
+      first = 0xf8;
+      len = 5;
+    }
+  else
+    {
+      first = 0xfc;
+      len = 6;
+    }
+
+  for (i = len - 1; i > 0; --i)
+    {
+      outbuf[i] = (c & 0x3f) | 0x80;
+      c >>= 6;
+    }
+  outbuf[0] = c | first;
+
+  return len;
+}
+
+/**
+ * g_utf8_strchr:
+ * @p: a nul-terminated utf-8 string
+ * @c: a iso-10646 character/
+ * 
+ * Find the leftmost occurence of the given iso-10646 character
+ * in a UTF-8 string.
+ * 
+ * Return value: NULL if the string does not contain the character, otherwise, a
+ *               a pointer to the start of the leftmost of the character in the string.
+ **/
+gchar *
+g_utf8_strchr (const char *p, gunichar c)
+{
+  gchar ch[10];
+
+  gint len = g_unichar_to_utf8 (c, ch);
+  ch[len] = '\0';
+  
+  return strstr(p, ch);
+}
+
+#if 0
+/**
+ * g_utf8_strrchr:
+ * @p: a nul-terminated utf-8 string
+ * @c: a iso-10646 character/
+ * 
+ * Find the rightmost occurence of the given iso-10646 character
+ * in a UTF-8 string.
+ * 
+ * Return value: NULL if the string does not contain the character, otherwise, a
+ *               a pointer to the start of the rightmost of the character in the string.
+ **/
+
+/* This is ifdefed out atm as there is no strrstr function in libc.
+ */
+gchar *
+unicode_strrchr (const char *p, gunichar c)
+{
+  gchar ch[10];
+
+  len = g_unichar_to_utf8 (c, ch);
+  ch[len] = '\0';
+  
+  return strrstr(p, ch);
+}
+#endif
+
+
+/**
+ * g_utf8_to_ucs4:
+ * @str: a UTF-8 encoded strnig
+ * @len: the length of @
+ * 
+ * Convert a string from UTF-8 to a 32-bit fixed width
+ * representation as UCS-4.
+ * 
+ * Return value: a pointer to a newly allocated UCS-4 string.
+ *               This value must be freed with g_free()
+ **/
+gunichar *
+g_utf8_to_ucs4 (const char *str, int len)
+{
+  gunichar *result;
+  gint n_chars, i;
+  const gchar *p;
+  
+  n_chars = g_utf8_strlen (str, len);
+  result = g_new (gunichar, n_chars);
+  
+  p = str;
+  for (i=0; i < n_chars; i++)
+    {
+      result[i] = g_utf8_get_char (p);
+      p = g_utf8_next_char (p);
+    }
+
+  return result;
+}
+
--- a/gunichartables.h
+++ b/gunichartables.h
--- a/gunicode.h
+++ b/gunicode.h
@@ -0,0 +1,178 @@
+/* gunicode.h - Unicode manipulation functions
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000 Red Hat, Inc.
+ *
+ * The Gnome Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The Gnome Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ *   Boston, MA 02111-1307, USA.
+ */
+
+#ifndef __GUNICODE_H__
+#define __GUNICODE_H__
+
+#include <stdlib.h>      /* For size_t */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef guint32 gunichar;
+typedef guint16 gunichar2;
+
+/* These are the possible character classifications.  */
+typedef enum {
+  G_UNICODE_CONTROL,
+  G_UNICODE_FORMAT,
+  G_UNICODE_UNASSIGNED,
+  G_UNICODE_PRIVATE_USE,
+  G_UNICODE_SURROGATE,
+  G_UNICODE_LOWERCASE_LETTER,
+  G_UNICODE_MODIFIER_LETTER,
+  G_UNICODE_OTHER_LETTER,
+  G_UNICODE_TITLECASE_LETTER,
+  G_UNICODE_UPPERCASE_LETTER,
+  G_UNICODE_COMBINING_MARK,
+  G_UNICODE_ENCLOSING_MARK,
+  G_UNICODE_NON_SPACING_MARK,
+  G_UNICODE_DECIMAL_NUMBER,
+  G_UNICODE_LETTER_NUMBER,
+  G_UNICODE_OTHER_NUMBER,
+  G_UNICODE_CONNECT_PUNCTUATION,
+  G_UNICODE_DASH_PUNCTUATION,
+  G_UNICODE_CLOSE_PUNCTUATION,
+  G_UNICODE_FINAL_PUNCTUATION,
+  G_UNICODE_INITIAL_PUNCTUATION,
+  G_UNICODE_OTHER_PUNCTUATION,
+  G_UNICODE_OPEN_PUNCTUATION,
+  G_UNICODE_CURRENCY_SYMBOL,
+  G_UNICODE_MODIFIER_SYMBOL,
+  G_UNICODE_MATH_SYMBOL,
+  G_UNICODE_OTHER_SYMBOL,
+  G_UNICODE_LINE_SEPARATOR,
+  G_UNICODE_PARAGRAPH_SEPARATOR,
+  G_UNICODE_SPACE_SEPARATOR
+} GUnicodeType;
+
+/* Returns TRUE if current locale uses UTF-8 charset.  If CHARSET is
+ * not null, sets *CHARSET to the name of the current locale's
+ * charset.  This value is statically allocated.
+ */
+gboolean g_get_charset (char **charset);
+
+/* These are all analogs of the <ctype.h> functions.
+ */
+gboolean g_unichar_isalnum   (gunichar c);
+gboolean g_unichar_isalpha   (gunichar c);
+gboolean g_unichar_iscntrl   (gunichar c);
+gboolean g_unicphar_isdigit   (gunichar c);
+gboolean g_unichar_isgraph   (gunichar c);
+gboolean g_unichar_islower   (gunichar c);
+gboolean g_unichar_isprint   (gunichar c);
+gboolean g_unichar_ispunct   (gunichar c);
+gboolean g_unichar_isspace   (gunichar c);
+gboolean g_unichar_isupper   (gunichar c);
+gboolean g_unichar_isxdigit  (gunichar c);
+gboolean g_unichar_istitle   (gunichar c);
+gboolean g_unichar_isdefined (gunichar c);
+gboolean g_unichar_iswide    (gunichar c);
+
+/* More <ctype.h> functions.  These convert between the three cases.
+ * See the Unicode book to understand title case.  */
+gunichar g_unichar_toupper (gunichar c);
+gunichar g_unichar_tolower (gunichar c);
+gunichar g_unichar_totitle (gunichar c);
+
+/* If C is a digit (according to `g_unichar_isdigit'), then return its
+   numeric value.  Otherwise return -1.  */
+gint g_unichar_digit_value (gunichar c);
+
+gint g_unichar_xdigit_value (gunichar c);
+
+/* Return the Unicode character type of a given character.  */
+GUnicodeType g_unichar_type (gunichar c);
+
+
+
+/* Compute canonical ordering of a string in-place.  This rearranges
+   decomposed characters in the string according to their combining
+   classes.  See the Unicode manual for more information.  */
+void g_unicode_canonical_ordering (gunichar *string,
+				   size_t   len);
+
+/* Compute canonical decomposition of a character.  Returns g_malloc()d
+   string of Unicode characters.  RESULT_LEN is set to the resulting
+   length of the string.  */
+gunichar *g_unicode_canonical_decomposition (gunichar  ch,
+					     size_t   *result_len);
+
+/* Array of skip-bytes-per-initial character
+ */
+extern char g_utf8_skip[256];
+
+#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
+
+gunichar g_utf8_get_char          (const gchar *p);
+gchar *  g_utf8_offset_to_pointer  (const gchar *str,
+				    gint         offset);
+gint     g_utf8_pointer_to_offset (const gchar *str,
+				   const gchar *pos);
+gchar *  g_utf8_prev_char         (const gchar *p);
+gchar *  g_utf8_find_next_char    (const gchar *p,
+				   const gchar *bound);
+gchar *  g_utf8_find_prev_char    (const gchar *str,
+				   const gchar *p);
+
+gint g_utf8_strlen (const gchar *p,
+		    gint         max);
+
+/* Copies n characters from src to dest */
+gchar *g_utf8_strncpy (gchar       *dest,
+		       const gchar *src,
+		       size_t       n);
+
+/* Find the UTF-8 character corresponding to ch, in string p. These
+   functions are equivalants to strchr and strrchr */
+
+gchar *g_utf8_strchr  (const gchar *p,
+		       gunichar     ch);
+gchar *g_utf8_strrchr (const gchar *p,
+		       gunichar     ch);
+
+gunichar2 *g_utf8_to_utf16 (const gchar     *str,
+			    gint             len);
+gunichar * g_utf8_to_ucs4  (const gchar     *str,
+			    gint             len);
+gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
+			    gint             len);
+gchar *    g_utf16_to_utf8 (const gunichar2 *str,
+			    gint             len);
+gunichar * g_ucs4_to_utf16 (const gunichar  *str,
+			    gint             len);
+gchar *    g_ucs4_to_utf8  (const gunichar  *str,
+			    gint             len);
+
+/* Convert a single character into UTF-8. outbuf must have at
+ * least 6 bytes of space. Returns the number of bytes in the
+ * result.
+ */
+gint      g_unichar_to_utf8 (gunichar    c,
+			     char       *outbuf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GUNICODE_H */
--- a/gunidecomp.c
+++ b/gunidecomp.c
@@ -0,0 +1,133 @@
+/* decomp.c - Character decomposition.
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000 Red Hat, Inc.
+ *
+ * The Gnome Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The Gnome Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ *   Boston, MA 02111-1307, USA.
+ */
+
+#include "glib.h"
+#include "gunidecomp.h"
+
+#include <config.h>
+
+#include <stdlib.h>
+
+/* We cheat a bit and cast type values to (char *).  We detect these
+   using the &0xff trick.  */
+#define CC(Page, Char) \
+  (((((int) (combining_class_table[Page])) & 0xff) \
+    == ((int) combining_class_table[Page])) \
+   ? ((int) combining_class_table[Page]) \
+   : (combining_class_table[Page][Char]))
+
+#define COMBINING_CLASS(Char) \
+     (((Char) > (UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
+
+/* Compute the canonical ordering of a string in-place.  */
+void
+g_unicode_canonical_ordering (gunichar *string,
+			      size_t len)
+{
+  size_t i;
+  int swap = 1;
+
+  while (swap)
+    {
+      int last;
+      swap = 0;
+      last = COMBINING_CLASS (string[0]);
+      for (i = 0; i < len - 1; ++i)
+	{
+	  int next = COMBINING_CLASS (string[i + 1]);
+	  if (next != 0 && last > next)
+	    {
+	      size_t j;
+	      /* Percolate item leftward through string.  */
+	      for (j = i; j > 0; --j)
+		{
+		  gunichar t;
+		  if (COMBINING_CLASS (string[j]) <= next)
+		    break;
+		  t = string[j + 1];
+		  string[j + 1] = string[j];
+		  string[j] = t;
+		  swap = 1;
+		}
+	      /* We're re-entering the loop looking at the old
+		 character again.  */
+	      next = last;
+	    }
+	  last = next;
+	}
+    }
+}
+
+gunichar *
+g_unicode_canonical_decomposition (gunichar ch,
+				   size_t *result_len)
+{
+  gunichar *r = NULL;
+
+  if (ch <= 0xffff)
+    {
+      int start = 0;
+      int end = G_N_ELEMENTS (decomp_table);
+      while (start != end)
+	{
+	  int half = (start + end) / 2;
+	  if (ch == decomp_table[half].ch)
+	    {
+	      /* Found it.  */
+	      int i, len;
+	      /* We store as a double-nul terminated string.  */
+	      for (len = 0; (decomp_table[half].expansion[len]
+			     || decomp_table[half].expansion[len + 1]);
+		   len += 2)
+		;
+
+	      /* We've counted twice as many bytes as there are
+		 characters.  */
+	      *result_len = len / 2;
+	      r = malloc (len / 2 * sizeof (gunichar));
+
+	      for (i = 0; i < len; i += 2)
+		{
+		  r[i / 2] = (decomp_table[half].expansion[i] << 8
+			      | decomp_table[half].expansion[i + 1]);
+		}
+	      break;
+	    }
+	  else if (ch > decomp_table[half].ch)
+	    start = half;
+	  else
+	    end = half;
+	}
+    }
+
+  if (r == NULL)
+    {
+      /* Not in our table.  */
+      r = malloc (sizeof (gunichar));
+      *r = ch;
+      *result_len = 1;
+    }
+
+  /* Supposedly following the Unicode 2.1.9 table means that the
+     decompositions come out in canonical order.  I haven't tested
+     this, but we rely on it here.  */
+  return r;
+}
--- a/gunidecomp.h
+++ b/gunidecomp.h
--- a/guniprop.c
+++ b/guniprop.c
@@ -0,0 +1,355 @@
+/* guniprop.c - Unicode character properties.
+ *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "glib.h"
+#include "gunichartables.h"
+
+#include <config.h>
+
+#include <stddef.h>
+
+#define asize(x)  ((sizeof (x)) / sizeof (x[0]))
+
+#define ATTTABLE(Page, Char) \
+  ((attr_table[Page] == 0) ? 0 : (attr_table[Page][Char]))
+
+/* We cheat a bit and cast type values to (char *).  We detect these
+   using the &0xff trick.  */
+#define TTYPE(Page, Char) \
+  (((((int) type_table[Page]) & 0xff) == ((int) type_table[Page])) \
+   ? ((int) (type_table[Page])) \
+   : (type_table[Page][Char]))
+
+#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
+
+#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
+		       || (Type) == G_UNICODE_LETTER_NUMBER \
+		       || (Type) == G_UNICODE_OTHER_NUMBER)
+
+#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
+		       || (Type) == G_UNICODE_UPPERCASE_LETTER \
+		       || (Type) == G_UNICODE_TITLECASE_LETTER \
+		       || (Type) == G_UNICODE_MODIFIER_LETTER \
+		       || (Type) == G_UNICODE_OTHER_LETTER)
+
+gboolean
+g_unichar_isalnum (gunichar c)
+{
+  int t = TYPE (c);
+  return ISDIGIT (t) || ISALPHA (t);
+}
+
+gboolean
+g_unichar_isalpha (gunichar c)
+{
+  int t = TYPE (c);
+  return ISALPHA (t);
+}
+
+gboolean
+g_unichar_iscntrl (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_CONTROL;
+}
+
+gboolean
+g_unichar_isdigit (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
+}
+
+gboolean
+g_unichar_isgraph (gunichar c)
+{
+  int t = TYPE (c);
+  return (t != G_UNICODE_CONTROL
+	  && t != G_UNICODE_FORMAT
+	  && t != G_UNICODE_UNASSIGNED
+	  && t != G_UNICODE_PRIVATE_USE
+	  && t != G_UNICODE_SURROGATE
+	  && t != G_UNICODE_SPACE_SEPARATOR);
+}
+
+gboolean
+g_unichar_islower (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
+}
+
+gboolean
+g_unichar_isprint (gunichar c)
+{
+  int t = TYPE (c);
+  return (t != G_UNICODE_CONTROL
+	  && t != G_UNICODE_FORMAT
+	  && t != G_UNICODE_UNASSIGNED
+	  && t != G_UNICODE_PRIVATE_USE
+	  && t != G_UNICODE_SURROGATE);
+}
+
+gboolean
+g_unichar_ispunct (gunichar c)
+{
+  int t = TYPE (c);
+  return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
+	  || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
+	  || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
+	  || t == G_UNICODE_OPEN_PUNCTUATION);
+}
+
+gboolean
+g_unichar_isspace (gunichar c)
+{
+  int t = TYPE (c);
+  return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
+	  || t == G_UNICODE_PARAGRAPH_SEPARATOR);
+}
+
+/**
+ * g_unichar_isupper:
+ * @c: a unicode character
+ * 
+ * Determines if a character is uppercase.
+ * 
+ * Return value: 
+ **/
+gboolean
+g_unichar_isupper (gunichar c)
+{
+  return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
+}
+
+/**
+ * g_unichar_istitle:
+ * @c: a unicode character
+ * 
+ * Determines if a character is titlecase. Some characters in
+ * Unicode which are composites, such as the DZ digraph
+ * have three case variants instead of just two. The titlecase
+ * form is used at the beginning of a word where only the
+ * first letter is capitalized. The titlecase form of the DZ
+ * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z
+ * 
+ * Return value: %TRUE if the character is titlecase.
+ **/
+gboolean
+g_unichar_istitle (gunichar c)
+{
+  unsigned int i;
+  for (i = 0; i < asize (title_table); ++i)
+    if (title_table[i][0] == c)
+      return 1;
+  return 0;
+}
+
+/**
+ * g_unichar_isxdigit:
+ * @c: a unicode character.
+ * 
+ * Determines if a characters is a hexidecimal digit
+ * 
+ * Return value: %TRUE if the character is a hexidecimal digit.
+ **/
+gboolean
+g_unichar_isxdigit (gunichar c)
+{
+  int t = TYPE (c);
+  return ((c >= 'a' && c <= 'f')
+	  || (c >= 'A' && c <= 'F')
+	  || ISDIGIT (t));
+}
+
+/**
+ * g_unichar_isdefined:
+ * @c: a unicode character
+ * 
+ * Determines if a given character is assigned in the Unicode
+ * standard
+ *
+ * Return value: %TRUE if the character has an assigned value.
+ **/
+gboolean
+g_unichar_isdefined (gunichar c)
+{
+  int t = TYPE (c);
+  return t != G_UNICODE_UNASSIGNED;
+}
+
+/**
+ * g_unichar_iswide:
+ * @c: a unicode character
+ * 
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ * 
+ * Return value: %TRUE if the character is wide.
+ **/
+/* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
+gboolean
+g_unichar_iswide (gunichar c)
+{
+  if (c < 0x1100)
+    return 0;
+
+  return ((c >= 0x1100 && c <= 0x115f)	   /* Hangul Jamo */
+	  || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
+	      c != 0x303f)		   /* CJK ... Yi */
+	  || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
+	  || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
+	  || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
+	  || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
+	  || (c >= 0xffe0 && c <= 0xffe6));
+}
+
+/**
+ * g_unichar_toupper:
+ * @c: a unicode character
+ * 
+ * Convert a character to uppercase.
+ * 
+ * Return value: the result of converting @c to uppercase.
+ *               If @c is not an lowercase or titlecase character,
+ *               @c is returned unchanged.
+ **/
+gunichar
+g_unichar_toupper (gunichar c)
+{
+  int t = TYPE (c);
+  if (t == G_UNICODE_LOWERCASE_LETTER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  else if (t == G_UNICODE_TITLECASE_LETTER)
+    {
+      unsigned int i;
+      for (i = 0; i < asize (title_table); ++i)
+	{
+	  if (title_table[i][0] == c)
+	    return title_table[i][1];
+	}
+    }
+  return c;
+}
+
+/**
+ * g_unichar_tolower:
+ * @c: a unicode character.
+ * 
+ * Convert a character to lower case
+ * 
+e * Return value: the result of converting @c to lower case.
+ *               If @c is not an upperlower or titlecase character,
+ *               @c is returned unchanged.
+ **/
+gunichar
+g_unichar_tolower (gunichar c)
+{
+  int t = TYPE (c);
+  if (t == G_UNICODE_UPPERCASE_LETTER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  else if (t == G_UNICODE_TITLECASE_LETTER)
+    {
+      unsigned int i;
+      for (i = 0; i < asize (title_table); ++i)
+	{
+	  if (title_table[i][0] == c)
+	    return title_table[i][2];
+	}
+    }
+  return c;
+}
+
+/**
+ * g_unichar_totitle:
+ * @c: a unicode character
+ * 
+ * Convert a character to the titlecase
+ * 
+ * Return value: the result of converting @c to titlecase.
+ *               If @c is not an uppercase or lowercase character,
+ *               @c is returned unchanged.
+ **/
+gunichar
+g_unichar_totitle (gunichar c)
+{
+  unsigned int i;
+  for (i = 0; i < asize (title_table); ++i)
+    {
+      if (title_table[i][0] == c || title_table[i][1] == c
+	  || title_table[i][2] == c)
+	return title_table[i][0];
+    }
+  return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
+	  ? ATTTABLE (c >> 8, c & 0xff)
+	  : c);
+}
+
+/**
+ * g_unichar_xdigit_value:
+ * @c: a unicode character
+ *
+ * Determines the numeric value of a character as a decimal
+ * degital.
+ *
+ * Return value: If @c is a decimal digit (according to
+ * `g_unichar_isdigit'), its numeric value. Otherwise, -1.
+ **/
+int
+g_unichar_digit_value (gunichar c)
+{
+  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  return -1;
+}
+
+/**
+ * g_unichar_xdigit_value:
+ * @c: a unicode character
+ *
+ * Determines the numeric value of a character as a hexidecimal
+ * degital.
+ *
+ * Return value: If @c is a hex digit (according to
+ * `g_unichar_isxdigit'), its numeric value. Otherwise, -1.
+ **/
+int
+g_unichar_xdigit_value (gunichar c)
+{
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 1;
+  if (c >= 'a' && c <= 'f')
+    return c - 'a' + 1;
+  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
+    return ATTTABLE (c >> 8, c & 0xff);
+  return -1;
+}
+
+/**
+ * g_unichar_type:
+ * @c: a unicode character
+ * 
+ * Classifies a unicode character by type.
+ * 
+ * Return value: the typ of the character.
+ **/
+GUnicodeType
+g_unichar_type (gunichar c)
+{
+  return TYPE (c);
+}
--- a/gutf8.c
+++ b/gutf8.c
@@ -0,0 +1,483 @@
+/* gutf8.c - Operations on UTF-8 strings.
+ *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#ifdef HAVE_LANGINFO_H
+#include <langinfo.h>
+#endif
+#include <string.h>
+
+#include "glib.h"
+
+#define UTF8_COMPUTE(Char, Mask, Len)					      \
+  if (Char < 128)							      \
+    {									      \
+      Len = 1;								      \
+      Mask = 0x7f;							      \
+    }									      \
+  else if ((Char & 0xe0) == 0xc0)					      \
+    {									      \
+      Len = 2;								      \
+      Mask = 0x1f;							      \
+    }									      \
+  else if ((Char & 0xf0) == 0xe0)					      \
+    {									      \
+      Len = 3;								      \
+      Mask = 0x0f;							      \
+    }									      \
+  else if ((Char & 0xf8) == 0xf0)					      \
+    {									      \
+      Len = 4;								      \
+      Mask = 0x07;							      \
+    }									      \
+  else if ((Char & 0xfc) == 0xf8)					      \
+    {									      \
+      Len = 5;								      \
+      Mask = 0x03;							      \
+    }									      \
+  else if ((Char & 0xfe) == 0xfc)					      \
+    {									      \
+      Len = 6;								      \
+      Mask = 0x01;							      \
+    }									      \
+  else									      \
+    Len = -1;
+
+#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
+  (Result) = (Chars)[0] & (Mask);					      \
+  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
+    {									      \
+      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
+	{								      \
+	  (Result) = -1;						      \
+	  break;							      \
+	}								      \
+      (Result) <<= 6;							      \
+      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
+    }
+
+gchar g_utf8_skip[256] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
+};
+
+/**
+ * g_utf8_find_prev_char:
+ * @str: pointer to the beginning of a UTF-8 string
+ * @p: pointer to some position within @str
+ * 
+ * Given a position @p with a UTF-8 encoded string @str, find the start
+ * of the previous UTF-8 character starting before @p. Returns %NULL if no
+ * UTF-8 characters are present in @p before @str.
+ *
+ * @p does not have to be at the beginning of a UTF-8 chracter. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte.
+ *
+ * Return value: a pointer to the found character or %NULL.
+ **/
+gchar *
+g_utf8_find_prev_char (const char *str,
+		       const char *p)
+{
+  for (--p; p > str; --p)
+    {
+      if ((*p & 0xc0) != 0x80)
+	return (gchar *)p;
+    }
+  return NULL;
+}
+
+/**
+ * g_utf8_find_next_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ * @end: a pointer to the end of the string, or %NULL to indicate
+ *        that the string is NULL terminated, in which case
+ *        the returned value will be 
+ *
+ * Find the start of the next utf-8 character in the string after @p
+ *
+ * @p does not have to be at the beginning of a UTF-8 chracter. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte.
+ * 
+ * Return value: a pointer to the found character or %NULL
+ **/
+gchar *
+g_utf8_find_next_char (const gchar *p,
+		       const gchar *end)
+{
+  if (*p)
+    {
+      if (end)
+	for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
+	  ;
+      else
+	for (++p; (*p & 0xc0) == 0x80; ++p)
+	  ;
+    }
+  return (p == end) ? NULL : (gchar *)p;
+}
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Find the previous UTF-8 character in the string before @p
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char instead.
+ * 
+ * Return value: a pointer to the found character.
+ **/
+gchar *
+g_utf8_prev_char (const gchar *p)
+{
+  while (TRUE)
+    {
+      p--;
+      if ((*p & 0xc0) != 0x80)
+	return (gchar *)p;
+    }
+}
+
+/**
+ * g_utf8_strlen:
+ * @p: pointer to the start of a UTF-8 string.
+ * @max: the maximum number of bytes to examine. If @max
+ *       is less than 0, then the string is assumed to be
+ *       nul-terminated.
+ * 
+ * Return value: the length of the string in characters
+ */
+gint
+g_utf8_strlen (const gchar *p, gint max)
+{
+  int len = 0;
+  const gchar *start = p;
+  /* special case for the empty string */
+  if (!*p) 
+    return 0;
+  /* Note that the test here and the test in the loop differ subtly.
+     In the loop we want to see if we've passed the maximum limit --
+     for instance if the buffer ends mid-character.  Here at the top
+     of the loop we want to see if we've just reached the last byte.  */
+  while (max < 0 || p - start < max)
+    {
+      p = g_utf8_next_char (p);
+      ++len;
+      if (! *p || (max > 0 && p - start > max))
+	break;
+    }
+  return len;
+}
+
+/**
+ * g_utf8_get_char:
+ * @p: a pointer to unicode character encoded as UTF-8
+ * 
+ * Convert a sequence of bytes encoded as UTF-8 to a unicode character.
+ * 
+ * Return value: the resulting character or (gunichar)-1 if @p does
+ *               not point to a valid UTF-8 encoded unicode character
+ **/
+gunichar
+g_utf8_get_char (const gchar *p)
+{
+  int i, mask = 0, len;
+  gunichar result;
+  unsigned char c = (unsigned char) *p;
+
+  UTF8_COMPUTE (c, mask, len);
+  if (len == -1)
+    return (gunichar)-1;
+  UTF8_GET (result, p, i, mask, len);
+
+  return result;
+}
+
+/**
+ * g_utf8_offset_to_pointer:
+ * @str: a UTF-8 encoded string
+ * @offset: a character offset within the string.
+ * 
+ * Converts from an integer character offset to a pointer to a position
+ * within the string.
+ * 
+ * Return value: the resulting pointer
+ **/
+gchar *
+g_utf8_offset_to_pointer  (const gchar *str,
+			   gint         offset)
+{
+  const gchar *s = str;
+  while (offset--)
+    s = g_utf8_next_char (s);
+  
+  return (gchar *)s;
+}
+
+/**
+ * g_utf8_pointer_to_offset:
+ * @str: a UTF-8 encoded string
+ * @pos: a pointer to a position within @str
+ * 
+ * Converts from a pointer to position within a string to a integer
+ * character offset
+ * 
+ * Return value: the resulting character offset
+ **/
+gint
+g_utf8_pointer_to_offset (const gchar *str,
+			  const gchar *pos)
+{
+  const gchar *s = str;
+  gint offset = 0;
+  
+  while (s < pos)
+    {
+      s = g_utf8_next_char (s);
+      offset++;
+    }
+
+  return offset;
+}
+
+
+gchar *
+g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
+{
+  const gchar *s = src;
+  while (n && *s)
+    {
+      s = g_utf8_next_char(s);
+      n--;
+    }
+  strncpy(dest, src, s - src);
+  dest[s - src] = 0;
+  return dest;
+}
+
+static gboolean
+g_utf8_get_charset_internal (char **a)
+{
+  char *charset = getenv("CHARSET");
+
+  if (charset && a && ! *a)
+    *a = charset;
+
+  if (charset && strstr (charset, "UTF-8"))
+      return TRUE;
+
+#ifdef _NL_CTYPE_CODESET_NAME
+  charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
+  if (charset)
+    {
+      if (a && ! *a)
+	*a = charset;
+      if (strcmp (charset, "UTF-8") == 0)
+	return TRUE;
+    }
+#elif CODESET
+  charset = nl_langinfo(CODESET);
+  if (charset)
+    {
+      if (a && ! *a)
+	*a = charset;
+      if (strcmp (charset, "UTF-8") == 0)
+	return TRUE;
+    }
+#endif  
+
+  if (a && ! *a) 
+    *a = "US-ASCII";
+  /* Assume this for compatibility at present.  */
+  return FALSE;
+}
+
+static int utf8_locale_cache = -1;
+static char *utf8_charset_cache = NULL;
+
+gboolean
+g_get_charset (char **charset) 
+{
+  if (utf8_locale_cache != -1)
+    {
+      if (charset)
+	*charset = utf8_charset_cache;
+      return utf8_locale_cache;
+    }
+  utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
+  if (charset) 
+    *charset = utf8_charset_cache;
+  return utf8_locale_cache;
+}
+
+/* unicode_strchr */
+
+/**
+ * g_unichar_to_utf8:
+ * @ch: a ISO10646 character code
+ * @out: output buffer, must have at least 6 bytes of space.
+ * 
+ * Convert a single character to utf8
+ * 
+ * Return value: number of bytes written
+ **/
+int
+g_unichar_to_utf8 (gunichar c, gchar *outbuf)
+{
+  size_t len = 0;
+  int first;
+  int i;
+
+  if (c < 0x80)
+    {
+      first = 0;
+      len = 1;
+    }
+  else if (c < 0x800)
+    {
+      first = 0xc0;
+      len = 2;
+    }
+  else if (c < 0x10000)
+    {
+      first = 0xe0;
+      len = 3;
+    }
+   else if (c < 0x200000)
+    {
+      first = 0xf0;
+      len = 4;
+    }
+  else if (c < 0x4000000)
+    {
+      first = 0xf8;
+      len = 5;
+    }
+  else
+    {
+      first = 0xfc;
+      len = 6;
+    }
+
+  for (i = len - 1; i > 0; --i)
+    {
+      outbuf[i] = (c & 0x3f) | 0x80;
+      c >>= 6;
+    }
+  outbuf[0] = c | first;
+
+  return len;
+}
+
+/**
+ * g_utf8_strchr:
+ * @p: a nul-terminated utf-8 string
+ * @c: a iso-10646 character/
+ * 
+ * Find the leftmost occurence of the given iso-10646 character
+ * in a UTF-8 string.
+ * 
+ * Return value: NULL if the string does not contain the character, otherwise, a
+ *               a pointer to the start of the leftmost of the character in the string.
+ **/
+gchar *
+g_utf8_strchr (const char *p, gunichar c)
+{
+  gchar ch[10];
+
+  gint len = g_unichar_to_utf8 (c, ch);
+  ch[len] = '\0';
+  
+  return strstr(p, ch);
+}
+
+#if 0
+/**
+ * g_utf8_strrchr:
+ * @p: a nul-terminated utf-8 string
+ * @c: a iso-10646 character/
+ * 
+ * Find the rightmost occurence of the given iso-10646 character
+ * in a UTF-8 string.
+ * 
+ * Return value: NULL if the string does not contain the character, otherwise, a
+ *               a pointer to the start of the rightmost of the character in the string.
+ **/
+
+/* This is ifdefed out atm as there is no strrstr function in libc.
+ */
+gchar *
+unicode_strrchr (const char *p, gunichar c)
+{
+  gchar ch[10];
+
+  len = g_unichar_to_utf8 (c, ch);
+  ch[len] = '\0';
+  
+  return strrstr(p, ch);
+}
+#endif
+
+
+/**
+ * g_utf8_to_ucs4:
+ * @str: a UTF-8 encoded strnig
+ * @len: the length of @
+ * 
+ * Convert a string from UTF-8 to a 32-bit fixed width
+ * representation as UCS-4.
+ * 
+ * Return value: a pointer to a newly allocated UCS-4 string.
+ *               This value must be freed with g_free()
+ **/
+gunichar *
+g_utf8_to_ucs4 (const char *str, int len)
+{
+  gunichar *result;
+  gint n_chars, i;
+  const gchar *p;
+  
+  n_chars = g_utf8_strlen (str, len);
+  result = g_new (gunichar, n_chars);
+  
+  p = str;
+  for (i=0; i < n_chars; i++)
+    {
+      result[i] = g_utf8_get_char (p);
+      p = g_utf8_next_char (p);
+    }
+
+  return result;
+}
+