glib/gutf8.c

/* gutf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include <config.h>

#include <stdlib.h>
#ifdef HAVE_CODESET
#include <langinfo.h>
#endif
#include <string.h>

#include "glib.h"

#define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
    {									      \
      Len = 1;								      \
      Mask = 0x7f;							      \
    }									      \
  else if ((Char & 0xe0) == 0xc0)					      \
    {									      \
      Len = 2;								      \
      Mask = 0x1f;							      \
    }									      \
  else if ((Char & 0xf0) == 0xe0)					      \
    {									      \
      Len = 3;								      \
      Mask = 0x0f;							      \
    }									      \
  else if ((Char & 0xf8) == 0xf0)					      \
    {									      \
      Len = 4;								      \
      Mask = 0x07;							      \
    }									      \
  else if ((Char & 0xfc) == 0xf8)					      \
    {									      \
      Len = 5;								      \
      Mask = 0x03;							      \
    }									      \
  else if ((Char & 0xfe) == 0xfc)					      \
    {									      \
      Len = 6;								      \
      Mask = 0x01;							      \
    }									      \
  else									      \
    Len = -1;

#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
  (Result) = (Chars)[0] & (Mask);					      \
  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
    {									      \
      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
	{								      \
	  (Result) = -1;						      \
	  break;							      \
	}								      \
      (Result) <<= 6;							      \
      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
    }

gchar g_utf8_skip[256] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
};

/**
 * g_utf8_find_prev_char:
 * @str: pointer to the beginning of a UTF-8 string
 * @p: pointer to some position within @str
 * 
 * Given a position @p with a UTF-8 encoded string @str, find the start
 * of the previous UTF-8 character starting before @p. Returns %NULL if no
 * UTF-8 characters are present in @p before @str.
 *
 * @p does not have to be at the beginning of a UTF-8 chracter. No check
 * is made to see if the character found is actually valid other than
 * it starts with an appropriate byte.
 *
 * Return value: a pointer to the found character or %NULL.
 **/
gchar *
g_utf8_find_prev_char (const char *str,
		       const char *p)
{
  for (--p; p > str; --p)
    {
      if ((*p & 0xc0) != 0x80)
	return (gchar *)p;
    }
  return NULL;
}

/**
 * g_utf8_find_next_char:
 * @p: a pointer to a position within a UTF-8 encoded string
 * @end: a pointer to the end of the string, or %NULL to indicate
 *        that the string is NULL terminated, in which case
 *        the returned value will be 
 *
 * Find the start of the next utf-8 character in the string after @p
 *
 * @p does not have to be at the beginning of a UTF-8 chracter. No check
 * is made to see if the character found is actually valid other than
 * it starts with an appropriate byte.
 * 
 * Return value: a pointer to the found character or %NULL
 **/
gchar *
g_utf8_find_next_char (const gchar *p,
		       const gchar *end)
{
  if (*p)
    {
      if (end)
	for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
	  ;
      else
	for (++p; (*p & 0xc0) == 0x80; ++p)
	  ;
    }
  return (p == end) ? NULL : (gchar *)p;
}

/**
 * g_utf8_prev_char:
 * @p: a pointer to a position within a UTF-8 encoded string
 *
 * Find the previous UTF-8 character in the string before @p
 *
 * @p does not have to be at the beginning of a UTF-8 character. No check
 * is made to see if the character found is actually valid other than
 * it starts with an appropriate byte. If @p might be the first
 * character of the string, you must use g_utf8_find_prev_char instead.
 * 
 * Return value: a pointer to the found character.
 **/
gchar *
g_utf8_prev_char (const gchar *p)
{
  while (TRUE)
    {
      p--;
      if ((*p & 0xc0) != 0x80)
	return (gchar *)p;
    }
}

/**
 * g_utf8_strlen:
 * @p: pointer to the start of a UTF-8 string.
 * @max: the maximum number of bytes to examine. If @max
 *       is less than 0, then the string is assumed to be
 *       nul-terminated.
 * 
 * Return value: the length of the string in characters
 */
gint
g_utf8_strlen (const gchar *p, gint max)
{
  int len = 0;
  const gchar *start = p;
  /* special case for the empty string */
  if (!*p) 
    return 0;
  /* Note that the test here and the test in the loop differ subtly.
     In the loop we want to see if we've passed the maximum limit --
     for instance if the buffer ends mid-character.  Here at the top
     of the loop we want to see if we've just reached the last byte.  */
  while (max < 0 || p - start < max)
    {
      p = g_utf8_next_char (p);
      ++len;
      if (! *p || (max > 0 && p - start > max))
	break;
    }
  return len;
}

/**
 * g_utf8_get_char:
 * @p: a pointer to unicode character encoded as UTF-8
 * 
 * Convert a sequence of bytes encoded as UTF-8 to a unicode character.
 * 
 * Return value: the resulting character or (gunichar)-1 if @p does
 *               not point to a valid UTF-8 encoded unicode character
 **/
gunichar
g_utf8_get_char (const gchar *p)
{
  int i, mask = 0, len;
  gunichar result;
  unsigned char c = (unsigned char) *p;

  UTF8_COMPUTE (c, mask, len);
  if (len == -1)
    return (gunichar)-1;
  UTF8_GET (result, p, i, mask, len);

  return result;
}

/**
 * g_utf8_offset_to_pointer:
 * @str: a UTF-8 encoded string
 * @offset: a character offset within the string.
 * 
 * Converts from an integer character offset to a pointer to a position
 * within the string.
 * 
 * Return value: the resulting pointer
 **/
gchar *
g_utf8_offset_to_pointer  (const gchar *str,
			   gint         offset)
{
  const gchar *s = str;
  while (offset--)
    s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

/**
 * g_utf8_pointer_to_offset:
 * @str: a UTF-8 encoded string
 * @pos: a pointer to a position within @str
 * 
 * Converts from a pointer to position within a string to a integer
 * character offset
 * 
 * Return value: the resulting character offset
 **/
gint
g_utf8_pointer_to_offset (const gchar *str,
			  const gchar *pos)
{
  const gchar *s = str;
  gint offset = 0;
  
  while (s < pos)
    {
      s = g_utf8_next_char (s);
      offset++;
    }

  return offset;
}


gchar *
g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
{
  const gchar *s = src;
  while (n && *s)
    {
      s = g_utf8_next_char(s);
      n--;
    }
  strncpy(dest, src, s - src);
  dest[s - src] = 0;
  return dest;
}

static gboolean
g_utf8_get_charset_internal (char **a)
{
  char *charset = getenv("CHARSET");

  if (charset && a && ! *a)
    *a = charset;

  if (charset && strstr (charset, "UTF-8"))
      return TRUE;

#ifdef HAVE_CODESET
  charset = nl_langinfo(CODESET);
  if (charset)
    {
      if (a && ! *a)
	*a = charset;
      if (strcmp (charset, "UTF-8") == 0)
	return TRUE;
    }
#endif
  
#if 0 /* #ifdef _NL_CTYPE_CODESET_NAME */
  charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
  if (charset)
    {
      if (a && ! *a)
	*a = charset;
      if (strcmp (charset, "UTF-8") == 0)
	return TRUE;
    }
#endif

  if (a && ! *a) 
    *a = "US-ASCII";
  /* Assume this for compatibility at present.  */
  return FALSE;
}

static int utf8_locale_cache = -1;
static char *utf8_charset_cache = NULL;

gboolean
g_get_charset (char **charset) 
{
  if (utf8_locale_cache != -1)
    {
      if (charset)
	*charset = utf8_charset_cache;
      return utf8_locale_cache;
    }
  utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
  if (charset) 
    *charset = utf8_charset_cache;
  return utf8_locale_cache;
}

/* unicode_strchr */

/**
 * g_unichar_to_utf8:
 * @ch: a ISO10646 character code
 * @out: output buffer, must have at least 6 bytes of space.
 * 
 * Convert a single character to utf8
 * 
 * Return value: number of bytes written
 **/
int
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
{
  size_t len = 0;
  int first;
  int i;

  if (c < 0x80)
    {
      first = 0;
      len = 1;
    }
  else if (c < 0x800)
    {
      first = 0xc0;
      len = 2;
    }
  else if (c < 0x10000)
    {
      first = 0xe0;
      len = 3;
    }
   else if (c < 0x200000)
    {
      first = 0xf0;
      len = 4;
    }
  else if (c < 0x4000000)
    {
      first = 0xf8;
      len = 5;
    }
  else
    {
      first = 0xfc;
      len = 6;
    }

  for (i = len - 1; i > 0; --i)
    {
      outbuf[i] = (c & 0x3f) | 0x80;
      c >>= 6;
    }
  outbuf[0] = c | first;

  return len;
}

/**
 * g_utf8_strchr:
 * @p: a nul-terminated utf-8 string
 * @c: a iso-10646 character/
 * 
 * Find the leftmost occurence of the given iso-10646 character
 * in a UTF-8 string.
 * 
 * Return value: NULL if the string does not contain the character, otherwise, a
 *               a pointer to the start of the leftmost of the character in the string.
 **/
gchar *
g_utf8_strchr (const char *p, gunichar c)
{
  gchar ch[10];

  gint len = g_unichar_to_utf8 (c, ch);
  ch[len] = '\0';
  
  return strstr(p, ch);
}

#if 0
/**
 * g_utf8_strrchr:
 * @p: a nul-terminated utf-8 string
 * @c: a iso-10646 character/
 * 
 * Find the rightmost occurence of the given iso-10646 character
 * in a UTF-8 string.
 * 
 * Return value: NULL if the string does not contain the character, otherwise, a
 *               a pointer to the start of the rightmost of the character in the string.
 **/

/* This is ifdefed out atm as there is no strrstr function in libc.
 */
gchar *
unicode_strrchr (const char *p, gunichar c)
{
  gchar ch[10];

  len = g_unichar_to_utf8 (c, ch);
  ch[len] = '\0';
  
  return strrstr(p, ch);
}
#endif


/**
 * g_utf8_to_ucs4:
 * @str: a UTF-8 encoded strnig
 * @len: the length of @
 * 
 * Convert a string from UTF-8 to a 32-bit fixed width
 * representation as UCS-4.
 * 
 * Return value: a pointer to a newly allocated UCS-4 string.
 *               This value must be freed with g_free()
 **/
gunichar *
g_utf8_to_ucs4 (const char *str, int len)
{
  gunichar *result;
  gint n_chars, i;
  const gchar *p;
  
  n_chars = g_utf8_strlen (str, len);
  result = g_new (gunichar, n_chars);
  
  p = str;
  for (i=0; i < n_chars; i++)
    {
      result[i] = g_utf8_get_char (p);
      p = g_utf8_next_char (p);
    }

  return result;
}
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`/* gutf8.c - Operations on UTF-8 strings.`
			`*`
			`* Copyright (C) 1999 Tom Tromey`
			`* Copyright (C) 2000 Red Hat, Inc.`
			`*`
			`* This library is free software; you can redistribute it and/or`
applied patch from Andreas Persenius <ndap@swipnet.se> that updates the Wed Jul 26 12:59:31 2000 Tim Janik <timj@gtk.org> * *.[hc]: applied patch from Andreas Persenius <ndap@swipnet.se> that updates the license headers to the GNU Lesser General Public License, as well as updating the copyright year to 2000. 2000-07-26 13:02:02 +02:00			`* modify it under the terms of the GNU Lesser General Public`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`* License as published by the Free Software Foundation; either`
			`* version 2 of the License, or (at your option) any later version.`
			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
applied patch from Andreas Persenius <ndap@swipnet.se> that updates the Wed Jul 26 12:59:31 2000 Tim Janik <timj@gtk.org> * *.[hc]: applied patch from Andreas Persenius <ndap@swipnet.se> that updates the license headers to the GNU Lesser General Public License, as well as updating the copyright year to 2000. 2000-07-26 13:02:02 +02:00			`* Lesser General Public License for more details.`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`*`
applied patch from Andreas Persenius <ndap@swipnet.se> that updates the Wed Jul 26 12:59:31 2000 Tim Janik <timj@gtk.org> * *.[hc]: applied patch from Andreas Persenius <ndap@swipnet.se> that updates the license headers to the GNU Lesser General Public License, as well as updating the copyright year to 2000. 2000-07-26 13:02:02 +02:00			`* You should have received a copy of the GNU Lesser General Public`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`* License along with this library; if not, write to the`
			`* Free Software Foundation, Inc., 59 Temple Place - Suite 330,`
			`* Boston, MA 02111-1307, USA.`
			`*/`

			`#include <config.h>`

			`#include <stdlib.h>`
Fix up to correspond to configure.in checks. Mon Jul 3 17:58:02 2000 Owen Taylor <otaylor@redhat.com> * gutf8.c (g_utf8_get_charset_internal): Fix up to correspond to configure.in checks. 2000-07-04 00:01:58 +02:00			`#ifdef HAVE_CODESET`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`#include <langinfo.h>`
			`#endif`
			`#include <string.h>`

			`#include "glib.h"`

			`#define UTF8_COMPUTE(Char, Mask, Len) \`
			`if (Char < 128) \`
			`{ \`
			`Len = 1; \`
			`Mask = 0x7f; \`
			`} \`
			`else if ((Char & 0xe0) == 0xc0) \`
			`{ \`
			`Len = 2; \`
			`Mask = 0x1f; \`
			`} \`
			`else if ((Char & 0xf0) == 0xe0) \`
			`{ \`
			`Len = 3; \`
			`Mask = 0x0f; \`
			`} \`
			`else if ((Char & 0xf8) == 0xf0) \`
			`{ \`
			`Len = 4; \`
			`Mask = 0x07; \`
			`} \`
			`else if ((Char & 0xfc) == 0xf8) \`
			`{ \`
			`Len = 5; \`
			`Mask = 0x03; \`
			`} \`
			`else if ((Char & 0xfe) == 0xfc) \`
			`{ \`
			`Len = 6; \`
			`Mask = 0x01; \`
			`} \`
			`else \`
			`Len = -1;`

			`#define UTF8_GET(Result, Chars, Count, Mask, Len) \`
			`(Result) = (Chars)[0] & (Mask); \`
			`for ((Count) = 1; (Count) < (Len); ++(Count)) \`
			`{ \`
			`if (((Chars)[(Count)] & 0xc0) != 0x80) \`
			`{ \`
			`(Result) = -1; \`
			`break; \`
			`} \`
			`(Result) <<= 6; \`
			`(Result) \|= ((Chars)[(Count)] & 0x3f); \`
			`}`

			`gchar g_utf8_skip[256] = {`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,`
			`3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0`
			`};`

			`/**`
			`* g_utf8_find_prev_char:`
			`* @str: pointer to the beginning of a UTF-8 string`
			`* @p: pointer to some position within @str`
			`*`
			`* Given a position @p with a UTF-8 encoded string @str, find the start`
			`* of the previous UTF-8 character starting before @p. Returns %NULL if no`
			`* UTF-8 characters are present in @p before @str.`
			`*`
			`* @p does not have to be at the beginning of a UTF-8 chracter. No check`
			`* is made to see if the character found is actually valid other than`
			`* it starts with an appropriate byte.`
			`*`
			`* Return value: a pointer to the found character or %NULL.`
			`**/`
			`gchar *`
			`g_utf8_find_prev_char (const char *str,`
			`const char *p)`
			`{`
			`for (--p; p > str; --p)`
			`{`
			`if ((*p & 0xc0) != 0x80)`
			`return (gchar *)p;`
			`}`
			`return NULL;`
			`}`

			`/**`
			`* g_utf8_find_next_char:`
			`* @p: a pointer to a position within a UTF-8 encoded string`
			`* @end: a pointer to the end of the string, or %NULL to indicate`
			`* that the string is NULL terminated, in which case`
			`* the returned value will be`
			`*`
			`* Find the start of the next utf-8 character in the string after @p`
			`*`
			`* @p does not have to be at the beginning of a UTF-8 chracter. No check`
			`* is made to see if the character found is actually valid other than`
			`* it starts with an appropriate byte.`
			`*`
			`* Return value: a pointer to the found character or %NULL`
			`**/`
			`gchar *`
			`g_utf8_find_next_char (const gchar *p,`
			`const gchar *end)`
			`{`
			`if (*p)`
			`{`
			`if (end)`
			`for (++p; p < end && (*p & 0xc0) == 0x80; ++p)`
			`;`
			`else`
			`for (++p; (*p & 0xc0) == 0x80; ++p)`
			`;`
			`}`
			`return (p == end) ? NULL : (gchar *)p;`
			`}`

			`/**`
			`* g_utf8_prev_char:`
			`* @p: a pointer to a position within a UTF-8 encoded string`
			`*`
			`* Find the previous UTF-8 character in the string before @p`
			`*`
			`* @p does not have to be at the beginning of a UTF-8 character. No check`
			`* is made to see if the character found is actually valid other than`
			`* it starts with an appropriate byte. If @p might be the first`
			`* character of the string, you must use g_utf8_find_prev_char instead.`
			`*`
			`* Return value: a pointer to the found character.`
			`**/`
			`gchar *`
			`g_utf8_prev_char (const gchar *p)`
			`{`
			`while (TRUE)`
			`{`
			`p--;`
			`if ((*p & 0xc0) != 0x80)`
			`return (gchar *)p;`
			`}`
			`}`

			`/**`
			`* g_utf8_strlen:`
			`* @p: pointer to the start of a UTF-8 string.`
			`* @max: the maximum number of bytes to examine. If @max`
			`* is less than 0, then the string is assumed to be`
			`* nul-terminated.`
			`*`
			`* Return value: the length of the string in characters`
			`*/`
			`gint`
			`g_utf8_strlen (const gchar *p, gint max)`
			`{`
			`int len = 0;`
			`const gchar *start = p;`
			`/* special case for the empty string */`
			`if (!*p)`
			`return 0;`
			`/* Note that the test here and the test in the loop differ subtly.`
			`In the loop we want to see if we've passed the maximum limit --`
			`for instance if the buffer ends mid-character. Here at the top`
			`of the loop we want to see if we've just reached the last byte. */`
			`while (max < 0 \|\| p - start < max)`
			`{`
			`p = g_utf8_next_char (p);`
			`++len;`
			`if (! *p \|\| (max > 0 && p - start > max))`
			`break;`
			`}`
			`return len;`
			`}`

			`/**`
			`* g_utf8_get_char:`
			`* @p: a pointer to unicode character encoded as UTF-8`
			`*`
			`* Convert a sequence of bytes encoded as UTF-8 to a unicode character.`
			`*`
			`* Return value: the resulting character or (gunichar)-1 if @p does`
			`* not point to a valid UTF-8 encoded unicode character`
			`**/`
			`gunichar`
			`g_utf8_get_char (const gchar *p)`
			`{`
			`int i, mask = 0, len;`
			`gunichar result;`
			`unsigned char c = (unsigned char) *p;`

			`UTF8_COMPUTE (c, mask, len);`
			`if (len == -1)`
			`return (gunichar)-1;`
			`UTF8_GET (result, p, i, mask, len);`

			`return result;`
			`}`

			`/**`
			`* g_utf8_offset_to_pointer:`
			`* @str: a UTF-8 encoded string`
			`* @offset: a character offset within the string.`
			`*`
			`* Converts from an integer character offset to a pointer to a position`
			`* within the string.`
			`*`
			`* Return value: the resulting pointer`
			`**/`
			`gchar *`
			`g_utf8_offset_to_pointer (const gchar *str,`
			`gint offset)`
			`{`
			`const gchar *s = str;`
			`while (offset--)`
			`s = g_utf8_next_char (s);`

			`return (gchar *)s;`
			`}`

			`/**`
			`* g_utf8_pointer_to_offset:`
			`* @str: a UTF-8 encoded string`
			`* @pos: a pointer to a position within @str`
			`*`
			`* Converts from a pointer to position within a string to a integer`
			`* character offset`
			`*`
			`* Return value: the resulting character offset`
			`**/`
			`gint`
			`g_utf8_pointer_to_offset (const gchar *str,`
			`const gchar *pos)`
			`{`
			`const gchar *s = str;`
			`gint offset = 0;`

			`while (s < pos)`
			`{`
			`s = g_utf8_next_char (s);`
			`offset++;`
			`}`

			`return offset;`
			`}`


			`gchar *`
			`g_utf8_strncpy (gchar dest, const gchar src, size_t n)`
			`{`
			`const gchar *s = src;`
			`while (n && *s)`
			`{`
			`s = g_utf8_next_char(s);`
			`n--;`
			`}`
			`strncpy(dest, src, s - src);`
			`dest[s - src] = 0;`
			`return dest;`
			`}`

			`static gboolean`
			`g_utf8_get_charset_internal (char **a)`
			`{`
			`char *charset = getenv("CHARSET");`

			`if (charset && a && ! *a)`
			`*a = charset;`

			`if (charset && strstr (charset, "UTF-8"))`
			`return TRUE;`

Fix up to correspond to configure.in checks. Mon Jul 3 17:58:02 2000 Owen Taylor <otaylor@redhat.com> * gutf8.c (g_utf8_get_charset_internal): Fix up to correspond to configure.in checks. 2000-07-04 00:01:58 +02:00			`#ifdef HAVE_CODESET`
			`charset = nl_langinfo(CODESET);`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`if (charset)`
			`{`
			`if (a && ! *a)`
			`*a = charset;`
			`if (strcmp (charset, "UTF-8") == 0)`
			`return TRUE;`
			`}`
Fix up to correspond to configure.in checks. Mon Jul 3 17:58:02 2000 Owen Taylor <otaylor@redhat.com> * gutf8.c (g_utf8_get_charset_internal): Fix up to correspond to configure.in checks. 2000-07-04 00:01:58 +02:00			`#endif`

			`#if 0 /* #ifdef _NL_CTYPE_CODESET_NAME */`
			`charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00			`if (charset)`
			`{`
			`if (a && ! *a)`
			`*a = charset;`
			`if (strcmp (charset, "UTF-8") == 0)`
			`return TRUE;`
			`}`
Fix up to correspond to configure.in checks. Mon Jul 3 17:58:02 2000 Owen Taylor <otaylor@redhat.com> * gutf8.c (g_utf8_get_charset_internal): Fix up to correspond to configure.in checks. 2000-07-04 00:01:58 +02:00			`#endif`
Initial pass at adding unicode support functions. A few things still need Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com> * gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h Makefile.am glib.h: Initial pass at adding unicode support functions. A few things still need to be implemented, a bit of cleanup needs to be done, tests need to be added, and the docs need to be finished, but this should allow replacing most or all use of libunicode. 2000-06-21 18:11:21 +02:00
			`if (a && ! *a)`
			`*a = "US-ASCII";`
			`/* Assume this for compatibility at present. */`
			`return FALSE;`
			`}`

			`static int utf8_locale_cache = -1;`
			`static char *utf8_charset_cache = NULL;`

			`gboolean`
			`g_get_charset (char **charset)`
			`{`
			`if (utf8_locale_cache != -1)`
			`{`
			`if (charset)`
			`*charset = utf8_charset_cache;`
			`return utf8_locale_cache;`
			`}`
			`utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);`
			`if (charset)`
			`*charset = utf8_charset_cache;`
			`return utf8_locale_cache;`
			`}`

			`/* unicode_strchr */`

			`/**`
			`* g_unichar_to_utf8:`
			`* @ch: a ISO10646 character code`
			`* @out: output buffer, must have at least 6 bytes of space.`
			`*`
			`* Convert a single character to utf8`
			`*`
			`* Return value: number of bytes written`
			`**/`
			`int`
			`g_unichar_to_utf8 (gunichar c, gchar *outbuf)`
			`{`
			`size_t len = 0;`
			`int first;`
			`int i;`

			`if (c < 0x80)`
			`{`
			`first = 0;`
			`len = 1;`
			`}`
			`else if (c < 0x800)`
			`{`
			`first = 0xc0;`
			`len = 2;`
			`}`
			`else if (c < 0x10000)`
			`{`
			`first = 0xe0;`
			`len = 3;`
			`}`
			`else if (c < 0x200000)`
			`{`
			`first = 0xf0;`
			`len = 4;`
			`}`
			`else if (c < 0x4000000)`
			`{`
			`first = 0xf8;`
			`len = 5;`
			`}`
			`else`
			`{`
			`first = 0xfc;`
			`len = 6;`
			`}`

			`for (i = len - 1; i > 0; --i)`
			`{`
			`outbuf[i] = (c & 0x3f) \| 0x80;`
			`c >>= 6;`
			`}`
			`outbuf[0] = c \| first;`

			`return len;`
			`}`

			`/**`
			`* g_utf8_strchr:`
			`* @p: a nul-terminated utf-8 string`
			`* @c: a iso-10646 character/`
			`*`
			`* Find the leftmost occurence of the given iso-10646 character`
			`* in a UTF-8 string.`
			`*`
			`* Return value: NULL if the string does not contain the character, otherwise, a`
			`* a pointer to the start of the leftmost of the character in the string.`
			`**/`
			`gchar *`
			`g_utf8_strchr (const char *p, gunichar c)`
			`{`
			`gchar ch[10];`

			`gint len = g_unichar_to_utf8 (c, ch);`
			`ch[len] = '\0';`

			`return strstr(p, ch);`
			`}`

			`#if 0`
			`/**`
			`* g_utf8_strrchr:`
			`* @p: a nul-terminated utf-8 string`
			`* @c: a iso-10646 character/`
			`*`
			`* Find the rightmost occurence of the given iso-10646 character`
			`* in a UTF-8 string.`
			`*`
			`* Return value: NULL if the string does not contain the character, otherwise, a`
			`* a pointer to the start of the rightmost of the character in the string.`
			`**/`

			`/* This is ifdefed out atm as there is no strrstr function in libc.`
			`*/`
			`gchar *`
			`unicode_strrchr (const char *p, gunichar c)`
			`{`
			`gchar ch[10];`

			`len = g_unichar_to_utf8 (c, ch);`
			`ch[len] = '\0';`

			`return strrstr(p, ch);`
			`}`
			`#endif`


			`/**`
			`* g_utf8_to_ucs4:`
			`* @str: a UTF-8 encoded strnig`
			`* @len: the length of @`
			`*`
			`* Convert a string from UTF-8 to a 32-bit fixed width`
			`* representation as UCS-4.`
			`*`
			`* Return value: a pointer to a newly allocated UCS-4 string.`
			`* This value must be freed with g_free()`
			`**/`
			`gunichar *`
			`g_utf8_to_ucs4 (const char *str, int len)`
			`{`
			`gunichar *result;`
			`gint n_chars, i;`
			`const gchar *p;`

			`n_chars = g_utf8_strlen (str, len);`
			`result = g_new (gunichar, n_chars);`

			`p = str;`
			`for (i=0; i < n_chars; i++)`
			`{`
			`result[i] = g_utf8_get_char (p);`
			`p = g_utf8_next_char (p);`
			`}`

			`return result;`
			`}`