Initial pass at adding unicode support functions. A few things still need

Wed Jun 21 12:09:03 2000  Owen Taylor  <otaylor@redhat.com>

	* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
	Makefile.am glib.h: Initial pass at adding unicode support
	functions. A few things still need to be implemented, a bit
	of cleanup needs to be done, tests need to be added, and
	the docs need to be finished, but this should allow replacing
	most or all use of libunicode.
This commit is contained in:
Owen Taylor 2000-06-21 16:11:21 +00:00 committed by Owen Taylor
parent 876a6767eb
commit 0891c64816
24 changed files with 16676 additions and 4 deletions

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -1,3 +1,12 @@
Wed Jun 21 12:09:03 2000 Owen Taylor <otaylor@redhat.com>
* gunicode.h gutf8.c guniprop.c gunidecomp.[ch] gunichartables.h
Makefile.am glib.h: Initial pass at adding unicode support
functions. A few things still need to be implemented, a bit
of cleanup needs to be done, tests need to be added, and
the docs need to be finished, but this should allow replacing
most or all use of libunicode.
2000-06-06 Tor Lillqvist <tml@iki.fi>
* giowin32.c (g_io_channel_win32_pipe_readable): If we are

View File

@ -62,9 +62,14 @@ libglib_la_SOURCES = \
gthreadpool.c \
gtimer.c \
gtree.c \
guniprop.c \
gutf8.c \
gunichartable.h \
gunidecomp.h \
gunidecomp.c \
gutils.c
include_HEADERS = glib.h glib-object.h
include_HEADERS = glib.h glib-object.h gunicode.h
configexecincludedir = $(pkglibdir)/include
#configexecinclude_DATA = glibconfig.h

3
glib.h
View File

@ -3321,9 +3321,10 @@ guint g_thread_pool_get_num_unused_threads (void);
/* Stop all currently unused threads, but leave the limit untouched */
void g_thread_pool_stop_unused_threads (void);
#include <gunicode.h>
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* __G_LIB_H__ */

View File

@ -62,9 +62,14 @@ libglib_la_SOURCES = \
gthreadpool.c \
gtimer.c \
gtree.c \
guniprop.c \
gutf8.c \
gunichartable.h \
gunidecomp.h \
gunidecomp.c \
gutils.c
include_HEADERS = glib.h glib-object.h
include_HEADERS = glib.h glib-object.h gunicode.h
configexecincludedir = $(pkglibdir)/include
#configexecinclude_DATA = glibconfig.h

View File

@ -3321,9 +3321,10 @@ guint g_thread_pool_get_num_unused_threads (void);
/* Stop all currently unused threads, but leave the limit untouched */
void g_thread_pool_stop_unused_threads (void);
#include <gunicode.h>
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* __G_LIB_H__ */

5390
glib/gunichartables.h Normal file

File diff suppressed because it is too large Load Diff

178
glib/gunicode.h Normal file
View File

@ -0,0 +1,178 @@
/* gunicode.h - Unicode manipulation functions
*
* Copyright (C) 1999, 2000 Tom Tromey
* Copyright 2000 Red Hat, Inc.
*
* The Gnome Library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* The Gnome Library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#ifndef __GUNICODE_H__
#define __GUNICODE_H__
#include <stdlib.h> /* For size_t */
#ifdef __cplusplus
extern "C"
{
#endif
typedef guint32 gunichar;
typedef guint16 gunichar2;
/* These are the possible character classifications. */
typedef enum {
G_UNICODE_CONTROL,
G_UNICODE_FORMAT,
G_UNICODE_UNASSIGNED,
G_UNICODE_PRIVATE_USE,
G_UNICODE_SURROGATE,
G_UNICODE_LOWERCASE_LETTER,
G_UNICODE_MODIFIER_LETTER,
G_UNICODE_OTHER_LETTER,
G_UNICODE_TITLECASE_LETTER,
G_UNICODE_UPPERCASE_LETTER,
G_UNICODE_COMBINING_MARK,
G_UNICODE_ENCLOSING_MARK,
G_UNICODE_NON_SPACING_MARK,
G_UNICODE_DECIMAL_NUMBER,
G_UNICODE_LETTER_NUMBER,
G_UNICODE_OTHER_NUMBER,
G_UNICODE_CONNECT_PUNCTUATION,
G_UNICODE_DASH_PUNCTUATION,
G_UNICODE_CLOSE_PUNCTUATION,
G_UNICODE_FINAL_PUNCTUATION,
G_UNICODE_INITIAL_PUNCTUATION,
G_UNICODE_OTHER_PUNCTUATION,
G_UNICODE_OPEN_PUNCTUATION,
G_UNICODE_CURRENCY_SYMBOL,
G_UNICODE_MODIFIER_SYMBOL,
G_UNICODE_MATH_SYMBOL,
G_UNICODE_OTHER_SYMBOL,
G_UNICODE_LINE_SEPARATOR,
G_UNICODE_PARAGRAPH_SEPARATOR,
G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
* not null, sets *CHARSET to the name of the current locale's
* charset. This value is statically allocated.
*/
gboolean g_get_charset (char **charset);
/* These are all analogs of the <ctype.h> functions.
*/
gboolean g_unichar_isalnum (gunichar c);
gboolean g_unichar_isalpha (gunichar c);
gboolean g_unichar_iscntrl (gunichar c);
gboolean g_unicphar_isdigit (gunichar c);
gboolean g_unichar_isgraph (gunichar c);
gboolean g_unichar_islower (gunichar c);
gboolean g_unichar_isprint (gunichar c);
gboolean g_unichar_ispunct (gunichar c);
gboolean g_unichar_isspace (gunichar c);
gboolean g_unichar_isupper (gunichar c);
gboolean g_unichar_isxdigit (gunichar c);
gboolean g_unichar_istitle (gunichar c);
gboolean g_unichar_isdefined (gunichar c);
gboolean g_unichar_iswide (gunichar c);
/* More <ctype.h> functions. These convert between the three cases.
* See the Unicode book to understand title case. */
gunichar g_unichar_toupper (gunichar c);
gunichar g_unichar_tolower (gunichar c);
gunichar g_unichar_totitle (gunichar c);
/* If C is a digit (according to `g_unichar_isdigit'), then return its
numeric value. Otherwise return -1. */
gint g_unichar_digit_value (gunichar c);
gint g_unichar_xdigit_value (gunichar c);
/* Return the Unicode character type of a given character. */
GUnicodeType g_unichar_type (gunichar c);
/* Compute canonical ordering of a string in-place. This rearranges
decomposed characters in the string according to their combining
classes. See the Unicode manual for more information. */
void g_unicode_canonical_ordering (gunichar *string,
size_t len);
/* Compute canonical decomposition of a character. Returns g_malloc()d
string of Unicode characters. RESULT_LEN is set to the resulting
length of the string. */
gunichar *g_unicode_canonical_decomposition (gunichar ch,
size_t *result_len);
/* Array of skip-bytes-per-initial character
*/
extern char g_utf8_skip[256];
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
gunichar g_utf8_get_char (const gchar *p);
gchar * g_utf8_offset_to_pointer (const gchar *str,
gint offset);
gint g_utf8_pointer_to_offset (const gchar *str,
const gchar *pos);
gchar * g_utf8_prev_char (const gchar *p);
gchar * g_utf8_find_next_char (const gchar *p,
const gchar *bound);
gchar * g_utf8_find_prev_char (const gchar *str,
const gchar *p);
gint g_utf8_strlen (const gchar *p,
gint max);
/* Copies n characters from src to dest */
gchar *g_utf8_strncpy (gchar *dest,
const gchar *src,
size_t n);
/* Find the UTF-8 character corresponding to ch, in string p. These
functions are equivalants to strchr and strrchr */
gchar *g_utf8_strchr (const gchar *p,
gunichar ch);
gchar *g_utf8_strrchr (const gchar *p,
gunichar ch);
gunichar2 *g_utf8_to_utf16 (const gchar *str,
gint len);
gunichar * g_utf8_to_ucs4 (const gchar *str,
gint len);
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
gint len);
gchar * g_utf16_to_utf8 (const gunichar2 *str,
gint len);
gunichar * g_ucs4_to_utf16 (const gunichar *str,
gint len);
gchar * g_ucs4_to_utf8 (const gunichar *str,
gint len);
/* Convert a single character into UTF-8. outbuf must have at
* least 6 bytes of space. Returns the number of bytes in the
* result.
*/
gint g_unichar_to_utf8 (gunichar c,
char *outbuf);
#ifdef __cplusplus
}
#endif
#endif /* GUNICODE_H */

133
glib/gunidecomp.c Normal file
View File

@ -0,0 +1,133 @@
/* decomp.c - Character decomposition.
*
* Copyright (C) 1999, 2000 Tom Tromey
* Copyright 2000 Red Hat, Inc.
*
* The Gnome Library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* The Gnome Library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "glib.h"
#include "gunidecomp.h"
#include <config.h>
#include <stdlib.h>
/* We cheat a bit and cast type values to (char *). We detect these
using the &0xff trick. */
#define CC(Page, Char) \
(((((int) (combining_class_table[Page])) & 0xff) \
== ((int) combining_class_table[Page])) \
? ((int) combining_class_table[Page]) \
: (combining_class_table[Page][Char]))
#define COMBINING_CLASS(Char) \
(((Char) > (UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
/* Compute the canonical ordering of a string in-place. */
void
g_unicode_canonical_ordering (gunichar *string,
size_t len)
{
size_t i;
int swap = 1;
while (swap)
{
int last;
swap = 0;
last = COMBINING_CLASS (string[0]);
for (i = 0; i < len - 1; ++i)
{
int next = COMBINING_CLASS (string[i + 1]);
if (next != 0 && last > next)
{
size_t j;
/* Percolate item leftward through string. */
for (j = i; j > 0; --j)
{
gunichar t;
if (COMBINING_CLASS (string[j]) <= next)
break;
t = string[j + 1];
string[j + 1] = string[j];
string[j] = t;
swap = 1;
}
/* We're re-entering the loop looking at the old
character again. */
next = last;
}
last = next;
}
}
}
gunichar *
g_unicode_canonical_decomposition (gunichar ch,
size_t *result_len)
{
gunichar *r = NULL;
if (ch <= 0xffff)
{
int start = 0;
int end = G_N_ELEMENTS (decomp_table);
while (start != end)
{
int half = (start + end) / 2;
if (ch == decomp_table[half].ch)
{
/* Found it. */
int i, len;
/* We store as a double-nul terminated string. */
for (len = 0; (decomp_table[half].expansion[len]
|| decomp_table[half].expansion[len + 1]);
len += 2)
;
/* We've counted twice as many bytes as there are
characters. */
*result_len = len / 2;
r = malloc (len / 2 * sizeof (gunichar));
for (i = 0; i < len; i += 2)
{
r[i / 2] = (decomp_table[half].expansion[i] << 8
| decomp_table[half].expansion[i + 1]);
}
break;
}
else if (ch > decomp_table[half].ch)
start = half;
else
end = half;
}
}
if (r == NULL)
{
/* Not in our table. */
r = malloc (sizeof (gunichar));
*r = ch;
*result_len = 1;
}
/* Supposedly following the Unicode 2.1.9 table means that the
decompositions come out in canonical order. I haven't tested
this, but we rely on it here. */
return r;
}

1755
glib/gunidecomp.h Normal file

File diff suppressed because it is too large Load Diff

355
glib/guniprop.c Normal file
View File

@ -0,0 +1,355 @@
/* guniprop.c - Unicode character properties.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "glib.h"
#include "gunichartables.h"
#include <config.h>
#include <stddef.h>
#define asize(x) ((sizeof (x)) / sizeof (x[0]))
#define ATTTABLE(Page, Char) \
((attr_table[Page] == 0) ? 0 : (attr_table[Page][Char]))
/* We cheat a bit and cast type values to (char *). We detect these
using the &0xff trick. */
#define TTYPE(Page, Char) \
(((((int) type_table[Page]) & 0xff) == ((int) type_table[Page])) \
? ((int) (type_table[Page])) \
: (type_table[Page][Char]))
#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
|| (Type) == G_UNICODE_LETTER_NUMBER \
|| (Type) == G_UNICODE_OTHER_NUMBER)
#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
|| (Type) == G_UNICODE_UPPERCASE_LETTER \
|| (Type) == G_UNICODE_TITLECASE_LETTER \
|| (Type) == G_UNICODE_MODIFIER_LETTER \
|| (Type) == G_UNICODE_OTHER_LETTER)
gboolean
g_unichar_isalnum (gunichar c)
{
int t = TYPE (c);
return ISDIGIT (t) || ISALPHA (t);
}
gboolean
g_unichar_isalpha (gunichar c)
{
int t = TYPE (c);
return ISALPHA (t);
}
gboolean
g_unichar_iscntrl (gunichar c)
{
return TYPE (c) == G_UNICODE_CONTROL;
}
gboolean
g_unichar_isdigit (gunichar c)
{
return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
}
gboolean
g_unichar_isgraph (gunichar c)
{
int t = TYPE (c);
return (t != G_UNICODE_CONTROL
&& t != G_UNICODE_FORMAT
&& t != G_UNICODE_UNASSIGNED
&& t != G_UNICODE_PRIVATE_USE
&& t != G_UNICODE_SURROGATE
&& t != G_UNICODE_SPACE_SEPARATOR);
}
gboolean
g_unichar_islower (gunichar c)
{
return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
}
gboolean
g_unichar_isprint (gunichar c)
{
int t = TYPE (c);
return (t != G_UNICODE_CONTROL
&& t != G_UNICODE_FORMAT
&& t != G_UNICODE_UNASSIGNED
&& t != G_UNICODE_PRIVATE_USE
&& t != G_UNICODE_SURROGATE);
}
gboolean
g_unichar_ispunct (gunichar c)
{
int t = TYPE (c);
return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
|| t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
|| t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
|| t == G_UNICODE_OPEN_PUNCTUATION);
}
gboolean
g_unichar_isspace (gunichar c)
{
int t = TYPE (c);
return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
|| t == G_UNICODE_PARAGRAPH_SEPARATOR);
}
/**
* g_unichar_isupper:
* @c: a unicode character
*
* Determines if a character is uppercase.
*
* Return value:
**/
gboolean
g_unichar_isupper (gunichar c)
{
return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
}
/**
* g_unichar_istitle:
* @c: a unicode character
*
* Determines if a character is titlecase. Some characters in
* Unicode which are composites, such as the DZ digraph
* have three case variants instead of just two. The titlecase
* form is used at the beginning of a word where only the
* first letter is capitalized. The titlecase form of the DZ
* digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z
*
* Return value: %TRUE if the character is titlecase.
**/
gboolean
g_unichar_istitle (gunichar c)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
if (title_table[i][0] == c)
return 1;
return 0;
}
/**
* g_unichar_isxdigit:
* @c: a unicode character.
*
* Determines if a characters is a hexidecimal digit
*
* Return value: %TRUE if the character is a hexidecimal digit.
**/
gboolean
g_unichar_isxdigit (gunichar c)
{
int t = TYPE (c);
return ((c >= 'a' && c <= 'f')
|| (c >= 'A' && c <= 'F')
|| ISDIGIT (t));
}
/**
* g_unichar_isdefined:
* @c: a unicode character
*
* Determines if a given character is assigned in the Unicode
* standard
*
* Return value: %TRUE if the character has an assigned value.
**/
gboolean
g_unichar_isdefined (gunichar c)
{
int t = TYPE (c);
return t != G_UNICODE_UNASSIGNED;
}
/**
* g_unichar_iswide:
* @c: a unicode character
*
* Determines if a character is typically rendered in a double-width
* cell.
*
* Return value: %TRUE if the character is wide.
**/
/* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>. */
gboolean
g_unichar_iswide (gunichar c)
{
if (c < 0x1100)
return 0;
return ((c >= 0x1100 && c <= 0x115f) /* Hangul Jamo */
|| (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
c != 0x303f) /* CJK ... Yi */
|| (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
|| (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility Ideographs */
|| (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
|| (c >= 0xff00 && c <= 0xff5f) /* Fullwidth Forms */
|| (c >= 0xffe0 && c <= 0xffe6));
}
/**
* g_unichar_toupper:
* @c: a unicode character
*
* Convert a character to uppercase.
*
* Return value: the result of converting @c to uppercase.
* If @c is not an lowercase or titlecase character,
* @c is returned unchanged.
**/
gunichar
g_unichar_toupper (gunichar c)
{
int t = TYPE (c);
if (t == G_UNICODE_LOWERCASE_LETTER)
return ATTTABLE (c >> 8, c & 0xff);
else if (t == G_UNICODE_TITLECASE_LETTER)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
{
if (title_table[i][0] == c)
return title_table[i][1];
}
}
return c;
}
/**
* g_unichar_tolower:
* @c: a unicode character.
*
* Convert a character to lower case
*
e * Return value: the result of converting @c to lower case.
* If @c is not an upperlower or titlecase character,
* @c is returned unchanged.
**/
gunichar
g_unichar_tolower (gunichar c)
{
int t = TYPE (c);
if (t == G_UNICODE_UPPERCASE_LETTER)
return ATTTABLE (c >> 8, c & 0xff);
else if (t == G_UNICODE_TITLECASE_LETTER)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
{
if (title_table[i][0] == c)
return title_table[i][2];
}
}
return c;
}
/**
* g_unichar_totitle:
* @c: a unicode character
*
* Convert a character to the titlecase
*
* Return value: the result of converting @c to titlecase.
* If @c is not an uppercase or lowercase character,
* @c is returned unchanged.
**/
gunichar
g_unichar_totitle (gunichar c)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
{
if (title_table[i][0] == c || title_table[i][1] == c
|| title_table[i][2] == c)
return title_table[i][0];
}
return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
? ATTTABLE (c >> 8, c & 0xff)
: c);
}
/**
* g_unichar_xdigit_value:
* @c: a unicode character
*
* Determines the numeric value of a character as a decimal
* degital.
*
* Return value: If @c is a decimal digit (according to
* `g_unichar_isdigit'), its numeric value. Otherwise, -1.
**/
int
g_unichar_digit_value (gunichar c)
{
if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
return ATTTABLE (c >> 8, c & 0xff);
return -1;
}
/**
* g_unichar_xdigit_value:
* @c: a unicode character
*
* Determines the numeric value of a character as a hexidecimal
* degital.
*
* Return value: If @c is a hex digit (according to
* `g_unichar_isxdigit'), its numeric value. Otherwise, -1.
**/
int
g_unichar_xdigit_value (gunichar c)
{
if (c >= 'A' && c <= 'F')
return c - 'A' + 1;
if (c >= 'a' && c <= 'f')
return c - 'a' + 1;
if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
return ATTTABLE (c >> 8, c & 0xff);
return -1;
}
/**
* g_unichar_type:
* @c: a unicode character
*
* Classifies a unicode character by type.
*
* Return value: the typ of the character.
**/
GUnicodeType
g_unichar_type (gunichar c)
{
return TYPE (c);
}

483
glib/gutf8.c Normal file
View File

@ -0,0 +1,483 @@
/* gutf8.c - Operations on UTF-8 strings.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include <config.h>
#include <stdlib.h>
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif
#include <string.h>
#include "glib.h"
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Len = 1; \
Mask = 0x7f; \
} \
else if ((Char & 0xe0) == 0xc0) \
{ \
Len = 2; \
Mask = 0x1f; \
} \
else if ((Char & 0xf0) == 0xe0) \
{ \
Len = 3; \
Mask = 0x0f; \
} \
else if ((Char & 0xf8) == 0xf0) \
{ \
Len = 4; \
Mask = 0x07; \
} \
else if ((Char & 0xfc) == 0xf8) \
{ \
Len = 5; \
Mask = 0x03; \
} \
else if ((Char & 0xfe) == 0xfc) \
{ \
Len = 6; \
Mask = 0x01; \
} \
else \
Len = -1;
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
(Result) = (Chars)[0] & (Mask); \
for ((Count) = 1; (Count) < (Len); ++(Count)) \
{ \
if (((Chars)[(Count)] & 0xc0) != 0x80) \
{ \
(Result) = -1; \
break; \
} \
(Result) <<= 6; \
(Result) |= ((Chars)[(Count)] & 0x3f); \
}
gchar g_utf8_skip[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
};
/**
* g_utf8_find_prev_char:
* @str: pointer to the beginning of a UTF-8 string
* @p: pointer to some position within @str
*
* Given a position @p with a UTF-8 encoded string @str, find the start
* of the previous UTF-8 character starting before @p. Returns %NULL if no
* UTF-8 characters are present in @p before @str.
*
* @p does not have to be at the beginning of a UTF-8 chracter. No check
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte.
*
* Return value: a pointer to the found character or %NULL.
**/
gchar *
g_utf8_find_prev_char (const char *str,
const char *p)
{
for (--p; p > str; --p)
{
if ((*p & 0xc0) != 0x80)
return (gchar *)p;
}
return NULL;
}
/**
* g_utf8_find_next_char:
* @p: a pointer to a position within a UTF-8 encoded string
* @end: a pointer to the end of the string, or %NULL to indicate
* that the string is NULL terminated, in which case
* the returned value will be
*
* Find the start of the next utf-8 character in the string after @p
*
* @p does not have to be at the beginning of a UTF-8 chracter. No check
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte.
*
* Return value: a pointer to the found character or %NULL
**/
gchar *
g_utf8_find_next_char (const gchar *p,
const gchar *end)
{
if (*p)
{
if (end)
for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
;
else
for (++p; (*p & 0xc0) == 0x80; ++p)
;
}
return (p == end) ? NULL : (gchar *)p;
}
/**
* g_utf8_prev_char:
* @p: a pointer to a position within a UTF-8 encoded string
*
* Find the previous UTF-8 character in the string before @p
*
* @p does not have to be at the beginning of a UTF-8 character. No check
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte. If @p might be the first
* character of the string, you must use g_utf8_find_prev_char instead.
*
* Return value: a pointer to the found character.
**/
gchar *
g_utf8_prev_char (const gchar *p)
{
while (TRUE)
{
p--;
if ((*p & 0xc0) != 0x80)
return (gchar *)p;
}
}
/**
* g_utf8_strlen:
* @p: pointer to the start of a UTF-8 string.
* @max: the maximum number of bytes to examine. If @max
* is less than 0, then the string is assumed to be
* nul-terminated.
*
* Return value: the length of the string in characters
*/
gint
g_utf8_strlen (const gchar *p, gint max)
{
int len = 0;
const gchar *start = p;
/* special case for the empty string */
if (!*p)
return 0;
/* Note that the test here and the test in the loop differ subtly.
In the loop we want to see if we've passed the maximum limit --
for instance if the buffer ends mid-character. Here at the top
of the loop we want to see if we've just reached the last byte. */
while (max < 0 || p - start < max)
{
p = g_utf8_next_char (p);
++len;
if (! *p || (max > 0 && p - start > max))
break;
}
return len;
}
/**
* g_utf8_get_char:
* @p: a pointer to unicode character encoded as UTF-8
*
* Convert a sequence of bytes encoded as UTF-8 to a unicode character.
*
* Return value: the resulting character or (gunichar)-1 if @p does
* not point to a valid UTF-8 encoded unicode character
**/
gunichar
g_utf8_get_char (const gchar *p)
{
int i, mask = 0, len;
gunichar result;
unsigned char c = (unsigned char) *p;
UTF8_COMPUTE (c, mask, len);
if (len == -1)
return (gunichar)-1;
UTF8_GET (result, p, i, mask, len);
return result;
}
/**
* g_utf8_offset_to_pointer:
* @str: a UTF-8 encoded string
* @offset: a character offset within the string.
*
* Converts from an integer character offset to a pointer to a position
* within the string.
*
* Return value: the resulting pointer
**/
gchar *
g_utf8_offset_to_pointer (const gchar *str,
gint offset)
{
const gchar *s = str;
while (offset--)
s = g_utf8_next_char (s);
return (gchar *)s;
}
/**
* g_utf8_pointer_to_offset:
* @str: a UTF-8 encoded string
* @pos: a pointer to a position within @str
*
* Converts from a pointer to position within a string to a integer
* character offset
*
* Return value: the resulting character offset
**/
gint
g_utf8_pointer_to_offset (const gchar *str,
const gchar *pos)
{
const gchar *s = str;
gint offset = 0;
while (s < pos)
{
s = g_utf8_next_char (s);
offset++;
}
return offset;
}
gchar *
g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
{
const gchar *s = src;
while (n && *s)
{
s = g_utf8_next_char(s);
n--;
}
strncpy(dest, src, s - src);
dest[s - src] = 0;
return dest;
}
static gboolean
g_utf8_get_charset_internal (char **a)
{
char *charset = getenv("CHARSET");
if (charset && a && ! *a)
*a = charset;
if (charset && strstr (charset, "UTF-8"))
return TRUE;
#ifdef _NL_CTYPE_CODESET_NAME
charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
if (charset)
{
if (a && ! *a)
*a = charset;
if (strcmp (charset, "UTF-8") == 0)
return TRUE;
}
#elif CODESET
charset = nl_langinfo(CODESET);
if (charset)
{
if (a && ! *a)
*a = charset;
if (strcmp (charset, "UTF-8") == 0)
return TRUE;
}
#endif
if (a && ! *a)
*a = "US-ASCII";
/* Assume this for compatibility at present. */
return FALSE;
}
static int utf8_locale_cache = -1;
static char *utf8_charset_cache = NULL;
gboolean
g_get_charset (char **charset)
{
if (utf8_locale_cache != -1)
{
if (charset)
*charset = utf8_charset_cache;
return utf8_locale_cache;
}
utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
if (charset)
*charset = utf8_charset_cache;
return utf8_locale_cache;
}
/* unicode_strchr */
/**
* g_unichar_to_utf8:
* @ch: a ISO10646 character code
* @out: output buffer, must have at least 6 bytes of space.
*
* Convert a single character to utf8
*
* Return value: number of bytes written
**/
int
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
{
size_t len = 0;
int first;
int i;
if (c < 0x80)
{
first = 0;
len = 1;
}
else if (c < 0x800)
{
first = 0xc0;
len = 2;
}
else if (c < 0x10000)
{
first = 0xe0;
len = 3;
}
else if (c < 0x200000)
{
first = 0xf0;
len = 4;
}
else if (c < 0x4000000)
{
first = 0xf8;
len = 5;
}
else
{
first = 0xfc;
len = 6;
}
for (i = len - 1; i > 0; --i)
{
outbuf[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
outbuf[0] = c | first;
return len;
}
/**
* g_utf8_strchr:
* @p: a nul-terminated utf-8 string
* @c: a iso-10646 character/
*
* Find the leftmost occurence of the given iso-10646 character
* in a UTF-8 string.
*
* Return value: NULL if the string does not contain the character, otherwise, a
* a pointer to the start of the leftmost of the character in the string.
**/
gchar *
g_utf8_strchr (const char *p, gunichar c)
{
gchar ch[10];
gint len = g_unichar_to_utf8 (c, ch);
ch[len] = '\0';
return strstr(p, ch);
}
#if 0
/**
* g_utf8_strrchr:
* @p: a nul-terminated utf-8 string
* @c: a iso-10646 character/
*
* Find the rightmost occurence of the given iso-10646 character
* in a UTF-8 string.
*
* Return value: NULL if the string does not contain the character, otherwise, a
* a pointer to the start of the rightmost of the character in the string.
**/
/* This is ifdefed out atm as there is no strrstr function in libc.
*/
gchar *
unicode_strrchr (const char *p, gunichar c)
{
gchar ch[10];
len = g_unichar_to_utf8 (c, ch);
ch[len] = '\0';
return strrstr(p, ch);
}
#endif
/**
* g_utf8_to_ucs4:
* @str: a UTF-8 encoded strnig
* @len: the length of @
*
* Convert a string from UTF-8 to a 32-bit fixed width
* representation as UCS-4.
*
* Return value: a pointer to a newly allocated UCS-4 string.
* This value must be freed with g_free()
**/
gunichar *
g_utf8_to_ucs4 (const char *str, int len)
{
gunichar *result;
gint n_chars, i;
const gchar *p;
n_chars = g_utf8_strlen (str, len);
result = g_new (gunichar, n_chars);
p = str;
for (i=0; i < n_chars; i++)
{
result[i] = g_utf8_get_char (p);
p = g_utf8_next_char (p);
}
return result;
}

5390
gunichartables.h Normal file

File diff suppressed because it is too large Load Diff

178
gunicode.h Normal file
View File

@ -0,0 +1,178 @@
/* gunicode.h - Unicode manipulation functions
*
* Copyright (C) 1999, 2000 Tom Tromey
* Copyright 2000 Red Hat, Inc.
*
* The Gnome Library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* The Gnome Library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#ifndef __GUNICODE_H__
#define __GUNICODE_H__
#include <stdlib.h> /* For size_t */
#ifdef __cplusplus
extern "C"
{
#endif
typedef guint32 gunichar;
typedef guint16 gunichar2;
/* These are the possible character classifications. */
typedef enum {
G_UNICODE_CONTROL,
G_UNICODE_FORMAT,
G_UNICODE_UNASSIGNED,
G_UNICODE_PRIVATE_USE,
G_UNICODE_SURROGATE,
G_UNICODE_LOWERCASE_LETTER,
G_UNICODE_MODIFIER_LETTER,
G_UNICODE_OTHER_LETTER,
G_UNICODE_TITLECASE_LETTER,
G_UNICODE_UPPERCASE_LETTER,
G_UNICODE_COMBINING_MARK,
G_UNICODE_ENCLOSING_MARK,
G_UNICODE_NON_SPACING_MARK,
G_UNICODE_DECIMAL_NUMBER,
G_UNICODE_LETTER_NUMBER,
G_UNICODE_OTHER_NUMBER,
G_UNICODE_CONNECT_PUNCTUATION,
G_UNICODE_DASH_PUNCTUATION,
G_UNICODE_CLOSE_PUNCTUATION,
G_UNICODE_FINAL_PUNCTUATION,
G_UNICODE_INITIAL_PUNCTUATION,
G_UNICODE_OTHER_PUNCTUATION,
G_UNICODE_OPEN_PUNCTUATION,
G_UNICODE_CURRENCY_SYMBOL,
G_UNICODE_MODIFIER_SYMBOL,
G_UNICODE_MATH_SYMBOL,
G_UNICODE_OTHER_SYMBOL,
G_UNICODE_LINE_SEPARATOR,
G_UNICODE_PARAGRAPH_SEPARATOR,
G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
* not null, sets *CHARSET to the name of the current locale's
* charset. This value is statically allocated.
*/
gboolean g_get_charset (char **charset);
/* These are all analogs of the <ctype.h> functions.
*/
gboolean g_unichar_isalnum (gunichar c);
gboolean g_unichar_isalpha (gunichar c);
gboolean g_unichar_iscntrl (gunichar c);
gboolean g_unicphar_isdigit (gunichar c);
gboolean g_unichar_isgraph (gunichar c);
gboolean g_unichar_islower (gunichar c);
gboolean g_unichar_isprint (gunichar c);
gboolean g_unichar_ispunct (gunichar c);
gboolean g_unichar_isspace (gunichar c);
gboolean g_unichar_isupper (gunichar c);
gboolean g_unichar_isxdigit (gunichar c);
gboolean g_unichar_istitle (gunichar c);
gboolean g_unichar_isdefined (gunichar c);
gboolean g_unichar_iswide (gunichar c);
/* More <ctype.h> functions. These convert between the three cases.
* See the Unicode book to understand title case. */
gunichar g_unichar_toupper (gunichar c);
gunichar g_unichar_tolower (gunichar c);
gunichar g_unichar_totitle (gunichar c);
/* If C is a digit (according to `g_unichar_isdigit'), then return its
numeric value. Otherwise return -1. */
gint g_unichar_digit_value (gunichar c);
gint g_unichar_xdigit_value (gunichar c);
/* Return the Unicode character type of a given character. */
GUnicodeType g_unichar_type (gunichar c);
/* Compute canonical ordering of a string in-place. This rearranges
decomposed characters in the string according to their combining
classes. See the Unicode manual for more information. */
void g_unicode_canonical_ordering (gunichar *string,
size_t len);
/* Compute canonical decomposition of a character. Returns g_malloc()d
string of Unicode characters. RESULT_LEN is set to the resulting
length of the string. */
gunichar *g_unicode_canonical_decomposition (gunichar ch,
size_t *result_len);
/* Array of skip-bytes-per-initial character
*/
extern char g_utf8_skip[256];
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
gunichar g_utf8_get_char (const gchar *p);
gchar * g_utf8_offset_to_pointer (const gchar *str,
gint offset);
gint g_utf8_pointer_to_offset (const gchar *str,
const gchar *pos);
gchar * g_utf8_prev_char (const gchar *p);
gchar * g_utf8_find_next_char (const gchar *p,
const gchar *bound);
gchar * g_utf8_find_prev_char (const gchar *str,
const gchar *p);
gint g_utf8_strlen (const gchar *p,
gint max);
/* Copies n characters from src to dest */
gchar *g_utf8_strncpy (gchar *dest,
const gchar *src,
size_t n);
/* Find the UTF-8 character corresponding to ch, in string p. These
functions are equivalants to strchr and strrchr */
gchar *g_utf8_strchr (const gchar *p,
gunichar ch);
gchar *g_utf8_strrchr (const gchar *p,
gunichar ch);
gunichar2 *g_utf8_to_utf16 (const gchar *str,
gint len);
gunichar * g_utf8_to_ucs4 (const gchar *str,
gint len);
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
gint len);
gchar * g_utf16_to_utf8 (const gunichar2 *str,
gint len);
gunichar * g_ucs4_to_utf16 (const gunichar *str,
gint len);
gchar * g_ucs4_to_utf8 (const gunichar *str,
gint len);
/* Convert a single character into UTF-8. outbuf must have at
* least 6 bytes of space. Returns the number of bytes in the
* result.
*/
gint g_unichar_to_utf8 (gunichar c,
char *outbuf);
#ifdef __cplusplus
}
#endif
#endif /* GUNICODE_H */

133
gunidecomp.c Normal file
View File

@ -0,0 +1,133 @@
/* decomp.c - Character decomposition.
*
* Copyright (C) 1999, 2000 Tom Tromey
* Copyright 2000 Red Hat, Inc.
*
* The Gnome Library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* The Gnome Library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "glib.h"
#include "gunidecomp.h"
#include <config.h>
#include <stdlib.h>
/* We cheat a bit and cast type values to (char *). We detect these
using the &0xff trick. */
#define CC(Page, Char) \
(((((int) (combining_class_table[Page])) & 0xff) \
== ((int) combining_class_table[Page])) \
? ((int) combining_class_table[Page]) \
: (combining_class_table[Page][Char]))
#define COMBINING_CLASS(Char) \
(((Char) > (UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
/* Compute the canonical ordering of a string in-place. */
void
g_unicode_canonical_ordering (gunichar *string,
size_t len)
{
size_t i;
int swap = 1;
while (swap)
{
int last;
swap = 0;
last = COMBINING_CLASS (string[0]);
for (i = 0; i < len - 1; ++i)
{
int next = COMBINING_CLASS (string[i + 1]);
if (next != 0 && last > next)
{
size_t j;
/* Percolate item leftward through string. */
for (j = i; j > 0; --j)
{
gunichar t;
if (COMBINING_CLASS (string[j]) <= next)
break;
t = string[j + 1];
string[j + 1] = string[j];
string[j] = t;
swap = 1;
}
/* We're re-entering the loop looking at the old
character again. */
next = last;
}
last = next;
}
}
}
gunichar *
g_unicode_canonical_decomposition (gunichar ch,
size_t *result_len)
{
gunichar *r = NULL;
if (ch <= 0xffff)
{
int start = 0;
int end = G_N_ELEMENTS (decomp_table);
while (start != end)
{
int half = (start + end) / 2;
if (ch == decomp_table[half].ch)
{
/* Found it. */
int i, len;
/* We store as a double-nul terminated string. */
for (len = 0; (decomp_table[half].expansion[len]
|| decomp_table[half].expansion[len + 1]);
len += 2)
;
/* We've counted twice as many bytes as there are
characters. */
*result_len = len / 2;
r = malloc (len / 2 * sizeof (gunichar));
for (i = 0; i < len; i += 2)
{
r[i / 2] = (decomp_table[half].expansion[i] << 8
| decomp_table[half].expansion[i + 1]);
}
break;
}
else if (ch > decomp_table[half].ch)
start = half;
else
end = half;
}
}
if (r == NULL)
{
/* Not in our table. */
r = malloc (sizeof (gunichar));
*r = ch;
*result_len = 1;
}
/* Supposedly following the Unicode 2.1.9 table means that the
decompositions come out in canonical order. I haven't tested
this, but we rely on it here. */
return r;
}

1755
gunidecomp.h Normal file

File diff suppressed because it is too large Load Diff

355
guniprop.c Normal file
View File

@ -0,0 +1,355 @@
/* guniprop.c - Unicode character properties.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "glib.h"
#include "gunichartables.h"
#include <config.h>
#include <stddef.h>
#define asize(x) ((sizeof (x)) / sizeof (x[0]))
#define ATTTABLE(Page, Char) \
((attr_table[Page] == 0) ? 0 : (attr_table[Page][Char]))
/* We cheat a bit and cast type values to (char *). We detect these
using the &0xff trick. */
#define TTYPE(Page, Char) \
(((((int) type_table[Page]) & 0xff) == ((int) type_table[Page])) \
? ((int) (type_table[Page])) \
: (type_table[Page][Char]))
#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
|| (Type) == G_UNICODE_LETTER_NUMBER \
|| (Type) == G_UNICODE_OTHER_NUMBER)
#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
|| (Type) == G_UNICODE_UPPERCASE_LETTER \
|| (Type) == G_UNICODE_TITLECASE_LETTER \
|| (Type) == G_UNICODE_MODIFIER_LETTER \
|| (Type) == G_UNICODE_OTHER_LETTER)
gboolean
g_unichar_isalnum (gunichar c)
{
int t = TYPE (c);
return ISDIGIT (t) || ISALPHA (t);
}
gboolean
g_unichar_isalpha (gunichar c)
{
int t = TYPE (c);
return ISALPHA (t);
}
gboolean
g_unichar_iscntrl (gunichar c)
{
return TYPE (c) == G_UNICODE_CONTROL;
}
gboolean
g_unichar_isdigit (gunichar c)
{
return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
}
gboolean
g_unichar_isgraph (gunichar c)
{
int t = TYPE (c);
return (t != G_UNICODE_CONTROL
&& t != G_UNICODE_FORMAT
&& t != G_UNICODE_UNASSIGNED
&& t != G_UNICODE_PRIVATE_USE
&& t != G_UNICODE_SURROGATE
&& t != G_UNICODE_SPACE_SEPARATOR);
}
gboolean
g_unichar_islower (gunichar c)
{
return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
}
gboolean
g_unichar_isprint (gunichar c)
{
int t = TYPE (c);
return (t != G_UNICODE_CONTROL
&& t != G_UNICODE_FORMAT
&& t != G_UNICODE_UNASSIGNED
&& t != G_UNICODE_PRIVATE_USE
&& t != G_UNICODE_SURROGATE);
}
gboolean
g_unichar_ispunct (gunichar c)
{
int t = TYPE (c);
return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
|| t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
|| t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
|| t == G_UNICODE_OPEN_PUNCTUATION);
}
gboolean
g_unichar_isspace (gunichar c)
{
int t = TYPE (c);
return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
|| t == G_UNICODE_PARAGRAPH_SEPARATOR);
}
/**
* g_unichar_isupper:
* @c: a unicode character
*
* Determines if a character is uppercase.
*
* Return value:
**/
gboolean
g_unichar_isupper (gunichar c)
{
return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
}
/**
* g_unichar_istitle:
* @c: a unicode character
*
* Determines if a character is titlecase. Some characters in
* Unicode which are composites, such as the DZ digraph
* have three case variants instead of just two. The titlecase
* form is used at the beginning of a word where only the
* first letter is capitalized. The titlecase form of the DZ
* digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z
*
* Return value: %TRUE if the character is titlecase.
**/
gboolean
g_unichar_istitle (gunichar c)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
if (title_table[i][0] == c)
return 1;
return 0;
}
/**
* g_unichar_isxdigit:
* @c: a unicode character.
*
* Determines if a characters is a hexidecimal digit
*
* Return value: %TRUE if the character is a hexidecimal digit.
**/
gboolean
g_unichar_isxdigit (gunichar c)
{
int t = TYPE (c);
return ((c >= 'a' && c <= 'f')
|| (c >= 'A' && c <= 'F')
|| ISDIGIT (t));
}
/**
* g_unichar_isdefined:
* @c: a unicode character
*
* Determines if a given character is assigned in the Unicode
* standard
*
* Return value: %TRUE if the character has an assigned value.
**/
gboolean
g_unichar_isdefined (gunichar c)
{
int t = TYPE (c);
return t != G_UNICODE_UNASSIGNED;
}
/**
* g_unichar_iswide:
* @c: a unicode character
*
* Determines if a character is typically rendered in a double-width
* cell.
*
* Return value: %TRUE if the character is wide.
**/
/* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>. */
gboolean
g_unichar_iswide (gunichar c)
{
if (c < 0x1100)
return 0;
return ((c >= 0x1100 && c <= 0x115f) /* Hangul Jamo */
|| (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
c != 0x303f) /* CJK ... Yi */
|| (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
|| (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility Ideographs */
|| (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
|| (c >= 0xff00 && c <= 0xff5f) /* Fullwidth Forms */
|| (c >= 0xffe0 && c <= 0xffe6));
}
/**
* g_unichar_toupper:
* @c: a unicode character
*
* Convert a character to uppercase.
*
* Return value: the result of converting @c to uppercase.
* If @c is not an lowercase or titlecase character,
* @c is returned unchanged.
**/
gunichar
g_unichar_toupper (gunichar c)
{
int t = TYPE (c);
if (t == G_UNICODE_LOWERCASE_LETTER)
return ATTTABLE (c >> 8, c & 0xff);
else if (t == G_UNICODE_TITLECASE_LETTER)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
{
if (title_table[i][0] == c)
return title_table[i][1];
}
}
return c;
}
/**
* g_unichar_tolower:
* @c: a unicode character.
*
* Convert a character to lower case
*
e * Return value: the result of converting @c to lower case.
* If @c is not an upperlower or titlecase character,
* @c is returned unchanged.
**/
gunichar
g_unichar_tolower (gunichar c)
{
int t = TYPE (c);
if (t == G_UNICODE_UPPERCASE_LETTER)
return ATTTABLE (c >> 8, c & 0xff);
else if (t == G_UNICODE_TITLECASE_LETTER)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
{
if (title_table[i][0] == c)
return title_table[i][2];
}
}
return c;
}
/**
* g_unichar_totitle:
* @c: a unicode character
*
* Convert a character to the titlecase
*
* Return value: the result of converting @c to titlecase.
* If @c is not an uppercase or lowercase character,
* @c is returned unchanged.
**/
gunichar
g_unichar_totitle (gunichar c)
{
unsigned int i;
for (i = 0; i < asize (title_table); ++i)
{
if (title_table[i][0] == c || title_table[i][1] == c
|| title_table[i][2] == c)
return title_table[i][0];
}
return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
? ATTTABLE (c >> 8, c & 0xff)
: c);
}
/**
* g_unichar_xdigit_value:
* @c: a unicode character
*
* Determines the numeric value of a character as a decimal
* degital.
*
* Return value: If @c is a decimal digit (according to
* `g_unichar_isdigit'), its numeric value. Otherwise, -1.
**/
int
g_unichar_digit_value (gunichar c)
{
if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
return ATTTABLE (c >> 8, c & 0xff);
return -1;
}
/**
* g_unichar_xdigit_value:
* @c: a unicode character
*
* Determines the numeric value of a character as a hexidecimal
* degital.
*
* Return value: If @c is a hex digit (according to
* `g_unichar_isxdigit'), its numeric value. Otherwise, -1.
**/
int
g_unichar_xdigit_value (gunichar c)
{
if (c >= 'A' && c <= 'F')
return c - 'A' + 1;
if (c >= 'a' && c <= 'f')
return c - 'a' + 1;
if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
return ATTTABLE (c >> 8, c & 0xff);
return -1;
}
/**
* g_unichar_type:
* @c: a unicode character
*
* Classifies a unicode character by type.
*
* Return value: the typ of the character.
**/
GUnicodeType
g_unichar_type (gunichar c)
{
return TYPE (c);
}

483
gutf8.c Normal file
View File

@ -0,0 +1,483 @@
/* gutf8.c - Operations on UTF-8 strings.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include <config.h>
#include <stdlib.h>
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif
#include <string.h>
#include "glib.h"
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Len = 1; \
Mask = 0x7f; \
} \
else if ((Char & 0xe0) == 0xc0) \
{ \
Len = 2; \
Mask = 0x1f; \
} \
else if ((Char & 0xf0) == 0xe0) \
{ \
Len = 3; \
Mask = 0x0f; \
} \
else if ((Char & 0xf8) == 0xf0) \
{ \
Len = 4; \
Mask = 0x07; \
} \
else if ((Char & 0xfc) == 0xf8) \
{ \
Len = 5; \
Mask = 0x03; \
} \
else if ((Char & 0xfe) == 0xfc) \
{ \
Len = 6; \
Mask = 0x01; \
} \
else \
Len = -1;
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
(Result) = (Chars)[0] & (Mask); \
for ((Count) = 1; (Count) < (Len); ++(Count)) \
{ \
if (((Chars)[(Count)] & 0xc0) != 0x80) \
{ \
(Result) = -1; \
break; \
} \
(Result) <<= 6; \
(Result) |= ((Chars)[(Count)] & 0x3f); \
}
gchar g_utf8_skip[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
};
/**
* g_utf8_find_prev_char:
* @str: pointer to the beginning of a UTF-8 string
* @p: pointer to some position within @str
*
* Given a position @p with a UTF-8 encoded string @str, find the start
* of the previous UTF-8 character starting before @p. Returns %NULL if no
* UTF-8 characters are present in @p before @str.
*
* @p does not have to be at the beginning of a UTF-8 chracter. No check
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte.
*
* Return value: a pointer to the found character or %NULL.
**/
gchar *
g_utf8_find_prev_char (const char *str,
const char *p)
{
for (--p; p > str; --p)
{
if ((*p & 0xc0) != 0x80)
return (gchar *)p;
}
return NULL;
}
/**
* g_utf8_find_next_char:
* @p: a pointer to a position within a UTF-8 encoded string
* @end: a pointer to the end of the string, or %NULL to indicate
* that the string is NULL terminated, in which case
* the returned value will be
*
* Find the start of the next utf-8 character in the string after @p
*
* @p does not have to be at the beginning of a UTF-8 chracter. No check
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte.
*
* Return value: a pointer to the found character or %NULL
**/
gchar *
g_utf8_find_next_char (const gchar *p,
const gchar *end)
{
if (*p)
{
if (end)
for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
;
else
for (++p; (*p & 0xc0) == 0x80; ++p)
;
}
return (p == end) ? NULL : (gchar *)p;
}
/**
* g_utf8_prev_char:
* @p: a pointer to a position within a UTF-8 encoded string
*
* Find the previous UTF-8 character in the string before @p
*
* @p does not have to be at the beginning of a UTF-8 character. No check
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte. If @p might be the first
* character of the string, you must use g_utf8_find_prev_char instead.
*
* Return value: a pointer to the found character.
**/
gchar *
g_utf8_prev_char (const gchar *p)
{
while (TRUE)
{
p--;
if ((*p & 0xc0) != 0x80)
return (gchar *)p;
}
}
/**
* g_utf8_strlen:
* @p: pointer to the start of a UTF-8 string.
* @max: the maximum number of bytes to examine. If @max
* is less than 0, then the string is assumed to be
* nul-terminated.
*
* Return value: the length of the string in characters
*/
gint
g_utf8_strlen (const gchar *p, gint max)
{
int len = 0;
const gchar *start = p;
/* special case for the empty string */
if (!*p)
return 0;
/* Note that the test here and the test in the loop differ subtly.
In the loop we want to see if we've passed the maximum limit --
for instance if the buffer ends mid-character. Here at the top
of the loop we want to see if we've just reached the last byte. */
while (max < 0 || p - start < max)
{
p = g_utf8_next_char (p);
++len;
if (! *p || (max > 0 && p - start > max))
break;
}
return len;
}
/**
* g_utf8_get_char:
* @p: a pointer to unicode character encoded as UTF-8
*
* Convert a sequence of bytes encoded as UTF-8 to a unicode character.
*
* Return value: the resulting character or (gunichar)-1 if @p does
* not point to a valid UTF-8 encoded unicode character
**/
gunichar
g_utf8_get_char (const gchar *p)
{
int i, mask = 0, len;
gunichar result;
unsigned char c = (unsigned char) *p;
UTF8_COMPUTE (c, mask, len);
if (len == -1)
return (gunichar)-1;
UTF8_GET (result, p, i, mask, len);
return result;
}
/**
* g_utf8_offset_to_pointer:
* @str: a UTF-8 encoded string
* @offset: a character offset within the string.
*
* Converts from an integer character offset to a pointer to a position
* within the string.
*
* Return value: the resulting pointer
**/
gchar *
g_utf8_offset_to_pointer (const gchar *str,
gint offset)
{
const gchar *s = str;
while (offset--)
s = g_utf8_next_char (s);
return (gchar *)s;
}
/**
* g_utf8_pointer_to_offset:
* @str: a UTF-8 encoded string
* @pos: a pointer to a position within @str
*
* Converts from a pointer to position within a string to a integer
* character offset
*
* Return value: the resulting character offset
**/
gint
g_utf8_pointer_to_offset (const gchar *str,
const gchar *pos)
{
const gchar *s = str;
gint offset = 0;
while (s < pos)
{
s = g_utf8_next_char (s);
offset++;
}
return offset;
}
gchar *
g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
{
const gchar *s = src;
while (n && *s)
{
s = g_utf8_next_char(s);
n--;
}
strncpy(dest, src, s - src);
dest[s - src] = 0;
return dest;
}
static gboolean
g_utf8_get_charset_internal (char **a)
{
char *charset = getenv("CHARSET");
if (charset && a && ! *a)
*a = charset;
if (charset && strstr (charset, "UTF-8"))
return TRUE;
#ifdef _NL_CTYPE_CODESET_NAME
charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
if (charset)
{
if (a && ! *a)
*a = charset;
if (strcmp (charset, "UTF-8") == 0)
return TRUE;
}
#elif CODESET
charset = nl_langinfo(CODESET);
if (charset)
{
if (a && ! *a)
*a = charset;
if (strcmp (charset, "UTF-8") == 0)
return TRUE;
}
#endif
if (a && ! *a)
*a = "US-ASCII";
/* Assume this for compatibility at present. */
return FALSE;
}
static int utf8_locale_cache = -1;
static char *utf8_charset_cache = NULL;
gboolean
g_get_charset (char **charset)
{
if (utf8_locale_cache != -1)
{
if (charset)
*charset = utf8_charset_cache;
return utf8_locale_cache;
}
utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
if (charset)
*charset = utf8_charset_cache;
return utf8_locale_cache;
}
/* unicode_strchr */
/**
* g_unichar_to_utf8:
* @ch: a ISO10646 character code
* @out: output buffer, must have at least 6 bytes of space.
*
* Convert a single character to utf8
*
* Return value: number of bytes written
**/
int
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
{
size_t len = 0;
int first;
int i;
if (c < 0x80)
{
first = 0;
len = 1;
}
else if (c < 0x800)
{
first = 0xc0;
len = 2;
}
else if (c < 0x10000)
{
first = 0xe0;
len = 3;
}
else if (c < 0x200000)
{
first = 0xf0;
len = 4;
}
else if (c < 0x4000000)
{
first = 0xf8;
len = 5;
}
else
{
first = 0xfc;
len = 6;
}
for (i = len - 1; i > 0; --i)
{
outbuf[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
outbuf[0] = c | first;
return len;
}
/**
* g_utf8_strchr:
* @p: a nul-terminated utf-8 string
* @c: a iso-10646 character/
*
* Find the leftmost occurence of the given iso-10646 character
* in a UTF-8 string.
*
* Return value: NULL if the string does not contain the character, otherwise, a
* a pointer to the start of the leftmost of the character in the string.
**/
gchar *
g_utf8_strchr (const char *p, gunichar c)
{
gchar ch[10];
gint len = g_unichar_to_utf8 (c, ch);
ch[len] = '\0';
return strstr(p, ch);
}
#if 0
/**
* g_utf8_strrchr:
* @p: a nul-terminated utf-8 string
* @c: a iso-10646 character/
*
* Find the rightmost occurence of the given iso-10646 character
* in a UTF-8 string.
*
* Return value: NULL if the string does not contain the character, otherwise, a
* a pointer to the start of the rightmost of the character in the string.
**/
/* This is ifdefed out atm as there is no strrstr function in libc.
*/
gchar *
unicode_strrchr (const char *p, gunichar c)
{
gchar ch[10];
len = g_unichar_to_utf8 (c, ch);
ch[len] = '\0';
return strrstr(p, ch);
}
#endif
/**
* g_utf8_to_ucs4:
* @str: a UTF-8 encoded strnig
* @len: the length of @
*
* Convert a string from UTF-8 to a 32-bit fixed width
* representation as UCS-4.
*
* Return value: a pointer to a newly allocated UCS-4 string.
* This value must be freed with g_free()
**/
gunichar *
g_utf8_to_ucs4 (const char *str, int len)
{
gunichar *result;
gint n_chars, i;
const gchar *p;
n_chars = g_utf8_strlen (str, len);
result = g_new (gunichar, n_chars);
p = str;
for (i=0; i < n_chars; i++)
{
result[i] = g_utf8_get_char (p);
p = g_utf8_next_char (p);
}
return result;
}