Add hangul composition and decomposition to unicode normalization.

2003-12-04  Noah Levitt  <nlevitt@columbia.edu>

	* glib/gunidecomp.c: Add hangul composition and decomposition to
	unicode normalization. (#100456)

	* tests/unicode-normalize.c: Test hangul.
This commit is contained in:
Noah Levitt 2003-12-04 19:47:52 +00:00 committed by Noah Levitt
parent c9ca629947
commit 45b1590923
8 changed files with 167 additions and 23 deletions

View File

@ -1,3 +1,10 @@
2003-12-04 Noah Levitt <nlevitt@columbia.edu>
* glib/gunidecomp.c: Add hangul composition and decomposition to
unicode normalization. (#100456)
* tests/unicode-normalize.c: Test hangul.
Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de> Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de>
Fix for #103710, Mark Jones: Fix for #103710, Mark Jones:

View File

@ -1,3 +1,10 @@
2003-12-04 Noah Levitt <nlevitt@columbia.edu>
* glib/gunidecomp.c: Add hangul composition and decomposition to
unicode normalization. (#100456)
* tests/unicode-normalize.c: Test hangul.
Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de> Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de>
Fix for #103710, Mark Jones: Fix for #103710, Mark Jones:

View File

@ -1,3 +1,10 @@
2003-12-04 Noah Levitt <nlevitt@columbia.edu>
* glib/gunidecomp.c: Add hangul composition and decomposition to
unicode normalization. (#100456)
* tests/unicode-normalize.c: Test hangul.
Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de> Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de>
Fix for #103710, Mark Jones: Fix for #103710, Mark Jones:

View File

@ -1,3 +1,10 @@
2003-12-04 Noah Levitt <nlevitt@columbia.edu>
* glib/gunidecomp.c: Add hangul composition and decomposition to
unicode normalization. (#100456)
* tests/unicode-normalize.c: Test hangul.
Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de> Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de>
Fix for #103710, Mark Jones: Fix for #103710, Mark Jones:

View File

@ -1,3 +1,10 @@
2003-12-04 Noah Levitt <nlevitt@columbia.edu>
* glib/gunidecomp.c: Add hangul composition and decomposition to
unicode normalization. (#100456)
* tests/unicode-normalize.c: Test hangul.
Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de> Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de>
Fix for #103710, Mark Jones: Fix for #103710, Mark Jones:

View File

@ -1,3 +1,10 @@
2003-12-04 Noah Levitt <nlevitt@columbia.edu>
* glib/gunidecomp.c: Add hangul composition and decomposition to
unicode normalization. (#100456)
* tests/unicode-normalize.c: Test hangul.
Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de> Tue Dec 2 02:29:41 2003 Matthias Clasen <maclas@gmx.de>
Fix for #103710, Mark Jones: Fix for #103710, Mark Jones:

View File

@ -52,6 +52,17 @@ _g_unichar_combining_class (gunichar uc)
return COMBINING_CLASS (uc); return COMBINING_CLASS (uc);
} }
/* constants for hangul syllable [de]composition */
#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount * TCount)
#define SCount (LCount * NCount)
/** /**
* g_unicode_canonical_ordering: * g_unicode_canonical_ordering:
* @string: a UCS-4 encoded string. * @string: a UCS-4 encoded string.
@ -100,6 +111,47 @@ g_unicode_canonical_ordering (gunichar *string,
} }
} }
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
* r should be null or have sufficient space. Calling with r == NULL will
* only calculate the result_len; however, a buffer with space for three
* characters will always be big enough. */
static void
decompose_hangul (gunichar s,
gunichar *r,
gsize *result_len)
{
gint SIndex = s - SBase;
/* not a hangul syllable */
if (SIndex < 0 || SIndex >= SCount)
{
if (r)
r[0] = s;
*result_len = 1;
}
else
{
gunichar L = LBase + SIndex / NCount;
gunichar V = VBase + (SIndex % NCount) / TCount;
gunichar T = TBase + SIndex % TCount;
if (r)
{
r[0] = L;
r[1] = V;
}
if (T != TBase)
{
if (r)
r[2] = T;
*result_len = 3;
}
else
*result_len = 2;
}
}
/* returns a pointer to a null-terminated UTF-8 string */ /* returns a pointer to a null-terminated UTF-8 string */
static const gchar * static const gchar *
find_decomposition (gunichar ch, find_decomposition (gunichar ch,
@ -159,11 +211,18 @@ gunichar *
g_unicode_canonical_decomposition (gunichar ch, g_unicode_canonical_decomposition (gunichar ch,
gsize *result_len) gsize *result_len)
{ {
const gchar *decomp = find_decomposition (ch, FALSE); const gchar *decomp;
const gchar *p; const gchar *p;
gunichar *r; gunichar *r;
if (decomp) /* Hangul syllable */
if (ch >= 0xac00 && ch <= 0xd7af)
{
decompose_hangul (ch, NULL, result_len);
r = g_malloc (*result_len * sizeof (gunichar));
decompose_hangul (ch, r, result_len);
}
else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
{ {
/* Found it. */ /* Found it. */
int i; int i;
@ -188,6 +247,34 @@ g_unicode_canonical_decomposition (gunichar ch,
return r; return r;
} }
/* L,V => LV and LV,T => LVT */
static gboolean
combine_hangul (gunichar a,
gunichar b,
gunichar *result)
{
gint LIndex = a - LBase;
gint SIndex = a - SBase;
gint VIndex = b - VBase;
gint TIndex = b - TBase;
if (0 <= LIndex && LIndex < LCount
&& 0 <= VIndex && VIndex < VCount)
{
*result = SBase + (LIndex * VCount + VIndex) * TCount;
return TRUE;
}
else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
&& 0 <= TIndex && TIndex <= TCount)
{
*result = a + TIndex;
return TRUE;
}
return FALSE;
}
#define CI(Page, Char) \ #define CI(Page, Char) \
((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
@ -203,6 +290,9 @@ combine (gunichar a,
{ {
gushort index_a, index_b; gushort index_a, index_b;
if (combine_hangul (a, b, result))
return TRUE;
index_a = COMPOSE_INDEX(a); index_a = COMPOSE_INDEX(a);
if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
@ -262,14 +352,24 @@ _g_utf8_normalize_wc (const gchar *str,
p = str; p = str;
while ((max_len < 0 || p < str + max_len) && *p) while ((max_len < 0 || p < str + max_len) && *p)
{ {
const gchar *decomp;
gunichar wc = g_utf8_get_char (p); gunichar wc = g_utf8_get_char (p);
const gchar *decomp = find_decomposition (wc, do_compat); if (wc >= 0xac00 && wc <= 0xd7af)
{
gint result_len;
decompose_hangul (wc, NULL, &result_len);
n_wc += result_len;
}
else
{
decomp = find_decomposition (wc, do_compat);
if (decomp) if (decomp)
n_wc += g_utf8_strlen (decomp, -1); n_wc += g_utf8_strlen (decomp, -1);
else else
n_wc++; n_wc++;
}
p = g_utf8_next_char (p); p = g_utf8_next_char (p);
} }
@ -286,6 +386,14 @@ _g_utf8_normalize_wc (const gchar *str,
int cc; int cc;
gsize old_n_wc = n_wc; gsize old_n_wc = n_wc;
if (wc >= 0xac00 && wc <= 0xd7af)
{
gint result_len;
decompose_hangul (wc, wc_buffer + n_wc, &result_len);
n_wc += result_len;
}
else
{
decomp = find_decomposition (wc, do_compat); decomp = find_decomposition (wc, do_compat);
if (decomp) if (decomp)
@ -296,6 +404,7 @@ _g_utf8_normalize_wc (const gchar *str,
} }
else else
wc_buffer[n_wc++] = wc; wc_buffer[n_wc++] = wc;
}
if (n_wc > 0) if (n_wc > 0)
{ {

View File

@ -23,13 +23,6 @@ decode (const gchar *input)
exit (1); exit (1);
} }
/* FIXME: We don't handle the Hangul syllables */
if (ch >= 0xac00 && ch <= 0xd7ff) /* Hangul syllables */
{
g_string_free (result, TRUE);
return NULL;
}
g_string_append_unichar (result, ch); g_string_append_unichar (result, ch);
while (input[offset] && input[offset] != ' ') while (input[offset] && input[offset] != ' ')