From 0584fe33de86f9fd8effe006d911cf5948b4ad05 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Mon, 18 Jul 2011 17:52:40 -0400 Subject: [PATCH] Bug 654651 - Better g_unicode_canonical_decomposition() Add g_unichar_fully_decompose(). Deprecate g_unicode_canonical_decomposition(). --- docs/reference/glib/glib-sections.txt | 1 + glib/glib.symbols | 3 +- glib/gunicode.h | 9 ++-- glib/gunidecomp.c | 78 +++++++++++++++++++++++++-- glib/tests/unicode.c | 72 +++++++++++++++++++++++-- 5 files changed, 150 insertions(+), 13 deletions(-) diff --git a/docs/reference/glib/glib-sections.txt b/docs/reference/glib/glib-sections.txt index 627bf67c5..3a0160da6 100644 --- a/docs/reference/glib/glib-sections.txt +++ b/docs/reference/glib/glib-sections.txt @@ -2704,6 +2704,7 @@ g_unichar_digit_value g_unichar_xdigit_value g_unichar_compose g_unichar_decompose +g_unichar_fully_decompose GUnicodeType g_unichar_type GUnicodeBreakType diff --git a/glib/glib.symbols b/glib/glib.symbols index fe6babaca..df978ec41 100644 --- a/glib/glib.symbols +++ b/glib/glib.symbols @@ -1196,7 +1196,6 @@ g_tree_search g_tree_steal g_tree_traverse g_unichar_break_type -g_unicode_canonical_ordering g_unichar_combining_class g_unichar_compose g_unichar_decompose @@ -1216,6 +1215,7 @@ g_unichar_iswide g_unichar_iswide_cjk g_unichar_isxdigit g_unichar_iszerowidth +g_unichar_fully_decompose g_unichar_tolower g_unichar_totitle g_unichar_toupper @@ -1226,6 +1226,7 @@ g_unichar_digit_value g_unichar_xdigit_value g_unichar_type g_unicode_canonical_decomposition +g_unicode_canonical_ordering g_utf8_casefold g_utf8_collate g_utf8_collate_key diff --git a/glib/gunicode.h b/glib/gunicode.h index ca0569e6e..6c08e7d29 100644 --- a/glib/gunicode.h +++ b/glib/gunicode.h @@ -295,6 +295,11 @@ gboolean g_unichar_decompose (gunichar ch, gunichar *a, gunichar *b); +gsize g_unichar_fully_decompose (gunichar ch, + gboolean compat, + gunichar *result, + gsize result_len); + /* Compute canonical ordering of a string in-place. This rearranges decomposed characters in the string according to their combining classes. See the Unicode manual for more information. */ @@ -302,9 +307,7 @@ void g_unicode_canonical_ordering (gunichar *string, gsize len); -/* Compute canonical decomposition of a character. Returns g_malloc()d - string of Unicode characters. RESULT_LEN is set to the resulting - length of the string. */ +/* Deprecated. Use g_unichar_fully_decompose() */ gunichar *g_unicode_canonical_decomposition (gunichar ch, gsize *result_len) G_GNUC_MALLOC; diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c index d9ef8f0ba..d1f98975b 100644 --- a/glib/gunidecomp.c +++ b/glib/gunidecomp.c @@ -127,7 +127,7 @@ g_unicode_canonical_ordering (gunichar *string, * only calculate the result_len; however, a buffer with space for three * characters will always be big enough. */ static void -decompose_hangul (gunichar s, +decompose_hangul (gunichar s, gunichar *r, gsize *result_len) { @@ -217,6 +217,9 @@ find_decomposition (gunichar ch, * * Return value: a newly allocated string of Unicode characters. * @result_len is set to the resulting length of the string. + * + * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose() + * instead. **/ gunichar * g_unicode_canonical_decomposition (gunichar ch, @@ -227,7 +230,7 @@ g_unicode_canonical_decomposition (gunichar ch, gunichar *r; /* Hangul syllable */ - if (ch >= 0xac00 && ch <= 0xd7a3) + if (ch >= SBase && ch < SBase + SCount) { decompose_hangul (ch, NULL, result_len); r = g_malloc (*result_len * sizeof (gunichar)); @@ -363,7 +366,7 @@ _g_utf8_normalize_wc (const gchar *str, const gchar *decomp; gunichar wc = g_utf8_get_char (p); - if (wc >= 0xac00 && wc <= 0xd7a3) + if (wc >= SBase && wc < SBase + SCount) { gsize result_len; decompose_hangul (wc, NULL, &result_len); @@ -394,7 +397,7 @@ _g_utf8_normalize_wc (const gchar *str, int cc; gsize old_n_wc = n_wc; - if (wc >= 0xac00 && wc <= 0xd7a3) + if (wc >= SBase && wc < SBase + SCount) { gsize result_len; decompose_hangul (wc, wc_buffer + n_wc, &result_len); @@ -592,7 +595,7 @@ decompose_hangul_step (gunichar ch, * further, but @a may itself decompose. To get the full * canonical decomposition for @ch, one would need to * recursively call this function on @a. Or use - * g_unicode_canonical_decomposition(). + * g_unichar_fully_decompose(). * * See UAX#15 * for details. @@ -678,3 +681,68 @@ g_unichar_compose (gunichar a, *ch = 0; return FALSE; } + +/** + * g_unichar_fully_decompose: + * @ch: a Unicode character. + * @compat: whether perform canonical or compatibility decomposition + * @result: location to store decomposed result, or %NULL + * @result_len: length of @result + * + * Computes the canonical or compatibility decomposition of a + * Unicode character. For compatibility decomposition, + * pass %TRUE for @compat; for canonical decomposition + * pass %FALSE for @compat. + * + * The decomposed sequence is placed in @result. Only up to + * @result_len characters are written into @result. The length + * of the full decomposition (irrespective of @result_len) is + * returned by the function. For canonical decomposition, a + * result buffer of length 4 is always enough, whereas for + * compatibility decomposition, a buffer of 18 is enough. + * + * See UAX#15 + * for details. + * + * Return value: the length of the full decomposition. + * + * Since: 2.30 + **/ +gsize +g_unichar_fully_decompose (gunichar ch, + gboolean compat, + gunichar *result, + gsize result_len) +{ + const gchar *decomp; + const gchar *p; + + /* Hangul syllable */ + if (ch >= SBase && ch < SBase + SCount) + { + gsize len, i; + gunichar buffer[3]; + decompose_hangul (ch, result ? buffer : NULL, &len); + if (result) + for (i = 0; i < len && i < result_len; i++) + result[i] = buffer[i]; + return len; + } + else if ((decomp = find_decomposition (ch, compat)) != NULL) + { + /* Found it. */ + gsize len, i; + + len = g_utf8_strlen (decomp, -1); + + for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++) + result[i] = g_utf8_get_char (p); + + return len; + } + + /* Does not decompose */ + if (result && result_len >= 1) + *result = ch; + return 1; +} diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c index 8a43388bc..91f224b3c 100644 --- a/glib/tests/unicode.c +++ b/glib/tests/unicode.c @@ -482,19 +482,18 @@ test_decompose (void) } static void -test_canonical_decomposition (void) +test_fully_decompose_canonical (void) { - gunichar *decomp; + gunichar decomp[5]; gsize len; #define TEST_DECOMP(ch, expected_len, a, b, c, d) \ - decomp = g_unicode_canonical_decomposition (ch, &len); \ + len = g_unichar_fully_decompose (ch, FALSE, decomp, G_N_ELEMENTS (decomp)); \ g_assert_cmpint (expected_len, ==, len); \ if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \ if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \ if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \ if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \ - g_free (d); #define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0) #define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0) @@ -523,6 +522,54 @@ test_canonical_decomposition (void) TEST2 (0xD4CC, 0x1111, 0x1171); TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8); TEST2 (0xCE20, 0x110E, 0x1173); + +#undef TEST_DECOMP +} + +static void +test_canonical_decomposition (void) +{ + gunichar *decomp; + gsize len; + +#define TEST_DECOMP(ch, expected_len, a, b, c, d) \ + decomp = g_unicode_canonical_decomposition (ch, &len); \ + g_assert_cmpint (expected_len, ==, len); \ + if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \ + if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \ + if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \ + if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \ + g_free (decomp); + +#define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0) +#define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0) +#define TEST2(ch, a, b) TEST_DECOMP (ch, 2, a, b, 0, 0) +#define TEST3(ch, a, b, c) TEST_DECOMP (ch, 3, a, b, c, 0) +#define TEST4(ch, a, b, c, d) TEST_DECOMP (ch, 4, a, b, c, d) + + /* Not decomposable */ + TEST0 (0x0041); + TEST0 (0xFB01); + + /* Singletons */ + TEST2 (0x212B, 0x0041, 0x030A); + TEST1 (0x2126, 0x03A9); + + /* General */ + TEST2 (0x00C5, 0x0041, 0x030A); + TEST2 (0x00F4, 0x006F, 0x0302); + TEST3 (0x1E69, 0x0073, 0x0323, 0x0307); + TEST2 (0x1E63, 0x0073, 0x0323); + TEST2 (0x1E0B, 0x0064, 0x0307); + TEST2 (0x1E0D, 0x0064, 0x0323); + + /* Hangul */ + TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6); + TEST2 (0xD4CC, 0x1111, 0x1171); + TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8); + TEST2 (0xCE20, 0x110E, 0x1173); + +#undef TEST_DECOMP } static void @@ -540,6 +587,21 @@ test_decompose_tail (void) g_assert (a == ch && b == 0); } +static void +test_fully_decompose_len (void) +{ + gunichar ch; + + /* Test that all canonical decompositions are at most 4 in length, + * and compatibility decompositions are at most 18 in length. + */ + + for (ch = 0; ch < 0x110000; ch++) { + g_assert_cmpint (g_unichar_fully_decompose (ch, FALSE, NULL, 0), <=, 4); + g_assert_cmpint (g_unichar_fully_decompose (ch, TRUE, NULL, 0), <=, 18); + } +} + int main (int argc, char *argv[]) @@ -557,8 +619,10 @@ main (int argc, g_test_add_func ("/unicode/wide", test_wide); g_test_add_func ("/unicode/compose", test_compose); g_test_add_func ("/unicode/decompose", test_decompose); + g_test_add_func ("/unicode/fully-decompose-canonical", test_fully_decompose_canonical); g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition); g_test_add_func ("/unicode/decompose-tail", test_decompose_tail); + g_test_add_func ("/unicode/fully-decompose-len", test_fully_decompose_len); return g_test_run(); }