mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-02-26 12:12:10 +01:00
Bug 654651 - Better g_unicode_canonical_decomposition()
Add g_unichar_fully_decompose(). Deprecate g_unicode_canonical_decomposition().
This commit is contained in:
parent
7f289c924b
commit
0584fe33de
@ -2704,6 +2704,7 @@ g_unichar_digit_value
|
|||||||
g_unichar_xdigit_value
|
g_unichar_xdigit_value
|
||||||
g_unichar_compose
|
g_unichar_compose
|
||||||
g_unichar_decompose
|
g_unichar_decompose
|
||||||
|
g_unichar_fully_decompose
|
||||||
GUnicodeType
|
GUnicodeType
|
||||||
g_unichar_type
|
g_unichar_type
|
||||||
GUnicodeBreakType
|
GUnicodeBreakType
|
||||||
|
@ -1196,7 +1196,6 @@ g_tree_search
|
|||||||
g_tree_steal
|
g_tree_steal
|
||||||
g_tree_traverse
|
g_tree_traverse
|
||||||
g_unichar_break_type
|
g_unichar_break_type
|
||||||
g_unicode_canonical_ordering
|
|
||||||
g_unichar_combining_class
|
g_unichar_combining_class
|
||||||
g_unichar_compose
|
g_unichar_compose
|
||||||
g_unichar_decompose
|
g_unichar_decompose
|
||||||
@ -1216,6 +1215,7 @@ g_unichar_iswide
|
|||||||
g_unichar_iswide_cjk
|
g_unichar_iswide_cjk
|
||||||
g_unichar_isxdigit
|
g_unichar_isxdigit
|
||||||
g_unichar_iszerowidth
|
g_unichar_iszerowidth
|
||||||
|
g_unichar_fully_decompose
|
||||||
g_unichar_tolower
|
g_unichar_tolower
|
||||||
g_unichar_totitle
|
g_unichar_totitle
|
||||||
g_unichar_toupper
|
g_unichar_toupper
|
||||||
@ -1226,6 +1226,7 @@ g_unichar_digit_value
|
|||||||
g_unichar_xdigit_value
|
g_unichar_xdigit_value
|
||||||
g_unichar_type
|
g_unichar_type
|
||||||
g_unicode_canonical_decomposition
|
g_unicode_canonical_decomposition
|
||||||
|
g_unicode_canonical_ordering
|
||||||
g_utf8_casefold
|
g_utf8_casefold
|
||||||
g_utf8_collate
|
g_utf8_collate
|
||||||
g_utf8_collate_key
|
g_utf8_collate_key
|
||||||
|
@ -295,6 +295,11 @@ gboolean g_unichar_decompose (gunichar ch,
|
|||||||
gunichar *a,
|
gunichar *a,
|
||||||
gunichar *b);
|
gunichar *b);
|
||||||
|
|
||||||
|
gsize g_unichar_fully_decompose (gunichar ch,
|
||||||
|
gboolean compat,
|
||||||
|
gunichar *result,
|
||||||
|
gsize result_len);
|
||||||
|
|
||||||
/* Compute canonical ordering of a string in-place. This rearranges
|
/* Compute canonical ordering of a string in-place. This rearranges
|
||||||
decomposed characters in the string according to their combining
|
decomposed characters in the string according to their combining
|
||||||
classes. See the Unicode manual for more information. */
|
classes. See the Unicode manual for more information. */
|
||||||
@ -302,9 +307,7 @@ void g_unicode_canonical_ordering (gunichar *string,
|
|||||||
gsize len);
|
gsize len);
|
||||||
|
|
||||||
|
|
||||||
/* Compute canonical decomposition of a character. Returns g_malloc()d
|
/* Deprecated. Use g_unichar_fully_decompose() */
|
||||||
string of Unicode characters. RESULT_LEN is set to the resulting
|
|
||||||
length of the string. */
|
|
||||||
gunichar *g_unicode_canonical_decomposition (gunichar ch,
|
gunichar *g_unicode_canonical_decomposition (gunichar ch,
|
||||||
gsize *result_len) G_GNUC_MALLOC;
|
gsize *result_len) G_GNUC_MALLOC;
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ g_unicode_canonical_ordering (gunichar *string,
|
|||||||
* only calculate the result_len; however, a buffer with space for three
|
* only calculate the result_len; however, a buffer with space for three
|
||||||
* characters will always be big enough. */
|
* characters will always be big enough. */
|
||||||
static void
|
static void
|
||||||
decompose_hangul (gunichar s,
|
decompose_hangul (gunichar s,
|
||||||
gunichar *r,
|
gunichar *r,
|
||||||
gsize *result_len)
|
gsize *result_len)
|
||||||
{
|
{
|
||||||
@ -217,6 +217,9 @@ find_decomposition (gunichar ch,
|
|||||||
*
|
*
|
||||||
* Return value: a newly allocated string of Unicode characters.
|
* Return value: a newly allocated string of Unicode characters.
|
||||||
* @result_len is set to the resulting length of the string.
|
* @result_len is set to the resulting length of the string.
|
||||||
|
*
|
||||||
|
* Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose()
|
||||||
|
* instead.
|
||||||
**/
|
**/
|
||||||
gunichar *
|
gunichar *
|
||||||
g_unicode_canonical_decomposition (gunichar ch,
|
g_unicode_canonical_decomposition (gunichar ch,
|
||||||
@ -227,7 +230,7 @@ g_unicode_canonical_decomposition (gunichar ch,
|
|||||||
gunichar *r;
|
gunichar *r;
|
||||||
|
|
||||||
/* Hangul syllable */
|
/* Hangul syllable */
|
||||||
if (ch >= 0xac00 && ch <= 0xd7a3)
|
if (ch >= SBase && ch < SBase + SCount)
|
||||||
{
|
{
|
||||||
decompose_hangul (ch, NULL, result_len);
|
decompose_hangul (ch, NULL, result_len);
|
||||||
r = g_malloc (*result_len * sizeof (gunichar));
|
r = g_malloc (*result_len * sizeof (gunichar));
|
||||||
@ -363,7 +366,7 @@ _g_utf8_normalize_wc (const gchar *str,
|
|||||||
const gchar *decomp;
|
const gchar *decomp;
|
||||||
gunichar wc = g_utf8_get_char (p);
|
gunichar wc = g_utf8_get_char (p);
|
||||||
|
|
||||||
if (wc >= 0xac00 && wc <= 0xd7a3)
|
if (wc >= SBase && wc < SBase + SCount)
|
||||||
{
|
{
|
||||||
gsize result_len;
|
gsize result_len;
|
||||||
decompose_hangul (wc, NULL, &result_len);
|
decompose_hangul (wc, NULL, &result_len);
|
||||||
@ -394,7 +397,7 @@ _g_utf8_normalize_wc (const gchar *str,
|
|||||||
int cc;
|
int cc;
|
||||||
gsize old_n_wc = n_wc;
|
gsize old_n_wc = n_wc;
|
||||||
|
|
||||||
if (wc >= 0xac00 && wc <= 0xd7a3)
|
if (wc >= SBase && wc < SBase + SCount)
|
||||||
{
|
{
|
||||||
gsize result_len;
|
gsize result_len;
|
||||||
decompose_hangul (wc, wc_buffer + n_wc, &result_len);
|
decompose_hangul (wc, wc_buffer + n_wc, &result_len);
|
||||||
@ -592,7 +595,7 @@ decompose_hangul_step (gunichar ch,
|
|||||||
* further, but @a may itself decompose. To get the full
|
* further, but @a may itself decompose. To get the full
|
||||||
* canonical decomposition for @ch, one would need to
|
* canonical decomposition for @ch, one would need to
|
||||||
* recursively call this function on @a. Or use
|
* recursively call this function on @a. Or use
|
||||||
* g_unicode_canonical_decomposition().
|
* g_unichar_fully_decompose().
|
||||||
*
|
*
|
||||||
* See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
|
* See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
|
||||||
* for details.
|
* for details.
|
||||||
@ -678,3 +681,68 @@ g_unichar_compose (gunichar a,
|
|||||||
*ch = 0;
|
*ch = 0;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* g_unichar_fully_decompose:
|
||||||
|
* @ch: a Unicode character.
|
||||||
|
* @compat: whether perform canonical or compatibility decomposition
|
||||||
|
* @result: location to store decomposed result, or %NULL
|
||||||
|
* @result_len: length of @result
|
||||||
|
*
|
||||||
|
* Computes the canonical or compatibility decomposition of a
|
||||||
|
* Unicode character. For compatibility decomposition,
|
||||||
|
* pass %TRUE for @compat; for canonical decomposition
|
||||||
|
* pass %FALSE for @compat.
|
||||||
|
*
|
||||||
|
* The decomposed sequence is placed in @result. Only up to
|
||||||
|
* @result_len characters are written into @result. The length
|
||||||
|
* of the full decomposition (irrespective of @result_len) is
|
||||||
|
* returned by the function. For canonical decomposition, a
|
||||||
|
* result buffer of length 4 is always enough, whereas for
|
||||||
|
* compatibility decomposition, a buffer of 18 is enough.
|
||||||
|
*
|
||||||
|
* See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
|
||||||
|
* for details.
|
||||||
|
*
|
||||||
|
* Return value: the length of the full decomposition.
|
||||||
|
*
|
||||||
|
* Since: 2.30
|
||||||
|
**/
|
||||||
|
gsize
|
||||||
|
g_unichar_fully_decompose (gunichar ch,
|
||||||
|
gboolean compat,
|
||||||
|
gunichar *result,
|
||||||
|
gsize result_len)
|
||||||
|
{
|
||||||
|
const gchar *decomp;
|
||||||
|
const gchar *p;
|
||||||
|
|
||||||
|
/* Hangul syllable */
|
||||||
|
if (ch >= SBase && ch < SBase + SCount)
|
||||||
|
{
|
||||||
|
gsize len, i;
|
||||||
|
gunichar buffer[3];
|
||||||
|
decompose_hangul (ch, result ? buffer : NULL, &len);
|
||||||
|
if (result)
|
||||||
|
for (i = 0; i < len && i < result_len; i++)
|
||||||
|
result[i] = buffer[i];
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
else if ((decomp = find_decomposition (ch, compat)) != NULL)
|
||||||
|
{
|
||||||
|
/* Found it. */
|
||||||
|
gsize len, i;
|
||||||
|
|
||||||
|
len = g_utf8_strlen (decomp, -1);
|
||||||
|
|
||||||
|
for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++)
|
||||||
|
result[i] = g_utf8_get_char (p);
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Does not decompose */
|
||||||
|
if (result && result_len >= 1)
|
||||||
|
*result = ch;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
@ -482,19 +482,18 @@ test_decompose (void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_canonical_decomposition (void)
|
test_fully_decompose_canonical (void)
|
||||||
{
|
{
|
||||||
gunichar *decomp;
|
gunichar decomp[5];
|
||||||
gsize len;
|
gsize len;
|
||||||
|
|
||||||
#define TEST_DECOMP(ch, expected_len, a, b, c, d) \
|
#define TEST_DECOMP(ch, expected_len, a, b, c, d) \
|
||||||
decomp = g_unicode_canonical_decomposition (ch, &len); \
|
len = g_unichar_fully_decompose (ch, FALSE, decomp, G_N_ELEMENTS (decomp)); \
|
||||||
g_assert_cmpint (expected_len, ==, len); \
|
g_assert_cmpint (expected_len, ==, len); \
|
||||||
if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
|
if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
|
||||||
if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
|
if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
|
||||||
if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
|
if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
|
||||||
if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
|
if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
|
||||||
g_free (d);
|
|
||||||
|
|
||||||
#define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0)
|
#define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0)
|
||||||
#define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0)
|
#define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0)
|
||||||
@ -523,6 +522,54 @@ test_canonical_decomposition (void)
|
|||||||
TEST2 (0xD4CC, 0x1111, 0x1171);
|
TEST2 (0xD4CC, 0x1111, 0x1171);
|
||||||
TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
|
TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
|
||||||
TEST2 (0xCE20, 0x110E, 0x1173);
|
TEST2 (0xCE20, 0x110E, 0x1173);
|
||||||
|
|
||||||
|
#undef TEST_DECOMP
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_canonical_decomposition (void)
|
||||||
|
{
|
||||||
|
gunichar *decomp;
|
||||||
|
gsize len;
|
||||||
|
|
||||||
|
#define TEST_DECOMP(ch, expected_len, a, b, c, d) \
|
||||||
|
decomp = g_unicode_canonical_decomposition (ch, &len); \
|
||||||
|
g_assert_cmpint (expected_len, ==, len); \
|
||||||
|
if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
|
||||||
|
if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
|
||||||
|
if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
|
||||||
|
if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
|
||||||
|
g_free (decomp);
|
||||||
|
|
||||||
|
#define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0)
|
||||||
|
#define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0)
|
||||||
|
#define TEST2(ch, a, b) TEST_DECOMP (ch, 2, a, b, 0, 0)
|
||||||
|
#define TEST3(ch, a, b, c) TEST_DECOMP (ch, 3, a, b, c, 0)
|
||||||
|
#define TEST4(ch, a, b, c, d) TEST_DECOMP (ch, 4, a, b, c, d)
|
||||||
|
|
||||||
|
/* Not decomposable */
|
||||||
|
TEST0 (0x0041);
|
||||||
|
TEST0 (0xFB01);
|
||||||
|
|
||||||
|
/* Singletons */
|
||||||
|
TEST2 (0x212B, 0x0041, 0x030A);
|
||||||
|
TEST1 (0x2126, 0x03A9);
|
||||||
|
|
||||||
|
/* General */
|
||||||
|
TEST2 (0x00C5, 0x0041, 0x030A);
|
||||||
|
TEST2 (0x00F4, 0x006F, 0x0302);
|
||||||
|
TEST3 (0x1E69, 0x0073, 0x0323, 0x0307);
|
||||||
|
TEST2 (0x1E63, 0x0073, 0x0323);
|
||||||
|
TEST2 (0x1E0B, 0x0064, 0x0307);
|
||||||
|
TEST2 (0x1E0D, 0x0064, 0x0323);
|
||||||
|
|
||||||
|
/* Hangul */
|
||||||
|
TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6);
|
||||||
|
TEST2 (0xD4CC, 0x1111, 0x1171);
|
||||||
|
TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
|
||||||
|
TEST2 (0xCE20, 0x110E, 0x1173);
|
||||||
|
|
||||||
|
#undef TEST_DECOMP
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -540,6 +587,21 @@ test_decompose_tail (void)
|
|||||||
g_assert (a == ch && b == 0);
|
g_assert (a == ch && b == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_fully_decompose_len (void)
|
||||||
|
{
|
||||||
|
gunichar ch;
|
||||||
|
|
||||||
|
/* Test that all canonical decompositions are at most 4 in length,
|
||||||
|
* and compatibility decompositions are at most 18 in length.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (ch = 0; ch < 0x110000; ch++) {
|
||||||
|
g_assert_cmpint (g_unichar_fully_decompose (ch, FALSE, NULL, 0), <=, 4);
|
||||||
|
g_assert_cmpint (g_unichar_fully_decompose (ch, TRUE, NULL, 0), <=, 18);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main (int argc,
|
main (int argc,
|
||||||
char *argv[])
|
char *argv[])
|
||||||
@ -557,8 +619,10 @@ main (int argc,
|
|||||||
g_test_add_func ("/unicode/wide", test_wide);
|
g_test_add_func ("/unicode/wide", test_wide);
|
||||||
g_test_add_func ("/unicode/compose", test_compose);
|
g_test_add_func ("/unicode/compose", test_compose);
|
||||||
g_test_add_func ("/unicode/decompose", test_decompose);
|
g_test_add_func ("/unicode/decompose", test_decompose);
|
||||||
|
g_test_add_func ("/unicode/fully-decompose-canonical", test_fully_decompose_canonical);
|
||||||
g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition);
|
g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition);
|
||||||
g_test_add_func ("/unicode/decompose-tail", test_decompose_tail);
|
g_test_add_func ("/unicode/decompose-tail", test_decompose_tail);
|
||||||
|
g_test_add_func ("/unicode/fully-decompose-len", test_fully_decompose_len);
|
||||||
|
|
||||||
return g_test_run();
|
return g_test_run();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user