Add tests for Unicode canonical composition/decomposition

Also update compose()/decompose() API corner cases and docs.
This commit is contained in:
Behdad Esfahbod 2011-07-14 16:18:30 -04:00
parent 7ad2f0eb80
commit a73c66b134
2 changed files with 128 additions and 4 deletions

View File

@ -616,13 +616,20 @@ compose_hangul_step (gunichar a,
* @b: return location for the second component of @ch * @b: return location for the second component of @ch
* *
* Performs a single decomposition step of the * Performs a single decomposition step of the
* Unicode character normalization algorithm. * Unicode canonical decomposition algorithm.
* *
* This function does not include compatibility * This function does not include compatibility
* decompositions. It does, however, include algorithmic * decompositions. It does, however, include algorithmic
* Hangul Jamo decomposition, as well as 'singleton' * Hangul Jamo decomposition, as well as 'singleton'
* decompositions which replace a character by a single * decompositions which replace a character by a single
* other character. In this case, *@b will be set to zero. * other character. In the case of singletons *@b will
* be set to zero.
*
* If @ch is not decomposable, *@a is set to @ch and *@b
* is set to zero.
*
* See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
* for details.
* *
* Returns: %TRUE if the character could be decomposed * Returns: %TRUE if the character could be decomposed
* *
@ -661,6 +668,9 @@ g_unichar_decompose (gunichar ch,
} }
} }
*a = ch;
*b = 0;
return FALSE; return FALSE;
} }
@ -671,7 +681,7 @@ g_unichar_decompose (gunichar ch,
* @ch: return location for the composed character * @ch: return location for the composed character
* *
* Performs a single composition step of the * Performs a single composition step of the
* Unicode character normalization algorithm. * Unicode canonical composition algorithm.
* *
* This function does not perform algorithmic composition * This function does not perform algorithmic composition
* for Hangul characters, and does not include compatibility * for Hangul characters, and does not include compatibility
@ -679,6 +689,13 @@ g_unichar_decompose (gunichar ch,
* compositions which replace a character by a single * compositions which replace a character by a single
* other character. To obtain these, pass zero for @b. * other character. To obtain these, pass zero for @b.
* *
* This function includes algorithmic Hangul Jamo composition.
*
* If @a and @b do not compose a new character, @ch is set to zero.
*
* See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
* for details.
*
* Returns: %TRUE if the characters could be composed * Returns: %TRUE if the characters could be composed
* *
* Since: 2.30 * Since: 2.30
@ -715,5 +732,7 @@ g_unichar_compose (gunichar a,
} }
} }
*ch = 0;
return FALSE; return FALSE;
} }

View File

@ -436,7 +436,6 @@ test_title (void)
g_assert (g_unichar_istitle (0x01c5)); g_assert (g_unichar_istitle (0x01c5));
g_assert (g_unichar_istitle (0x1f88)); g_assert (g_unichar_istitle (0x1f88));
g_assert (g_unichar_istitle (0x1fcc)); g_assert (g_unichar_istitle (0x1fcc));
g_assert (!g_unichar_ismark ('a'));
g_assert (g_unichar_totitle (0x01c6) == 0x01c5); g_assert (g_unichar_totitle (0x01c6) == 0x01c5);
g_assert (g_unichar_totitle (0x01c4) == 0x01c5); g_assert (g_unichar_totitle (0x01c4) == 0x01c5);
@ -529,6 +528,109 @@ test_wide (void)
} }
}; };
static void
test_compose (void)
{
gunichar ch;
/* Not composable */
g_assert (!g_unichar_compose (0x0041, 0x0042, &ch) && ch == 0);
g_assert (!g_unichar_compose (0x0041, 0, &ch) && ch == 0);
g_assert (!g_unichar_compose (0x0066, 0x0069, &ch) && ch == 0);
/* Singletons should not compose */
g_assert (!g_unichar_compose (0x212B, 0, &ch) && ch == 0);
g_assert (!g_unichar_compose (0x00C5, 0, &ch) && ch == 0);
g_assert (!g_unichar_compose (0x2126, 0, &ch) && ch == 0);
g_assert (!g_unichar_compose (0x03A9, 0, &ch) && ch == 0);
/* Pairs */
g_assert (g_unichar_compose (0x0041, 0x030A, &ch) && ch == 0x00C5);
g_assert (g_unichar_compose (0x006F, 0x0302, &ch) && ch == 0x00F4);
g_assert (g_unichar_compose (0x1E63, 0x0307, &ch) && ch == 0x1E69);
g_assert (g_unichar_compose (0x0073, 0x0323, &ch) && ch == 0x1E63);
g_assert (g_unichar_compose (0x0064, 0x0307, &ch) && ch == 0x1E0B);
g_assert (g_unichar_compose (0x0064, 0x0323, &ch) && ch == 0x1E0D);
/* Hangul */
g_assert (g_unichar_compose (0xD4CC, 0x11B6, &ch) && ch == 0xD4DB);
g_assert (g_unichar_compose (0x1111, 0x1171, &ch) && ch == 0xD4CC);
g_assert (g_unichar_compose (0xCE20, 0x11B8, &ch) && ch == 0xCE31);
g_assert (g_unichar_compose (0x110E, 0x1173, &ch) && ch == 0xCE20);
}
static void
test_decompose (void)
{
gunichar a, b;
/* Not decomposable */
g_assert (!g_unichar_decompose (0x0041, &a, &b) && a == 0x0041 && b == 0);
g_assert (!g_unichar_decompose (0xFB01, &a, &b) && a == 0xFB01 && b == 0);
/* Singletons */
g_assert (g_unichar_decompose (0x212B, &a, &b) && a == 0x00C5 && b == 0);
g_assert (g_unichar_decompose (0x2126, &a, &b) && a == 0x03A9 && b == 0);
/* Pairs */
g_assert (g_unichar_decompose (0x00C5, &a, &b) && a == 0x0041 && b == 0x030A);
g_assert (g_unichar_decompose (0x00F4, &a, &b) && a == 0x006F && b == 0x0302);
g_assert (g_unichar_decompose (0x1E69, &a, &b) && a == 0x1E63 && b == 0x0307);
g_assert (g_unichar_decompose (0x1E63, &a, &b) && a == 0x0073 && b == 0x0323);
g_assert (g_unichar_decompose (0x1E0B, &a, &b) && a == 0x0064 && b == 0x0307);
g_assert (g_unichar_decompose (0x1E0D, &a, &b) && a == 0x0064 && b == 0x0323);
/* Hangul */
g_assert (g_unichar_decompose (0xD4DB, &a, &b) && a == 0xD4CC && b == 0x11B6);
g_assert (g_unichar_decompose (0xD4CC, &a, &b) && a == 0x1111 && b == 0x1171);
g_assert (g_unichar_decompose (0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8);
g_assert (g_unichar_decompose (0xCE20, &a, &b) && a == 0x110E && b == 0x1173);
}
static void
test_canonical_decomposition (void)
{
gunichar *decomp;
gsize len;
#define TEST_DECOMP(ch, expected_len, a, b, c, d) \
decomp = g_unicode_canonical_decomposition (ch, &len); \
g_assert_cmpint (expected_len, ==, len); \
if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
g_free (d);
#define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0)
#define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0)
#define TEST2(ch, a, b) TEST_DECOMP (ch, 2, a, b, 0, 0)
#define TEST3(ch, a, b, c) TEST_DECOMP (ch, 3, a, b, c, 0)
#define TEST4(ch, a, b, c, d) TEST_DECOMP (ch, 4, a, b, c, d)
/* Not decomposable */
TEST0 (0x0041);
TEST0 (0xFB01);
/* Singletons */
TEST2 (0x212B, 0x0041, 0x030A);
TEST1 (0x2126, 0x03A9);
/* General */
TEST2 (0x00C5, 0x0041, 0x030A);
TEST2 (0x00F4, 0x006F, 0x0302);
TEST3 (0x1E69, 0x0073, 0x0323, 0x0307);
TEST2 (0x1E63, 0x0073, 0x0323);
TEST2 (0x1E0B, 0x0064, 0x0307);
TEST2 (0x1E0D, 0x0064, 0x0323);
/* Hangul */
TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6);
TEST2 (0xD4CC, 0x1111, 0x1171);
TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
TEST2 (0xCE20, 0x110E, 0x1173);
}
int int
main (int argc, main (int argc,
char *argv[]) char *argv[])
@ -549,6 +651,9 @@ main (int argc,
g_test_add_func ("/unicode/mark", test_mark); g_test_add_func ("/unicode/mark", test_mark);
g_test_add_func ("/unicode/title", test_title); g_test_add_func ("/unicode/title", test_title);
g_test_add_func ("/unicode/wide", test_wide); g_test_add_func ("/unicode/wide", test_wide);
g_test_add_func ("/unicode/compose", test_compose);
g_test_add_func ("/unicode/decompose", test_decompose);
g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition);
return g_test_run(); return g_test_run();
} }