From 0584fe33de86f9fd8effe006d911cf5948b4ad05 Mon Sep 17 00:00:00 2001
From: Behdad Esfahbod <behdad@behdad.org>
Date: Mon, 18 Jul 2011 17:52:40 -0400
Subject: [PATCH] Bug 654651 - Better g_unicode_canonical_decomposition()

Add g_unichar_fully_decompose().
Deprecate g_unicode_canonical_decomposition().
---
 docs/reference/glib/glib-sections.txt |  1 +
 glib/glib.symbols                     |  3 +-
 glib/gunicode.h                       |  9 ++--
 glib/gunidecomp.c                     | 78 +++++++++++++++++++++++++--
 glib/tests/unicode.c                  | 72 +++++++++++++++++++++++--
 5 files changed, 150 insertions(+), 13 deletions(-)

diff --git a/docs/reference/glib/glib-sections.txt b/docs/reference/glib/glib-sections.txt
index 627bf67c5..3a0160da6 100644
--- a/docs/reference/glib/glib-sections.txt
+++ b/docs/reference/glib/glib-sections.txt
@@ -2704,6 +2704,7 @@ g_unichar_digit_value
 g_unichar_xdigit_value
 g_unichar_compose
 g_unichar_decompose
+g_unichar_fully_decompose
 GUnicodeType
 g_unichar_type
 GUnicodeBreakType
diff --git a/glib/glib.symbols b/glib/glib.symbols
index fe6babaca..df978ec41 100644
--- a/glib/glib.symbols
+++ b/glib/glib.symbols
@@ -1196,7 +1196,6 @@ g_tree_search
 g_tree_steal
 g_tree_traverse
 g_unichar_break_type
-g_unicode_canonical_ordering
 g_unichar_combining_class
 g_unichar_compose
 g_unichar_decompose
@@ -1216,6 +1215,7 @@ g_unichar_iswide
 g_unichar_iswide_cjk
 g_unichar_isxdigit
 g_unichar_iszerowidth
+g_unichar_fully_decompose
 g_unichar_tolower
 g_unichar_totitle
 g_unichar_toupper
@@ -1226,6 +1226,7 @@ g_unichar_digit_value
 g_unichar_xdigit_value
 g_unichar_type
 g_unicode_canonical_decomposition
+g_unicode_canonical_ordering
 g_utf8_casefold
 g_utf8_collate
 g_utf8_collate_key
diff --git a/glib/gunicode.h b/glib/gunicode.h
index ca0569e6e..6c08e7d29 100644
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@@ -295,6 +295,11 @@ gboolean g_unichar_decompose (gunichar  ch,
 			      gunichar *a,
 			      gunichar *b);
 
+gsize g_unichar_fully_decompose (gunichar  ch,
+				 gboolean  compat,
+				 gunichar *result,
+				 gsize     result_len);
+
 /* Compute canonical ordering of a string in-place.  This rearranges
    decomposed characters in the string according to their combining
    classes.  See the Unicode manual for more information.  */
@@ -302,9 +307,7 @@ void g_unicode_canonical_ordering (gunichar *string,
 				   gsize     len);
 
 
-/* Compute canonical decomposition of a character.  Returns g_malloc()d
-   string of Unicode characters.  RESULT_LEN is set to the resulting
-   length of the string.  */
+/* Deprecated.  Use g_unichar_fully_decompose() */
 gunichar *g_unicode_canonical_decomposition (gunichar  ch,
 					     gsize    *result_len) G_GNUC_MALLOC;
 
diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c
index d9ef8f0ba..d1f98975b 100644
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -127,7 +127,7 @@ g_unicode_canonical_ordering (gunichar *string,
  * only calculate the result_len; however, a buffer with space for three
  * characters will always be big enough. */
 static void
-decompose_hangul (gunichar s, 
+decompose_hangul (gunichar s,
                   gunichar *r,
                   gsize *result_len)
 {
@@ -217,6 +217,9 @@ find_decomposition (gunichar ch,
  * 
  * Return value: a newly allocated string of Unicode characters.
  *   @result_len is set to the resulting length of the string.
+ *
+ * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose()
+ *   instead.
  **/
 gunichar *
 g_unicode_canonical_decomposition (gunichar ch,
@@ -227,7 +230,7 @@ g_unicode_canonical_decomposition (gunichar ch,
   gunichar *r;
 
   /* Hangul syllable */
-  if (ch >= 0xac00 && ch <= 0xd7a3)
+  if (ch >= SBase && ch < SBase + SCount)
     {
       decompose_hangul (ch, NULL, result_len);
       r = g_malloc (*result_len * sizeof (gunichar));
@@ -363,7 +366,7 @@ _g_utf8_normalize_wc (const gchar    *str,
       const gchar *decomp;
       gunichar wc = g_utf8_get_char (p);
 
-      if (wc >= 0xac00 && wc <= 0xd7a3)
+      if (wc >= SBase && wc < SBase + SCount)
         {
           gsize result_len;
           decompose_hangul (wc, NULL, &result_len);
@@ -394,7 +397,7 @@ _g_utf8_normalize_wc (const gchar    *str,
       int cc;
       gsize old_n_wc = n_wc;
 	  
-      if (wc >= 0xac00 && wc <= 0xd7a3)
+      if (wc >= SBase && wc < SBase + SCount)
         {
           gsize result_len;
           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
@@ -592,7 +595,7 @@ decompose_hangul_step (gunichar  ch,
  * further, but @a may itself decompose.  To get the full
  * canonical decomposition for @ch, one would need to
  * recursively call this function on @a.  Or use
- * g_unicode_canonical_decomposition().
+ * g_unichar_fully_decompose().
  *
  * See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
  * for details.
@@ -678,3 +681,68 @@ g_unichar_compose (gunichar  a,
   *ch = 0;
   return FALSE;
 }
+
+/**
+ * g_unichar_fully_decompose:
+ * @ch: a Unicode character.
+ * @compat: whether perform canonical or compatibility decomposition
+ * @result: location to store decomposed result, or %NULL
+ * @result_len: length of @result
+ *
+ * Computes the canonical or compatibility decomposition of a
+ * Unicode character.  For compatibility decomposition,
+ * pass %TRUE for @compat; for canonical decomposition
+ * pass %FALSE for @compat.
+ *
+ * The decomposed sequence is placed in @result.  Only up to
+ * @result_len characters are written into @result.  The length
+ * of the full decomposition (irrespective of @result_len) is
+ * returned by the function.  For canonical decomposition, a
+ * result buffer of length 4 is always enough, whereas for
+ * compatibility decomposition, a buffer of 18 is enough.
+ *
+ * See <ulink url="http://unicode.org/reports/tr15/">UAX#15</ulink>
+ * for details.
+ *
+ * Return value: the length of the full decomposition.
+ *
+ * Since: 2.30
+ **/
+gsize
+g_unichar_fully_decompose (gunichar  ch,
+			   gboolean  compat,
+			   gunichar *result,
+			   gsize     result_len)
+{
+  const gchar *decomp;
+  const gchar *p;
+
+  /* Hangul syllable */
+  if (ch >= SBase && ch < SBase + SCount)
+    {
+      gsize len, i;
+      gunichar buffer[3];
+      decompose_hangul (ch, result ? buffer : NULL, &len);
+      if (result)
+        for (i = 0; i < len && i < result_len; i++)
+	  result[i] = buffer[i];
+      return len;
+    }
+  else if ((decomp = find_decomposition (ch, compat)) != NULL)
+    {
+      /* Found it.  */
+      gsize len, i;
+
+      len = g_utf8_strlen (decomp, -1);
+
+      for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++)
+        result[i] = g_utf8_get_char (p);
+
+      return len;
+    }
+
+  /* Does not decompose */
+  if (result && result_len >= 1)
+    *result = ch;
+  return 1;
+}
diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c
index 8a43388bc..91f224b3c 100644
--- a/glib/tests/unicode.c
+++ b/glib/tests/unicode.c
@@ -482,19 +482,18 @@ test_decompose (void)
 }
 
 static void
-test_canonical_decomposition (void)
+test_fully_decompose_canonical (void)
 {
-  gunichar *decomp;
+  gunichar decomp[5];
   gsize len;
 
 #define TEST_DECOMP(ch, expected_len, a, b, c, d) \
-  decomp = g_unicode_canonical_decomposition (ch, &len); \
+  len = g_unichar_fully_decompose (ch, FALSE, decomp, G_N_ELEMENTS (decomp)); \
   g_assert_cmpint (expected_len, ==, len); \
   if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
   if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
   if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
   if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
-  g_free (d);
 
 #define TEST0(ch)		TEST_DECOMP (ch, 1, ch, 0, 0, 0)
 #define TEST1(ch, a)		TEST_DECOMP (ch, 1, a, 0, 0, 0)
@@ -523,6 +522,54 @@ test_canonical_decomposition (void)
   TEST2 (0xD4CC, 0x1111, 0x1171);
   TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
   TEST2 (0xCE20, 0x110E, 0x1173);
+
+#undef TEST_DECOMP
+}
+
+static void
+test_canonical_decomposition (void)
+{
+  gunichar *decomp;
+  gsize len;
+
+#define TEST_DECOMP(ch, expected_len, a, b, c, d) \
+  decomp = g_unicode_canonical_decomposition (ch, &len); \
+  g_assert_cmpint (expected_len, ==, len); \
+  if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
+  if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
+  if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
+  if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
+  g_free (decomp);
+
+#define TEST0(ch)		TEST_DECOMP (ch, 1, ch, 0, 0, 0)
+#define TEST1(ch, a)		TEST_DECOMP (ch, 1, a, 0, 0, 0)
+#define TEST2(ch, a, b)		TEST_DECOMP (ch, 2, a, b, 0, 0)
+#define TEST3(ch, a, b, c)	TEST_DECOMP (ch, 3, a, b, c, 0)
+#define TEST4(ch, a, b, c, d)	TEST_DECOMP (ch, 4, a, b, c, d)
+
+  /* Not decomposable */
+  TEST0 (0x0041);
+  TEST0 (0xFB01);
+
+  /* Singletons */
+  TEST2 (0x212B, 0x0041, 0x030A);
+  TEST1 (0x2126, 0x03A9);
+
+  /* General */
+  TEST2 (0x00C5, 0x0041, 0x030A);
+  TEST2 (0x00F4, 0x006F, 0x0302);
+  TEST3 (0x1E69, 0x0073, 0x0323, 0x0307);
+  TEST2 (0x1E63, 0x0073, 0x0323);
+  TEST2 (0x1E0B, 0x0064, 0x0307);
+  TEST2 (0x1E0D, 0x0064, 0x0323);
+
+  /* Hangul */
+  TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6);
+  TEST2 (0xD4CC, 0x1111, 0x1171);
+  TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
+  TEST2 (0xCE20, 0x110E, 0x1173);
+
+#undef TEST_DECOMP
 }
 
 static void
@@ -540,6 +587,21 @@ test_decompose_tail (void)
       g_assert (a == ch && b == 0);
 }
 
+static void
+test_fully_decompose_len (void)
+{
+  gunichar ch;
+
+  /* Test that all canonical decompositions are at most 4 in length,
+   * and compatibility decompositions are at most 18 in length.
+   */
+
+  for (ch = 0; ch < 0x110000; ch++) {
+    g_assert_cmpint (g_unichar_fully_decompose (ch, FALSE, NULL, 0), <=, 4);
+    g_assert_cmpint (g_unichar_fully_decompose (ch, TRUE,  NULL, 0), <=, 18);
+  }
+}
+
 int
 main (int   argc,
       char *argv[])
@@ -557,8 +619,10 @@ main (int   argc,
   g_test_add_func ("/unicode/wide", test_wide);
   g_test_add_func ("/unicode/compose", test_compose);
   g_test_add_func ("/unicode/decompose", test_decompose);
+  g_test_add_func ("/unicode/fully-decompose-canonical", test_fully_decompose_canonical);
   g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition);
   g_test_add_func ("/unicode/decompose-tail", test_decompose_tail);
+  g_test_add_func ("/unicode/fully-decompose-len", test_fully_decompose_len);
 
   return g_test_run();
 }