Bug 654195 - Add g_unichar_compose() and g_unichar_decompose()

2025-03-14 19:55:12 +01:00 · 2011-07-13 22:07:11 -04:00 · 2011-07-13 22:07:11 -04:00 · 761a1841ee
commit 761a1841ee
parent 8c7de592ff
6 changed files with 3321 additions and 1 deletions
--- a/docs/reference/glib/glib-sections.txt
+++ b/docs/reference/glib/glib-sections.txt
@ -2702,6 +2702,8 @@ g_unichar_tolower
 g_unichar_totitle
 g_unichar_digit_value
 g_unichar_xdigit_value
+g_unichar_compose
+g_unichar_decompose
 GUnicodeType
 g_unichar_type
 GUnicodeBreakType
--- a/glib/gen-unicode-tables.pl
+++ b/glib/gen-unicode-tables.pl
@ -870,6 +870,46 @@ sub print_decomp

    printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;

+    print OUT "typedef struct\n{\n";
+    print OUT "  gunichar ch;\n";
+    print OUT "  gunichar a;\n";
+    print OUT "  gunichar b;\n";
+    print OUT "} decomposition_step;\n\n";
+
+    print OUT "static const decomposition_step decomp_step_table[] =\n{\n";
+    $first = 1;
+    my @steps = ();
+    for ($count = 0; $count <= $last; ++$count)
+    {
+        if ((defined $decompositions[$count]) && (!$decompose_compat[$count]))
+        {
+            print OUT ",\n"
+                if ! $first;
+            $first = 0;
+            my @list;
+            @list = (split(' ', $decompositions[$count]), "0");
+            printf OUT qq(  { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]);
+            # don't include 1:1 in the compose table
+            push @steps, [ ($count, hex($list[0]), hex($list[1])) ]
+                if hex($list[1])
+        }
+    }
+    print OUT "\n};\n\n";
+
+    print OUT "static const decomposition_step comp_step_table[] =\n{\n";
+    my @inverted;
+    @inverted = sort {  @{$a}[1] <=> @{$b}[1] ||
+                        @{$a}[2] <=> @{$b}[2] } @steps;
+    $first = 1;
+    foreach my $i ( 0 .. $#inverted )
+    {
+        print OUT ",\n"
+            if ! $first;
+        $first = 0;
+        printf OUT qq(  { 0x%05x, 0x%05x, 0x%05x }), $inverted[$i][0], $inverted[$i][1], $inverted[$i][2];
+    }
+    print OUT "\n};\n\n";
+
    print OUT "#endif /* DECOMP_H */\n";

    printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;
--- a/glib/glib.symbols
+++ b/glib/glib.symbols
@ -1198,6 +1198,8 @@ g_tree_traverse
 g_unichar_break_type
 g_unicode_canonical_ordering
 g_unichar_combining_class
+g_unichar_compose
+g_unichar_decompose
 g_unichar_isalnum
 g_unichar_isalpha
 g_unichar_iscntrl
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@ -280,12 +280,21 @@ GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
 gint g_unichar_combining_class (gunichar uc) G_GNUC_CONST;


+/* Pairwise canonical compose/decompose */
+gboolean g_unichar_compose (gunichar  a,
+			    gunichar  b,
+			    gunichar *ch);
+gboolean g_unichar_decompose (gunichar  ch,
+			      gunichar *a,
+			      gunichar *b);
+
 /* Compute canonical ordering of a string in-place.  This rearranges
   decomposed characters in the string according to their combining
   classes.  See the Unicode manual for more information.  */
 void g_unicode_canonical_ordering (gunichar *string,
 				   gsize     len);

+
 /* Compute canonical decomposition of a character.  Returns g_malloc()d
   string of Unicode characters.  RESULT_LEN is set to the resulting
   length of the string.  */
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@ -152,7 +152,7 @@ decompose_hangul (gunichar s,
          r[1] = V;
        }

-      if (T != TBase) 
+      if (T != TBase)
        {
          if (r)
            r[2] = T;
@ -530,3 +530,190 @@ g_utf8_normalize (const gchar    *str,

  return result;
 }
+
+static gboolean
+decompose_hangul_step (gunichar  ch,
+                       gunichar *a,
+                       gunichar *b)
+{
+  gint SIndex;
+  gunichar L, V, T;
+
+  SIndex = ch - SBase;
+
+  if (SIndex < 0 || SIndex >= SCount)
+    return FALSE;  /* not a hangul syllable */
+
+  L = LBase + SIndex / NCount;
+  V = VBase + (SIndex % NCount) / TCount;
+  T = TBase + SIndex % TCount;
+
+  if (T != TBase)
+    {
+      gint LIndex, VIndex;
+      gunichar LV;
+
+      /* split LVT -> LV,T */
+      LIndex = L - LBase;
+      VIndex = V - VBase;
+      LV = SBase + (LIndex * VCount + VIndex) * TCount;
+
+      *a = LV;
+      *b = T;
+    }
+  else
+    {
+      /* split LV -> L,V */
+      *a = L;
+      *b = V;
+    }
+
+  return TRUE;
+}
+
+static gboolean
+compose_hangul_step (gunichar a,
+                     gunichar b,
+                     gunichar *ch)
+{
+  gint LIndex, SIndex;
+
+  /* first try L,V -> LV */
+  LIndex = a - LBase;
+  if (0 <= LIndex && LIndex < LCount)
+    {
+      gint VIndex;
+
+      VIndex = b - VBase;
+      if (0 <= VIndex && VIndex < VCount)
+        {
+          *ch = SBase + (LIndex * VCount + VIndex) * TCount;
+          return TRUE;
+        }
+    }
+
+  /* next try LV,T -> LVT */
+  SIndex = a - SBase;
+  if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0)
+    {
+      gint TIndex;
+
+      TIndex = b - TBase;
+      if (0 < TIndex && TIndex < TCount)
+        {
+          *ch = a + TIndex;
+          return TRUE;
+        }
+    }
+
+  return FALSE;
+}
+
+/**
+ * g_unichar_decompose:
+ * @ch: a Unicode character
+ * @a: return location for the first component of @ch
+ * @b: return location for the second component of @ch
+ *
+ * Performs a single decomposition step of the
+ * Unicode character normalization algorithm.
+ *
+ * This function does not include compatibility
+ * decompositions. It does, however, include algorithmic
+ * Hangul Jamo decomposition, as well as 'singleton'
+ * decompositions which replace a character by a single
+ * other character. In this case, *@b will be set to zero.
+ *
+ * Returns: %TRUE if the character could be decomposed
+ *
+ * Since: 2.30
+ */
+gboolean
+g_unichar_decompose (gunichar  ch,
+                     gunichar *a,
+                     gunichar *b)
+{
+  gint start = 0;
+  gint end = G_N_ELEMENTS (decomp_step_table);
+
+  if (decompose_hangul_step (ch, a, b))
+    return TRUE;
+
+  if (ch >= decomp_step_table[start].ch &&
+      ch <= decomp_step_table[end - 1].ch)
+    {
+      while (TRUE)
+        {
+          gint half = (start + end) / 2;
+          const decomposition_step *p = &(decomp_step_table[half]);
+          if (ch == p->ch)
+            {
+              *a = p->a;
+              *b = p->b;
+              return TRUE;
+            }
+          else if (half == start)
+            break;
+          else if (ch > p->ch)
+            start = half;
+          else
+            end = half;
+        }
+    }
+
+  return FALSE;
+}
+
+/**
+ * g_unichar_compose:
+ * @a: a Unicode character
+ * @b: a Unicode character
+ * @ch: return location for the composed character
+ *
+ * Performs a single composition step of the
+ * Unicode character normalization algorithm.
+ *
+ * This function does not perform algorithmic composition
+ * for Hangul characters, and does not include compatibility
+ * compositions. It does, however, include 'singleton'
+ * compositions which replace a character by a single
+ * other character. To obtain these, pass zero for @b.
+ *
+ * Returns: %TRUE if the characters could be composed
+ *
+ * Since: 2.30
+ */
+gboolean
+g_unichar_compose (gunichar  a,
+                   gunichar  b,
+                   gunichar *ch)
+{
+  gint start = 0;
+  gint end = G_N_ELEMENTS (comp_step_table);
+
+  if (compose_hangul_step (a, b, ch))
+    return TRUE;
+
+  if (a >= comp_step_table[start].a &&
+      a <= comp_step_table[end - 1].a)
+    {
+      while (TRUE)
+        {
+          gint half = (start + end) / 2;
+          const decomposition_step *p = &(comp_step_table[half]);
+          if (a == p->a && b == p->b)
+            {
+              *ch = p->ch;
+              return TRUE;
+            }
+          else if (half == start)
+            break;
+          else if (a > p->a || (a == p->a && b > p->b))
+            start = half;
+          else
+            end = half;
+        }
+    }
+
+  return FALSE;
+}
--- a/glib/gunidecomp.h
+++ b/glib/gunidecomp.h