Bug 654195 - Add g_unichar_compose() and g_unichar_decompose()

2025-09-28 01:57:14 +02:00 · 2011-07-13 22:07:11 -04:00
parent 8c7de592ff
commit 761a1841ee
6 changed files with 3321 additions and 1 deletions
--- a/docs/reference/glib/glib-sections.txt
+++ b/docs/reference/glib/glib-sections.txt
@@ -2702,6 +2702,8 @@ g_unichar_tolower
 g_unichar_totitle
 g_unichar_digit_value
 g_unichar_xdigit_value
 g_unichar_compose
 g_unichar_decompose
 GUnicodeType
 g_unichar_type
 GUnicodeBreakType
--- a/glib/gen-unicode-tables.pl
+++ b/glib/gen-unicode-tables.pl
@@ -870,6 +870,46 @@ sub print_decomp
    printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
    print OUT "typedef struct\n{\n";
    print OUT "  gunichar ch;\n";
    print OUT "  gunichar a;\n";
    print OUT "  gunichar b;\n";
    print OUT "} decomposition_step;\n\n";
    print OUT "static const decomposition_step decomp_step_table[] =\n{\n";
    $first = 1;
    my @steps = ();
    for ($count = 0; $count <= $last; ++$count)
    {
        if ((defined $decompositions[$count]) && (!$decompose_compat[$count]))
        {
            print OUT ",\n"
                if ! $first;
            $first = 0;
            my @list;
            @list = (split(' ', $decompositions[$count]), "0");
            printf OUT qq(  { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]);
            # don't include 1:1 in the compose table
            push @steps, [ ($count, hex($list[0]), hex($list[1])) ]
                if hex($list[1])
        }
    }
    print OUT "\n};\n\n";
    print OUT "static const decomposition_step comp_step_table[] =\n{\n";
    my @inverted;
    @inverted = sort {  @{$a}[1] <=> @{$b}[1] ||
                        @{$a}[2] <=> @{$b}[2] } @steps;
    $first = 1;
    foreach my $i ( 0 .. $#inverted )
    {
        print OUT ",\n"
            if ! $first;
        $first = 0;
        printf OUT qq(  { 0x%05x, 0x%05x, 0x%05x }), $inverted[$i][0], $inverted[$i][1], $inverted[$i][2];
    }
    print OUT "\n};\n\n";
    print OUT "#endif /* DECOMP_H */\n";
    printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;
--- a/glib/glib.symbols
+++ b/glib/glib.symbols
@@ -1198,6 +1198,8 @@ g_tree_traverse
 g_unichar_break_type
 g_unicode_canonical_ordering
 g_unichar_combining_class
 g_unichar_compose
 g_unichar_decompose
 g_unichar_isalnum
 g_unichar_isalpha
 g_unichar_iscntrl
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@@ -280,12 +280,21 @@ GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
 gint g_unichar_combining_class (gunichar uc) G_GNUC_CONST;
 /* Pairwise canonical compose/decompose */
 gboolean g_unichar_compose (gunichar  a,
 			    gunichar  b,
 			    gunichar *ch);
 gboolean g_unichar_decompose (gunichar  ch,
 			      gunichar *a,
 			      gunichar *b);
 /* Compute canonical ordering of a string in-place.  This rearranges
   decomposed characters in the string according to their combining
   classes.  See the Unicode manual for more information.  */
 void g_unicode_canonical_ordering (gunichar *string,
 				   gsize     len);
 /* Compute canonical decomposition of a character.  Returns g_malloc()d
   string of Unicode characters.  RESULT_LEN is set to the resulting
   length of the string.  */
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -152,7 +152,7 @@ decompose_hangul (gunichar s,
          r[1] = V;
        }
-      if (T != TBase) 
+      if (T != TBase)
        {
          if (r)
            r[2] = T;
@@ -530,3 +530,190 @@ g_utf8_normalize (const gchar    *str,
  return result;
 }
 static gboolean
 decompose_hangul_step (gunichar  ch,
                       gunichar *a,
                       gunichar *b)
 {
  gint SIndex;
  gunichar L, V, T;
  SIndex = ch - SBase;
  if (SIndex < 0 || SIndex >= SCount)
    return FALSE;  /* not a hangul syllable */
  L = LBase + SIndex / NCount;
  V = VBase + (SIndex % NCount) / TCount;
  T = TBase + SIndex % TCount;
  if (T != TBase)
    {
      gint LIndex, VIndex;
      gunichar LV;
      /* split LVT -> LV,T */
      LIndex = L - LBase;
      VIndex = V - VBase;
      LV = SBase + (LIndex * VCount + VIndex) * TCount;
      *a = LV;
      *b = T;
    }
  else
    {
      /* split LV -> L,V */
      *a = L;
      *b = V;
    }
  return TRUE;
 }
 static gboolean
 compose_hangul_step (gunichar a,
                     gunichar b,
                     gunichar *ch)
 {
  gint LIndex, SIndex;
  /* first try L,V -> LV */
  LIndex = a - LBase;
  if (0 <= LIndex && LIndex < LCount)
    {
      gint VIndex;
      VIndex = b - VBase;
      if (0 <= VIndex && VIndex < VCount)
        {
          *ch = SBase + (LIndex * VCount + VIndex) * TCount;
          return TRUE;
        }
    }
  /* next try LV,T -> LVT */
  SIndex = a - SBase;
  if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0)
    {
      gint TIndex;
      TIndex = b - TBase;
      if (0 < TIndex && TIndex < TCount)
        {
          *ch = a + TIndex;
          return TRUE;
        }
    }
  return FALSE;
 }
 /**
 * g_unichar_decompose:
 * @ch: a Unicode character
 * @a: return location for the first component of @ch
 * @b: return location for the second component of @ch
 *
 * Performs a single decomposition step of the
 * Unicode character normalization algorithm.
 *
 * This function does not include compatibility
 * decompositions. It does, however, include algorithmic
 * Hangul Jamo decomposition, as well as 'singleton'
 * decompositions which replace a character by a single
 * other character. In this case, *@b will be set to zero.
 *
 * Returns: %TRUE if the character could be decomposed
 *
 * Since: 2.30
 */
 gboolean
 g_unichar_decompose (gunichar  ch,
                     gunichar *a,
                     gunichar *b)
 {
  gint start = 0;
  gint end = G_N_ELEMENTS (decomp_step_table);
  if (decompose_hangul_step (ch, a, b))
    return TRUE;
  if (ch >= decomp_step_table[start].ch &&
      ch <= decomp_step_table[end - 1].ch)
    {
      while (TRUE)
        {
          gint half = (start + end) / 2;
          const decomposition_step *p = &(decomp_step_table[half]);
          if (ch == p->ch)
            {
              *a = p->a;
              *b = p->b;
              return TRUE;
            }
          else if (half == start)
            break;
          else if (ch > p->ch)
            start = half;
          else
            end = half;
        }
    }
  return FALSE;
 }
 /**
 * g_unichar_compose:
 * @a: a Unicode character
 * @b: a Unicode character
 * @ch: return location for the composed character
 *
 * Performs a single composition step of the
 * Unicode character normalization algorithm.
 *
 * This function does not perform algorithmic composition
 * for Hangul characters, and does not include compatibility
 * compositions. It does, however, include 'singleton'
 * compositions which replace a character by a single
 * other character. To obtain these, pass zero for @b.
 *
 * Returns: %TRUE if the characters could be composed
 *
 * Since: 2.30
 */
 gboolean
 g_unichar_compose (gunichar  a,
                   gunichar  b,
                   gunichar *ch)
 {
  gint start = 0;
  gint end = G_N_ELEMENTS (comp_step_table);
  if (compose_hangul_step (a, b, ch))
    return TRUE;
  if (a >= comp_step_table[start].a &&
      a <= comp_step_table[end - 1].a)
    {
      while (TRUE)
        {
          gint half = (start + end) / 2;
          const decomposition_step *p = &(comp_step_table[half]);
          if (a == p->a && b == p->b)
            {
              *ch = p->ch;
              return TRUE;
            }
          else if (half == start)
            break;
          else if (a > p->a || (a == p->a && b > p->b))
            start = half;
          else
            end = half;
        }
    }
  return FALSE;
 }
--- a/glib/gunidecomp.h
+++ b/glib/gunidecomp.h