Bug 654195 - Add g_unichar_compose() and g_unichar_decompose()

This commit is contained in:
Matthias Clasen 2011-07-13 22:07:11 -04:00 committed by Behdad Esfahbod
parent 8c7de592ff
commit 761a1841ee
6 changed files with 3321 additions and 1 deletions

View File

@ -2702,6 +2702,8 @@ g_unichar_tolower
g_unichar_totitle
g_unichar_digit_value
g_unichar_xdigit_value
g_unichar_compose
g_unichar_decompose
GUnicodeType
g_unichar_type
GUnicodeBreakType

View File

@ -870,6 +870,46 @@ sub print_decomp
printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
print OUT "typedef struct\n{\n";
print OUT " gunichar ch;\n";
print OUT " gunichar a;\n";
print OUT " gunichar b;\n";
print OUT "} decomposition_step;\n\n";
print OUT "static const decomposition_step decomp_step_table[] =\n{\n";
$first = 1;
my @steps = ();
for ($count = 0; $count <= $last; ++$count)
{
if ((defined $decompositions[$count]) && (!$decompose_compat[$count]))
{
print OUT ",\n"
if ! $first;
$first = 0;
my @list;
@list = (split(' ', $decompositions[$count]), "0");
printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]);
# don't include 1:1 in the compose table
push @steps, [ ($count, hex($list[0]), hex($list[1])) ]
if hex($list[1])
}
}
print OUT "\n};\n\n";
print OUT "static const decomposition_step comp_step_table[] =\n{\n";
my @inverted;
@inverted = sort { @{$a}[1] <=> @{$b}[1] ||
@{$a}[2] <=> @{$b}[2] } @steps;
$first = 1;
foreach my $i ( 0 .. $#inverted )
{
print OUT ",\n"
if ! $first;
$first = 0;
printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $inverted[$i][0], $inverted[$i][1], $inverted[$i][2];
}
print OUT "\n};\n\n";
print OUT "#endif /* DECOMP_H */\n";
printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;

View File

@ -1198,6 +1198,8 @@ g_tree_traverse
g_unichar_break_type
g_unicode_canonical_ordering
g_unichar_combining_class
g_unichar_compose
g_unichar_decompose
g_unichar_isalnum
g_unichar_isalpha
g_unichar_iscntrl

View File

@ -280,12 +280,21 @@ GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
gint g_unichar_combining_class (gunichar uc) G_GNUC_CONST;
/* Pairwise canonical compose/decompose */
gboolean g_unichar_compose (gunichar a,
gunichar b,
gunichar *ch);
gboolean g_unichar_decompose (gunichar ch,
gunichar *a,
gunichar *b);
/* Compute canonical ordering of a string in-place. This rearranges
decomposed characters in the string according to their combining
classes. See the Unicode manual for more information. */
void g_unicode_canonical_ordering (gunichar *string,
gsize len);
/* Compute canonical decomposition of a character. Returns g_malloc()d
string of Unicode characters. RESULT_LEN is set to the resulting
length of the string. */

View File

@ -152,7 +152,7 @@ decompose_hangul (gunichar s,
r[1] = V;
}
if (T != TBase)
if (T != TBase)
{
if (r)
r[2] = T;
@ -530,3 +530,190 @@ g_utf8_normalize (const gchar *str,
return result;
}
static gboolean
decompose_hangul_step (gunichar ch,
gunichar *a,
gunichar *b)
{
gint SIndex;
gunichar L, V, T;
SIndex = ch - SBase;
if (SIndex < 0 || SIndex >= SCount)
return FALSE; /* not a hangul syllable */
L = LBase + SIndex / NCount;
V = VBase + (SIndex % NCount) / TCount;
T = TBase + SIndex % TCount;
if (T != TBase)
{
gint LIndex, VIndex;
gunichar LV;
/* split LVT -> LV,T */
LIndex = L - LBase;
VIndex = V - VBase;
LV = SBase + (LIndex * VCount + VIndex) * TCount;
*a = LV;
*b = T;
}
else
{
/* split LV -> L,V */
*a = L;
*b = V;
}
return TRUE;
}
static gboolean
compose_hangul_step (gunichar a,
gunichar b,
gunichar *ch)
{
gint LIndex, SIndex;
/* first try L,V -> LV */
LIndex = a - LBase;
if (0 <= LIndex && LIndex < LCount)
{
gint VIndex;
VIndex = b - VBase;
if (0 <= VIndex && VIndex < VCount)
{
*ch = SBase + (LIndex * VCount + VIndex) * TCount;
return TRUE;
}
}
/* next try LV,T -> LVT */
SIndex = a - SBase;
if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0)
{
gint TIndex;
TIndex = b - TBase;
if (0 < TIndex && TIndex < TCount)
{
*ch = a + TIndex;
return TRUE;
}
}
return FALSE;
}
/**
* g_unichar_decompose:
* @ch: a Unicode character
* @a: return location for the first component of @ch
* @b: return location for the second component of @ch
*
* Performs a single decomposition step of the
* Unicode character normalization algorithm.
*
* This function does not include compatibility
* decompositions. It does, however, include algorithmic
* Hangul Jamo decomposition, as well as 'singleton'
* decompositions which replace a character by a single
* other character. In this case, *@b will be set to zero.
*
* Returns: %TRUE if the character could be decomposed
*
* Since: 2.30
*/
gboolean
g_unichar_decompose (gunichar ch,
gunichar *a,
gunichar *b)
{
gint start = 0;
gint end = G_N_ELEMENTS (decomp_step_table);
if (decompose_hangul_step (ch, a, b))
return TRUE;
if (ch >= decomp_step_table[start].ch &&
ch <= decomp_step_table[end - 1].ch)
{
while (TRUE)
{
gint half = (start + end) / 2;
const decomposition_step *p = &(decomp_step_table[half]);
if (ch == p->ch)
{
*a = p->a;
*b = p->b;
return TRUE;
}
else if (half == start)
break;
else if (ch > p->ch)
start = half;
else
end = half;
}
}
return FALSE;
}
/**
* g_unichar_compose:
* @a: a Unicode character
* @b: a Unicode character
* @ch: return location for the composed character
*
* Performs a single composition step of the
* Unicode character normalization algorithm.
*
* This function does not perform algorithmic composition
* for Hangul characters, and does not include compatibility
* compositions. It does, however, include 'singleton'
* compositions which replace a character by a single
* other character. To obtain these, pass zero for @b.
*
* Returns: %TRUE if the characters could be composed
*
* Since: 2.30
*/
gboolean
g_unichar_compose (gunichar a,
gunichar b,
gunichar *ch)
{
gint start = 0;
gint end = G_N_ELEMENTS (comp_step_table);
if (compose_hangul_step (a, b, ch))
return TRUE;
if (a >= comp_step_table[start].a &&
a <= comp_step_table[end - 1].a)
{
while (TRUE)
{
gint half = (start + end) / 2;
const decomposition_step *p = &(comp_step_table[half]);
if (a == p->a && b == p->b)
{
*ch = p->ch;
return TRUE;
}
else if (half == start)
break;
else if (a > p->a || (a == p->a && b > p->b))
start = half;
else
end = half;
}
}
return FALSE;
}

File diff suppressed because it is too large Load Diff